diff --git a/linear_bow.ipynb b/linear_bow.ipynb index ff0ff6f..b8db19b 100644 --- a/linear_bow.ipynb +++ b/linear_bow.ipynb @@ -140,79 +140,63 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ + "from sklearn.model_selection import train_test_split\n", "import numpy as np\n", - "X = []\n", - "Y1 = []\n", - "Y2 = []\n", - "for review in reviews:\n", - " X.append(feature(review))\n", - " Y1.append(review[\"funny\"])\n", - " Y2.append(review[\"helpful_n\"])\n", "\n", - "X = np.array(X)\n", - "Y1 = np.array(Y1)\n", - "Y2 = np.array(Y2)" + "X_train, X_test, Y_funny_train, Y_funny_test, Y_helpful_train, Y_helpful_test = [], [], [], [], [], []\n", + "\n", + "train, test = train_test_split(reviews, test_size=0.25, random_state=0)\n", + "\n", + "for review in train:\n", + " X_train.append(feature(review))\n", + " #Y1.append(review[\"funny\"])\n", + " if review[\"funny\"] == 0:\n", + " Y_funny_train.append(0)\n", + " else:\n", + " Y_funny_train.append(np.log(review[\"funny\"]))\n", + " #Y2.append(review[\"helpful_n\"])\n", + " if review[\"helpful_n\"] == 0:\n", + " Y_helpful_train.append(0)\n", + " else:\n", + " Y_helpful_train.append(np.log(review[\"helpful_n\"]))\n", + "\n", + "for review in test:\n", + " X_test.append(feature(review))\n", + " #Y1.append(review[\"funny\"])\n", + " if review[\"funny\"] == 0:\n", + " Y_funny_test.append(0)\n", + " else:\n", + " Y_funny_test.append(np.log(review[\"funny\"]))\n", + " #Y2.append(review[\"helpful_n\"])\n", + " if review[\"helpful_n\"] == 0:\n", + " Y_helpful_test.append(0)\n", + " else:\n", + " Y_helpful_test.append(np.log(review[\"helpful_n\"]))\n" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "baseline 294.7309048565537 4.604634941766926\n" + "baseline 0.17818192454918605 0.557911382661004\n" ] } ], "source": [ "from sklearn.metrics import mean_squared_error, mean_absolute_error\n", - "guess_mean1 = np.mean(Y1)\n", - "guess_mean2 = np.mean(Y2)\n", + "guess_mean_funny = np.mean(Y_funny_train)\n", + "guess_mean_helpful = np.mean(Y_helpful_train)\n", "\n", - "print(\"baseline\", mean_squared_error(Y1, [guess_mean1]*len(Y1)), mean_absolute_error(Y2, [guess_mean2]*len(Y2)))" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.001 282.2541467007739 4.179655704428717\n", - "0.01 282.25415246942805 4.179600740282743\n", - "0.1 282.2546345232787 4.179072864682249\n", - "1 282.2692492511399 4.175349141167781\n", - "10 282.3721909589884 4.147935437500891\n", - "100 283.13132181376034 3.9883973026815065\n", - "1000 286.86570062121467 3.620101916467935\n" - ] - } - ], - "source": [ - "from sklearn import linear_model\n", - "\n", - "for C in [0.001, 0.01, 0.1, 1, 10, 100, 1000]:\n", - "\n", - " model1 = linear_model.Ridge(C, fit_intercept=True)\n", - " model1.fit(X, Y1)\n", - "\n", - " model2 = linear_model.Ridge(C, fit_intercept=True)\n", - " model2.fit(X, Y2)\n", - "\n", - " predictions1 = model1.predict(X)\n", - " predictions2 = model1.predict(X)\n", - "\n", - " print(C, mean_squared_error(Y1, predictions1), mean_absolute_error(Y2, predictions2))" + "print(\"baseline\", mean_squared_error(Y_funny_test, [guess_mean_funny]*len(Y_funny_test)), mean_squared_error(Y_helpful_test, [guess_mean_helpful]*len(Y_helpful_test)))" ] }, { @@ -224,26 +208,65 @@ "name": "stdout", "output_type": "stream", "text": [ - "1 294.7309048565537 3.2338225122785453\n", - "10 294.7309048565537 3.2338225122785453\n", - "100 294.7309048565537 3.2338225122785453\n", - "1000 294.7309048565537 3.2338225122785453\n" + "0.01 0.24665170508912013 0.7702414041912456\n", + "0.1 0.24578924150085898 0.7681419094613451\n", + "1 0.24248804203997093 0.7584811772506682\n", + "10 0.24888382029075776 0.7518311372299598\n", + "100 0.23060394844562843 0.6419885405134674\n" ] } ], "source": [ - "for C in [1, 10, 100, 1000]:\n", + "from sklearn import linear_model\n", "\n", - " model1 = linear_model.Lasso(alpha=C, fit_intercept=True)\n", - " model1.fit(X, Y1)\n", + "Cs = [0.01, 0.1, 1, 10, 100]\n", "\n", - " model2 = linear_model.Lasso(alpha=C, fit_intercept=True)\n", - " model2.fit(X, Y2)\n", + "for C in Cs:\n", "\n", - " predictions1 = model1.predict(X)\n", - " predictions2 = model1.predict(X)\n", + " model1 = linear_model.Ridge(C, fit_intercept=True)\n", + " model1.fit(X_train, Y_funny_train)\n", "\n", - " print(C, mean_squared_error(Y1, predictions1), mean_absolute_error(Y2, predictions2))" + " model2 = linear_model.Ridge(C, fit_intercept=True)\n", + " model2.fit(X_train, Y_helpful_train)\n", + "\n", + " pred_funny_test = model1.predict(X_test)\n", + " pred_helpful_test = model2.predict(X_test)\n", + "\n", + " print(C, mean_squared_error(Y_funny_test, pred_funny_test), mean_squared_error(Y_helpful_test, pred_helpful_test))" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.01 0.17730058785614386 0.539258189636067\n", + "0.1 0.17818192454918605 0.543156420319067\n", + "1 0.17818192454918605 0.557911382661004\n", + "10 0.17818192454918605 0.557911382661004\n", + "100 0.17818192454918605 0.557911382661004\n" + ] + } + ], + "source": [ + "Cs = [0.01, 0.1, 1, 10, 100]\n", + "\n", + "for C in Cs:\n", + "\n", + " model1 = linear_model.Lasso(C, fit_intercept=True)\n", + " model1.fit(X_train, Y_funny_train)\n", + "\n", + " model2 = linear_model.Lasso(C, fit_intercept=True)\n", + " model2.fit(X_train, Y_helpful_train)\n", + "\n", + " pred_funny_test = model1.predict(X_test)\n", + " pred_helpful_test = model2.predict(X_test)\n", + "\n", + " print(C, mean_squared_error(Y_funny_test, pred_funny_test), mean_squared_error(Y_helpful_test, pred_helpful_test))" ] }, { diff --git a/preliminary.ipynb b/preliminary.ipynb index e350a06..2328591 100644 --- a/preliminary.ipynb +++ b/preliminary.ipynb @@ -225,14 +225,6 @@ "plt.xlabel(\"Num. Total Ratings\")\n", "plt.title(\"Num. Helpful Ratings vs Num. Total Ratings\")" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "79ea84a9", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": {