try linear models (marginal improvement)

This commit is contained in:
Arthur Lu 2023-11-29 16:47:49 -08:00
parent 5b9a700123
commit 45986a9568
2 changed files with 87 additions and 72 deletions

View File

@ -140,79 +140,63 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"import numpy as np\n",
"X = []\n",
"Y1 = []\n",
"Y2 = []\n",
"for review in reviews:\n",
" X.append(feature(review))\n",
" Y1.append(review[\"funny\"])\n",
" Y2.append(review[\"helpful_n\"])\n",
"\n",
"X = np.array(X)\n",
"Y1 = np.array(Y1)\n",
"Y2 = np.array(Y2)"
"X_train, X_test, Y_funny_train, Y_funny_test, Y_helpful_train, Y_helpful_test = [], [], [], [], [], []\n",
"\n",
"train, test = train_test_split(reviews, test_size=0.25, random_state=0)\n",
"\n",
"for review in train:\n",
" X_train.append(feature(review))\n",
" #Y1.append(review[\"funny\"])\n",
" if review[\"funny\"] == 0:\n",
" Y_funny_train.append(0)\n",
" else:\n",
" Y_funny_train.append(np.log(review[\"funny\"]))\n",
" #Y2.append(review[\"helpful_n\"])\n",
" if review[\"helpful_n\"] == 0:\n",
" Y_helpful_train.append(0)\n",
" else:\n",
" Y_helpful_train.append(np.log(review[\"helpful_n\"]))\n",
"\n",
"for review in test:\n",
" X_test.append(feature(review))\n",
" #Y1.append(review[\"funny\"])\n",
" if review[\"funny\"] == 0:\n",
" Y_funny_test.append(0)\n",
" else:\n",
" Y_funny_test.append(np.log(review[\"funny\"]))\n",
" #Y2.append(review[\"helpful_n\"])\n",
" if review[\"helpful_n\"] == 0:\n",
" Y_helpful_test.append(0)\n",
" else:\n",
" Y_helpful_test.append(np.log(review[\"helpful_n\"]))\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"baseline 294.7309048565537 4.604634941766926\n"
"baseline 0.17818192454918605 0.557911382661004\n"
]
}
],
"source": [
"from sklearn.metrics import mean_squared_error, mean_absolute_error\n",
"guess_mean1 = np.mean(Y1)\n",
"guess_mean2 = np.mean(Y2)\n",
"guess_mean_funny = np.mean(Y_funny_train)\n",
"guess_mean_helpful = np.mean(Y_helpful_train)\n",
"\n",
"print(\"baseline\", mean_squared_error(Y1, [guess_mean1]*len(Y1)), mean_absolute_error(Y2, [guess_mean2]*len(Y2)))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.001 282.2541467007739 4.179655704428717\n",
"0.01 282.25415246942805 4.179600740282743\n",
"0.1 282.2546345232787 4.179072864682249\n",
"1 282.2692492511399 4.175349141167781\n",
"10 282.3721909589884 4.147935437500891\n",
"100 283.13132181376034 3.9883973026815065\n",
"1000 286.86570062121467 3.620101916467935\n"
]
}
],
"source": [
"from sklearn import linear_model\n",
"\n",
"for C in [0.001, 0.01, 0.1, 1, 10, 100, 1000]:\n",
"\n",
" model1 = linear_model.Ridge(C, fit_intercept=True)\n",
" model1.fit(X, Y1)\n",
"\n",
" model2 = linear_model.Ridge(C, fit_intercept=True)\n",
" model2.fit(X, Y2)\n",
"\n",
" predictions1 = model1.predict(X)\n",
" predictions2 = model1.predict(X)\n",
"\n",
" print(C, mean_squared_error(Y1, predictions1), mean_absolute_error(Y2, predictions2))"
"print(\"baseline\", mean_squared_error(Y_funny_test, [guess_mean_funny]*len(Y_funny_test)), mean_squared_error(Y_helpful_test, [guess_mean_helpful]*len(Y_helpful_test)))"
]
},
{
@ -224,26 +208,65 @@
"name": "stdout",
"output_type": "stream",
"text": [
"1 294.7309048565537 3.2338225122785453\n",
"10 294.7309048565537 3.2338225122785453\n",
"100 294.7309048565537 3.2338225122785453\n",
"1000 294.7309048565537 3.2338225122785453\n"
"0.01 0.24665170508912013 0.7702414041912456\n",
"0.1 0.24578924150085898 0.7681419094613451\n",
"1 0.24248804203997093 0.7584811772506682\n",
"10 0.24888382029075776 0.7518311372299598\n",
"100 0.23060394844562843 0.6419885405134674\n"
]
}
],
"source": [
"for C in [1, 10, 100, 1000]:\n",
"from sklearn import linear_model\n",
"\n",
" model1 = linear_model.Lasso(alpha=C, fit_intercept=True)\n",
" model1.fit(X, Y1)\n",
"Cs = [0.01, 0.1, 1, 10, 100]\n",
"\n",
" model2 = linear_model.Lasso(alpha=C, fit_intercept=True)\n",
" model2.fit(X, Y2)\n",
"for C in Cs:\n",
"\n",
" predictions1 = model1.predict(X)\n",
" predictions2 = model1.predict(X)\n",
" model1 = linear_model.Ridge(C, fit_intercept=True)\n",
" model1.fit(X_train, Y_funny_train)\n",
"\n",
" print(C, mean_squared_error(Y1, predictions1), mean_absolute_error(Y2, predictions2))"
" model2 = linear_model.Ridge(C, fit_intercept=True)\n",
" model2.fit(X_train, Y_helpful_train)\n",
"\n",
" pred_funny_test = model1.predict(X_test)\n",
" pred_helpful_test = model2.predict(X_test)\n",
"\n",
" print(C, mean_squared_error(Y_funny_test, pred_funny_test), mean_squared_error(Y_helpful_test, pred_helpful_test))"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.01 0.17730058785614386 0.539258189636067\n",
"0.1 0.17818192454918605 0.543156420319067\n",
"1 0.17818192454918605 0.557911382661004\n",
"10 0.17818192454918605 0.557911382661004\n",
"100 0.17818192454918605 0.557911382661004\n"
]
}
],
"source": [
"Cs = [0.01, 0.1, 1, 10, 100]\n",
"\n",
"for C in Cs:\n",
"\n",
" model1 = linear_model.Lasso(C, fit_intercept=True)\n",
" model1.fit(X_train, Y_funny_train)\n",
"\n",
" model2 = linear_model.Lasso(C, fit_intercept=True)\n",
" model2.fit(X_train, Y_helpful_train)\n",
"\n",
" pred_funny_test = model1.predict(X_test)\n",
" pred_helpful_test = model2.predict(X_test)\n",
"\n",
" print(C, mean_squared_error(Y_funny_test, pred_funny_test), mean_squared_error(Y_helpful_test, pred_helpful_test))"
]
},
{

View File

@ -225,14 +225,6 @@
"plt.xlabel(\"Num. Total Ratings\")\n",
"plt.title(\"Num. Helpful Ratings vs Num. Total Ratings\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "79ea84a9",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {