try linear models (marginal improvement)
This commit is contained in:
parent
5b9a700123
commit
45986a9568
151
linear_bow.ipynb
151
linear_bow.ipynb
@ -140,79 +140,63 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"import numpy as np\n",
|
||||
"X = []\n",
|
||||
"Y1 = []\n",
|
||||
"Y2 = []\n",
|
||||
"for review in reviews:\n",
|
||||
" X.append(feature(review))\n",
|
||||
" Y1.append(review[\"funny\"])\n",
|
||||
" Y2.append(review[\"helpful_n\"])\n",
|
||||
"\n",
|
||||
"X = np.array(X)\n",
|
||||
"Y1 = np.array(Y1)\n",
|
||||
"Y2 = np.array(Y2)"
|
||||
"X_train, X_test, Y_funny_train, Y_funny_test, Y_helpful_train, Y_helpful_test = [], [], [], [], [], []\n",
|
||||
"\n",
|
||||
"train, test = train_test_split(reviews, test_size=0.25, random_state=0)\n",
|
||||
"\n",
|
||||
"for review in train:\n",
|
||||
" X_train.append(feature(review))\n",
|
||||
" #Y1.append(review[\"funny\"])\n",
|
||||
" if review[\"funny\"] == 0:\n",
|
||||
" Y_funny_train.append(0)\n",
|
||||
" else:\n",
|
||||
" Y_funny_train.append(np.log(review[\"funny\"]))\n",
|
||||
" #Y2.append(review[\"helpful_n\"])\n",
|
||||
" if review[\"helpful_n\"] == 0:\n",
|
||||
" Y_helpful_train.append(0)\n",
|
||||
" else:\n",
|
||||
" Y_helpful_train.append(np.log(review[\"helpful_n\"]))\n",
|
||||
"\n",
|
||||
"for review in test:\n",
|
||||
" X_test.append(feature(review))\n",
|
||||
" #Y1.append(review[\"funny\"])\n",
|
||||
" if review[\"funny\"] == 0:\n",
|
||||
" Y_funny_test.append(0)\n",
|
||||
" else:\n",
|
||||
" Y_funny_test.append(np.log(review[\"funny\"]))\n",
|
||||
" #Y2.append(review[\"helpful_n\"])\n",
|
||||
" if review[\"helpful_n\"] == 0:\n",
|
||||
" Y_helpful_test.append(0)\n",
|
||||
" else:\n",
|
||||
" Y_helpful_test.append(np.log(review[\"helpful_n\"]))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"baseline 294.7309048565537 4.604634941766926\n"
|
||||
"baseline 0.17818192454918605 0.557911382661004\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sklearn.metrics import mean_squared_error, mean_absolute_error\n",
|
||||
"guess_mean1 = np.mean(Y1)\n",
|
||||
"guess_mean2 = np.mean(Y2)\n",
|
||||
"guess_mean_funny = np.mean(Y_funny_train)\n",
|
||||
"guess_mean_helpful = np.mean(Y_helpful_train)\n",
|
||||
"\n",
|
||||
"print(\"baseline\", mean_squared_error(Y1, [guess_mean1]*len(Y1)), mean_absolute_error(Y2, [guess_mean2]*len(Y2)))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0.001 282.2541467007739 4.179655704428717\n",
|
||||
"0.01 282.25415246942805 4.179600740282743\n",
|
||||
"0.1 282.2546345232787 4.179072864682249\n",
|
||||
"1 282.2692492511399 4.175349141167781\n",
|
||||
"10 282.3721909589884 4.147935437500891\n",
|
||||
"100 283.13132181376034 3.9883973026815065\n",
|
||||
"1000 286.86570062121467 3.620101916467935\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sklearn import linear_model\n",
|
||||
"\n",
|
||||
"for C in [0.001, 0.01, 0.1, 1, 10, 100, 1000]:\n",
|
||||
"\n",
|
||||
" model1 = linear_model.Ridge(C, fit_intercept=True)\n",
|
||||
" model1.fit(X, Y1)\n",
|
||||
"\n",
|
||||
" model2 = linear_model.Ridge(C, fit_intercept=True)\n",
|
||||
" model2.fit(X, Y2)\n",
|
||||
"\n",
|
||||
" predictions1 = model1.predict(X)\n",
|
||||
" predictions2 = model1.predict(X)\n",
|
||||
"\n",
|
||||
" print(C, mean_squared_error(Y1, predictions1), mean_absolute_error(Y2, predictions2))"
|
||||
"print(\"baseline\", mean_squared_error(Y_funny_test, [guess_mean_funny]*len(Y_funny_test)), mean_squared_error(Y_helpful_test, [guess_mean_helpful]*len(Y_helpful_test)))"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -224,26 +208,65 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"1 294.7309048565537 3.2338225122785453\n",
|
||||
"10 294.7309048565537 3.2338225122785453\n",
|
||||
"100 294.7309048565537 3.2338225122785453\n",
|
||||
"1000 294.7309048565537 3.2338225122785453\n"
|
||||
"0.01 0.24665170508912013 0.7702414041912456\n",
|
||||
"0.1 0.24578924150085898 0.7681419094613451\n",
|
||||
"1 0.24248804203997093 0.7584811772506682\n",
|
||||
"10 0.24888382029075776 0.7518311372299598\n",
|
||||
"100 0.23060394844562843 0.6419885405134674\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for C in [1, 10, 100, 1000]:\n",
|
||||
"from sklearn import linear_model\n",
|
||||
"\n",
|
||||
" model1 = linear_model.Lasso(alpha=C, fit_intercept=True)\n",
|
||||
" model1.fit(X, Y1)\n",
|
||||
"Cs = [0.01, 0.1, 1, 10, 100]\n",
|
||||
"\n",
|
||||
" model2 = linear_model.Lasso(alpha=C, fit_intercept=True)\n",
|
||||
" model2.fit(X, Y2)\n",
|
||||
"for C in Cs:\n",
|
||||
"\n",
|
||||
" predictions1 = model1.predict(X)\n",
|
||||
" predictions2 = model1.predict(X)\n",
|
||||
" model1 = linear_model.Ridge(C, fit_intercept=True)\n",
|
||||
" model1.fit(X_train, Y_funny_train)\n",
|
||||
"\n",
|
||||
" print(C, mean_squared_error(Y1, predictions1), mean_absolute_error(Y2, predictions2))"
|
||||
" model2 = linear_model.Ridge(C, fit_intercept=True)\n",
|
||||
" model2.fit(X_train, Y_helpful_train)\n",
|
||||
"\n",
|
||||
" pred_funny_test = model1.predict(X_test)\n",
|
||||
" pred_helpful_test = model2.predict(X_test)\n",
|
||||
"\n",
|
||||
" print(C, mean_squared_error(Y_funny_test, pred_funny_test), mean_squared_error(Y_helpful_test, pred_helpful_test))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0.01 0.17730058785614386 0.539258189636067\n",
|
||||
"0.1 0.17818192454918605 0.543156420319067\n",
|
||||
"1 0.17818192454918605 0.557911382661004\n",
|
||||
"10 0.17818192454918605 0.557911382661004\n",
|
||||
"100 0.17818192454918605 0.557911382661004\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"Cs = [0.01, 0.1, 1, 10, 100]\n",
|
||||
"\n",
|
||||
"for C in Cs:\n",
|
||||
"\n",
|
||||
" model1 = linear_model.Lasso(C, fit_intercept=True)\n",
|
||||
" model1.fit(X_train, Y_funny_train)\n",
|
||||
"\n",
|
||||
" model2 = linear_model.Lasso(C, fit_intercept=True)\n",
|
||||
" model2.fit(X_train, Y_helpful_train)\n",
|
||||
"\n",
|
||||
" pred_funny_test = model1.predict(X_test)\n",
|
||||
" pred_helpful_test = model2.predict(X_test)\n",
|
||||
"\n",
|
||||
" print(C, mean_squared_error(Y_funny_test, pred_funny_test), mean_squared_error(Y_helpful_test, pred_helpful_test))"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -225,14 +225,6 @@
|
||||
"plt.xlabel(\"Num. Total Ratings\")\n",
|
||||
"plt.title(\"Num. Helpful Ratings vs Num. Total Ratings\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "79ea84a9",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
Reference in New Issue
Block a user