small improvement by adding review length as feature
This commit is contained in:
parent
91ee829e60
commit
4cf85a15dd
@ -2,7 +2,7 @@
|
|||||||
"cells": [
|
"cells": [
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 31,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -15,7 +15,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": 32,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -24,33 +24,22 @@
|
|||||||
" yield eval(l)\n",
|
" yield eval(l)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"data = list(parseData(\"australian_user_reviews.json.gz\"))\n",
|
"data = list(parseData(\"australian_user_reviews.json.gz\"))\n",
|
||||||
"\n",
|
|
||||||
"dm = [[0,0],[0,0]]\n",
|
|
||||||
"\n",
|
|
||||||
"users = set()\n",
|
"users = set()\n",
|
||||||
"games = set()\n",
|
|
||||||
"\n",
|
|
||||||
"nodate = 0\n",
|
|
||||||
"\n",
|
|
||||||
"reviews = []\n",
|
"reviews = []\n",
|
||||||
"\n",
|
"\n",
|
||||||
"for user in data:\n",
|
"for user in data:\n",
|
||||||
" if user[\"user_id\"] in users:\n",
|
" if user[\"user_id\"] in users:\n",
|
||||||
" #print(f\"ducplicate user skipped: {user['user_id']}\")\n",
|
|
||||||
" pass\n",
|
" pass\n",
|
||||||
" else:\n",
|
" else:\n",
|
||||||
" users.add(user[\"user_id\"])\n",
|
" users.add(user[\"user_id\"])\n",
|
||||||
" for review in user[\"reviews\"]:\n",
|
" for review in user[\"reviews\"]:\n",
|
||||||
" games.add(review[\"item_id\"])\n",
|
|
||||||
" funny = review[\"funny\"]\n",
|
" funny = review[\"funny\"]\n",
|
||||||
" hasfunny = int(funny != \"\")\n",
|
|
||||||
" if funny == \"\":\n",
|
" if funny == \"\":\n",
|
||||||
" review[\"funny\"] = 0\n",
|
" review[\"funny\"] = 0\n",
|
||||||
" else:\n",
|
" else:\n",
|
||||||
" review[\"funny\"] = int(re.findall(\"\\d+\", funny)[0])\n",
|
" review[\"funny\"] = int(re.findall(\"\\d+\", funny)[0])\n",
|
||||||
" \n",
|
" \n",
|
||||||
" helpful = review[\"helpful\"]\n",
|
" helpful = review[\"helpful\"]\n",
|
||||||
" hashelpful = int(helpful != \"No ratings yet\")\n",
|
|
||||||
" if helpful == \"No ratings yet\":\n",
|
" if helpful == \"No ratings yet\":\n",
|
||||||
" review[\"helpful_n\"] = 0\n",
|
" review[\"helpful_n\"] = 0\n",
|
||||||
" review[\"helpful_total\"] = 0\n",
|
" review[\"helpful_total\"] = 0\n",
|
||||||
@ -62,13 +51,11 @@
|
|||||||
" review[\"helpful_n\"] = float(nums[0])\n",
|
" review[\"helpful_n\"] = float(nums[0])\n",
|
||||||
" review[\"helpful_total\"] = float(nums[1])\n",
|
" review[\"helpful_total\"] = float(nums[1])\n",
|
||||||
" \n",
|
" \n",
|
||||||
" dm[hasfunny][hashelpful] += 1\n",
|
|
||||||
"\n",
|
|
||||||
" try:\n",
|
" try:\n",
|
||||||
" post_datetime = datetime.strptime(review[\"posted\"],'Posted %B %d, %Y.')\n",
|
" post_datetime = datetime.strptime(review[\"posted\"],'Posted %B %d, %Y.')\n",
|
||||||
" review[\"posted\"] = post_datetime\n",
|
" review[\"posted\"] = post_datetime\n",
|
||||||
" except:\n",
|
" except:\n",
|
||||||
" nodate += 1\n",
|
" pass\n",
|
||||||
"\n",
|
"\n",
|
||||||
" review[\"user_id\"] = user[\"user_id\"]\n",
|
" review[\"user_id\"] = user[\"user_id\"]\n",
|
||||||
" review[\"user_url\"] = user[\"user_url\"]\n",
|
" review[\"user_url\"] = user[\"user_url\"]\n",
|
||||||
@ -77,7 +64,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 33,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -86,7 +73,7 @@
|
|||||||
"97248"
|
"97248"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 3,
|
"execution_count": 33,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"output_type": "execute_result"
|
"output_type": "execute_result"
|
||||||
}
|
}
|
||||||
@ -110,7 +97,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 4,
|
"execution_count": 34,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -124,7 +111,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 5,
|
"execution_count": 35,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -134,13 +121,15 @@
|
|||||||
" for w in r.split():\n",
|
" for w in r.split():\n",
|
||||||
" if w in words:\n",
|
" if w in words:\n",
|
||||||
" feat[wordId[w]] += 1\n",
|
" feat[wordId[w]] += 1\n",
|
||||||
" feat.append(1) # offset\n",
|
"\n",
|
||||||
|
" feat.append(len(datum[\"review\"]))\n",
|
||||||
|
" \n",
|
||||||
" return feat"
|
" return feat"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 12,
|
"execution_count": 36,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
@ -180,7 +169,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 13,
|
"execution_count": 37,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -201,18 +190,18 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 14,
|
"execution_count": 38,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"0.01 0.24665170508912013 0.7702414041912456\n",
|
"0.01 0.21632227671715168 0.6847807364903296\n",
|
||||||
"0.1 0.24578924150085898 0.7681419094613451\n",
|
"0.1 0.2156867944836758 0.6829965387241808\n",
|
||||||
"1 0.24248804203997093 0.7584811772506682\n",
|
"1 0.21316700811628655 0.6747810400313006\n",
|
||||||
"10 0.24888382029075776 0.7518311372299598\n",
|
"10 0.2161776145305841 0.6681779252365153\n",
|
||||||
"100 0.23060394844562843 0.6419885405134674\n"
|
"100 0.20723445731519957 0.5973124724751776\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
@ -223,10 +212,10 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"for C in Cs:\n",
|
"for C in Cs:\n",
|
||||||
"\n",
|
"\n",
|
||||||
" model1 = linear_model.Ridge(C, fit_intercept=True)\n",
|
" model1 = linear_model.Ridge(alpha=C, fit_intercept=True)\n",
|
||||||
" model1.fit(X_train, Y_funny_train)\n",
|
" model1.fit(X_train, Y_funny_train)\n",
|
||||||
"\n",
|
"\n",
|
||||||
" model2 = linear_model.Ridge(C, fit_intercept=True)\n",
|
" model2 = linear_model.Ridge(alpha=C, fit_intercept=True)\n",
|
||||||
" model2.fit(X_train, Y_helpful_train)\n",
|
" model2.fit(X_train, Y_helpful_train)\n",
|
||||||
"\n",
|
"\n",
|
||||||
" pred_funny_test = model1.predict(X_test)\n",
|
" pred_funny_test = model1.predict(X_test)\n",
|
||||||
@ -237,17 +226,17 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 15,
|
"execution_count": 39,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
"name": "stdout",
|
"name": "stdout",
|
||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"0.01 0.17730058785614386 0.539258189636067\n",
|
"0.01 0.17702951629340366 0.538690243296189\n",
|
||||||
"0.1 0.17818192454918605 0.543156420319067\n",
|
"0.1 0.177432503566242 0.5387345171140366\n",
|
||||||
"1 0.17818192454918605 0.557911382661004\n",
|
"1 0.17743138596037397 0.538778156304091\n",
|
||||||
"10 0.17818192454918605 0.557911382661004\n",
|
"10 0.17786269625555318 0.5396020974919651\n",
|
||||||
"100 0.17818192454918605 0.557911382661004\n"
|
"100 0.17818192454918605 0.557911382661004\n"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
@ -257,10 +246,10 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"for C in Cs:\n",
|
"for C in Cs:\n",
|
||||||
"\n",
|
"\n",
|
||||||
" model1 = linear_model.Lasso(C, fit_intercept=True)\n",
|
" model1 = linear_model.Lasso(alpha=C, fit_intercept=True)\n",
|
||||||
" model1.fit(X_train, Y_funny_train)\n",
|
" model1.fit(X_train, Y_funny_train)\n",
|
||||||
"\n",
|
"\n",
|
||||||
" model2 = linear_model.Lasso(C, fit_intercept=True)\n",
|
" model2 = linear_model.Lasso(alpha=C, fit_intercept=True)\n",
|
||||||
" model2.fit(X_train, Y_helpful_train)\n",
|
" model2.fit(X_train, Y_helpful_train)\n",
|
||||||
"\n",
|
"\n",
|
||||||
" pred_funny_test = model1.predict(X_test)\n",
|
" pred_funny_test = model1.predict(X_test)\n",
|
||||||
@ -268,13 +257,6 @@
|
|||||||
"\n",
|
"\n",
|
||||||
" print(C, mean_squared_error(Y_funny_test, pred_funny_test), mean_squared_error(Y_helpful_test, pred_helpful_test))"
|
" print(C, mean_squared_error(Y_funny_test, pred_funny_test), mean_squared_error(Y_helpful_test, pred_helpful_test))"
|
||||||
]
|
]
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"metadata": {
|
"metadata": {
|
||||||
|
Reference in New Issue
Block a user