2023-11-29 23:45:43 +00:00
|
|
|
{
|
|
|
|
"cells": [
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-12-04 18:22:45 +00:00
|
|
|
"execution_count": 1,
|
2023-11-29 23:45:43 +00:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"import numpy as np\n",
|
|
|
|
"import matplotlib.pyplot as plt\n",
|
|
|
|
"from datetime import datetime\n",
|
|
|
|
"import re\n",
|
|
|
|
"import gzip"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-12-04 18:22:45 +00:00
|
|
|
"execution_count": 2,
|
2023-11-29 23:45:43 +00:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"def parseData(fname):\n",
|
|
|
|
" for l in gzip.open(fname):\n",
|
|
|
|
" yield eval(l)\n",
|
|
|
|
"\n",
|
|
|
|
"data = list(parseData(\"australian_user_reviews.json.gz\"))\n",
|
|
|
|
"users = set()\n",
|
|
|
|
"reviews = []\n",
|
|
|
|
"\n",
|
|
|
|
"for user in data:\n",
|
|
|
|
" if user[\"user_id\"] in users:\n",
|
|
|
|
" pass\n",
|
|
|
|
" else:\n",
|
|
|
|
" users.add(user[\"user_id\"])\n",
|
|
|
|
" for review in user[\"reviews\"]:\n",
|
|
|
|
" funny = review[\"funny\"]\n",
|
|
|
|
" if funny == \"\":\n",
|
|
|
|
" review[\"funny\"] = 0\n",
|
|
|
|
" else:\n",
|
|
|
|
" review[\"funny\"] = int(re.findall(\"\\d+\", funny)[0])\n",
|
|
|
|
" \n",
|
|
|
|
" helpful = review[\"helpful\"]\n",
|
|
|
|
" if helpful == \"No ratings yet\":\n",
|
|
|
|
" review[\"helpful_n\"] = 0\n",
|
|
|
|
" review[\"helpful_total\"] = 0\n",
|
|
|
|
" review[\"helpful\"] = 0\n",
|
|
|
|
" else:\n",
|
|
|
|
" nums = re.findall(\"\\d+\", helpful.replace(\",\", \"\"))\n",
|
|
|
|
" helpfulness = float(nums[0]) / float(nums[1])\n",
|
|
|
|
" review[\"helpful\"] = float(nums[0]) / float(nums[1])\n",
|
|
|
|
" review[\"helpful_n\"] = float(nums[0])\n",
|
|
|
|
" review[\"helpful_total\"] = float(nums[1])\n",
|
2023-11-30 18:30:48 +00:00
|
|
|
" \n",
|
2023-11-29 23:45:43 +00:00
|
|
|
" try:\n",
|
|
|
|
" post_datetime = datetime.strptime(review[\"posted\"],'Posted %B %d, %Y.')\n",
|
|
|
|
" review[\"posted\"] = post_datetime\n",
|
|
|
|
" except:\n",
|
2023-11-30 18:30:48 +00:00
|
|
|
" pass\n",
|
2023-11-29 23:45:43 +00:00
|
|
|
"\n",
|
|
|
|
" review[\"user_id\"] = user[\"user_id\"]\n",
|
|
|
|
" review[\"user_url\"] = user[\"user_url\"]\n",
|
|
|
|
" reviews.append(review)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-12-04 18:22:45 +00:00
|
|
|
"execution_count": 3,
|
2023-11-29 23:45:43 +00:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/plain": [
|
|
|
|
"97248"
|
|
|
|
]
|
|
|
|
},
|
2023-12-04 18:22:45 +00:00
|
|
|
"execution_count": 3,
|
2023-11-29 23:45:43 +00:00
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"from collections import defaultdict\n",
|
|
|
|
"import string\n",
|
|
|
|
"from nltk.stem.porter import *\n",
|
|
|
|
"\n",
|
|
|
|
"wordCount = defaultdict(int)\n",
|
|
|
|
"punctuation = set(string.punctuation)\n",
|
|
|
|
"stemmer = PorterStemmer()\n",
|
|
|
|
"for review in reviews:\n",
|
|
|
|
"\tr = ''.join([c for c in review['review'].lower() if not c in punctuation])\n",
|
|
|
|
"\tfor w in r.split():\n",
|
|
|
|
"\t\tw = stemmer.stem(w)\n",
|
|
|
|
"\t\twordCount[w] += 1\n",
|
|
|
|
"\t\t\n",
|
|
|
|
"len(wordCount)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-12-04 18:22:45 +00:00
|
|
|
"execution_count": 4,
|
2023-11-29 23:45:43 +00:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"counts = [(wordCount[w], w) for w in wordCount]\n",
|
|
|
|
"counts.sort()\n",
|
|
|
|
"counts.reverse()\n",
|
|
|
|
"words = [x[1] for x in counts[:1000]]\n",
|
|
|
|
"wordId = dict(zip(words, range(len(words))))\n",
|
|
|
|
"wordSet = set(words)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-12-04 18:22:45 +00:00
|
|
|
"execution_count": 5,
|
2023-11-29 23:45:43 +00:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"def feature(datum):\n",
|
|
|
|
" feat = [0]*len(words)\n",
|
|
|
|
" r = ''.join([c for c in datum['review'].lower() if not c in punctuation])\n",
|
|
|
|
" for w in r.split():\n",
|
|
|
|
" if w in words:\n",
|
|
|
|
" feat[wordId[w]] += 1\n",
|
2023-11-30 18:30:48 +00:00
|
|
|
"\n",
|
|
|
|
" feat.append(len(datum[\"review\"]))\n",
|
|
|
|
" \n",
|
2023-11-29 23:45:43 +00:00
|
|
|
" return feat"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-12-04 18:22:45 +00:00
|
|
|
"execution_count": 6,
|
2023-11-29 23:45:43 +00:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
2023-11-30 00:47:49 +00:00
|
|
|
"from sklearn.model_selection import train_test_split\n",
|
2023-11-29 23:45:43 +00:00
|
|
|
"import numpy as np\n",
|
|
|
|
"\n",
|
2023-11-30 00:47:49 +00:00
|
|
|
"X_train, X_test, Y_funny_train, Y_funny_test, Y_helpful_train, Y_helpful_test = [], [], [], [], [], []\n",
|
|
|
|
"\n",
|
|
|
|
"train, test = train_test_split(reviews, test_size=0.25, random_state=0)\n",
|
|
|
|
"\n",
|
|
|
|
"for review in train:\n",
|
|
|
|
" X_train.append(feature(review))\n",
|
|
|
|
" #Y1.append(review[\"funny\"])\n",
|
|
|
|
" if review[\"funny\"] == 0:\n",
|
|
|
|
" Y_funny_train.append(0)\n",
|
|
|
|
" else:\n",
|
|
|
|
" Y_funny_train.append(np.log(review[\"funny\"]))\n",
|
|
|
|
" #Y2.append(review[\"helpful_n\"])\n",
|
|
|
|
" if review[\"helpful_n\"] == 0:\n",
|
|
|
|
" Y_helpful_train.append(0)\n",
|
|
|
|
" else:\n",
|
|
|
|
" Y_helpful_train.append(np.log(review[\"helpful_n\"]))\n",
|
|
|
|
"\n",
|
|
|
|
"for review in test:\n",
|
|
|
|
" X_test.append(feature(review))\n",
|
|
|
|
" #Y1.append(review[\"funny\"])\n",
|
|
|
|
" if review[\"funny\"] == 0:\n",
|
|
|
|
" Y_funny_test.append(0)\n",
|
|
|
|
" else:\n",
|
|
|
|
" Y_funny_test.append(np.log(review[\"funny\"]))\n",
|
|
|
|
" #Y2.append(review[\"helpful_n\"])\n",
|
|
|
|
" if review[\"helpful_n\"] == 0:\n",
|
|
|
|
" Y_helpful_test.append(0)\n",
|
|
|
|
" else:\n",
|
|
|
|
" Y_helpful_test.append(np.log(review[\"helpful_n\"]))\n"
|
2023-11-29 23:45:43 +00:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-12-04 18:22:45 +00:00
|
|
|
"execution_count": 7,
|
2023-11-29 23:45:43 +00:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"name": "stdout",
|
|
|
|
"output_type": "stream",
|
|
|
|
"text": [
|
2023-11-30 00:47:49 +00:00
|
|
|
"baseline 0.17818192454918605 0.557911382661004\n"
|
2023-11-29 23:45:43 +00:00
|
|
|
]
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"from sklearn.metrics import mean_squared_error, mean_absolute_error\n",
|
2023-11-30 00:47:49 +00:00
|
|
|
"guess_mean_funny = np.mean(Y_funny_train)\n",
|
|
|
|
"guess_mean_helpful = np.mean(Y_helpful_train)\n",
|
2023-11-29 23:45:43 +00:00
|
|
|
"\n",
|
2023-11-30 00:47:49 +00:00
|
|
|
"print(\"baseline\", mean_squared_error(Y_funny_test, [guess_mean_funny]*len(Y_funny_test)), mean_squared_error(Y_helpful_test, [guess_mean_helpful]*len(Y_helpful_test)))"
|
2023-11-29 23:45:43 +00:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-12-04 18:22:45 +00:00
|
|
|
"execution_count": 8,
|
2023-11-29 23:45:43 +00:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"name": "stdout",
|
|
|
|
"output_type": "stream",
|
|
|
|
"text": [
|
2023-12-04 18:22:45 +00:00
|
|
|
"0.01 0.21632227671897006 0.6847807364939246\n",
|
|
|
|
"0.1 0.21568679448554554 0.6829965387278908\n",
|
|
|
|
"1 0.21316700811782532 0.6747810400344023\n",
|
|
|
|
"10 0.21617761453133164 0.6681779252378663\n",
|
|
|
|
"100 0.20723445731522736 0.5973124724752429\n"
|
2023-11-29 23:45:43 +00:00
|
|
|
]
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"from sklearn import linear_model\n",
|
|
|
|
"\n",
|
2023-11-30 00:47:49 +00:00
|
|
|
"Cs = [0.01, 0.1, 1, 10, 100]\n",
|
|
|
|
"\n",
|
|
|
|
"for C in Cs:\n",
|
2023-11-29 23:45:43 +00:00
|
|
|
"\n",
|
2023-11-30 18:30:48 +00:00
|
|
|
" model1 = linear_model.Ridge(alpha=C, fit_intercept=True)\n",
|
2023-11-30 00:47:49 +00:00
|
|
|
" model1.fit(X_train, Y_funny_train)\n",
|
2023-11-29 23:45:43 +00:00
|
|
|
"\n",
|
2023-11-30 18:30:48 +00:00
|
|
|
" model2 = linear_model.Ridge(alpha=C, fit_intercept=True)\n",
|
2023-11-30 00:47:49 +00:00
|
|
|
" model2.fit(X_train, Y_helpful_train)\n",
|
2023-11-29 23:45:43 +00:00
|
|
|
"\n",
|
2023-11-30 00:47:49 +00:00
|
|
|
" pred_funny_test = model1.predict(X_test)\n",
|
|
|
|
" pred_helpful_test = model2.predict(X_test)\n",
|
2023-11-29 23:45:43 +00:00
|
|
|
"\n",
|
2023-11-30 00:47:49 +00:00
|
|
|
" print(C, mean_squared_error(Y_funny_test, pred_funny_test), mean_squared_error(Y_helpful_test, pred_helpful_test))"
|
2023-11-29 23:45:43 +00:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2023-12-04 18:22:45 +00:00
|
|
|
"execution_count": 9,
|
2023-11-29 23:45:43 +00:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"name": "stdout",
|
|
|
|
"output_type": "stream",
|
|
|
|
"text": [
|
2023-11-30 18:30:48 +00:00
|
|
|
"0.01 0.17702951629340366 0.538690243296189\n",
|
|
|
|
"0.1 0.177432503566242 0.5387345171140366\n",
|
|
|
|
"1 0.17743138596037397 0.538778156304091\n",
|
2023-12-04 18:22:45 +00:00
|
|
|
"10 0.17786269625555318 0.539602097491965\n",
|
2023-11-30 00:47:49 +00:00
|
|
|
"100 0.17818192454918605 0.557911382661004\n"
|
2023-11-29 23:45:43 +00:00
|
|
|
]
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
2023-11-30 00:47:49 +00:00
|
|
|
"Cs = [0.01, 0.1, 1, 10, 100]\n",
|
|
|
|
"\n",
|
|
|
|
"for C in Cs:\n",
|
2023-11-29 23:45:43 +00:00
|
|
|
"\n",
|
2023-11-30 18:30:48 +00:00
|
|
|
" model1 = linear_model.Lasso(alpha=C, fit_intercept=True)\n",
|
2023-11-30 00:47:49 +00:00
|
|
|
" model1.fit(X_train, Y_funny_train)\n",
|
2023-11-29 23:45:43 +00:00
|
|
|
"\n",
|
2023-11-30 18:30:48 +00:00
|
|
|
" model2 = linear_model.Lasso(alpha=C, fit_intercept=True)\n",
|
2023-11-30 00:47:49 +00:00
|
|
|
" model2.fit(X_train, Y_helpful_train)\n",
|
2023-11-29 23:45:43 +00:00
|
|
|
"\n",
|
2023-11-30 00:47:49 +00:00
|
|
|
" pred_funny_test = model1.predict(X_test)\n",
|
|
|
|
" pred_helpful_test = model2.predict(X_test)\n",
|
2023-11-29 23:45:43 +00:00
|
|
|
"\n",
|
2023-11-30 00:47:49 +00:00
|
|
|
" print(C, mean_squared_error(Y_funny_test, pred_funny_test), mean_squared_error(Y_helpful_test, pred_helpful_test))"
|
2023-11-29 23:45:43 +00:00
|
|
|
]
|
2023-12-04 18:22:45 +00:00
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 11,
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"name": "stdout",
|
|
|
|
"output_type": "stream",
|
|
|
|
"text": [
|
|
|
|
"-0.008666338118757945: you\n",
|
|
|
|
"-0.0033743830081447994: shit\n",
|
|
|
|
"0.002818066343305709: i\n",
|
|
|
|
"-0.0009495127061038011: 3\n",
|
|
|
|
"-0.0007321653144316716: it\n",
|
|
|
|
"0.0006758513650775456: bad\n",
|
|
|
|
"-0.0005113655228402811: nyan\n",
|
|
|
|
"-0.00045892382707229636: of\n",
|
|
|
|
"0.0003103226245059089: *review length*\n",
|
|
|
|
"-0.00026518360080170943: ͡°\n"
|
|
|
|
]
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"model = linear_model.Lasso(alpha=0.01, fit_intercept=True)\n",
|
|
|
|
"model.fit(X_train, Y_helpful_train)\n",
|
|
|
|
"idxs = np.argsort(np.abs(model.coef_))[::-1][:10]\n",
|
|
|
|
"\n",
|
|
|
|
"for idx in idxs:\n",
|
|
|
|
" if (idx < len(words)):\n",
|
|
|
|
" print(f\"{model.coef_[idx]}: {words[idx]}\")\n",
|
|
|
|
" else:\n",
|
|
|
|
" print(f\"{model.coef_[idx]}: *review length*\")"
|
|
|
|
]
|
2023-11-29 23:45:43 +00:00
|
|
|
}
|
|
|
|
],
|
|
|
|
"metadata": {
|
|
|
|
"kernelspec": {
|
|
|
|
"display_name": "Python 3",
|
|
|
|
"language": "python",
|
|
|
|
"name": "python3"
|
|
|
|
},
|
|
|
|
"language_info": {
|
|
|
|
"codemirror_mode": {
|
|
|
|
"name": "ipython",
|
|
|
|
"version": 3
|
|
|
|
},
|
|
|
|
"file_extension": ".py",
|
|
|
|
"mimetype": "text/x-python",
|
|
|
|
"name": "python",
|
|
|
|
"nbconvert_exporter": "python",
|
|
|
|
"pygments_lexer": "ipython3",
|
|
|
|
"version": "3.11.6"
|
|
|
|
},
|
|
|
|
"orig_nbformat": 4
|
|
|
|
},
|
|
|
|
"nbformat": 4,
|
|
|
|
"nbformat_minor": 2
|
|
|
|
}
|