{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from datetime import datetime\n", "import re\n", "import gzip" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def parseData(fname):\n", " for l in gzip.open(fname):\n", " yield eval(l)\n", "\n", "data = list(parseData(\"australian_user_reviews.json.gz\"))\n", "users = set()\n", "reviews = []\n", "\n", "for user in data:\n", " if user[\"user_id\"] in users:\n", " pass\n", " else:\n", " users.add(user[\"user_id\"])\n", " for review in user[\"reviews\"]:\n", " funny = review[\"funny\"]\n", " if funny == \"\":\n", " review[\"funny\"] = 0\n", " else:\n", " review[\"funny\"] = int(re.findall(\"\\d+\", funny)[0])\n", " \n", " helpful = review[\"helpful\"]\n", " if helpful == \"No ratings yet\":\n", " review[\"helpful_n\"] = 0\n", " review[\"helpful_total\"] = 0\n", " review[\"helpful\"] = 0\n", " else:\n", " nums = re.findall(\"\\d+\", helpful.replace(\",\", \"\"))\n", " helpfulness = float(nums[0]) / float(nums[1])\n", " review[\"helpful\"] = float(nums[0]) / float(nums[1])\n", " review[\"helpful_n\"] = float(nums[0])\n", " review[\"helpful_total\"] = float(nums[1])\n", " \n", " try:\n", " post_datetime = datetime.strptime(review[\"posted\"],'Posted %B %d, %Y.')\n", " review[\"posted\"] = post_datetime\n", " except:\n", " pass\n", "\n", " review[\"user_id\"] = user[\"user_id\"]\n", " review[\"user_url\"] = user[\"user_url\"]\n", " reviews.append(review)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "97248" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from collections import defaultdict\n", "import string\n", "from nltk.stem.porter import *\n", "\n", "wordCount = defaultdict(int)\n", "punctuation = set(string.punctuation)\n", "stemmer = PorterStemmer()\n", "for review in reviews:\n", "\tr = ''.join([c for c in review['review'].lower() if not c in punctuation])\n", "\tfor w in r.split():\n", "\t\tw = stemmer.stem(w)\n", "\t\twordCount[w] += 1\n", "\t\t\n", "len(wordCount)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "counts = [(wordCount[w], w) for w in wordCount]\n", "counts.sort()\n", "counts.reverse()\n", "words = [x[1] for x in counts[:1000]]\n", "wordId = dict(zip(words, range(len(words))))\n", "wordSet = set(words)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def feature(datum):\n", " feat = [0]*len(words)\n", " r = ''.join([c for c in datum['review'].lower() if not c in punctuation])\n", " for w in r.split():\n", " if w in words:\n", " feat[wordId[w]] += 1\n", "\n", " feat.append(len(datum[\"review\"]))\n", " \n", " return feat" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "import numpy as np\n", "\n", "X_train, X_test, Y_funny_train, Y_funny_test, Y_helpful_train, Y_helpful_test = [], [], [], [], [], []\n", "\n", "train, test = train_test_split(reviews, test_size=0.25, random_state=0)\n", "\n", "for review in train:\n", " X_train.append(feature(review))\n", " #Y1.append(review[\"funny\"])\n", " if review[\"funny\"] == 0:\n", " Y_funny_train.append(0)\n", " else:\n", " Y_funny_train.append(np.log(review[\"funny\"]))\n", " #Y2.append(review[\"helpful_n\"])\n", " if review[\"helpful_n\"] == 0:\n", " Y_helpful_train.append(0)\n", " else:\n", " Y_helpful_train.append(np.log(review[\"helpful_n\"]))\n", "\n", "for review in test:\n", " X_test.append(feature(review))\n", " #Y1.append(review[\"funny\"])\n", " if review[\"funny\"] == 0:\n", " Y_funny_test.append(0)\n", " else:\n", " Y_funny_test.append(np.log(review[\"funny\"]))\n", " #Y2.append(review[\"helpful_n\"])\n", " if review[\"helpful_n\"] == 0:\n", " Y_helpful_test.append(0)\n", " else:\n", " Y_helpful_test.append(np.log(review[\"helpful_n\"]))\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "baseline 0.17818192454918605 0.557911382661004\n" ] } ], "source": [ "from sklearn.metrics import mean_squared_error, mean_absolute_error\n", "guess_mean_funny = np.mean(Y_funny_train)\n", "guess_mean_helpful = np.mean(Y_helpful_train)\n", "\n", "print(\"baseline\", mean_squared_error(Y_funny_test, [guess_mean_funny]*len(Y_funny_test)), mean_squared_error(Y_helpful_test, [guess_mean_helpful]*len(Y_helpful_test)))" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.01 0.21632227671897006 0.6847807364939246\n", "0.1 0.21568679448554554 0.6829965387278908\n", "1 0.21316700811782532 0.6747810400344023\n", "10 0.21617761453133164 0.6681779252378663\n", "100 0.20723445731522736 0.5973124724752429\n" ] } ], "source": [ "from sklearn import linear_model\n", "\n", "Cs = [0.01, 0.1, 1, 10, 100]\n", "\n", "for C in Cs:\n", "\n", " model1 = linear_model.Ridge(alpha=C, fit_intercept=True)\n", " model1.fit(X_train, Y_funny_train)\n", "\n", " model2 = linear_model.Ridge(alpha=C, fit_intercept=True)\n", " model2.fit(X_train, Y_helpful_train)\n", "\n", " pred_funny_test = model1.predict(X_test)\n", " pred_helpful_test = model2.predict(X_test)\n", "\n", " print(C, mean_squared_error(Y_funny_test, pred_funny_test), mean_squared_error(Y_helpful_test, pred_helpful_test))" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.01 0.17702951629340366 0.538690243296189\n", "0.1 0.177432503566242 0.5387345171140366\n", "1 0.17743138596037397 0.538778156304091\n", "10 0.17786269625555318 0.539602097491965\n", "100 0.17818192454918605 0.557911382661004\n" ] } ], "source": [ "Cs = [0.01, 0.1, 1, 10, 100]\n", "\n", "for C in Cs:\n", "\n", " model1 = linear_model.Lasso(alpha=C, fit_intercept=True)\n", " model1.fit(X_train, Y_funny_train)\n", "\n", " model2 = linear_model.Lasso(alpha=C, fit_intercept=True)\n", " model2.fit(X_train, Y_helpful_train)\n", "\n", " pred_funny_test = model1.predict(X_test)\n", " pred_helpful_test = model2.predict(X_test)\n", "\n", " print(C, mean_squared_error(Y_funny_test, pred_funny_test), mean_squared_error(Y_helpful_test, pred_helpful_test))" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "-0.008666338118757945: you\n", "-0.0033743830081447994: shit\n", "0.002818066343305709: i\n", "-0.0009495127061038011: 3\n", "-0.0007321653144316716: it\n", "0.0006758513650775456: bad\n", "-0.0005113655228402811: nyan\n", "-0.00045892382707229636: of\n", "0.0003103226245059089: *review length*\n", "-0.00026518360080170943: ͡°\n" ] } ], "source": [ "model = linear_model.Lasso(alpha=0.01, fit_intercept=True)\n", "model.fit(X_train, Y_helpful_train)\n", "idxs = np.argsort(np.abs(model.coef_))[::-1][:10]\n", "\n", "for idx in idxs:\n", " if (idx < len(words)):\n", " print(f\"{model.coef_[idx]}: {words[idx]}\")\n", " else:\n", " print(f\"{model.coef_[idx]}: *review length*\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }