{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from datetime import datetime\n", "import re\n", "import gzip" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def parseData(fname):\n", " for l in gzip.open(fname):\n", " yield eval(l)\n", "\n", "data = list(parseData(\"australian_user_reviews.json.gz\"))\n", "\n", "dm = [[0,0],[0,0]]\n", "\n", "users = set()\n", "games = set()\n", "\n", "nodate = 0\n", "\n", "reviews = []\n", "\n", "for user in data:\n", " if user[\"user_id\"] in users:\n", " #print(f\"ducplicate user skipped: {user['user_id']}\")\n", " pass\n", " else:\n", " users.add(user[\"user_id\"])\n", " for review in user[\"reviews\"]:\n", " games.add(review[\"item_id\"])\n", " funny = review[\"funny\"]\n", " hasfunny = int(funny != \"\")\n", " if funny == \"\":\n", " review[\"funny\"] = 0\n", " else:\n", " review[\"funny\"] = int(re.findall(\"\\d+\", funny)[0])\n", " \n", " helpful = review[\"helpful\"]\n", " hashelpful = int(helpful != \"No ratings yet\")\n", " if helpful == \"No ratings yet\":\n", " review[\"helpful_n\"] = 0\n", " review[\"helpful_total\"] = 0\n", " review[\"helpful\"] = 0\n", " else:\n", " nums = re.findall(\"\\d+\", helpful.replace(\",\", \"\"))\n", " helpfulness = float(nums[0]) / float(nums[1])\n", " review[\"helpful\"] = float(nums[0]) / float(nums[1])\n", " review[\"helpful_n\"] = float(nums[0])\n", " review[\"helpful_total\"] = float(nums[1])\n", " \n", " dm[hasfunny][hashelpful] += 1\n", "\n", " try:\n", " post_datetime = datetime.strptime(review[\"posted\"],'Posted %B %d, %Y.')\n", " review[\"posted\"] = post_datetime\n", " except:\n", " nodate += 1\n", "\n", " review[\"user_id\"] = user[\"user_id\"]\n", " review[\"user_url\"] = user[\"user_url\"]\n", " reviews.append(review)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "97248" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from collections import defaultdict\n", "import string\n", "from nltk.stem.porter import *\n", "\n", "wordCount = defaultdict(int)\n", "punctuation = set(string.punctuation)\n", "stemmer = PorterStemmer()\n", "for review in reviews:\n", "\tr = ''.join([c for c in review['review'].lower() if not c in punctuation])\n", "\tfor w in r.split():\n", "\t\tw = stemmer.stem(w)\n", "\t\twordCount[w] += 1\n", "\t\t\n", "len(wordCount)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "counts = [(wordCount[w], w) for w in wordCount]\n", "counts.sort()\n", "counts.reverse()\n", "words = [x[1] for x in counts[:1000]]\n", "wordId = dict(zip(words, range(len(words))))\n", "wordSet = set(words)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def feature(datum):\n", " feat = [0]*len(words)\n", " r = ''.join([c for c in datum['review'].lower() if not c in punctuation])\n", " for w in r.split():\n", " if w in words:\n", " feat[wordId[w]] += 1\n", " feat.append(1) # offset\n", " return feat" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "import numpy as np\n", "\n", "X_train, X_test, Y_funny_train, Y_funny_test, Y_helpful_train, Y_helpful_test = [], [], [], [], [], []\n", "\n", "train, test = train_test_split(reviews, test_size=0.25, random_state=0)\n", "\n", "for review in train:\n", " X_train.append(feature(review))\n", " #Y1.append(review[\"funny\"])\n", " if review[\"funny\"] == 0:\n", " Y_funny_train.append(0)\n", " else:\n", " Y_funny_train.append(np.log(review[\"funny\"]))\n", " #Y2.append(review[\"helpful_n\"])\n", " if review[\"helpful_n\"] == 0:\n", " Y_helpful_train.append(0)\n", " else:\n", " Y_helpful_train.append(np.log(review[\"helpful_n\"]))\n", "\n", "for review in test:\n", " X_test.append(feature(review))\n", " #Y1.append(review[\"funny\"])\n", " if review[\"funny\"] == 0:\n", " Y_funny_test.append(0)\n", " else:\n", " Y_funny_test.append(np.log(review[\"funny\"]))\n", " #Y2.append(review[\"helpful_n\"])\n", " if review[\"helpful_n\"] == 0:\n", " Y_helpful_test.append(0)\n", " else:\n", " Y_helpful_test.append(np.log(review[\"helpful_n\"]))\n" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "baseline 0.17818192454918605 0.557911382661004\n" ] } ], "source": [ "from sklearn.metrics import mean_squared_error, mean_absolute_error\n", "guess_mean_funny = np.mean(Y_funny_train)\n", "guess_mean_helpful = np.mean(Y_helpful_train)\n", "\n", "print(\"baseline\", mean_squared_error(Y_funny_test, [guess_mean_funny]*len(Y_funny_test)), mean_squared_error(Y_helpful_test, [guess_mean_helpful]*len(Y_helpful_test)))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.01 0.24665170508912013 0.7702414041912456\n", "0.1 0.24578924150085898 0.7681419094613451\n", "1 0.24248804203997093 0.7584811772506682\n", "10 0.24888382029075776 0.7518311372299598\n", "100 0.23060394844562843 0.6419885405134674\n" ] } ], "source": [ "from sklearn import linear_model\n", "\n", "Cs = [0.01, 0.1, 1, 10, 100]\n", "\n", "for C in Cs:\n", "\n", " model1 = linear_model.Ridge(C, fit_intercept=True)\n", " model1.fit(X_train, Y_funny_train)\n", "\n", " model2 = linear_model.Ridge(C, fit_intercept=True)\n", " model2.fit(X_train, Y_helpful_train)\n", "\n", " pred_funny_test = model1.predict(X_test)\n", " pred_helpful_test = model2.predict(X_test)\n", "\n", " print(C, mean_squared_error(Y_funny_test, pred_funny_test), mean_squared_error(Y_helpful_test, pred_helpful_test))" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.01 0.17730058785614386 0.539258189636067\n", "0.1 0.17818192454918605 0.543156420319067\n", "1 0.17818192454918605 0.557911382661004\n", "10 0.17818192454918605 0.557911382661004\n", "100 0.17818192454918605 0.557911382661004\n" ] } ], "source": [ "Cs = [0.01, 0.1, 1, 10, 100]\n", "\n", "for C in Cs:\n", "\n", " model1 = linear_model.Lasso(C, fit_intercept=True)\n", " model1.fit(X_train, Y_funny_train)\n", "\n", " model2 = linear_model.Lasso(C, fit_intercept=True)\n", " model2.fit(X_train, Y_helpful_train)\n", "\n", " pred_funny_test = model1.predict(X_test)\n", " pred_helpful_test = model2.predict(X_test)\n", "\n", " print(C, mean_squared_error(Y_funny_test, pred_funny_test), mean_squared_error(Y_helpful_test, pred_helpful_test))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }