{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt\n", "from datetime import datetime\n", "import re\n", "import gzip" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def parseData(fname):\n", " for l in gzip.open(fname):\n", " yield eval(l)\n", "\n", "data = list(parseData(\"australian_user_reviews.json.gz\"))\n", "\n", "dm = [[0,0],[0,0]]\n", "\n", "users = set()\n", "games = set()\n", "\n", "nodate = 0\n", "\n", "reviews = []\n", "\n", "for user in data:\n", " if user[\"user_id\"] in users:\n", " #print(f\"ducplicate user skipped: {user['user_id']}\")\n", " pass\n", " else:\n", " users.add(user[\"user_id\"])\n", " for review in user[\"reviews\"]:\n", " games.add(review[\"item_id\"])\n", " funny = review[\"funny\"]\n", " hasfunny = int(funny != \"\")\n", " if funny == \"\":\n", " review[\"funny\"] = 0\n", " else:\n", " review[\"funny\"] = int(re.findall(\"\\d+\", funny)[0])\n", " \n", " helpful = review[\"helpful\"]\n", " hashelpful = int(helpful != \"No ratings yet\")\n", " if helpful == \"No ratings yet\":\n", " review[\"helpful_n\"] = 0\n", " review[\"helpful_total\"] = 0\n", " review[\"helpful\"] = 0\n", " else:\n", " nums = re.findall(\"\\d+\", helpful.replace(\",\", \"\"))\n", " helpfulness = float(nums[0]) / float(nums[1])\n", " review[\"helpful\"] = float(nums[0]) / float(nums[1])\n", " review[\"helpful_n\"] = float(nums[0])\n", " review[\"helpful_total\"] = float(nums[1])\n", " \n", " dm[hasfunny][hashelpful] += 1\n", "\n", " try:\n", " post_datetime = datetime.strptime(review[\"posted\"],'Posted %B %d, %Y.')\n", " review[\"posted\"] = post_datetime\n", " except:\n", " nodate += 1\n", "\n", " review[\"user_id\"] = user[\"user_id\"]\n", " review[\"user_url\"] = user[\"user_url\"]\n", " reviews.append(review)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "97248" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from collections import defaultdict\n", "import string\n", "from nltk.stem.porter import *\n", "\n", "wordCount = defaultdict(int)\n", "punctuation = set(string.punctuation)\n", "stemmer = PorterStemmer()\n", "for review in reviews:\n", "\tr = ''.join([c for c in review['review'].lower() if not c in punctuation])\n", "\tfor w in r.split():\n", "\t\tw = stemmer.stem(w)\n", "\t\twordCount[w] += 1\n", "\t\t\n", "len(wordCount)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "counts = [(wordCount[w], w) for w in wordCount]\n", "counts.sort()\n", "counts.reverse()\n", "words = [x[1] for x in counts[:1000]]\n", "wordId = dict(zip(words, range(len(words))))\n", "wordSet = set(words)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def feature(datum):\n", " feat = [0]*len(words)\n", " r = ''.join([c for c in datum['review'].lower() if not c in punctuation])\n", " for w in r.split():\n", " if w in words:\n", " feat[wordId[w]] += 1\n", " feat.append(1) # offset\n", " return feat" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "X = []\n", "Y1 = []\n", "Y2 = []\n", "for review in reviews:\n", " X.append(feature(review))\n", " Y1.append(review[\"funny\"])\n", " Y2.append(review[\"helpful_n\"])\n", "\n", "X = np.array(X)\n", "Y1 = np.array(Y1)\n", "Y2 = np.array(Y2)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "baseline 294.7309048565537 4.604634941766926\n" ] } ], "source": [ "from sklearn.metrics import mean_squared_error, mean_absolute_error\n", "guess_mean1 = np.mean(Y1)\n", "guess_mean2 = np.mean(Y2)\n", "\n", "print(\"baseline\", mean_squared_error(Y1, [guess_mean1]*len(Y1)), mean_absolute_error(Y2, [guess_mean2]*len(Y2)))" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.001 282.2541467007739 4.179655704428717\n", "0.01 282.25415246942805 4.179600740282743\n", "0.1 282.2546345232787 4.179072864682249\n", "1 282.2692492511399 4.175349141167781\n", "10 282.3721909589884 4.147935437500891\n", "100 283.13132181376034 3.9883973026815065\n", "1000 286.86570062121467 3.620101916467935\n" ] } ], "source": [ "from sklearn import linear_model\n", "\n", "for C in [0.001, 0.01, 0.1, 1, 10, 100, 1000]:\n", "\n", " model1 = linear_model.Ridge(C, fit_intercept=True)\n", " model1.fit(X, Y1)\n", "\n", " model2 = linear_model.Ridge(C, fit_intercept=True)\n", " model2.fit(X, Y2)\n", "\n", " predictions1 = model1.predict(X)\n", " predictions2 = model1.predict(X)\n", "\n", " print(C, mean_squared_error(Y1, predictions1), mean_absolute_error(Y2, predictions2))" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 294.7309048565537 3.2338225122785453\n", "10 294.7309048565537 3.2338225122785453\n", "100 294.7309048565537 3.2338225122785453\n", "1000 294.7309048565537 3.2338225122785453\n" ] } ], "source": [ "for C in [1, 10, 100, 1000]:\n", "\n", " model1 = linear_model.Lasso(alpha=C, fit_intercept=True)\n", " model1.fit(X, Y1)\n", "\n", " model2 = linear_model.Lasso(alpha=C, fit_intercept=True)\n", " model2.fit(X, Y2)\n", "\n", " predictions1 = model1.predict(X)\n", " predictions2 = model1.predict(X)\n", "\n", " print(C, mean_squared_error(Y1, predictions1), mean_absolute_error(Y2, predictions2))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }