{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "494d6c25", "metadata": {}, "outputs": [], "source": [ "import gzip\n", "def parseData(fname):\n", " for l in gzip.open(fname):\n", " yield eval(l)" ] }, { "cell_type": "code", "execution_count": 2, "id": "ca7ea536", "metadata": {}, "outputs": [], "source": [ "data = list(parseData(\"australian_user_reviews.json.gz\"))" ] }, { "cell_type": "code", "execution_count": 3, "id": "eb772e3d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[29204, 21950], [964, 7187]]\n" ] } ], "source": [ "import re\n", "\n", "dm = [[0,0],[0,0]]\n", "\n", "for user in data:\n", " for review in user[\"reviews\"]:\n", " funny = review[\"funny\"]\n", " hasfunny = int(funny != \"\")\n", " if funny == \"\":\n", " review[\"funny\"] = 0\n", " else:\n", " review[\"funny\"] = int(re.findall(\"\\d+\", funny)[0])\n", " \n", " helpful = review[\"helpful\"]\n", " hashelpful = int(helpful != \"No ratings yet\")\n", " if helpful == \"No ratings yet\":\n", " review[\"helpful\"] = 0\n", " else:\n", " nums = re.findall(\"\\d+\", helpful)\n", " review[\"helpful\"] = float(nums[0]) / float(nums[1])\n", " \n", " dm[hasfunny][hashelpful] += 1\n", " \n", "print(dm)" ] }, { "cell_type": "code", "execution_count": 4, "id": "72528b34", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "97248" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from collections import defaultdict\n", "import string\n", "from nltk.stem.porter import *\n", "\n", "wordCount = defaultdict(int)\n", "punctuation = set(string.punctuation)\n", "stemmer = PorterStemmer()\n", "for user in data:\n", " for review in user[\"reviews\"]:\n", " r = ''.join([c for c in review['review'].lower() if not c in punctuation])\n", " for w in r.split():\n", " w = stemmer.stem(w)\n", " wordCount[w] += 1\n", " \n", "len(wordCount)" ] }, { "cell_type": "code", "execution_count": 5, "id": "834dfe92", "metadata": {}, "outputs": [], "source": [ "counts = [(wordCount[w], w) for w in wordCount]\n", "counts.sort()\n", "counts.reverse()\n", "words = [x[1] for x in counts[:1000]]\n", "wordId = dict(zip(words, range(len(words))))\n", "wordSet = set(words)" ] }, { "cell_type": "code", "execution_count": 6, "id": "b9b3142f", "metadata": {}, "outputs": [], "source": [ "def feature(datum):\n", " feat = [0]*len(words)\n", " r = ''.join([c for c in datum['review'].lower() if not c in punctuation])\n", " for w in r.split():\n", " if w in words:\n", " feat[wordId[w]] += 1\n", " feat.append(1) # offset\n", " return feat" ] }, { "cell_type": "code", "execution_count": 7, "id": "c9b40320", "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "X = []\n", "Y1 = []\n", "Y2 = []\n", "for user in data:\n", " for review in user[\"reviews\"]:\n", " X.append(feature(review))\n", " Y1.append(review[\"funny\"])\n", " Y2.append(review[\"helpful\"])\n", "\n", "X = np.array(X)\n", "Y1 = np.array(Y1)\n", "Y2 = np.array(Y2)" ] }, { "cell_type": "code", "execution_count": 8, "id": "de506b44", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "baseline 291.58597082421744 104.0410362862406\n" ] } ], "source": [ "from sklearn.metrics import mean_squared_error\n", "guess_mean1 = np.mean(Y1)\n", "guess_mean2 = np.mean(Y2)\n", "\n", "print(\"baseline\", mean_squared_error(Y1, [guess_mean1]*len(Y1)), mean_squared_error(Y2, [guess_mean2]*len(Y2)))" ] }, { "cell_type": "code", "execution_count": 9, "id": "442da10a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.001 279.441162284691 111.14407506578739\n", "0.01 279.44116808131514 111.14220804619674\n", "0.1 279.4416527267009 111.12490360179714\n", "1 279.4563820408731 111.0088419162745\n", "10 279.55878360690946 110.3977031070603\n", "100 280.29261897219476 108.18116566648386\n", "1000 283.89486211897093 104.93301065452346\n" ] } ], "source": [ "from sklearn import linear_model\n", "\n", "for C in [0.001, 0.01, 0.1, 1, 10, 100, 1000]:\n", "\n", " model1 = linear_model.Ridge(C, fit_intercept=True)\n", " model1.fit(X, Y1)\n", "\n", " model2 = linear_model.Ridge(C, fit_intercept=True)\n", " model2.fit(X, Y2)\n", "\n", " predictions1 = model1.predict(X)\n", " predictions2 = model1.predict(X)\n", "\n", " print(C, mean_squared_error(Y1, predictions1), mean_squared_error(Y2, predictions2))" ] }, { "cell_type": "code", "execution_count": 10, "id": "90b2ad33", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 291.58597082421744 104.40268312757502\n", "10 291.58597082421744 104.40268312757502\n", "100 291.58597082421744 104.40268312757502\n", "1000 291.58597082421744 104.40268312757502\n" ] } ], "source": [ "from sklearn import linear_model\n", "from sklearn.metrics import mean_squared_error\n", "\n", "for C in [1, 10, 100, 1000]:\n", "\n", " model1 = linear_model.Lasso(alpha=C, fit_intercept=True)\n", " model1.fit(X, Y1)\n", "\n", " model2 = linear_model.Lasso(alpha=C, fit_intercept=True)\n", " model2.fit(X, Y2)\n", "\n", " predictions1 = model1.predict(X)\n", " predictions2 = model1.predict(X)\n", "\n", " print(C, mean_squared_error(Y1, predictions1), mean_squared_error(Y2, predictions2))" ] }, { "cell_type": "code", "execution_count": null, "id": "fafe9eec", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" } }, "nbformat": 4, "nbformat_minor": 5 }