commit 85861c572eda6f5e60c46831719dc76d1ba1003e Author: Arthur Lu Date: Tue Nov 28 15:58:51 2023 -0800 add some preliminary results diff --git a/australian_user_reviews.json.gz b/australian_user_reviews.json.gz new file mode 100644 index 0000000..e49560c Binary files /dev/null and b/australian_user_reviews.json.gz differ diff --git a/preliminary.ipynb b/preliminary.ipynb new file mode 100644 index 0000000..016952c --- /dev/null +++ b/preliminary.ipynb @@ -0,0 +1,280 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "494d6c25", + "metadata": {}, + "outputs": [], + "source": [ + "import gzip\n", + "def parseData(fname):\n", + " for l in gzip.open(fname):\n", + " yield eval(l)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "ca7ea536", + "metadata": {}, + "outputs": [], + "source": [ + "data = list(parseData(\"australian_user_reviews.json.gz\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "eb772e3d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[29204, 21950], [964, 7187]]\n" + ] + } + ], + "source": [ + "import re\n", + "\n", + "dm = [[0,0],[0,0]]\n", + "\n", + "for user in data:\n", + " for review in user[\"reviews\"]:\n", + " funny = review[\"funny\"]\n", + " hasfunny = int(funny != \"\")\n", + " if funny == \"\":\n", + " review[\"funny\"] = 0\n", + " else:\n", + " review[\"funny\"] = int(re.findall(\"\\d+\", funny)[0])\n", + " \n", + " helpful = review[\"helpful\"]\n", + " hashelpful = int(helpful != \"No ratings yet\")\n", + " if helpful == \"No ratings yet\":\n", + " review[\"helpful\"] = 0\n", + " else:\n", + " nums = re.findall(\"\\d+\", helpful)\n", + " review[\"helpful\"] = float(nums[0]) / float(nums[1])\n", + " \n", + " dm[hasfunny][hashelpful] += 1\n", + " \n", + "print(dm)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "72528b34", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "97248" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from collections import defaultdict\n", + "import string\n", + "from nltk.stem.porter import *\n", + "\n", + "wordCount = defaultdict(int)\n", + "punctuation = set(string.punctuation)\n", + "stemmer = PorterStemmer()\n", + "for user in data:\n", + " for review in user[\"reviews\"]:\n", + " r = ''.join([c for c in review['review'].lower() if not c in punctuation])\n", + " for w in r.split():\n", + " w = stemmer.stem(w)\n", + " wordCount[w] += 1\n", + " \n", + "len(wordCount)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "834dfe92", + "metadata": {}, + "outputs": [], + "source": [ + "counts = [(wordCount[w], w) for w in wordCount]\n", + "counts.sort()\n", + "counts.reverse()\n", + "words = [x[1] for x in counts[:1000]]\n", + "wordId = dict(zip(words, range(len(words))))\n", + "wordSet = set(words)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b9b3142f", + "metadata": {}, + "outputs": [], + "source": [ + "def feature(datum):\n", + " feat = [0]*len(words)\n", + " r = ''.join([c for c in datum['review'].lower() if not c in punctuation])\n", + " for w in r.split():\n", + " if w in words:\n", + " feat[wordId[w]] += 1\n", + " feat.append(1) # offset\n", + " return feat" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c9b40320", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "X = []\n", + "Y1 = []\n", + "Y2 = []\n", + "for user in data:\n", + " for review in user[\"reviews\"]:\n", + " X.append(feature(review))\n", + " Y1.append(review[\"funny\"])\n", + " Y2.append(review[\"helpful\"])\n", + "\n", + "X = np.array(X)\n", + "Y1 = np.array(Y1)\n", + "Y2 = np.array(Y2)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "de506b44", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "baseline 291.58597082421744 104.0410362862406\n" + ] + } + ], + "source": [ + "from sklearn.metrics import mean_squared_error\n", + "guess_mean1 = np.mean(Y1)\n", + "guess_mean2 = np.mean(Y2)\n", + "\n", + "print(\"baseline\", mean_squared_error(Y1, [guess_mean1]*len(Y1)), mean_squared_error(Y2, [guess_mean2]*len(Y2)))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "442da10a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.001 279.441162284691 111.14407506578739\n", + "0.01 279.44116808131514 111.14220804619674\n", + "0.1 279.4416527267009 111.12490360179714\n", + "1 279.4563820408731 111.0088419162745\n", + "10 279.55878360690946 110.3977031070603\n", + "100 280.29261897219476 108.18116566648386\n", + "1000 283.89486211897093 104.93301065452346\n" + ] + } + ], + "source": [ + "from sklearn import linear_model\n", + "\n", + "for C in [0.001, 0.01, 0.1, 1, 10, 100, 1000]:\n", + "\n", + " model1 = linear_model.Ridge(C, fit_intercept=True)\n", + " model1.fit(X, Y1)\n", + "\n", + " model2 = linear_model.Ridge(C, fit_intercept=True)\n", + " model2.fit(X, Y2)\n", + "\n", + " predictions1 = model1.predict(X)\n", + " predictions2 = model1.predict(X)\n", + "\n", + " print(C, mean_squared_error(Y1, predictions1), mean_squared_error(Y2, predictions2))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "90b2ad33", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 291.58597082421744 104.40268312757502\n", + "10 291.58597082421744 104.40268312757502\n", + "100 291.58597082421744 104.40268312757502\n", + "1000 291.58597082421744 104.40268312757502\n" + ] + } + ], + "source": [ + "from sklearn import linear_model\n", + "from sklearn.metrics import mean_squared_error\n", + "\n", + "for C in [1, 10, 100, 1000]:\n", + "\n", + " model1 = linear_model.Lasso(alpha=C, fit_intercept=True)\n", + " model1.fit(X, Y1)\n", + "\n", + " model2 = linear_model.Lasso(alpha=C, fit_intercept=True)\n", + " model2.fit(X, Y2)\n", + "\n", + " predictions1 = model1.predict(X)\n", + " predictions2 = model1.predict(X)\n", + "\n", + " print(C, mean_squared_error(Y1, predictions1), mean_squared_error(Y2, predictions2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fafe9eec", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}