This repository has been archived on 2023-12-08. You can view files and clone it, but cannot push or open issues or pull requests.
CSE-158-Assignment-2/linear_bow.ipynb

280 lines
7.5 KiB
Plaintext
Raw Normal View History

2023-11-29 23:45:43 +00:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from datetime import datetime\n",
"import re\n",
"import gzip"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def parseData(fname):\n",
" for l in gzip.open(fname):\n",
" yield eval(l)\n",
"\n",
"data = list(parseData(\"australian_user_reviews.json.gz\"))\n",
"\n",
"dm = [[0,0],[0,0]]\n",
"\n",
"users = set()\n",
"games = set()\n",
"\n",
"nodate = 0\n",
"\n",
"reviews = []\n",
"\n",
"for user in data:\n",
" if user[\"user_id\"] in users:\n",
" #print(f\"ducplicate user skipped: {user['user_id']}\")\n",
" pass\n",
" else:\n",
" users.add(user[\"user_id\"])\n",
" for review in user[\"reviews\"]:\n",
" games.add(review[\"item_id\"])\n",
" funny = review[\"funny\"]\n",
" hasfunny = int(funny != \"\")\n",
" if funny == \"\":\n",
" review[\"funny\"] = 0\n",
" else:\n",
" review[\"funny\"] = int(re.findall(\"\\d+\", funny)[0])\n",
" \n",
" helpful = review[\"helpful\"]\n",
" hashelpful = int(helpful != \"No ratings yet\")\n",
" if helpful == \"No ratings yet\":\n",
" review[\"helpful_n\"] = 0\n",
" review[\"helpful_total\"] = 0\n",
" review[\"helpful\"] = 0\n",
" else:\n",
" nums = re.findall(\"\\d+\", helpful.replace(\",\", \"\"))\n",
" helpfulness = float(nums[0]) / float(nums[1])\n",
" review[\"helpful\"] = float(nums[0]) / float(nums[1])\n",
" review[\"helpful_n\"] = float(nums[0])\n",
" review[\"helpful_total\"] = float(nums[1])\n",
" \n",
" dm[hasfunny][hashelpful] += 1\n",
"\n",
" try:\n",
" post_datetime = datetime.strptime(review[\"posted\"],'Posted %B %d, %Y.')\n",
" review[\"posted\"] = post_datetime\n",
" except:\n",
" nodate += 1\n",
"\n",
" review[\"user_id\"] = user[\"user_id\"]\n",
" review[\"user_url\"] = user[\"user_url\"]\n",
" reviews.append(review)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"97248"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from collections import defaultdict\n",
"import string\n",
"from nltk.stem.porter import *\n",
"\n",
"wordCount = defaultdict(int)\n",
"punctuation = set(string.punctuation)\n",
"stemmer = PorterStemmer()\n",
"for review in reviews:\n",
"\tr = ''.join([c for c in review['review'].lower() if not c in punctuation])\n",
"\tfor w in r.split():\n",
"\t\tw = stemmer.stem(w)\n",
"\t\twordCount[w] += 1\n",
"\t\t\n",
"len(wordCount)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"counts = [(wordCount[w], w) for w in wordCount]\n",
"counts.sort()\n",
"counts.reverse()\n",
"words = [x[1] for x in counts[:1000]]\n",
"wordId = dict(zip(words, range(len(words))))\n",
"wordSet = set(words)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def feature(datum):\n",
" feat = [0]*len(words)\n",
" r = ''.join([c for c in datum['review'].lower() if not c in punctuation])\n",
" for w in r.split():\n",
" if w in words:\n",
" feat[wordId[w]] += 1\n",
" feat.append(1) # offset\n",
" return feat"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"X = []\n",
"Y1 = []\n",
"Y2 = []\n",
"for review in reviews:\n",
" X.append(feature(review))\n",
" Y1.append(review[\"funny\"])\n",
" Y2.append(review[\"helpful_n\"])\n",
"\n",
"X = np.array(X)\n",
"Y1 = np.array(Y1)\n",
"Y2 = np.array(Y2)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"baseline 294.7309048565537 4.604634941766926\n"
]
}
],
"source": [
"from sklearn.metrics import mean_squared_error, mean_absolute_error\n",
"guess_mean1 = np.mean(Y1)\n",
"guess_mean2 = np.mean(Y2)\n",
"\n",
"print(\"baseline\", mean_squared_error(Y1, [guess_mean1]*len(Y1)), mean_absolute_error(Y2, [guess_mean2]*len(Y2)))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.001 282.2541467007739 4.179655704428717\n",
"0.01 282.25415246942805 4.179600740282743\n",
"0.1 282.2546345232787 4.179072864682249\n",
"1 282.2692492511399 4.175349141167781\n",
"10 282.3721909589884 4.147935437500891\n",
"100 283.13132181376034 3.9883973026815065\n",
"1000 286.86570062121467 3.620101916467935\n"
]
}
],
"source": [
"from sklearn import linear_model\n",
"\n",
"for C in [0.001, 0.01, 0.1, 1, 10, 100, 1000]:\n",
"\n",
" model1 = linear_model.Ridge(C, fit_intercept=True)\n",
" model1.fit(X, Y1)\n",
"\n",
" model2 = linear_model.Ridge(C, fit_intercept=True)\n",
" model2.fit(X, Y2)\n",
"\n",
" predictions1 = model1.predict(X)\n",
" predictions2 = model1.predict(X)\n",
"\n",
" print(C, mean_squared_error(Y1, predictions1), mean_absolute_error(Y2, predictions2))"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1 294.7309048565537 3.2338225122785453\n",
"10 294.7309048565537 3.2338225122785453\n",
"100 294.7309048565537 3.2338225122785453\n",
"1000 294.7309048565537 3.2338225122785453\n"
]
}
],
"source": [
"for C in [1, 10, 100, 1000]:\n",
"\n",
" model1 = linear_model.Lasso(alpha=C, fit_intercept=True)\n",
" model1.fit(X, Y1)\n",
"\n",
" model2 = linear_model.Lasso(alpha=C, fit_intercept=True)\n",
" model2.fit(X, Y2)\n",
"\n",
" predictions1 = model1.predict(X)\n",
" predictions2 = model1.predict(X)\n",
"\n",
" print(C, mean_squared_error(Y1, predictions1), mean_absolute_error(Y2, predictions2))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}