initial commit with hw3 methods
This commit is contained in:
commit
c760ea5e14
358
assignment1.ipynb
Normal file
358
assignment1.ipynb
Normal file
@ -0,0 +1,358 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Read Data"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 27,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# load data into dataset array\n",
|
||||||
|
"import gzip\n",
|
||||||
|
"from collections import defaultdict\n",
|
||||||
|
"import numpy as np"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 28,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def readJSON(path):\n",
|
||||||
|
" f = gzip.open(path, 'rt', encoding=\"utf-8\")\n",
|
||||||
|
" f.readline()\n",
|
||||||
|
" for l in f:\n",
|
||||||
|
" d = eval(l)\n",
|
||||||
|
" u = d['userID']\n",
|
||||||
|
" g = d['gameID']\n",
|
||||||
|
" yield u,g,d\n",
|
||||||
|
"\n",
|
||||||
|
"dataset = []\n",
|
||||||
|
"for l in readJSON(\"train.json.gz\"):\n",
|
||||||
|
" dataset.append(l)\n",
|
||||||
|
"\n",
|
||||||
|
"for user,game,review in dataset:\n",
|
||||||
|
" review[\"played\"] = 1"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 29,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# train test split\n",
|
||||||
|
"\n",
|
||||||
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
|
"\n",
|
||||||
|
"#train, valid = train_test_split(dataset, train_size=165000, random_state=0)\n",
|
||||||
|
"train = dataset[:165000]\n",
|
||||||
|
"valid = dataset[165000:]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 30,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Get negative labels in vaidation\n",
|
||||||
|
"import random\n",
|
||||||
|
"\n",
|
||||||
|
"def get_balanced_validation (dataset, valid):\n",
|
||||||
|
"\n",
|
||||||
|
" all_games = set()\n",
|
||||||
|
" user_played = defaultdict(set)\n",
|
||||||
|
"\n",
|
||||||
|
" for user,game,review in dataset:\n",
|
||||||
|
" all_games.add(review[\"gameID\"])\n",
|
||||||
|
" user_played[review[\"userID\"]].add(review[\"gameID\"])\n",
|
||||||
|
"\n",
|
||||||
|
" negative_valid = []\n",
|
||||||
|
"\n",
|
||||||
|
" for user,game,review in valid:\n",
|
||||||
|
" not_played = all_games - user_played[user]\n",
|
||||||
|
" new_game = random.choice(tuple(not_played))\n",
|
||||||
|
" negative_valid.append((user, new_game, {\"played\": 0}))\n",
|
||||||
|
"\n",
|
||||||
|
" return valid + negative_valid"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Utility Functions"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 31,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def writePredictions(infile, outfile, model):\n",
|
||||||
|
" with open(outfile, 'w') as predictions:\n",
|
||||||
|
" for l in open(infile):\n",
|
||||||
|
" if l.startswith(\"userID\"):\n",
|
||||||
|
" predictions.write(l)\n",
|
||||||
|
" continue\n",
|
||||||
|
" u,g = l.strip().split(',')\n",
|
||||||
|
" \n",
|
||||||
|
" pred = model.predict(u,g)\n",
|
||||||
|
" \n",
|
||||||
|
" _ = predictions.write(u + ',' + g + ',' + str(pred) + '\\n')\n",
|
||||||
|
"\n",
|
||||||
|
" predictions.close()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Play Predictor"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 32,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"class PlayPredictor:\n",
|
||||||
|
"\n",
|
||||||
|
" def __init__(self):\n",
|
||||||
|
" pass\n",
|
||||||
|
"\n",
|
||||||
|
" def fit(self, data, threshold=0.6): # data is an array of (user, game, review) tuples\n",
|
||||||
|
" self.topGames = self.getTopGames(threshold)\n",
|
||||||
|
"\n",
|
||||||
|
" def predict(self, user, game):\n",
|
||||||
|
" return int(game in self.topGames)\n",
|
||||||
|
"\n",
|
||||||
|
" def getTopGames (self, threshold):\n",
|
||||||
|
" gameCount = defaultdict(int)\n",
|
||||||
|
" totalPlayed = 0\n",
|
||||||
|
"\n",
|
||||||
|
" for user,game,_ in readJSON(\"train.json.gz\"):\n",
|
||||||
|
" gameCount[game] += 1\n",
|
||||||
|
" totalPlayed += 1\n",
|
||||||
|
"\n",
|
||||||
|
" mostPopular = [(gameCount[x], x) for x in gameCount]\n",
|
||||||
|
" mostPopular.sort()\n",
|
||||||
|
" mostPopular.reverse()\n",
|
||||||
|
"\n",
|
||||||
|
" return1 = set()\n",
|
||||||
|
" count = 0\n",
|
||||||
|
" for ic, i in mostPopular:\n",
|
||||||
|
" count += ic\n",
|
||||||
|
" return1.add(i)\n",
|
||||||
|
" if count > totalPlayed * threshold: break\n",
|
||||||
|
" return return1"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 33,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"model = PlayPredictor()\n",
|
||||||
|
"model.fit(train, threshold=0.6)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 34,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"PlayPredictor accuracy: 0.6997699769976997\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"error = 0\n",
|
||||||
|
"balanced_valid = get_balanced_validation(dataset, valid)\n",
|
||||||
|
"for user, game, review in balanced_valid:\n",
|
||||||
|
" pred = model.predict(user, game)\n",
|
||||||
|
" if pred != review[\"played\"]:\n",
|
||||||
|
" error += 1\n",
|
||||||
|
"\n",
|
||||||
|
"print(f\"PlayPredictor accuracy: \", 1 - error / len(balanced_valid))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 35,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"writePredictions(\"pairs_Played.csv\", \"predictions_Played.csv\", model)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Time Predictor"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 36,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from copy import copy\n",
|
||||||
|
"\n",
|
||||||
|
"class TimePredictor:\n",
|
||||||
|
" \n",
|
||||||
|
" def __init__(self):\n",
|
||||||
|
" pass\n",
|
||||||
|
"\n",
|
||||||
|
" def fit(self, data, l=5.0, iters=200): # data is an array of (user, game, review) tuples\n",
|
||||||
|
" reviewsPerUser = defaultdict(list)\n",
|
||||||
|
" reviewsPerItem = defaultdict(list)\n",
|
||||||
|
"\n",
|
||||||
|
" globalAverage = 0\n",
|
||||||
|
"\n",
|
||||||
|
" for user, game, review in data:\n",
|
||||||
|
" reviewsPerUser[user].append(review)\n",
|
||||||
|
" reviewsPerItem[game].append(review)\n",
|
||||||
|
"\n",
|
||||||
|
" globalAverage += review[\"hours_transformed\"]\n",
|
||||||
|
"\n",
|
||||||
|
" globalAverage /= len(data)\n",
|
||||||
|
"\n",
|
||||||
|
" betaU = {}\n",
|
||||||
|
" betaI = {}\n",
|
||||||
|
" for u in reviewsPerUser:\n",
|
||||||
|
" reviews = [r[\"hours_transformed\"] for r in reviewsPerUser[u]]\n",
|
||||||
|
" betaU[u] = np.mean(reviews)\n",
|
||||||
|
"\n",
|
||||||
|
" for g in reviewsPerItem:\n",
|
||||||
|
" reviews = [r[\"hours_transformed\"] for r in reviewsPerItem[g]]\n",
|
||||||
|
" betaI[g] = np.mean(reviews)\n",
|
||||||
|
"\n",
|
||||||
|
" alpha = globalAverage # Could initialize anywhere, this is a guess\n",
|
||||||
|
"\n",
|
||||||
|
" for i in range(iters):\n",
|
||||||
|
"\n",
|
||||||
|
" newAlpha = 0\n",
|
||||||
|
" for user,game,review in data:\n",
|
||||||
|
" newAlpha += review[\"hours_transformed\"] - (betaU[user] + betaI[game])\n",
|
||||||
|
" alpha = newAlpha / len(data)\n",
|
||||||
|
"\n",
|
||||||
|
" for user in reviewsPerUser:\n",
|
||||||
|
" bu = 0\n",
|
||||||
|
" for review in reviewsPerUser[user]:\n",
|
||||||
|
" item = review[\"gameID\"]\n",
|
||||||
|
" bu += review[\"hours_transformed\"] - (alpha + betaI[item])\n",
|
||||||
|
" betaU[user] = bu / (l + len(reviewsPerUser[user]))\n",
|
||||||
|
" \n",
|
||||||
|
" for item in reviewsPerItem:\n",
|
||||||
|
" bi = 0\n",
|
||||||
|
" for review in reviewsPerItem[item]:\n",
|
||||||
|
" user = review[\"userID\"]\n",
|
||||||
|
" bi += review[\"hours_transformed\"] - (alpha + betaU[user])\n",
|
||||||
|
" betaI[item] = bi / (l + len(reviewsPerItem[item]))\n",
|
||||||
|
" \n",
|
||||||
|
" self.alpha = alpha\n",
|
||||||
|
" self.betaU = betaU\n",
|
||||||
|
" self.betaI = betaI\n",
|
||||||
|
"\n",
|
||||||
|
" def predict(self, user, game):\n",
|
||||||
|
" bu = 0\n",
|
||||||
|
" bi = 0\n",
|
||||||
|
"\n",
|
||||||
|
" if user in self.betaU:\n",
|
||||||
|
" bu = self.betaU[user]\n",
|
||||||
|
" \n",
|
||||||
|
" if game in self.betaI:\n",
|
||||||
|
" bi = self.betaI[game]\n",
|
||||||
|
"\n",
|
||||||
|
" return self.alpha + bu + bi"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 40,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from sklearn.metrics import mean_squared_error\n",
|
||||||
|
"\n",
|
||||||
|
"def MSE(y, ypred):\n",
|
||||||
|
" return mean_squared_error(y, ypred)\n",
|
||||||
|
"\n",
|
||||||
|
"model = TimePredictor()\n",
|
||||||
|
"model.fit(train, l=5.0, iters=200)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 41,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"TimePredictor MSE: 2.990628028380304\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"y = []\n",
|
||||||
|
"y_pred = []\n",
|
||||||
|
"for user, game, review in valid:\n",
|
||||||
|
" y_pred.append(model.predict(user, game))\n",
|
||||||
|
" y.append(review[\"hours_transformed\"])\n",
|
||||||
|
"\n",
|
||||||
|
"print(f\"TimePredictor MSE: {MSE(y, y_pred)}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 42,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"writePredictions(\"pairs_Hours.csv\", \"predictions_Hours.csv\", model)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.6"
|
||||||
|
},
|
||||||
|
"orig_nbformat": 4
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
15
assignment1.py
Normal file
15
assignment1.py
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
# %%
|
||||||
|
import gzip
|
||||||
|
from collections import defaultdict
|
||||||
|
import json
|
||||||
|
|
||||||
|
# %%
|
||||||
|
dataset = []
|
||||||
|
|
||||||
|
for l in gzip.open("train.json.gz", 'rt', encoding='utf-8'):
|
||||||
|
d = eval(l)
|
||||||
|
dataset.append(d)
|
||||||
|
|
||||||
|
print(dataset[0])
|
||||||
|
|
||||||
|
|
10001
pairs_Hours.csv
Normal file
10001
pairs_Hours.csv
Normal file
File diff suppressed because it is too large
Load Diff
20001
pairs_Played.csv
Normal file
20001
pairs_Played.csv
Normal file
File diff suppressed because it is too large
Load Diff
10001
predictions_Hours.csv
Normal file
10001
predictions_Hours.csv
Normal file
File diff suppressed because it is too large
Load Diff
20001
predictions_Played.csv
Normal file
20001
predictions_Played.csv
Normal file
File diff suppressed because it is too large
Load Diff
BIN
train.json.gz
Normal file
BIN
train.json.gz
Normal file
Binary file not shown.
0
writeup.txt
Normal file
0
writeup.txt
Normal file
Reference in New Issue
Block a user