use BPR for predict play
This commit is contained in:
parent
c760ea5e14
commit
aa423db398
@ -9,19 +9,20 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"execution_count": 38,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# load data into dataset array\n",
|
||||
"import gzip\n",
|
||||
"from collections import defaultdict\n",
|
||||
"import numpy as np"
|
||||
"import numpy as np\n",
|
||||
"import tensorflow as tf"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"execution_count": 39,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -44,7 +45,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"execution_count": 40,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -59,15 +60,14 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 30,
|
||||
"execution_count": 41,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Get negative labels in vaidation\n",
|
||||
"import random\n",
|
||||
"\n",
|
||||
"def get_balanced_validation (dataset, valid):\n",
|
||||
"\n",
|
||||
"def get_balanced_set(dataset, s):\n",
|
||||
" all_games = set()\n",
|
||||
" user_played = defaultdict(set)\n",
|
||||
"\n",
|
||||
@ -75,14 +75,15 @@
|
||||
" all_games.add(review[\"gameID\"])\n",
|
||||
" user_played[review[\"userID\"]].add(review[\"gameID\"])\n",
|
||||
"\n",
|
||||
" negative_valid = []\n",
|
||||
" negative = []\n",
|
||||
"\n",
|
||||
" for user,game,review in valid:\n",
|
||||
" for user,game,review in s:\n",
|
||||
" not_played = all_games - user_played[user]\n",
|
||||
" new_game = random.choice(tuple(not_played))\n",
|
||||
" negative_valid.append((user, new_game, {\"played\": 0}))\n",
|
||||
" negative.append((user, new_game, {\"played\": 0}))\n",
|
||||
"\n",
|
||||
" return valid + negative_valid"
|
||||
" return s + negative\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -94,7 +95,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 31,
|
||||
"execution_count": 42,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -113,6 +114,48 @@
|
||||
" predictions.close()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 43,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"class BPRbatch(tf.keras.Model):\n",
|
||||
" def __init__(self, K, lamb, itemIDs, userIDs):\n",
|
||||
" super(BPRbatch, self).__init__()\n",
|
||||
" # Initialize variables\n",
|
||||
" self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))\n",
|
||||
" self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))\n",
|
||||
" self.gammaI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))\n",
|
||||
" # Regularization coefficient\n",
|
||||
" self.lamb = lamb\n",
|
||||
"\n",
|
||||
" # Prediction for a single instance\n",
|
||||
" def predict(self, u, i):\n",
|
||||
" p = self.betaI[i] + tf.tensordot(self.gammaU[u], self.gammaI[i], 1)\n",
|
||||
" return p\n",
|
||||
"\n",
|
||||
" # Regularizer\n",
|
||||
" def reg(self):\n",
|
||||
" return self.lamb * (tf.nn.l2_loss(self.betaI) +\\\n",
|
||||
" tf.nn.l2_loss(self.gammaU) +\\\n",
|
||||
" tf.nn.l2_loss(self.gammaI))\n",
|
||||
" \n",
|
||||
" def score(self, sampleU, sampleI):\n",
|
||||
" u = tf.convert_to_tensor(sampleU, dtype=tf.int32)\n",
|
||||
" i = tf.convert_to_tensor(sampleI, dtype=tf.int32)\n",
|
||||
" beta_i = tf.nn.embedding_lookup(self.betaI, i)\n",
|
||||
" gamma_u = tf.nn.embedding_lookup(self.gammaU, u)\n",
|
||||
" gamma_i = tf.nn.embedding_lookup(self.gammaI, i)\n",
|
||||
" x_ui = beta_i + tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)\n",
|
||||
" return x_ui\n",
|
||||
"\n",
|
||||
" def call(self, sampleU, sampleI, sampleJ):\n",
|
||||
" x_ui = self.score(sampleU, sampleI)\n",
|
||||
" x_uj = self.score(sampleU, sampleJ)\n",
|
||||
" return -tf.reduce_mean(tf.math.log(tf.math.sigmoid(x_ui - x_uj)))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@ -122,7 +165,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 32,
|
||||
"execution_count": 44,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -131,11 +174,60 @@
|
||||
" def __init__(self):\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
" def fit(self, data, threshold=0.6): # data is an array of (user, game, review) tuples\n",
|
||||
" def fit(self, data, threshold=0.6, K=5, iters=100): # data is an array of (user, game, review) tuples\n",
|
||||
" self.topGames = self.getTopGames(threshold)\n",
|
||||
"\n",
|
||||
" def predict(self, user, game):\n",
|
||||
" return int(game in self.topGames)\n",
|
||||
" self.userIDs = {}\n",
|
||||
" self.itemIDs = {}\n",
|
||||
" interactions = []\n",
|
||||
"\n",
|
||||
" for u,i,r in data:\n",
|
||||
" if not u in self.userIDs: self.userIDs[u] = len(self.userIDs)\n",
|
||||
" if not i in self.itemIDs: self.itemIDs[i] = len(self.itemIDs)\n",
|
||||
" interactions.append((u,i,r[\"played\"]))\n",
|
||||
" \n",
|
||||
" items = list(self.itemIDs.keys())\n",
|
||||
" \n",
|
||||
" itemsPerUser = defaultdict(list)\n",
|
||||
" usersPerItem = defaultdict(list)\n",
|
||||
" for u,i,r in interactions:\n",
|
||||
" itemsPerUser[u].append(i)\n",
|
||||
" usersPerItem[i].append(u)\n",
|
||||
"\n",
|
||||
" def trainingStepBPR(model, interactions):\n",
|
||||
" Nsamples = 50000\n",
|
||||
" with tf.GradientTape() as tape:\n",
|
||||
" sampleU, sampleI, sampleJ = [], [], []\n",
|
||||
" for _ in range(Nsamples):\n",
|
||||
" u,i,_ = random.choice(interactions) # positive sample\n",
|
||||
" j = random.choice(items) # negative sample\n",
|
||||
" while j in itemsPerUser[u]:\n",
|
||||
" j = random.choice(items)\n",
|
||||
" sampleU.append(self.userIDs[u])\n",
|
||||
" sampleI.append(self.itemIDs[i])\n",
|
||||
" sampleJ.append(self.itemIDs[j])\n",
|
||||
"\n",
|
||||
" loss = model(sampleU,sampleI,sampleJ)\n",
|
||||
" loss += model.reg()\n",
|
||||
" gradients = tape.gradient(loss, model.trainable_variables)\n",
|
||||
" optimizer.apply_gradients((grad, var) for\n",
|
||||
" (grad, var) in zip(gradients, model.trainable_variables)\n",
|
||||
" if grad is not None)\n",
|
||||
" return loss.numpy()\n",
|
||||
" \n",
|
||||
" optimizer = tf.keras.optimizers.Adam(0.1)\n",
|
||||
" self.modelBPR = BPRbatch(K, 0.00001, self.itemIDs, self.userIDs)\n",
|
||||
"\n",
|
||||
" for i in range(iters):\n",
|
||||
" obj = trainingStepBPR(self.modelBPR, interactions)\n",
|
||||
" if (i % 10 == 9): print(\"iteration \" + str(i+1) + \", objective = \" + str(obj))\n",
|
||||
" \n",
|
||||
" def predict(self, user, game, threshold=0.5):\n",
|
||||
" if user in self.userIDs and game in self.itemIDs:\n",
|
||||
" pred = self.modelBPR.predict(self.userIDs[user], self.itemIDs[game]).numpy()\n",
|
||||
" return int(pred > threshold)\n",
|
||||
" else:\n",
|
||||
" return int(game in self.topGames)\n",
|
||||
"\n",
|
||||
" def getTopGames (self, threshold):\n",
|
||||
" gameCount = defaultdict(int)\n",
|
||||
@ -155,37 +247,50 @@
|
||||
" count += ic\n",
|
||||
" return1.add(i)\n",
|
||||
" if count > totalPlayed * threshold: break\n",
|
||||
" return return1"
|
||||
" return return1\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 33,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = PlayPredictor()\n",
|
||||
"model.fit(train, threshold=0.6)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 34,
|
||||
"execution_count": 45,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"PlayPredictor accuracy: 0.6997699769976997\n"
|
||||
"iteration 10, objective = 0.51180786\n",
|
||||
"iteration 20, objective = 0.48082852\n",
|
||||
"iteration 30, objective = 0.47100148\n",
|
||||
"iteration 40, objective = 0.45862892\n",
|
||||
"iteration 50, objective = 0.45290428\n",
|
||||
"iteration 60, objective = 0.44695023\n",
|
||||
"iteration 70, objective = 0.4453482\n",
|
||||
"iteration 80, objective = 0.444919\n",
|
||||
"iteration 90, objective = 0.4451945\n",
|
||||
"iteration 100, objective = 0.44311014\n",
|
||||
"iteration 110, objective = 0.44101325\n",
|
||||
"iteration 120, objective = 0.43727913\n",
|
||||
"iteration 130, objective = 0.43938398\n",
|
||||
"iteration 140, objective = 0.43788543\n",
|
||||
"iteration 150, objective = 0.43573555\n",
|
||||
"iteration 160, objective = 0.4379884\n",
|
||||
"iteration 170, objective = 0.43852594\n",
|
||||
"iteration 180, objective = 0.4391472\n",
|
||||
"iteration 190, objective = 0.4318109\n",
|
||||
"iteration 200, objective = 0.4389726\n",
|
||||
"PlayPredictor accuracy: 0.7234723472347235\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"model = PlayPredictor()\n",
|
||||
"model.fit(train, K=6, iters=200)\n",
|
||||
"\n",
|
||||
"error = 0\n",
|
||||
"balanced_valid = get_balanced_validation(dataset, valid)\n",
|
||||
"balanced_valid = get_balanced_set(dataset, valid)\n",
|
||||
"for user, game, review in balanced_valid:\n",
|
||||
" pred = model.predict(user, game)\n",
|
||||
" pred = model.predict(user, game, threshold=0.5)\n",
|
||||
" if pred != review[\"played\"]:\n",
|
||||
" error += 1\n",
|
||||
"\n",
|
||||
@ -194,7 +299,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 35,
|
||||
"execution_count": 46,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -210,7 +315,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 36,
|
||||
"execution_count": 47,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
@ -287,22 +392,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 40,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.metrics import mean_squared_error\n",
|
||||
"\n",
|
||||
"def MSE(y, ypred):\n",
|
||||
" return mean_squared_error(y, ypred)\n",
|
||||
"\n",
|
||||
"model = TimePredictor()\n",
|
||||
"model.fit(train, l=5.0, iters=200)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 41,
|
||||
"execution_count": 48,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -314,6 +404,14 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sklearn.metrics import mean_squared_error\n",
|
||||
"\n",
|
||||
"def MSE(y, ypred):\n",
|
||||
" return mean_squared_error(y, ypred)\n",
|
||||
"\n",
|
||||
"model = TimePredictor()\n",
|
||||
"model.fit(train)\n",
|
||||
"\n",
|
||||
"y = []\n",
|
||||
"y_pred = []\n",
|
||||
"for user, game, review in valid:\n",
|
||||
@ -325,7 +423,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 42,
|
||||
"execution_count": 49,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
|
303
assignment1.py
303
assignment1.py
@ -1,15 +1,308 @@
|
||||
# %% [markdown]
|
||||
# ### Read Data
|
||||
|
||||
# %%
|
||||
# load data into dataset array
|
||||
import gzip
|
||||
from collections import defaultdict
|
||||
import json
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
# %%
|
||||
def readJSON(path):
|
||||
f = gzip.open(path, 'rt', encoding="utf-8")
|
||||
f.readline()
|
||||
for l in f:
|
||||
d = eval(l)
|
||||
u = d['userID']
|
||||
g = d['gameID']
|
||||
yield u,g,d
|
||||
|
||||
dataset = []
|
||||
for l in readJSON("train.json.gz"):
|
||||
dataset.append(l)
|
||||
|
||||
for l in gzip.open("train.json.gz", 'rt', encoding='utf-8'):
|
||||
d = eval(l)
|
||||
dataset.append(d)
|
||||
for user,game,review in dataset:
|
||||
review["played"] = 1
|
||||
|
||||
print(dataset[0])
|
||||
# %%
|
||||
# train test split
|
||||
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
#train, valid = train_test_split(dataset, train_size=165000, random_state=0)
|
||||
train = dataset[:165000]
|
||||
valid = dataset[165000:]
|
||||
|
||||
# %%
|
||||
# Get negative labels in vaidation
|
||||
import random
|
||||
|
||||
def get_balanced_set(dataset, s):
|
||||
all_games = set()
|
||||
user_played = defaultdict(set)
|
||||
|
||||
for user,game,review in dataset:
|
||||
all_games.add(review["gameID"])
|
||||
user_played[review["userID"]].add(review["gameID"])
|
||||
|
||||
negative = []
|
||||
|
||||
for user,game,review in s:
|
||||
not_played = all_games - user_played[user]
|
||||
new_game = random.choice(tuple(not_played))
|
||||
negative.append((user, new_game, {"played": 0}))
|
||||
|
||||
return s + negative
|
||||
|
||||
|
||||
# %% [markdown]
|
||||
# ### Utility Functions
|
||||
|
||||
# %%
|
||||
def writePredictions(infile, outfile, model):
|
||||
with open(outfile, 'w') as predictions:
|
||||
for l in open(infile):
|
||||
if l.startswith("userID"):
|
||||
predictions.write(l)
|
||||
continue
|
||||
u,g = l.strip().split(',')
|
||||
|
||||
pred = model.predict(u,g)
|
||||
|
||||
_ = predictions.write(u + ',' + g + ',' + str(pred) + '\n')
|
||||
|
||||
predictions.close()
|
||||
|
||||
# %%
|
||||
class BPRbatch(tf.keras.Model):
|
||||
def __init__(self, K, lamb, itemIDs, userIDs):
|
||||
super(BPRbatch, self).__init__()
|
||||
# Initialize variables
|
||||
self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))
|
||||
self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))
|
||||
self.gammaI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))
|
||||
# Regularization coefficient
|
||||
self.lamb = lamb
|
||||
|
||||
# Prediction for a single instance
|
||||
def predict(self, u, i):
|
||||
p = self.betaI[i] + tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
|
||||
return p
|
||||
|
||||
# Regularizer
|
||||
def reg(self):
|
||||
return self.lamb * (tf.nn.l2_loss(self.betaI) +\
|
||||
tf.nn.l2_loss(self.gammaU) +\
|
||||
tf.nn.l2_loss(self.gammaI))
|
||||
|
||||
def score(self, sampleU, sampleI):
|
||||
u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
|
||||
i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
|
||||
beta_i = tf.nn.embedding_lookup(self.betaI, i)
|
||||
gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
|
||||
gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
|
||||
x_ui = beta_i + tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
|
||||
return x_ui
|
||||
|
||||
def call(self, sampleU, sampleI, sampleJ):
|
||||
x_ui = self.score(sampleU, sampleI)
|
||||
x_uj = self.score(sampleU, sampleJ)
|
||||
return -tf.reduce_mean(tf.math.log(tf.math.sigmoid(x_ui - x_uj)))
|
||||
|
||||
# %% [markdown]
|
||||
# ### Play Predictor
|
||||
|
||||
# %%
|
||||
class PlayPredictor:
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def fit(self, data, threshold=0.6, K=5, iters=100): # data is an array of (user, game, review) tuples
|
||||
self.topGames = self.getTopGames(threshold)
|
||||
|
||||
self.userIDs = {}
|
||||
self.itemIDs = {}
|
||||
interactions = []
|
||||
|
||||
for u,i,r in data:
|
||||
if not u in self.userIDs: self.userIDs[u] = len(self.userIDs)
|
||||
if not i in self.itemIDs: self.itemIDs[i] = len(self.itemIDs)
|
||||
interactions.append((u,i,r["played"]))
|
||||
|
||||
items = list(self.itemIDs.keys())
|
||||
|
||||
itemsPerUser = defaultdict(list)
|
||||
usersPerItem = defaultdict(list)
|
||||
for u,i,r in interactions:
|
||||
itemsPerUser[u].append(i)
|
||||
usersPerItem[i].append(u)
|
||||
|
||||
def trainingStepBPR(model, interactions):
|
||||
Nsamples = 50000
|
||||
with tf.GradientTape() as tape:
|
||||
sampleU, sampleI, sampleJ = [], [], []
|
||||
for _ in range(Nsamples):
|
||||
u,i,_ = random.choice(interactions) # positive sample
|
||||
j = random.choice(items) # negative sample
|
||||
while j in itemsPerUser[u]:
|
||||
j = random.choice(items)
|
||||
sampleU.append(self.userIDs[u])
|
||||
sampleI.append(self.itemIDs[i])
|
||||
sampleJ.append(self.itemIDs[j])
|
||||
|
||||
loss = model(sampleU,sampleI,sampleJ)
|
||||
loss += model.reg()
|
||||
gradients = tape.gradient(loss, model.trainable_variables)
|
||||
optimizer.apply_gradients((grad, var) for
|
||||
(grad, var) in zip(gradients, model.trainable_variables)
|
||||
if grad is not None)
|
||||
return loss.numpy()
|
||||
|
||||
optimizer = tf.keras.optimizers.Adam(0.1)
|
||||
self.modelBPR = BPRbatch(K, 0.00001, self.itemIDs, self.userIDs)
|
||||
|
||||
for i in range(iters):
|
||||
obj = trainingStepBPR(self.modelBPR, interactions)
|
||||
if (i % 10 == 9): print("iteration " + str(i+1) + ", objective = " + str(obj))
|
||||
|
||||
def predict(self, user, game, threshold=0.5):
|
||||
if user in self.userIDs and game in self.itemIDs:
|
||||
pred = self.modelBPR.predict(self.userIDs[user], self.itemIDs[game]).numpy()
|
||||
return int(pred > threshold)
|
||||
else:
|
||||
return int(game in self.topGames)
|
||||
|
||||
def getTopGames (self, threshold):
|
||||
gameCount = defaultdict(int)
|
||||
totalPlayed = 0
|
||||
|
||||
for user,game,_ in readJSON("train.json.gz"):
|
||||
gameCount[game] += 1
|
||||
totalPlayed += 1
|
||||
|
||||
mostPopular = [(gameCount[x], x) for x in gameCount]
|
||||
mostPopular.sort()
|
||||
mostPopular.reverse()
|
||||
|
||||
return1 = set()
|
||||
count = 0
|
||||
for ic, i in mostPopular:
|
||||
count += ic
|
||||
return1.add(i)
|
||||
if count > totalPlayed * threshold: break
|
||||
return return1
|
||||
|
||||
|
||||
# %%
|
||||
model = PlayPredictor()
|
||||
model.fit(train, K=6, iters=200)
|
||||
|
||||
error = 0
|
||||
balanced_valid = get_balanced_set(dataset, valid)
|
||||
for user, game, review in balanced_valid:
|
||||
pred = model.predict(user, game, threshold=0.5)
|
||||
if pred != review["played"]:
|
||||
error += 1
|
||||
|
||||
print(f"PlayPredictor accuracy: ", 1 - error / len(balanced_valid))
|
||||
|
||||
# %%
|
||||
writePredictions("pairs_Played.csv", "predictions_Played.csv", model)
|
||||
|
||||
# %% [markdown]
|
||||
# ### Time Predictor
|
||||
|
||||
# %%
|
||||
from copy import copy
|
||||
|
||||
class TimePredictor:
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def fit(self, data, l=5.0, iters=200): # data is an array of (user, game, review) tuples
|
||||
reviewsPerUser = defaultdict(list)
|
||||
reviewsPerItem = defaultdict(list)
|
||||
|
||||
globalAverage = 0
|
||||
|
||||
for user, game, review in data:
|
||||
reviewsPerUser[user].append(review)
|
||||
reviewsPerItem[game].append(review)
|
||||
|
||||
globalAverage += review["hours_transformed"]
|
||||
|
||||
globalAverage /= len(data)
|
||||
|
||||
betaU = {}
|
||||
betaI = {}
|
||||
for u in reviewsPerUser:
|
||||
reviews = [r["hours_transformed"] for r in reviewsPerUser[u]]
|
||||
betaU[u] = np.mean(reviews)
|
||||
|
||||
for g in reviewsPerItem:
|
||||
reviews = [r["hours_transformed"] for r in reviewsPerItem[g]]
|
||||
betaI[g] = np.mean(reviews)
|
||||
|
||||
alpha = globalAverage # Could initialize anywhere, this is a guess
|
||||
|
||||
for i in range(iters):
|
||||
|
||||
newAlpha = 0
|
||||
for user,game,review in data:
|
||||
newAlpha += review["hours_transformed"] - (betaU[user] + betaI[game])
|
||||
alpha = newAlpha / len(data)
|
||||
|
||||
for user in reviewsPerUser:
|
||||
bu = 0
|
||||
for review in reviewsPerUser[user]:
|
||||
item = review["gameID"]
|
||||
bu += review["hours_transformed"] - (alpha + betaI[item])
|
||||
betaU[user] = bu / (l + len(reviewsPerUser[user]))
|
||||
|
||||
for item in reviewsPerItem:
|
||||
bi = 0
|
||||
for review in reviewsPerItem[item]:
|
||||
user = review["userID"]
|
||||
bi += review["hours_transformed"] - (alpha + betaU[user])
|
||||
betaI[item] = bi / (l + len(reviewsPerItem[item]))
|
||||
|
||||
self.alpha = alpha
|
||||
self.betaU = betaU
|
||||
self.betaI = betaI
|
||||
|
||||
def predict(self, user, game):
|
||||
bu = 0
|
||||
bi = 0
|
||||
|
||||
if user in self.betaU:
|
||||
bu = self.betaU[user]
|
||||
|
||||
if game in self.betaI:
|
||||
bi = self.betaI[game]
|
||||
|
||||
return self.alpha + bu + bi
|
||||
|
||||
# %%
|
||||
from sklearn.metrics import mean_squared_error
|
||||
|
||||
def MSE(y, ypred):
|
||||
return mean_squared_error(y, ypred)
|
||||
|
||||
model = TimePredictor()
|
||||
model.fit(train)
|
||||
|
||||
y = []
|
||||
y_pred = []
|
||||
for user, game, review in valid:
|
||||
y_pred.append(model.predict(user, game))
|
||||
y.append(review["hours_transformed"])
|
||||
|
||||
print(f"TimePredictor MSE: {MSE(y, y_pred)}")
|
||||
|
||||
# %%
|
||||
writePredictions("pairs_Hours.csv", "predictions_Hours.csv", model)
|
||||
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user