{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Read Data" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "# load data into dataset array\n", "import gzip\n", "from collections import defaultdict\n", "import numpy as np\n", "import tensorflow as tf" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "def readJSON(path):\n", " f = gzip.open(path, 'rt', encoding=\"utf-8\")\n", " f.readline()\n", " for l in f:\n", " d = eval(l)\n", " u = d['userID']\n", " g = d['gameID']\n", " yield u,g,d\n", "\n", "dataset = []\n", "for l in readJSON(\"train.json.gz\"):\n", " dataset.append(l)\n", "\n", "for user,game,review in dataset:\n", " review[\"played\"] = 1" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "# train test split\n", "\n", "from sklearn.model_selection import train_test_split\n", "\n", "#train, valid = train_test_split(dataset, train_size=165000, random_state=0)\n", "train = dataset[:165000]\n", "valid = dataset[165000:]" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "# Get negative labels in vaidation\n", "import random\n", "\n", "def get_balanced_set(dataset, s):\n", " all_games = set()\n", " user_played = defaultdict(set)\n", "\n", " for user,game,review in dataset:\n", " all_games.add(review[\"gameID\"])\n", " user_played[review[\"userID\"]].add(review[\"gameID\"])\n", "\n", " negative = []\n", "\n", " for user,game,review in s:\n", " not_played = all_games - user_played[user]\n", " new_game = random.choice(tuple(not_played))\n", " negative.append((user, new_game, {\"played\": 0}))\n", "\n", " return s + negative\n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Utility Functions" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "def writePredictions(infile, outfile, model):\n", " with open(outfile, 'w') as predictions:\n", " for l in open(infile):\n", " if l.startswith(\"userID\"):\n", " predictions.write(l)\n", " continue\n", " u,g = l.strip().split(',')\n", " \n", " pred = model.predict(u,g)\n", " \n", " _ = predictions.write(u + ',' + g + ',' + str(pred) + '\\n')\n", "\n", " predictions.close()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "class BPRbatch(tf.keras.Model):\n", " def __init__(self, K, lamb, itemIDs, userIDs):\n", " super(BPRbatch, self).__init__()\n", " # Initialize variables\n", " self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))\n", " self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))\n", " self.gammaI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))\n", " # Regularization coefficient\n", " self.lamb = lamb\n", "\n", " # Prediction for a single instance\n", " def predict(self, u, i):\n", " bi = self.bi\n", " gu = self.gu\n", " gi = self.gi\n", " if u != None:\n", " gu = self.gammaU[u]\n", " if i != None:\n", " bi = self.betaI[i]\n", " gi = self.gammaI[i]\n", " p = bi + tf.tensordot(gu, gi, 1)\n", " return p\n", "\n", " # Regularizer\n", " def reg(self):\n", " return self.lamb * (tf.nn.l2_loss(self.betaI) +\\\n", " tf.nn.l2_loss(self.gammaU) +\\\n", " tf.nn.l2_loss(self.gammaI))\n", " \n", " def score(self, sampleU, sampleI):\n", " u = tf.convert_to_tensor(sampleU, dtype=tf.int32)\n", " i = tf.convert_to_tensor(sampleI, dtype=tf.int32)\n", " beta_i = tf.nn.embedding_lookup(self.betaI, i)\n", " gamma_u = tf.nn.embedding_lookup(self.gammaU, u)\n", " gamma_i = tf.nn.embedding_lookup(self.gammaI, i)\n", " x_ui = beta_i + tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)\n", " return x_ui\n", "\n", " def call(self, sampleU, sampleI, sampleJ):\n", " x_ui = self.score(sampleU, sampleI)\n", " x_uj = self.score(sampleU, sampleJ)\n", " return -tf.reduce_mean(tf.math.log(tf.math.sigmoid(x_ui - x_uj)))\n", " \n", " def finalize(self):\n", " self.bi = np.average(self.betaI, axis=0)\n", " self.gu = np.average(self.gammaU, axis=0)\n", " self.gi = np.average(self.gammaI, axis=0)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Play Predictor" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "class PlayPredictor:\n", "\n", " def __init__(self):\n", " pass\n", "\n", " def fit(self, data, K=5, iters=100): # data is an array of (user, game, review) tuples\n", " self.userIDs = {}\n", " self.itemIDs = {}\n", " interactions = []\n", "\n", " for u,i,r in data:\n", " if not u in self.userIDs: self.userIDs[u] = len(self.userIDs)\n", " if not i in self.itemIDs: self.itemIDs[i] = len(self.itemIDs)\n", " interactions.append((u,i,r[\"played\"]))\n", " \n", " items = list(self.itemIDs.keys())\n", " \n", " itemsPerUser = defaultdict(list)\n", " usersPerItem = defaultdict(list)\n", " for u,i,r in interactions:\n", " itemsPerUser[u].append(i)\n", " usersPerItem[i].append(u)\n", "\n", " def trainingStepBPR(model, interactions):\n", " Nsamples = 50000\n", " with tf.GradientTape() as tape:\n", " sampleU, sampleI, sampleJ = [], [], []\n", " for _ in range(Nsamples):\n", " u,i,_ = random.choice(interactions) # positive sample\n", " j = random.choice(items) # negative sample\n", " while j in itemsPerUser[u]:\n", " j = random.choice(items)\n", " sampleU.append(self.userIDs[u])\n", " sampleI.append(self.itemIDs[i])\n", " sampleJ.append(self.itemIDs[j])\n", "\n", " loss = model(sampleU,sampleI,sampleJ)\n", " loss += model.reg()\n", " gradients = tape.gradient(loss, model.trainable_variables)\n", " optimizer.apply_gradients((grad, var) for\n", " (grad, var) in zip(gradients, model.trainable_variables)\n", " if grad is not None)\n", " return loss.numpy()\n", " \n", " optimizer = tf.keras.optimizers.Adam(0.1)\n", " self.modelBPR = BPRbatch(K, 0.00001, self.itemIDs, self.userIDs)\n", "\n", " for i in range(iters):\n", " obj = trainingStepBPR(self.modelBPR, interactions)\n", " if (i % 10 == 9): print(\"iteration \" + str(i+1) + \", objective = \" + str(obj))\n", "\n", " self.modelBPR.finalize()\n", " \n", " def predict(self, user, game, threshold=0.5):\n", " uid = None\n", " gid = None\n", " if user in self.userIDs:\n", " uid = self.userIDs[user]\n", " if game in self.itemIDs:\n", " gid = self.itemIDs[game]\n", " pred = self.modelBPR.predict(uid, gid).numpy()\n", " return int(pred > threshold)\n" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "iteration 10, objective = 0.5121787\n", "iteration 20, objective = 0.4860348\n", "iteration 30, objective = 0.4675451\n", "iteration 40, objective = 0.46167055\n", "iteration 50, objective = 0.45701832\n", "iteration 60, objective = 0.44749424\n", "iteration 70, objective = 0.44757926\n", "iteration 80, objective = 0.4452785\n", "iteration 90, objective = 0.4446122\n", "iteration 100, objective = 0.44039646\n", "iteration 110, objective = 0.44507992\n", "iteration 120, objective = 0.44116876\n", "iteration 130, objective = 0.4395796\n", "iteration 140, objective = 0.4408364\n", "iteration 150, objective = 0.44295114\n", "iteration 160, objective = 0.43921968\n", "iteration 170, objective = 0.44189137\n", "iteration 180, objective = 0.43661243\n", "iteration 190, objective = 0.43899748\n", "iteration 200, objective = 0.4371943\n" ] } ], "source": [ "model = PlayPredictor()\n", "model.fit(train, K=6, iters=200)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[7044 2955]\n", " [2598 7401]]\n", "PlayPredictor accuracy: 0.7223222322232223\n" ] } ], "source": [ "CM = np.array([[0,0], [0,0]])\n", "balanced_valid = get_balanced_set(dataset, valid)\n", "for user, game, review in balanced_valid:\n", " pred = model.predict(user, game, threshold=0.5)\n", " CM[review[\"played\"]][pred] += 1\n", "\n", "print(CM)\n", "print(f\"PlayPredictor accuracy: \", 1 - (CM[1][0] + CM[0][1]) / len(balanced_valid))" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "writePredictions(\"pairs_Played.csv\", \"predictions_Played.csv\", model)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Time Predictor" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from copy import copy\n", "\n", "class TimePredictor:\n", " \n", " def __init__(self):\n", " pass\n", "\n", " def fit(self, data, l=5.0, iters=200): # data is an array of (user, game, review) tuples\n", " reviewsPerUser = defaultdict(list)\n", " reviewsPerItem = defaultdict(list)\n", "\n", " globalAverage = 0\n", "\n", " for user, game, review in data:\n", " reviewsPerUser[user].append(review)\n", " reviewsPerItem[game].append(review)\n", "\n", " globalAverage += review[\"hours_transformed\"]\n", "\n", " globalAverage /= len(data)\n", "\n", " betaU = {}\n", " betaI = {}\n", " for u in reviewsPerUser:\n", " reviews = [r[\"hours_transformed\"] for r in reviewsPerUser[u]]\n", " betaU[u] = np.mean(reviews)\n", "\n", " for g in reviewsPerItem:\n", " reviews = [r[\"hours_transformed\"] for r in reviewsPerItem[g]]\n", " betaI[g] = np.mean(reviews)\n", "\n", " alpha = globalAverage # Could initialize anywhere, this is a guess\n", "\n", " for i in range(iters):\n", "\n", " newAlpha = 0\n", " for user,game,review in data:\n", " newAlpha += review[\"hours_transformed\"] - (betaU[user] + betaI[game])\n", " alpha = newAlpha / len(data)\n", "\n", " for user in reviewsPerUser:\n", " bu = 0\n", " for review in reviewsPerUser[user]:\n", " item = review[\"gameID\"]\n", " bu += review[\"hours_transformed\"] - (alpha + betaI[item])\n", " betaU[user] = bu / (l + len(reviewsPerUser[user]))\n", " \n", " for item in reviewsPerItem:\n", " bi = 0\n", " for review in reviewsPerItem[item]:\n", " user = review[\"userID\"]\n", " bi += review[\"hours_transformed\"] - (alpha + betaU[user])\n", " betaI[item] = bi / (l + len(reviewsPerItem[item]))\n", " \n", " self.alpha = alpha\n", " self.betaU = betaU\n", " self.betaI = betaI\n", "\n", " def predict(self, user, game):\n", " bu = 0\n", " bi = 0\n", "\n", " if user in self.betaU:\n", " bu = self.betaU[user]\n", " \n", " if game in self.betaI:\n", " bi = self.betaI[game]\n", "\n", " return self.alpha + bu + bi" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TimePredictor MSE: 2.990628028380304\n" ] } ], "source": [ "from sklearn.metrics import mean_squared_error\n", "\n", "def MSE(y, ypred):\n", " return mean_squared_error(y, ypred)\n", "\n", "model = TimePredictor()\n", "model.fit(train)\n", "\n", "y = []\n", "y_pred = []\n", "for user, game, review in valid:\n", " y_pred.append(model.predict(user, game))\n", " y.append(review[\"hours_transformed\"])\n", "\n", "print(f\"TimePredictor MSE: {MSE(y, y_pred)}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "writePredictions(\"pairs_Hours.csv\", \"predictions_Hours.csv\", model)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }