use BPR for predict play

2023-11-04 01:10:01 -07:00
parent c760ea5e14
commit aa423db398
3 changed files with 3971 additions and 3580 deletions
--- a/assignment1.ipynb
+++ b/assignment1.ipynb
@@ -9,19 +9,20 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load data into dataset array\n",
    "import gzip\n",
    "from collections import defaultdict\n",
-    "import numpy as np"
+    "import numpy as np\n",
+    "import tensorflow as tf"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -44,7 +45,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -59,15 +60,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get negative labels in vaidation\n",
    "import random\n",
    "\n",
-    "def get_balanced_validation (dataset, valid):\n",
-    "\n",
+    "def get_balanced_set(dataset, s):\n",
    "    all_games = set()\n",
    "    user_played = defaultdict(set)\n",
    "\n",
@@ -75,14 +75,15 @@
    "        all_games.add(review[\"gameID\"])\n",
    "        user_played[review[\"userID\"]].add(review[\"gameID\"])\n",
    "\n",
-    "    negative_valid = []\n",
+    "    negative = []\n",
    "\n",
-    "    for user,game,review in valid:\n",
+    "    for user,game,review in s:\n",
    "        not_played = all_games - user_played[user]\n",
    "        new_game = random.choice(tuple(not_played))\n",
-    "        negative_valid.append((user, new_game, {\"played\": 0}))\n",
+    "        negative.append((user, new_game, {\"played\": 0}))\n",
    "\n",
-    "    return valid + negative_valid"
+    "    return s + negative\n",
+    "    "
   ]
  },
  {
@@ -94,7 +95,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -113,6 +114,48 @@
    "        predictions.close()"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class BPRbatch(tf.keras.Model):\n",
+    "    def __init__(self, K, lamb, itemIDs, userIDs):\n",
+    "        super(BPRbatch, self).__init__()\n",
+    "        # Initialize variables\n",
+    "        self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))\n",
+    "        self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))\n",
+    "        self.gammaI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))\n",
+    "        # Regularization coefficient\n",
+    "        self.lamb = lamb\n",
+    "\n",
+    "    # Prediction for a single instance\n",
+    "    def predict(self, u, i):\n",
+    "        p = self.betaI[i] + tf.tensordot(self.gammaU[u], self.gammaI[i], 1)\n",
+    "        return p\n",
+    "\n",
+    "    # Regularizer\n",
+    "    def reg(self):\n",
+    "        return self.lamb * (tf.nn.l2_loss(self.betaI) +\\\n",
+    "                            tf.nn.l2_loss(self.gammaU) +\\\n",
+    "                            tf.nn.l2_loss(self.gammaI))\n",
+    "    \n",
+    "    def score(self, sampleU, sampleI):\n",
+    "        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)\n",
+    "        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)\n",
+    "        beta_i = tf.nn.embedding_lookup(self.betaI, i)\n",
+    "        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)\n",
+    "        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)\n",
+    "        x_ui = beta_i + tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)\n",
+    "        return x_ui\n",
+    "\n",
+    "    def call(self, sampleU, sampleI, sampleJ):\n",
+    "        x_ui = self.score(sampleU, sampleI)\n",
+    "        x_uj = self.score(sampleU, sampleJ)\n",
+    "        return -tf.reduce_mean(tf.math.log(tf.math.sigmoid(x_ui - x_uj)))"
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {},
@@ -122,7 +165,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -131,11 +174,60 @@
    "    def __init__(self):\n",
    "        pass\n",
    "\n",
-    "    def fit(self, data, threshold=0.6): # data is an array of (user, game, review) tuples\n",
+    "    def fit(self, data, threshold=0.6, K=5, iters=100): # data is an array of (user, game, review) tuples\n",
    "        self.topGames = self.getTopGames(threshold)\n",
    "\n",
-    "    def predict(self, user, game):\n",
-    "        return int(game in self.topGames)\n",
+    "        self.userIDs = {}\n",
+    "        self.itemIDs = {}\n",
+    "        interactions = []\n",
+    "\n",
+    "        for u,i,r in data:\n",
+    "            if not u in self.userIDs: self.userIDs[u] = len(self.userIDs)\n",
+    "            if not i in self.itemIDs: self.itemIDs[i] = len(self.itemIDs)\n",
+    "            interactions.append((u,i,r[\"played\"]))\n",
+    "        \n",
+    "        items = list(self.itemIDs.keys())\n",
+    "        \n",
+    "        itemsPerUser = defaultdict(list)\n",
+    "        usersPerItem = defaultdict(list)\n",
+    "        for u,i,r in interactions:\n",
+    "            itemsPerUser[u].append(i)\n",
+    "            usersPerItem[i].append(u)\n",
+    "\n",
+    "        def trainingStepBPR(model, interactions):\n",
+    "            Nsamples = 50000\n",
+    "            with tf.GradientTape() as tape:\n",
+    "                sampleU, sampleI, sampleJ = [], [], []\n",
+    "                for _ in range(Nsamples):\n",
+    "                    u,i,_ = random.choice(interactions) # positive sample\n",
+    "                    j = random.choice(items) # negative sample\n",
+    "                    while j in itemsPerUser[u]:\n",
+    "                        j = random.choice(items)\n",
+    "                    sampleU.append(self.userIDs[u])\n",
+    "                    sampleI.append(self.itemIDs[i])\n",
+    "                    sampleJ.append(self.itemIDs[j])\n",
+    "\n",
+    "                loss = model(sampleU,sampleI,sampleJ)\n",
+    "                loss += model.reg()\n",
+    "            gradients = tape.gradient(loss, model.trainable_variables)\n",
+    "            optimizer.apply_gradients((grad, var) for\n",
+    "                                    (grad, var) in zip(gradients, model.trainable_variables)\n",
+    "                                    if grad is not None)\n",
+    "            return loss.numpy()\n",
+    "        \n",
+    "        optimizer = tf.keras.optimizers.Adam(0.1)\n",
+    "        self.modelBPR = BPRbatch(K, 0.00001, self.itemIDs, self.userIDs)\n",
+    "\n",
+    "        for i in range(iters):\n",
+    "            obj = trainingStepBPR(self.modelBPR, interactions)\n",
+    "            if (i % 10 == 9): print(\"iteration \" + str(i+1) + \", objective = \" + str(obj))\n",
+    "            \n",
+    "    def predict(self, user, game, threshold=0.5):\n",
+    "        if user in self.userIDs and game in self.itemIDs:\n",
+    "            pred = self.modelBPR.predict(self.userIDs[user], self.itemIDs[game]).numpy()\n",
+    "            return int(pred > threshold)\n",
+    "        else:\n",
+    "            return int(game in self.topGames)\n",
    "\n",
    "    def getTopGames (self, threshold):\n",
    "        gameCount = defaultdict(int)\n",
@@ -155,37 +247,50 @@
    "            count += ic\n",
    "            return1.add(i)\n",
    "            if count > totalPlayed * threshold: break\n",
-    "        return return1"
+    "        return return1\n"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 33,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model = PlayPredictor()\n",
-    "model.fit(train, threshold=0.6)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "PlayPredictor accuracy:  0.6997699769976997\n"
+      "iteration 10, objective = 0.51180786\n",
+      "iteration 20, objective = 0.48082852\n",
+      "iteration 30, objective = 0.47100148\n",
+      "iteration 40, objective = 0.45862892\n",
+      "iteration 50, objective = 0.45290428\n",
+      "iteration 60, objective = 0.44695023\n",
+      "iteration 70, objective = 0.4453482\n",
+      "iteration 80, objective = 0.444919\n",
+      "iteration 90, objective = 0.4451945\n",
+      "iteration 100, objective = 0.44311014\n",
+      "iteration 110, objective = 0.44101325\n",
+      "iteration 120, objective = 0.43727913\n",
+      "iteration 130, objective = 0.43938398\n",
+      "iteration 140, objective = 0.43788543\n",
+      "iteration 150, objective = 0.43573555\n",
+      "iteration 160, objective = 0.4379884\n",
+      "iteration 170, objective = 0.43852594\n",
+      "iteration 180, objective = 0.4391472\n",
+      "iteration 190, objective = 0.4318109\n",
+      "iteration 200, objective = 0.4389726\n",
+      "PlayPredictor accuracy:  0.7234723472347235\n"
     ]
    }
   ],
   "source": [
+    "model = PlayPredictor()\n",
+    "model.fit(train, K=6, iters=200)\n",
+    "\n",
    "error = 0\n",
-    "balanced_valid = get_balanced_validation(dataset, valid)\n",
+    "balanced_valid = get_balanced_set(dataset, valid)\n",
    "for user, game, review in balanced_valid:\n",
-    "    pred = model.predict(user, game)\n",
+    "    pred = model.predict(user, game, threshold=0.5)\n",
    "    if pred != review[\"played\"]:\n",
    "        error += 1\n",
    "\n",
@@ -194,7 +299,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -210,7 +315,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -287,22 +392,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 40,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from sklearn.metrics import mean_squared_error\n",
-    "\n",
-    "def MSE(y, ypred):\n",
-    "    return mean_squared_error(y, ypred)\n",
-    "\n",
-    "model = TimePredictor()\n",
-    "model.fit(train, l=5.0, iters=200)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
@@ -314,6 +404,14 @@
    }
   ],
   "source": [
+    "from sklearn.metrics import mean_squared_error\n",
+    "\n",
+    "def MSE(y, ypred):\n",
+    "    return mean_squared_error(y, ypred)\n",
+    "\n",
+    "model = TimePredictor()\n",
+    "model.fit(train)\n",
+    "\n",
    "y = []\n",
    "y_pred = []\n",
    "for user, game, review in valid:\n",
@@ -325,7 +423,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
--- a/assignment1.py
+++ b/assignment1.py
@@ -1,15 +1,308 @@
+# %% [markdown]
+# ### Read Data
+
 # %%
+# load data into dataset array
 import gzip
 from collections import defaultdict
-import json
+import numpy as np
+import tensorflow as tf

 # %%
+def readJSON(path):
+    f = gzip.open(path, 'rt', encoding="utf-8")
+    f.readline()
+    for l in f:
+        d = eval(l)
+        u = d['userID']
+        g = d['gameID']
+        yield u,g,d
+
 dataset = []
+for l in readJSON("train.json.gz"):
+    dataset.append(l)

-for l in gzip.open("train.json.gz", 'rt', encoding='utf-8'):
-    d = eval(l)
-    dataset.append(d)
+for user,game,review in dataset:
+    review["played"] = 1

-print(dataset[0])
+# %%
+# train test split
+
+from sklearn.model_selection import train_test_split
+
+#train, valid = train_test_split(dataset, train_size=165000, random_state=0)
+train = dataset[:165000]
+valid = dataset[165000:]
+
+# %%
+# Get negative labels in vaidation
+import random
+
+def get_balanced_set(dataset, s):
+    all_games = set()
+    user_played = defaultdict(set)
+
+    for user,game,review in dataset:
+        all_games.add(review["gameID"])
+        user_played[review["userID"]].add(review["gameID"])
+
+    negative = []
+
+    for user,game,review in s:
+        not_played = all_games - user_played[user]
+        new_game = random.choice(tuple(not_played))
+        negative.append((user, new_game, {"played": 0}))
+
+    return s + negative
+    
+
+# %% [markdown]
+# ### Utility Functions
+
+# %%
+def writePredictions(infile, outfile, model):
+    with open(outfile, 'w') as predictions:
+        for l in open(infile):
+            if l.startswith("userID"):
+                predictions.write(l)
+                continue
+            u,g = l.strip().split(',')
+            
+            pred = model.predict(u,g)
+            
+            _ = predictions.write(u + ',' + g + ',' + str(pred) + '\n')
+
+        predictions.close()
+
+# %%
+class BPRbatch(tf.keras.Model):
+    def __init__(self, K, lamb, itemIDs, userIDs):
+        super(BPRbatch, self).__init__()
+        # Initialize variables
+        self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))
+        self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))
+        self.gammaI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))
+        # Regularization coefficient
+        self.lamb = lamb
+
+    # Prediction for a single instance
+    def predict(self, u, i):
+        p = self.betaI[i] + tf.tensordot(self.gammaU[u], self.gammaI[i], 1)
+        return p
+
+    # Regularizer
+    def reg(self):
+        return self.lamb * (tf.nn.l2_loss(self.betaI) +\
+                            tf.nn.l2_loss(self.gammaU) +\
+                            tf.nn.l2_loss(self.gammaI))
+    
+    def score(self, sampleU, sampleI):
+        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)
+        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)
+        beta_i = tf.nn.embedding_lookup(self.betaI, i)
+        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)
+        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)
+        x_ui = beta_i + tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)
+        return x_ui
+
+    def call(self, sampleU, sampleI, sampleJ):
+        x_ui = self.score(sampleU, sampleI)
+        x_uj = self.score(sampleU, sampleJ)
+        return -tf.reduce_mean(tf.math.log(tf.math.sigmoid(x_ui - x_uj)))
+
+# %% [markdown]
+# ### Play Predictor
+
+# %%
+class PlayPredictor:
+
+    def __init__(self):
+        pass
+
+    def fit(self, data, threshold=0.6, K=5, iters=100): # data is an array of (user, game, review) tuples
+        self.topGames = self.getTopGames(threshold)
+
+        self.userIDs = {}
+        self.itemIDs = {}
+        interactions = []
+
+        for u,i,r in data:
+            if not u in self.userIDs: self.userIDs[u] = len(self.userIDs)
+            if not i in self.itemIDs: self.itemIDs[i] = len(self.itemIDs)
+            interactions.append((u,i,r["played"]))
+        
+        items = list(self.itemIDs.keys())
+        
+        itemsPerUser = defaultdict(list)
+        usersPerItem = defaultdict(list)
+        for u,i,r in interactions:
+            itemsPerUser[u].append(i)
+            usersPerItem[i].append(u)
+
+        def trainingStepBPR(model, interactions):
+            Nsamples = 50000
+            with tf.GradientTape() as tape:
+                sampleU, sampleI, sampleJ = [], [], []
+                for _ in range(Nsamples):
+                    u,i,_ = random.choice(interactions) # positive sample
+                    j = random.choice(items) # negative sample
+                    while j in itemsPerUser[u]:
+                        j = random.choice(items)
+                    sampleU.append(self.userIDs[u])
+                    sampleI.append(self.itemIDs[i])
+                    sampleJ.append(self.itemIDs[j])
+
+                loss = model(sampleU,sampleI,sampleJ)
+                loss += model.reg()
+            gradients = tape.gradient(loss, model.trainable_variables)
+            optimizer.apply_gradients((grad, var) for
+                                    (grad, var) in zip(gradients, model.trainable_variables)
+                                    if grad is not None)
+            return loss.numpy()
+        
+        optimizer = tf.keras.optimizers.Adam(0.1)
+        self.modelBPR = BPRbatch(K, 0.00001, self.itemIDs, self.userIDs)
+
+        for i in range(iters):
+            obj = trainingStepBPR(self.modelBPR, interactions)
+            if (i % 10 == 9): print("iteration " + str(i+1) + ", objective = " + str(obj))
+            
+    def predict(self, user, game, threshold=0.5):
+        if user in self.userIDs and game in self.itemIDs:
+            pred = self.modelBPR.predict(self.userIDs[user], self.itemIDs[game]).numpy()
+            return int(pred > threshold)
+        else:
+            return int(game in self.topGames)
+
+    def getTopGames (self, threshold):
+        gameCount = defaultdict(int)
+        totalPlayed = 0
+
+        for user,game,_ in readJSON("train.json.gz"):
+            gameCount[game] += 1
+            totalPlayed += 1
+
+        mostPopular = [(gameCount[x], x) for x in gameCount]
+        mostPopular.sort()
+        mostPopular.reverse()
+
+        return1 = set()
+        count = 0
+        for ic, i in mostPopular:
+            count += ic
+            return1.add(i)
+            if count > totalPlayed * threshold: break
+        return return1
+
+
+# %%
+model = PlayPredictor()
+model.fit(train, K=6, iters=200)
+
+error = 0
+balanced_valid = get_balanced_set(dataset, valid)
+for user, game, review in balanced_valid:
+    pred = model.predict(user, game, threshold=0.5)
+    if pred != review["played"]:
+        error += 1
+
+print(f"PlayPredictor accuracy: ", 1 - error / len(balanced_valid))
+
+# %%
+writePredictions("pairs_Played.csv", "predictions_Played.csv", model)
+
+# %% [markdown]
+# ### Time Predictor
+
+# %%
+from copy import copy
+
+class TimePredictor:
+    
+    def __init__(self):
+        pass
+
+    def fit(self, data, l=5.0, iters=200): # data is an array of (user, game, review) tuples
+        reviewsPerUser = defaultdict(list)
+        reviewsPerItem = defaultdict(list)
+
+        globalAverage = 0
+
+        for user, game, review in data:
+            reviewsPerUser[user].append(review)
+            reviewsPerItem[game].append(review)
+
+            globalAverage += review["hours_transformed"]
+
+        globalAverage /= len(data)
+
+        betaU = {}
+        betaI = {}
+        for u in reviewsPerUser:
+            reviews = [r["hours_transformed"] for r in reviewsPerUser[u]]
+            betaU[u] = np.mean(reviews)
+
+        for g in reviewsPerItem:
+            reviews = [r["hours_transformed"] for r in reviewsPerItem[g]]
+            betaI[g] = np.mean(reviews)
+
+        alpha = globalAverage # Could initialize anywhere, this is a guess
+
+        for i in range(iters):
+
+            newAlpha = 0
+            for user,game,review in data:
+                newAlpha += review["hours_transformed"] - (betaU[user] + betaI[game])
+            alpha = newAlpha / len(data)
+
+            for user in reviewsPerUser:
+                bu = 0
+                for review in reviewsPerUser[user]:
+                    item = review["gameID"]
+                    bu += review["hours_transformed"] - (alpha + betaI[item])
+                betaU[user] = bu / (l + len(reviewsPerUser[user]))
+            
+            for item in reviewsPerItem:
+                bi = 0
+                for review in reviewsPerItem[item]:
+                    user = review["userID"]
+                    bi += review["hours_transformed"] - (alpha + betaU[user])
+                betaI[item] = bi / (l + len(reviewsPerItem[item]))
+        
+        self.alpha = alpha
+        self.betaU = betaU
+        self.betaI = betaI
+
+    def predict(self, user, game):
+        bu = 0
+        bi = 0
+
+        if user in self.betaU:
+            bu = self.betaU[user]
+        
+        if game in self.betaI:
+            bi = self.betaI[game]
+
+        return self.alpha + bu + bi
+
+# %%
+from sklearn.metrics import mean_squared_error
+
+def MSE(y, ypred):
+    return mean_squared_error(y, ypred)
+
+model = TimePredictor()
+model.fit(train)
+
+y = []
+y_pred = []
+for user, game, review in valid:
+    y_pred.append(model.predict(user, game))
+    y.append(review["hours_transformed"])
+
+print(f"TimePredictor MSE: {MSE(y, y_pred)}")
+
+# %%
+writePredictions("pairs_Hours.csv", "predictions_Hours.csv", model)


--- a/predictions_Played.csv
+++ b/predictions_Played.csv