CSE-158-Assignment-1/assignment1.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load data into dataset array\n",
    "import gzip\n",
    "from collections import defaultdict\n",
    "import numpy as np\n",
    "import tensorflow as tf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def readJSON(path):\n",
    "    f = gzip.open(path, 'rt', encoding=\"utf-8\")\n",
    "    f.readline()\n",
    "    for l in f:\n",
    "        d = eval(l)\n",
    "        u = d['userID']\n",
    "        g = d['gameID']\n",
    "        yield u,g,d\n",
    "\n",
    "dataset = []\n",
    "for l in readJSON(\"train.json.gz\"):\n",
    "    dataset.append(l)\n",
    "\n",
    "for user,game,review in dataset:\n",
    "    review[\"played\"] = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# train test split\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "#train, valid = train_test_split(dataset, train_size=165000, random_state=0)\n",
    "train = dataset[:165000]\n",
    "valid = dataset[165000:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get negative labels in vaidation\n",
    "import random\n",
    "\n",
    "def get_balanced_set(dataset, s):\n",
    "    all_games = set()\n",
    "    user_played = defaultdict(set)\n",
    "\n",
    "    for user,game,review in dataset:\n",
    "        all_games.add(review[\"gameID\"])\n",
    "        user_played[review[\"userID\"]].add(review[\"gameID\"])\n",
    "\n",
    "    negative = []\n",
    "\n",
    "    for user,game,review in s:\n",
    "        not_played = all_games - user_played[user]\n",
    "        new_game = random.choice(tuple(not_played))\n",
    "        negative.append((user, new_game, {\"played\": 0}))\n",
    "\n",
    "    return s + negative\n",
    "    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Utility Functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "def writePredictions(infile, outfile, model):\n",
    "    with open(outfile, 'w') as predictions:\n",
    "        for l in open(infile):\n",
    "            if l.startswith(\"userID\"):\n",
    "                predictions.write(l)\n",
    "                continue\n",
    "            u,g = l.strip().split(',')\n",
    "            \n",
    "            pred = model.predict(u,g)\n",
    "            \n",
    "            _ = predictions.write(u + ',' + g + ',' + str(pred) + '\\n')\n",
    "\n",
    "        predictions.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "class BPRbatch(tf.keras.Model):\n",
    "    def __init__(self, K, lamb, itemIDs, userIDs):\n",
    "        super(BPRbatch, self).__init__()\n",
    "        # Initialize variables\n",
    "        self.betaI = tf.Variable(tf.random.normal([len(itemIDs)],stddev=0.001))\n",
    "        self.gammaU = tf.Variable(tf.random.normal([len(userIDs),K],stddev=0.001))\n",
    "        self.gammaI = tf.Variable(tf.random.normal([len(itemIDs),K],stddev=0.001))\n",
    "        # Regularization coefficient\n",
    "        self.lamb = lamb\n",
    "\n",
    "    # Prediction for a single instance\n",
    "    def predict(self, u, i):\n",
    "        bi = self.bi\n",
    "        gu = self.gu\n",
    "        gi = self.gi\n",
    "        if u != None:\n",
    "            gu = self.gammaU[u]\n",
    "        if i != None:\n",
    "            bi = self.betaI[i]\n",
    "            gi = self.gammaI[i]\n",
    "        p = bi + tf.tensordot(gu, gi, 1)\n",
    "        return p\n",
    "\n",
    "    # Regularizer\n",
    "    def reg(self):\n",
    "        return self.lamb * (tf.nn.l2_loss(self.betaI) +\\\n",
    "                            tf.nn.l2_loss(self.gammaU) +\\\n",
    "                            tf.nn.l2_loss(self.gammaI))\n",
    "    \n",
    "    def score(self, sampleU, sampleI):\n",
    "        u = tf.convert_to_tensor(sampleU, dtype=tf.int32)\n",
    "        i = tf.convert_to_tensor(sampleI, dtype=tf.int32)\n",
    "        beta_i = tf.nn.embedding_lookup(self.betaI, i)\n",
    "        gamma_u = tf.nn.embedding_lookup(self.gammaU, u)\n",
    "        gamma_i = tf.nn.embedding_lookup(self.gammaI, i)\n",
    "        x_ui = beta_i + tf.reduce_sum(tf.multiply(gamma_u, gamma_i), 1)\n",
    "        return x_ui\n",
    "\n",
    "    def call(self, sampleU, sampleI, sampleJ):\n",
    "        x_ui = self.score(sampleU, sampleI)\n",
    "        x_uj = self.score(sampleU, sampleJ)\n",
    "        return -tf.reduce_mean(tf.math.log(tf.math.sigmoid(x_ui - x_uj)))\n",
    "    \n",
    "    def finalize(self):\n",
    "        self.bi = np.average(self.betaI, axis=0)\n",
    "        self.gu = np.average(self.gammaU, axis=0)\n",
    "        self.gi = np.average(self.gammaI, axis=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Play Predictor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "class PlayPredictor:\n",
    "\n",
    "    def __init__(self):\n",
    "        pass\n",
    "\n",
    "    def fit(self, data, K=5, iters=100): # data is an array of (user, game, review) tuples\n",
    "        self.userIDs = {}\n",
    "        self.itemIDs = {}\n",
    "        interactions = []\n",
    "\n",
    "        for u,i,r in data:\n",
    "            if not u in self.userIDs: self.userIDs[u] = len(self.userIDs)\n",
    "            if not i in self.itemIDs: self.itemIDs[i] = len(self.itemIDs)\n",
    "            interactions.append((u,i,r[\"played\"]))\n",
    "        \n",
    "        items = list(self.itemIDs.keys())\n",
    "        \n",
    "        itemsPerUser = defaultdict(list)\n",
    "        usersPerItem = defaultdict(list)\n",
    "        for u,i,r in interactions:\n",
    "            itemsPerUser[u].append(i)\n",
    "            usersPerItem[i].append(u)\n",
    "\n",
    "        def trainingStepBPR(model, interactions):\n",
    "            Nsamples = 50000\n",
    "            with tf.GradientTape() as tape:\n",
    "                sampleU, sampleI, sampleJ = [], [], []\n",
    "                for _ in range(Nsamples):\n",
    "                    u,i,_ = random.choice(interactions) # positive sample\n",
    "                    j = random.choice(items) # negative sample\n",
    "                    while j in itemsPerUser[u]:\n",
    "                        j = random.choice(items)\n",
    "                    sampleU.append(self.userIDs[u])\n",
    "                    sampleI.append(self.itemIDs[i])\n",
    "                    sampleJ.append(self.itemIDs[j])\n",
    "\n",
    "                loss = model(sampleU,sampleI,sampleJ)\n",
    "                loss += model.reg()\n",
    "            gradients = tape.gradient(loss, model.trainable_variables)\n",
    "            optimizer.apply_gradients((grad, var) for\n",
    "                                    (grad, var) in zip(gradients, model.trainable_variables)\n",
    "                                    if grad is not None)\n",
    "            return loss.numpy()\n",
    "        \n",
    "        optimizer = tf.keras.optimizers.Adam(0.1)\n",
    "        self.modelBPR = BPRbatch(K, 0.00001, self.itemIDs, self.userIDs)\n",
    "\n",
    "        for i in range(iters):\n",
    "            obj = trainingStepBPR(self.modelBPR, interactions)\n",
    "            if (i % 10 == 9): print(\"iteration \" + str(i+1) + \", objective = \" + str(obj))\n",
    "\n",
    "        self.modelBPR.finalize()\n",
    "            \n",
    "    def predict(self, user, game, threshold=0.5):\n",
    "        uid = None\n",
    "        gid = None\n",
    "        if user in self.userIDs:\n",
    "            uid = self.userIDs[user]\n",
    "        if game in self.itemIDs:\n",
    "            gid = self.itemIDs[game]\n",
    "        pred = self.modelBPR.predict(uid, gid).numpy()\n",
    "        return int(pred > threshold)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "iteration 10, objective = 0.5121787\n",
      "iteration 20, objective = 0.4860348\n",
      "iteration 30, objective = 0.4675451\n",
      "iteration 40, objective = 0.46167055\n",
      "iteration 50, objective = 0.45701832\n",
      "iteration 60, objective = 0.44749424\n",
      "iteration 70, objective = 0.44757926\n",
      "iteration 80, objective = 0.4452785\n",
      "iteration 90, objective = 0.4446122\n",
      "iteration 100, objective = 0.44039646\n",
      "iteration 110, objective = 0.44507992\n",
      "iteration 120, objective = 0.44116876\n",
      "iteration 130, objective = 0.4395796\n",
      "iteration 140, objective = 0.4408364\n",
      "iteration 150, objective = 0.44295114\n",
      "iteration 160, objective = 0.43921968\n",
      "iteration 170, objective = 0.44189137\n",
      "iteration 180, objective = 0.43661243\n",
      "iteration 190, objective = 0.43899748\n",
      "iteration 200, objective = 0.4371943\n"
     ]
    }
   ],
   "source": [
    "model = PlayPredictor()\n",
    "model.fit(train, K=6, iters=200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[7044 2955]\n",
      " [2598 7401]]\n",
      "PlayPredictor accuracy:  0.7223222322232223\n"
     ]
    }
   ],
   "source": [
    "CM = np.array([[0,0], [0,0]])\n",
    "balanced_valid = get_balanced_set(dataset, valid)\n",
    "for user, game, review in balanced_valid:\n",
    "    pred = model.predict(user, game, threshold=0.5)\n",
    "    CM[review[\"played\"]][pred] += 1\n",
    "\n",
    "print(CM)\n",
    "print(f\"PlayPredictor accuracy: \", 1 - (CM[1][0] + CM[0][1]) / len(balanced_valid))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "writePredictions(\"pairs_Played.csv\", \"predictions_Played.csv\", model)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Time Predictor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from copy import copy\n",
    "\n",
    "class TimePredictor:\n",
    "    \n",
    "    def __init__(self):\n",
    "        pass\n",
    "\n",
    "    def fit(self, data, l=5.0, iters=200): # data is an array of (user, game, review) tuples\n",
    "        reviewsPerUser = defaultdict(list)\n",
    "        reviewsPerItem = defaultdict(list)\n",
    "\n",
    "        globalAverage = 0\n",
    "\n",
    "        for user, game, review in data:\n",
    "            reviewsPerUser[user].append(review)\n",
    "            reviewsPerItem[game].append(review)\n",
    "\n",
    "            globalAverage += review[\"hours_transformed\"]\n",
    "\n",
    "        globalAverage /= len(data)\n",
    "\n",
    "        betaU = {}\n",
    "        betaI = {}\n",
    "        for u in reviewsPerUser:\n",
    "            reviews = [r[\"hours_transformed\"] for r in reviewsPerUser[u]]\n",
    "            betaU[u] = np.mean(reviews)\n",
    "\n",
    "        for g in reviewsPerItem:\n",
    "            reviews = [r[\"hours_transformed\"] for r in reviewsPerItem[g]]\n",
    "            betaI[g] = np.mean(reviews)\n",
    "\n",
    "        alpha = globalAverage # Could initialize anywhere, this is a guess\n",
    "\n",
    "        for i in range(iters):\n",
    "\n",
    "            newAlpha = 0\n",
    "            for user,game,review in data:\n",
    "                newAlpha += review[\"hours_transformed\"] - (betaU[user] + betaI[game])\n",
    "            alpha = newAlpha / len(data)\n",
    "\n",
    "            for user in reviewsPerUser:\n",
    "                bu = 0\n",
    "                for review in reviewsPerUser[user]:\n",
    "                    item = review[\"gameID\"]\n",
    "                    bu += review[\"hours_transformed\"] - (alpha + betaI[item])\n",
    "                betaU[user] = bu / (l + len(reviewsPerUser[user]))\n",
    "            \n",
    "            for item in reviewsPerItem:\n",
    "                bi = 0\n",
    "                for review in reviewsPerItem[item]:\n",
    "                    user = review[\"userID\"]\n",
    "                    bi += review[\"hours_transformed\"] - (alpha + betaU[user])\n",
    "                betaI[item] = bi / (l + len(reviewsPerItem[item]))\n",
    "        \n",
    "        self.alpha = alpha\n",
    "        self.betaU = betaU\n",
    "        self.betaI = betaI\n",
    "\n",
    "    def predict(self, user, game):\n",
    "        bu = 0\n",
    "        bi = 0\n",
    "\n",
    "        if user in self.betaU:\n",
    "            bu = self.betaU[user]\n",
    "        \n",
    "        if game in self.betaI:\n",
    "            bi = self.betaI[game]\n",
    "\n",
    "        return self.alpha + bu + bi"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TimePredictor MSE: 2.990628028380304\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import mean_squared_error\n",
    "\n",
    "def MSE(y, ypred):\n",
    "    return mean_squared_error(y, ypred)\n",
    "\n",
    "model = TimePredictor()\n",
    "model.fit(train)\n",
    "\n",
    "y = []\n",
    "y_pred = []\n",
    "for user, game, review in valid:\n",
    "    y_pred.append(model.predict(user, game))\n",
    "    y.append(review[\"hours_transformed\"])\n",
    "\n",
    "print(f\"TimePredictor MSE: {MSE(y, y_pred)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "writePredictions(\"pairs_Hours.csv\", \"predictions_Hours.csv\", model)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}