{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Read Data" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "# load data into dataset array\n", "import gzip\n", "from collections import defaultdict\n", "import numpy as np" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "def readJSON(path):\n", " f = gzip.open(path, 'rt', encoding=\"utf-8\")\n", " f.readline()\n", " for l in f:\n", " d = eval(l)\n", " u = d['userID']\n", " g = d['gameID']\n", " yield u,g,d\n", "\n", "dataset = []\n", "for l in readJSON(\"train.json.gz\"):\n", " dataset.append(l)\n", "\n", "for user,game,review in dataset:\n", " review[\"played\"] = 1" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "# train test split\n", "\n", "from sklearn.model_selection import train_test_split\n", "\n", "#train, valid = train_test_split(dataset, train_size=165000, random_state=0)\n", "train = dataset[:165000]\n", "valid = dataset[165000:]" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "# Get negative labels in vaidation\n", "import random\n", "\n", "def get_balanced_validation (dataset, valid):\n", "\n", " all_games = set()\n", " user_played = defaultdict(set)\n", "\n", " for user,game,review in dataset:\n", " all_games.add(review[\"gameID\"])\n", " user_played[review[\"userID\"]].add(review[\"gameID\"])\n", "\n", " negative_valid = []\n", "\n", " for user,game,review in valid:\n", " not_played = all_games - user_played[user]\n", " new_game = random.choice(tuple(not_played))\n", " negative_valid.append((user, new_game, {\"played\": 0}))\n", "\n", " return valid + negative_valid" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Utility Functions" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "def writePredictions(infile, outfile, model):\n", " with open(outfile, 'w') as predictions:\n", " for l in open(infile):\n", " if l.startswith(\"userID\"):\n", " predictions.write(l)\n", " continue\n", " u,g = l.strip().split(',')\n", " \n", " pred = model.predict(u,g)\n", " \n", " _ = predictions.write(u + ',' + g + ',' + str(pred) + '\\n')\n", "\n", " predictions.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Play Predictor" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "class PlayPredictor:\n", "\n", " def __init__(self):\n", " pass\n", "\n", " def fit(self, data, threshold=0.6): # data is an array of (user, game, review) tuples\n", " self.topGames = self.getTopGames(threshold)\n", "\n", " def predict(self, user, game):\n", " return int(game in self.topGames)\n", "\n", " def getTopGames (self, threshold):\n", " gameCount = defaultdict(int)\n", " totalPlayed = 0\n", "\n", " for user,game,_ in readJSON(\"train.json.gz\"):\n", " gameCount[game] += 1\n", " totalPlayed += 1\n", "\n", " mostPopular = [(gameCount[x], x) for x in gameCount]\n", " mostPopular.sort()\n", " mostPopular.reverse()\n", "\n", " return1 = set()\n", " count = 0\n", " for ic, i in mostPopular:\n", " count += ic\n", " return1.add(i)\n", " if count > totalPlayed * threshold: break\n", " return return1" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "model = PlayPredictor()\n", "model.fit(train, threshold=0.6)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "PlayPredictor accuracy: 0.6997699769976997\n" ] } ], "source": [ "error = 0\n", "balanced_valid = get_balanced_validation(dataset, valid)\n", "for user, game, review in balanced_valid:\n", " pred = model.predict(user, game)\n", " if pred != review[\"played\"]:\n", " error += 1\n", "\n", "print(f\"PlayPredictor accuracy: \", 1 - error / len(balanced_valid))" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "writePredictions(\"pairs_Played.csv\", \"predictions_Played.csv\", model)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Time Predictor" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "from copy import copy\n", "\n", "class TimePredictor:\n", " \n", " def __init__(self):\n", " pass\n", "\n", " def fit(self, data, l=5.0, iters=200): # data is an array of (user, game, review) tuples\n", " reviewsPerUser = defaultdict(list)\n", " reviewsPerItem = defaultdict(list)\n", "\n", " globalAverage = 0\n", "\n", " for user, game, review in data:\n", " reviewsPerUser[user].append(review)\n", " reviewsPerItem[game].append(review)\n", "\n", " globalAverage += review[\"hours_transformed\"]\n", "\n", " globalAverage /= len(data)\n", "\n", " betaU = {}\n", " betaI = {}\n", " for u in reviewsPerUser:\n", " reviews = [r[\"hours_transformed\"] for r in reviewsPerUser[u]]\n", " betaU[u] = np.mean(reviews)\n", "\n", " for g in reviewsPerItem:\n", " reviews = [r[\"hours_transformed\"] for r in reviewsPerItem[g]]\n", " betaI[g] = np.mean(reviews)\n", "\n", " alpha = globalAverage # Could initialize anywhere, this is a guess\n", "\n", " for i in range(iters):\n", "\n", " newAlpha = 0\n", " for user,game,review in data:\n", " newAlpha += review[\"hours_transformed\"] - (betaU[user] + betaI[game])\n", " alpha = newAlpha / len(data)\n", "\n", " for user in reviewsPerUser:\n", " bu = 0\n", " for review in reviewsPerUser[user]:\n", " item = review[\"gameID\"]\n", " bu += review[\"hours_transformed\"] - (alpha + betaI[item])\n", " betaU[user] = bu / (l + len(reviewsPerUser[user]))\n", " \n", " for item in reviewsPerItem:\n", " bi = 0\n", " for review in reviewsPerItem[item]:\n", " user = review[\"userID\"]\n", " bi += review[\"hours_transformed\"] - (alpha + betaU[user])\n", " betaI[item] = bi / (l + len(reviewsPerItem[item]))\n", " \n", " self.alpha = alpha\n", " self.betaU = betaU\n", " self.betaI = betaI\n", "\n", " def predict(self, user, game):\n", " bu = 0\n", " bi = 0\n", "\n", " if user in self.betaU:\n", " bu = self.betaU[user]\n", " \n", " if game in self.betaI:\n", " bi = self.betaI[game]\n", "\n", " return self.alpha + bu + bi" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import mean_squared_error\n", "\n", "def MSE(y, ypred):\n", " return mean_squared_error(y, ypred)\n", "\n", "model = TimePredictor()\n", "model.fit(train, l=5.0, iters=200)" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "TimePredictor MSE: 2.990628028380304\n" ] } ], "source": [ "y = []\n", "y_pred = []\n", "for user, game, review in valid:\n", " y_pred.append(model.predict(user, game))\n", " y.append(review[\"hours_transformed\"])\n", "\n", "print(f\"TimePredictor MSE: {MSE(y, y_pred)}\")" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "writePredictions(\"pairs_Hours.csv\", \"predictions_Hours.csv\", model)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }