### Read Data

In [27]:
# load data into dataset array
import gzip
from collections import defaultdict
import numpy as np

In [28]:
def readJSON(path):
    f = gzip.open(path, 'rt', encoding="utf-8")
    f.readline()
    for l in f:
        d = eval(l)
        u = d['userID']
        g = d['gameID']
        yield u,g,d

dataset = []
for l in readJSON("train.json.gz"):
    dataset.append(l)

for user,game,review in dataset:
    review["played"] = 1

In [29]:
# train test split

from sklearn.model_selection import train_test_split

#train, valid = train_test_split(dataset, train_size=165000, random_state=0)
train = dataset[:165000]
valid = dataset[165000:]

In [30]:
# Get negative labels in vaidation
import random

def get_balanced_validation (dataset, valid):

    all_games = set()
    user_played = defaultdict(set)

    for user,game,review in dataset:
        all_games.add(review["gameID"])
        user_played[review["userID"]].add(review["gameID"])

    negative_valid = []

    for user,game,review in valid:
        not_played = all_games - user_played[user]
        new_game = random.choice(tuple(not_played))
        negative_valid.append((user, new_game, {"played": 0}))

    return valid + negative_valid

### Utility Functions

In [31]:
def writePredictions(infile, outfile, model):
    with open(outfile, 'w') as predictions:
        for l in open(infile):
            if l.startswith("userID"):
                predictions.write(l)
                continue
            u,g = l.strip().split(',')
            
            pred = model.predict(u,g)
            
            _ = predictions.write(u + ',' + g + ',' + str(pred) + '\n')

        predictions.close()

### Play Predictor

In [32]:
class PlayPredictor:

    def __init__(self):
        pass

    def fit(self, data, threshold=0.6): # data is an array of (user, game, review) tuples
        self.topGames = self.getTopGames(threshold)

    def predict(self, user, game):
        return int(game in self.topGames)

    def getTopGames (self, threshold):
        gameCount = defaultdict(int)
        totalPlayed = 0

        for user,game,_ in readJSON("train.json.gz"):
            gameCount[game] += 1
            totalPlayed += 1

        mostPopular = [(gameCount[x], x) for x in gameCount]
        mostPopular.sort()
        mostPopular.reverse()

        return1 = set()
        count = 0
        for ic, i in mostPopular:
            count += ic
            return1.add(i)
            if count > totalPlayed * threshold: break
        return return1

In [33]:
model = PlayPredictor()
model.fit(train, threshold=0.6)

In [34]:
error = 0
balanced_valid = get_balanced_validation(dataset, valid)
for user, game, review in balanced_valid:
    pred = model.predict(user, game)
    if pred != review["played"]:
        error += 1

print(f"PlayPredictor accuracy: ", 1 - error / len(balanced_valid))

PlayPredictor accuracy:  0.6997699769976997


In [35]:
writePredictions("pairs_Played.csv", "predictions_Played.csv", model)

### Time Predictor

In [36]:
from copy import copy

class TimePredictor:
    
    def __init__(self):
        pass

    def fit(self, data, l=5.0, iters=200): # data is an array of (user, game, review) tuples
        reviewsPerUser = defaultdict(list)
        reviewsPerItem = defaultdict(list)

        globalAverage = 0

        for user, game, review in data:
            reviewsPerUser[user].append(review)
            reviewsPerItem[game].append(review)

            globalAverage += review["hours_transformed"]

        globalAverage /= len(data)

        betaU = {}
        betaI = {}
        for u in reviewsPerUser:
            reviews = [r["hours_transformed"] for r in reviewsPerUser[u]]
            betaU[u] = np.mean(reviews)

        for g in reviewsPerItem:
            reviews = [r["hours_transformed"] for r in reviewsPerItem[g]]
            betaI[g] = np.mean(reviews)

        alpha = globalAverage # Could initialize anywhere, this is a guess

        for i in range(iters):

            newAlpha = 0
            for user,game,review in data:
                newAlpha += review["hours_transformed"] - (betaU[user] + betaI[game])
            alpha = newAlpha / len(data)

            for user in reviewsPerUser:
                bu = 0
                for review in reviewsPerUser[user]:
                    item = review["gameID"]
                    bu += review["hours_transformed"] - (alpha + betaI[item])
                betaU[user] = bu / (l + len(reviewsPerUser[user]))
            
            for item in reviewsPerItem:
                bi = 0
                for review in reviewsPerItem[item]:
                    user = review["userID"]
                    bi += review["hours_transformed"] - (alpha + betaU[user])
                betaI[item] = bi / (l + len(reviewsPerItem[item]))
        
        self.alpha = alpha
        self.betaU = betaU
        self.betaI = betaI

    def predict(self, user, game):
        bu = 0
        bi = 0

        if user in self.betaU:
            bu = self.betaU[user]
        
        if game in self.betaI:
            bi = self.betaI[game]

        return self.alpha + bu + bi

In [40]:
from sklearn.metrics import mean_squared_error

def MSE(y, ypred):
    return mean_squared_error(y, ypred)

model = TimePredictor()
model.fit(train, l=5.0, iters=200)

In [41]:
y = []
y_pred = []
for user, game, review in valid:
    y_pred.append(model.predict(user, game))
    y.append(review["hours_transformed"])

print(f"TimePredictor MSE: {MSE(y, y_pred)}")

TimePredictor MSE: 2.990628028380304


In [42]:
writePredictions("pairs_Hours.csv", "predictions_Hours.csv", model)