In [1]:
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import re
import gzip

In [2]:
def parseData(fname):
    for l in gzip.open(fname):
        yield eval(l)

data = list(parseData("australian_user_reviews.json.gz"))

dm = [[0,0],[0,0]]

users = set()
games = set()

nodate = 0

reviews = []

for user in data:
    if user["user_id"] in users:
        #print(f"ducplicate user skipped: {user['user_id']}")
        pass
    else:
        users.add(user["user_id"])
        for review in user["reviews"]:
            games.add(review["item_id"])
            funny = review["funny"]
            hasfunny = int(funny != "")
            if funny == "":
                review["funny"] = 0
            else:
                review["funny"] = int(re.findall("\d+", funny)[0])
                
            helpful = review["helpful"]
            hashelpful = int(helpful != "No ratings yet")
            if helpful == "No ratings yet":
                review["helpful_n"] = 0
                review["helpful_total"] = 0
                review["helpful"] = 0
            else:
                nums = re.findall("\d+", helpful.replace(",", ""))
                helpfulness = float(nums[0]) / float(nums[1])
                review["helpful"] = float(nums[0]) / float(nums[1])
                review["helpful_n"] = float(nums[0])
                review["helpful_total"] = float(nums[1])
            
            dm[hasfunny][hashelpful] += 1

            try:
                post_datetime = datetime.strptime(review["posted"],'Posted %B %d, %Y.')
                review["posted"] = post_datetime
            except:
                nodate += 1

            review["user_id"] = user["user_id"]
            review["user_url"] = user["user_url"]
            reviews.append(review)

In [3]:
from collections import defaultdict
import string
from nltk.stem.porter import *

wordCount = defaultdict(int)
punctuation = set(string.punctuation)
stemmer = PorterStemmer()
for review in reviews:
	r = ''.join([c for c in review['review'].lower() if not c in punctuation])
	for w in r.split():
		w = stemmer.stem(w)
		wordCount[w] += 1
		
len(wordCount)

97248

In [4]:
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()
words = [x[1] for x in counts[:1000]]
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [5]:
def feature(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['review'].lower() if not c in punctuation])
    for w in r.split():
        if w in words:
            feat[wordId[w]] += 1
    feat.append(1) # offset
    return feat

In [10]:
import numpy as np
X = []
Y1 = []
Y2 = []
for review in reviews:
    X.append(feature(review))
    Y1.append(review["funny"])
    Y2.append(review["helpful_n"])

X = np.array(X)
Y1 = np.array(Y1)
Y2 = np.array(Y2)

In [11]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
guess_mean1 = np.mean(Y1)
guess_mean2 = np.mean(Y2)

print("baseline", mean_squared_error(Y1, [guess_mean1]*len(Y1)), mean_absolute_error(Y2, [guess_mean2]*len(Y2)))

baseline 294.7309048565537 4.604634941766926


In [12]:
from sklearn import linear_model

for C in [0.001, 0.01, 0.1, 1, 10, 100, 1000]:

    model1 = linear_model.Ridge(C, fit_intercept=True)
    model1.fit(X, Y1)

    model2 = linear_model.Ridge(C, fit_intercept=True)
    model2.fit(X, Y2)

    predictions1 = model1.predict(X)
    predictions2 = model1.predict(X)

    print(C, mean_squared_error(Y1, predictions1), mean_absolute_error(Y2, predictions2))

0.001 282.2541467007739 4.179655704428717
0.01 282.25415246942805 4.179600740282743
0.1 282.2546345232787 4.179072864682249
1 282.2692492511399 4.175349141167781
10 282.3721909589884 4.147935437500891
100 283.13132181376034 3.9883973026815065
1000 286.86570062121467 3.620101916467935


In [14]:
for C in [1, 10, 100, 1000]:

    model1 = linear_model.Lasso(alpha=C, fit_intercept=True)
    model1.fit(X, Y1)

    model2 = linear_model.Lasso(alpha=C, fit_intercept=True)
    model2.fit(X, Y2)

    predictions1 = model1.predict(X)
    predictions2 = model1.predict(X)

    print(C, mean_squared_error(Y1, predictions1), mean_absolute_error(Y2, predictions2))

1 294.7309048565537 3.2338225122785453
10 294.7309048565537 3.2338225122785453
100 294.7309048565537 3.2338225122785453
1000 294.7309048565537 3.2338225122785453
