In [1]:
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import re
import gzip

In [2]:
def parseData(fname):
    for l in gzip.open(fname):
        yield eval(l)

data = list(parseData("australian_user_reviews.json.gz"))

dm = [[0,0],[0,0]]

users = set()
games = set()

nodate = 0

reviews = []

for user in data:
    if user["user_id"] in users:
        #print(f"ducplicate user skipped: {user['user_id']}")
        pass
    else:
        users.add(user["user_id"])
        for review in user["reviews"]:
            games.add(review["item_id"])
            funny = review["funny"]
            hasfunny = int(funny != "")
            if funny == "":
                review["funny"] = 0
            else:
                review["funny"] = int(re.findall("\d+", funny)[0])
                
            helpful = review["helpful"]
            hashelpful = int(helpful != "No ratings yet")
            if helpful == "No ratings yet":
                review["helpful_n"] = 0
                review["helpful_total"] = 0
                review["helpful"] = 0
            else:
                nums = re.findall("\d+", helpful.replace(",", ""))
                helpfulness = float(nums[0]) / float(nums[1])
                review["helpful"] = float(nums[0]) / float(nums[1])
                review["helpful_n"] = float(nums[0])
                review["helpful_total"] = float(nums[1])
            
            dm[hasfunny][hashelpful] += 1

            try:
                post_datetime = datetime.strptime(review["posted"],'Posted %B %d, %Y.')
                review["posted"] = post_datetime
            except:
                nodate += 1

            review["user_id"] = user["user_id"]
            review["user_url"] = user["user_url"]
            reviews.append(review)

In [3]:
from collections import defaultdict
import string
from nltk.stem.porter import *

wordCount = defaultdict(int)
punctuation = set(string.punctuation)
stemmer = PorterStemmer()
for review in reviews:
	r = ''.join([c for c in review['review'].lower() if not c in punctuation])
	for w in r.split():
		w = stemmer.stem(w)
		wordCount[w] += 1
		
len(wordCount)

97248

In [4]:
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()
words = [x[1] for x in counts[:1000]]
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [5]:
def feature(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['review'].lower() if not c in punctuation])
    for w in r.split():
        if w in words:
            feat[wordId[w]] += 1
    feat.append(1) # offset
    return feat

In [12]:
from sklearn.model_selection import train_test_split
import numpy as np

X_train, X_test, Y_funny_train, Y_funny_test, Y_helpful_train, Y_helpful_test = [], [], [], [], [], []

train, test = train_test_split(reviews, test_size=0.25, random_state=0)

for review in train:
    X_train.append(feature(review))
    #Y1.append(review["funny"])
    if review["funny"] == 0:
        Y_funny_train.append(0)
    else:
        Y_funny_train.append(np.log(review["funny"]))
    #Y2.append(review["helpful_n"])
    if review["helpful_n"] == 0:
        Y_helpful_train.append(0)
    else:
        Y_helpful_train.append(np.log(review["helpful_n"]))

for review in test:
    X_test.append(feature(review))
    #Y1.append(review["funny"])
    if review["funny"] == 0:
        Y_funny_test.append(0)
    else:
        Y_funny_test.append(np.log(review["funny"]))
    #Y2.append(review["helpful_n"])
    if review["helpful_n"] == 0:
        Y_helpful_test.append(0)
    else:
        Y_helpful_test.append(np.log(review["helpful_n"]))


In [13]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
guess_mean_funny = np.mean(Y_funny_train)
guess_mean_helpful = np.mean(Y_helpful_train)

print("baseline", mean_squared_error(Y_funny_test, [guess_mean_funny]*len(Y_funny_test)), mean_squared_error(Y_helpful_test, [guess_mean_helpful]*len(Y_helpful_test)))

baseline 0.17818192454918605 0.557911382661004


In [14]:
from sklearn import linear_model

Cs = [0.01, 0.1, 1, 10, 100]

for C in Cs:

    model1 = linear_model.Ridge(C, fit_intercept=True)
    model1.fit(X_train, Y_funny_train)

    model2 = linear_model.Ridge(C, fit_intercept=True)
    model2.fit(X_train, Y_helpful_train)

    pred_funny_test = model1.predict(X_test)
    pred_helpful_test = model2.predict(X_test)

    print(C, mean_squared_error(Y_funny_test, pred_funny_test), mean_squared_error(Y_helpful_test, pred_helpful_test))

0.01 0.24665170508912013 0.7702414041912456
0.1 0.24578924150085898 0.7681419094613451
1 0.24248804203997093 0.7584811772506682
10 0.24888382029075776 0.7518311372299598
100 0.23060394844562843 0.6419885405134674


In [15]:
Cs = [0.01, 0.1, 1, 10, 100]

for C in Cs:

    model1 = linear_model.Lasso(C, fit_intercept=True)
    model1.fit(X_train, Y_funny_train)

    model2 = linear_model.Lasso(C, fit_intercept=True)
    model2.fit(X_train, Y_helpful_train)

    pred_funny_test = model1.predict(X_test)
    pred_helpful_test = model2.predict(X_test)

    print(C, mean_squared_error(Y_funny_test, pred_funny_test), mean_squared_error(Y_helpful_test, pred_helpful_test))

0.01 0.17730058785614386 0.539258189636067
0.1 0.17818192454918605 0.543156420319067
1 0.17818192454918605 0.557911382661004
10 0.17818192454918605 0.557911382661004
100 0.17818192454918605 0.557911382661004
