In [1]:
import gzip
def parseData(fname):
    for l in gzip.open(fname):
        yield eval(l)

In [2]:
data = list(parseData("australian_user_reviews.json.gz"))

In [3]:
import re

dm = [[0,0],[0,0]]

for user in data:
    for review in user["reviews"]:
        funny = review["funny"]
        hasfunny = int(funny != "")
        if funny == "":
            review["funny"] = 0
        else:
            review["funny"] = int(re.findall("\d+", funny)[0])
            
        helpful = review["helpful"]
        hashelpful = int(helpful != "No ratings yet")
        if helpful == "No ratings yet":
            review["helpful"] = 0
        else:
            nums = re.findall("\d+", helpful)
            review["helpful"] = float(nums[0]) / float(nums[1])
        
        dm[hasfunny][hashelpful] += 1
            
print(dm)

[[29204, 21950], [964, 7187]]


In [4]:
from collections import defaultdict
import string
from nltk.stem.porter import *

wordCount = defaultdict(int)
punctuation = set(string.punctuation)
stemmer = PorterStemmer()
for user in data:
    for review in user["reviews"]:
      r = ''.join([c for c in review['review'].lower() if not c in punctuation])
      for w in r.split():
            w = stemmer.stem(w)
            wordCount[w] += 1
    
len(wordCount)

97248

In [5]:
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()
words = [x[1] for x in counts[:1000]]
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [6]:
def feature(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['review'].lower() if not c in punctuation])
    for w in r.split():
        if w in words:
            feat[wordId[w]] += 1
    feat.append(1) # offset
    return feat

In [7]:
import numpy as np
X = []
Y1 = []
Y2 = []
for user in data:
    for review in user["reviews"]:
        X.append(feature(review))
        Y1.append(review["funny"])
        Y2.append(review["helpful"])

X = np.array(X)
Y1 = np.array(Y1)
Y2 = np.array(Y2)

In [8]:
from sklearn.metrics import mean_squared_error
guess_mean1 = np.mean(Y1)
guess_mean2 = np.mean(Y2)

print("baseline", mean_squared_error(Y1, [guess_mean1]*len(Y1)), mean_squared_error(Y2, [guess_mean2]*len(Y2)))

baseline 291.58597082421744 104.0410362862406


In [9]:
from sklearn import linear_model

for C in [0.001, 0.01, 0.1, 1, 10, 100, 1000]:

    model1 = linear_model.Ridge(C, fit_intercept=True)
    model1.fit(X, Y1)

    model2 = linear_model.Ridge(C, fit_intercept=True)
    model2.fit(X, Y2)

    predictions1 = model1.predict(X)
    predictions2 = model1.predict(X)

    print(C, mean_squared_error(Y1, predictions1), mean_squared_error(Y2, predictions2))

0.001 279.441162284691 111.14407506578739
0.01 279.44116808131514 111.14220804619674
0.1 279.4416527267009 111.12490360179714
1 279.4563820408731 111.0088419162745
10 279.55878360690946 110.3977031070603
100 280.29261897219476 108.18116566648386
1000 283.89486211897093 104.93301065452346


In [10]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

for C in [1, 10, 100, 1000]:

    model1 = linear_model.Lasso(alpha=C, fit_intercept=True)
    model1.fit(X, Y1)

    model2 = linear_model.Lasso(alpha=C, fit_intercept=True)
    model2.fit(X, Y2)

    predictions1 = model1.predict(X)
    predictions2 = model1.predict(X)

    print(C, mean_squared_error(Y1, predictions1), mean_squared_error(Y2, predictions2))

1 291.58597082421744 104.40268312757502
10 291.58597082421744 104.40268312757502
100 291.58597082421744 104.40268312757502
1000 291.58597082421744 104.40268312757502
