In [1]:
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import re
import gzip

In [2]:
def parseData(fname):
 for l in gzip.open(fname):
 yield eval(l)

data = list(parseData("australian_user_reviews.json.gz"))
users = set()
reviews = []

for user in data:
 if user["user_id"] in users:
 pass
 else:
 users.add(user["user_id"])
 for review in user["reviews"]:
 funny = review["funny"]
 if funny == "":
 review["funny"] = 0
 else:
 review["funny"] = int(re.findall("\d+", funny)[0])
 
 helpful = review["helpful"]
 if helpful == "No ratings yet":
 review["helpful_n"] = 0
 review["helpful_total"] = 0
 review["helpful"] = 0
 else:
 nums = re.findall("\d+", helpful.replace(",", ""))
 helpfulness = float(nums[0]) / float(nums[1])
 review["helpful"] = float(nums[0]) / float(nums[1])
 review["helpful_n"] = float(nums[0])
 review["helpful_total"] = float(nums[1])
 
 try:
 post_datetime = datetime.strptime(review["posted"],'Posted %B %d, %Y.')
 review["posted"] = post_datetime
 except:
 pass

 review["user_id"] = user["user_id"]
 review["user_url"] = user["user_url"]
 reviews.append(review)

In [3]:
from collections import defaultdict
import string
from nltk.stem.porter import *

wordCount = defaultdict(int)
punctuation = set(string.punctuation)
stemmer = PorterStemmer()
for review in reviews:
	r = ''.join([c for c in review['review'].lower() if not c in punctuation])
	for w in r.split():
		w = stemmer.stem(w)
		wordCount[w] += 1
		
len(wordCount)

97248

In [4]:
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()
words = [x[1] for x in counts[:1000]]
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [5]:
def feature(datum):
 feat = [0]*len(words)
 r = ''.join([c for c in datum['review'].lower() if not c in punctuation])
 for w in r.split():
 if w in words:
 feat[wordId[w]] += 1

 feat.append(len(datum["review"]))
 
 return feat

In [6]:
from sklearn.model_selection import train_test_split
import numpy as np

X_train, X_test, Y_funny_train, Y_funny_test, Y_helpful_train, Y_helpful_test = [], [], [], [], [], []

train, test = train_test_split(reviews, test_size=0.25, random_state=0)

for review in train:
 X_train.append(feature(review))
 #Y1.append(review["funny"])
 if review["funny"] == 0:
 Y_funny_train.append(0)
 else:
 Y_funny_train.append(np.log(review["funny"]))
 #Y2.append(review["helpful_n"])
 if review["helpful_n"] == 0:
 Y_helpful_train.append(0)
 else:
 Y_helpful_train.append(np.log(review["helpful_n"]))

for review in test:
 X_test.append(feature(review))
 #Y1.append(review["funny"])
 if review["funny"] == 0:
 Y_funny_test.append(0)
 else:
 Y_funny_test.append(np.log(review["funny"]))
 #Y2.append(review["helpful_n"])
 if review["helpful_n"] == 0:
 Y_helpful_test.append(0)
 else:
 Y_helpful_test.append(np.log(review["helpful_n"]))


In [7]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
guess_mean_funny = np.mean(Y_funny_train)
guess_mean_helpful = np.mean(Y_helpful_train)

print("baseline", mean_squared_error(Y_funny_test, [guess_mean_funny]*len(Y_funny_test)), mean_squared_error(Y_helpful_test, [guess_mean_helpful]*len(Y_helpful_test)))

baseline 0.17818192454918605 0.557911382661004


In [8]:
from sklearn import linear_model

Cs = [0.01, 0.1, 1, 10, 100]

for C in Cs:

 model1 = linear_model.Ridge(alpha=C, fit_intercept=True)
 model1.fit(X_train, Y_funny_train)

 model2 = linear_model.Ridge(alpha=C, fit_intercept=True)
 model2.fit(X_train, Y_helpful_train)

 pred_funny_test = model1.predict(X_test)
 pred_helpful_test = model2.predict(X_test)

 print(C, mean_squared_error(Y_funny_test, pred_funny_test), mean_squared_error(Y_helpful_test, pred_helpful_test))

0.01 0.21632227671897006 0.6847807364939246
0.1 0.21568679448554554 0.6829965387278908
1 0.21316700811782532 0.6747810400344023
10 0.21617761453133164 0.6681779252378663
100 0.20723445731522736 0.5973124724752429


In [9]:
Cs = [0.01, 0.1, 1, 10, 100]

for C in Cs:

 model1 = linear_model.Lasso(alpha=C, fit_intercept=True)
 model1.fit(X_train, Y_funny_train)

 model2 = linear_model.Lasso(alpha=C, fit_intercept=True)
 model2.fit(X_train, Y_helpful_train)

 pred_funny_test = model1.predict(X_test)
 pred_helpful_test = model2.predict(X_test)

 print(C, mean_squared_error(Y_funny_test, pred_funny_test), mean_squared_error(Y_helpful_test, pred_helpful_test))

0.01 0.17702951629340366 0.538690243296189
0.1 0.177432503566242 0.5387345171140366
1 0.17743138596037397 0.538778156304091
10 0.17786269625555318 0.539602097491965
100 0.17818192454918605 0.557911382661004


In [11]:
model = linear_model.Lasso(alpha=0.01, fit_intercept=True)
model.fit(X_train, Y_helpful_train)
idxs = np.argsort(np.abs(model.coef_))[::-1][:10]

for idx in idxs:
 if (idx < len(words)):
 print(f"{model.coef_[idx]}: {words[idx]}")
 else:
 print(f"{model.coef_[idx]}: *review length*")

-0.008666338118757945: you
-0.0033743830081447994: shit
0.002818066343305709: i
-0.0009495127061038011: 3
-0.0007321653144316716: it
0.0006758513650775456: bad
-0.0005113655228402811: nyan
-0.00045892382707229636: of
0.0003103226245059089: *review length*
-0.00026518360080170943: ͡°
