From 4cf85a15dd5ac17000a73014534675dac9abd644 Mon Sep 17 00:00:00 2001 From: Arthur Lu Date: Thu, 30 Nov 2023 10:30:48 -0800 Subject: [PATCH] small improvement by adding review length as feature --- linear_bow.ipynb | 74 ++++++++++++++++++------------------------------ 1 file changed, 28 insertions(+), 46 deletions(-) diff --git a/linear_bow.ipynb b/linear_bow.ipynb index b8db19b..b917b2a 100644 --- a/linear_bow.ipynb +++ b/linear_bow.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -15,7 +15,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -24,33 +24,22 @@ " yield eval(l)\n", "\n", "data = list(parseData(\"australian_user_reviews.json.gz\"))\n", - "\n", - "dm = [[0,0],[0,0]]\n", - "\n", "users = set()\n", - "games = set()\n", - "\n", - "nodate = 0\n", - "\n", "reviews = []\n", "\n", "for user in data:\n", " if user[\"user_id\"] in users:\n", - " #print(f\"ducplicate user skipped: {user['user_id']}\")\n", " pass\n", " else:\n", " users.add(user[\"user_id\"])\n", " for review in user[\"reviews\"]:\n", - " games.add(review[\"item_id\"])\n", " funny = review[\"funny\"]\n", - " hasfunny = int(funny != \"\")\n", " if funny == \"\":\n", " review[\"funny\"] = 0\n", " else:\n", " review[\"funny\"] = int(re.findall(\"\\d+\", funny)[0])\n", " \n", " helpful = review[\"helpful\"]\n", - " hashelpful = int(helpful != \"No ratings yet\")\n", " if helpful == \"No ratings yet\":\n", " review[\"helpful_n\"] = 0\n", " review[\"helpful_total\"] = 0\n", @@ -61,14 +50,12 @@ " review[\"helpful\"] = float(nums[0]) / float(nums[1])\n", " review[\"helpful_n\"] = float(nums[0])\n", " review[\"helpful_total\"] = float(nums[1])\n", - " \n", - " dm[hasfunny][hashelpful] += 1\n", - "\n", + " \n", " try:\n", " post_datetime = datetime.strptime(review[\"posted\"],'Posted %B %d, %Y.')\n", " review[\"posted\"] = post_datetime\n", " except:\n", - " nodate += 1\n", + " pass\n", "\n", " review[\"user_id\"] = user[\"user_id\"]\n", " review[\"user_url\"] = user[\"user_url\"]\n", @@ -77,7 +64,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -86,7 +73,7 @@ "97248" ] }, - "execution_count": 3, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -110,7 +97,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 34, "metadata": {}, "outputs": [], "source": [ @@ -124,7 +111,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ @@ -134,13 +121,15 @@ " for w in r.split():\n", " if w in words:\n", " feat[wordId[w]] += 1\n", - " feat.append(1) # offset\n", + "\n", + " feat.append(len(datum[\"review\"]))\n", + " \n", " return feat" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ @@ -180,7 +169,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 37, "metadata": {}, "outputs": [ { @@ -201,18 +190,18 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "0.01 0.24665170508912013 0.7702414041912456\n", - "0.1 0.24578924150085898 0.7681419094613451\n", - "1 0.24248804203997093 0.7584811772506682\n", - "10 0.24888382029075776 0.7518311372299598\n", - "100 0.23060394844562843 0.6419885405134674\n" + "0.01 0.21632227671715168 0.6847807364903296\n", + "0.1 0.2156867944836758 0.6829965387241808\n", + "1 0.21316700811628655 0.6747810400313006\n", + "10 0.2161776145305841 0.6681779252365153\n", + "100 0.20723445731519957 0.5973124724751776\n" ] } ], @@ -223,10 +212,10 @@ "\n", "for C in Cs:\n", "\n", - " model1 = linear_model.Ridge(C, fit_intercept=True)\n", + " model1 = linear_model.Ridge(alpha=C, fit_intercept=True)\n", " model1.fit(X_train, Y_funny_train)\n", "\n", - " model2 = linear_model.Ridge(C, fit_intercept=True)\n", + " model2 = linear_model.Ridge(alpha=C, fit_intercept=True)\n", " model2.fit(X_train, Y_helpful_train)\n", "\n", " pred_funny_test = model1.predict(X_test)\n", @@ -237,17 +226,17 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 39, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "0.01 0.17730058785614386 0.539258189636067\n", - "0.1 0.17818192454918605 0.543156420319067\n", - "1 0.17818192454918605 0.557911382661004\n", - "10 0.17818192454918605 0.557911382661004\n", + "0.01 0.17702951629340366 0.538690243296189\n", + "0.1 0.177432503566242 0.5387345171140366\n", + "1 0.17743138596037397 0.538778156304091\n", + "10 0.17786269625555318 0.5396020974919651\n", "100 0.17818192454918605 0.557911382661004\n" ] } @@ -257,10 +246,10 @@ "\n", "for C in Cs:\n", "\n", - " model1 = linear_model.Lasso(C, fit_intercept=True)\n", + " model1 = linear_model.Lasso(alpha=C, fit_intercept=True)\n", " model1.fit(X_train, Y_funny_train)\n", "\n", - " model2 = linear_model.Lasso(C, fit_intercept=True)\n", + " model2 = linear_model.Lasso(alpha=C, fit_intercept=True)\n", " model2.fit(X_train, Y_helpful_train)\n", "\n", " pred_funny_test = model1.predict(X_test)\n", @@ -268,13 +257,6 @@ "\n", " print(C, mean_squared_error(Y_funny_test, pred_funny_test), mean_squared_error(Y_helpful_test, pred_helpful_test))" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": {