From e5e37ebea95a606e39b887fb4e5b776d91ae0eda Mon Sep 17 00:00:00 2001 From: ltcptgeneral Date: Mon, 4 Dec 2023 10:22:45 -0800 Subject: [PATCH] extract lasso features --- linear_bow.ipynb | 66 +++++++++++++++++++++++++++++++++++------------ preliminary.ipynb | 4 +-- 2 files changed, 52 insertions(+), 18 deletions(-) diff --git a/linear_bow.ipynb b/linear_bow.ipynb index b917b2a..cfa1c17 100644 --- a/linear_bow.ipynb +++ b/linear_bow.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 31, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -15,7 +15,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -64,7 +64,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -73,7 +73,7 @@ "97248" ] }, - "execution_count": 33, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -97,7 +97,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -111,7 +111,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -129,7 +129,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -169,7 +169,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -190,18 +190,18 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "0.01 0.21632227671715168 0.6847807364903296\n", - "0.1 0.2156867944836758 0.6829965387241808\n", - "1 0.21316700811628655 0.6747810400313006\n", - "10 0.2161776145305841 0.6681779252365153\n", - "100 0.20723445731519957 0.5973124724751776\n" + "0.01 0.21632227671897006 0.6847807364939246\n", + "0.1 0.21568679448554554 0.6829965387278908\n", + "1 0.21316700811782532 0.6747810400344023\n", + "10 0.21617761453133164 0.6681779252378663\n", + "100 0.20723445731522736 0.5973124724752429\n" ] } ], @@ -226,7 +226,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -236,7 +236,7 @@ "0.01 0.17702951629340366 0.538690243296189\n", "0.1 0.177432503566242 0.5387345171140366\n", "1 0.17743138596037397 0.538778156304091\n", - "10 0.17786269625555318 0.5396020974919651\n", + "10 0.17786269625555318 0.539602097491965\n", "100 0.17818192454918605 0.557911382661004\n" ] } @@ -257,6 +257,40 @@ "\n", " print(C, mean_squared_error(Y_funny_test, pred_funny_test), mean_squared_error(Y_helpful_test, pred_helpful_test))" ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-0.008666338118757945: you\n", + "-0.0033743830081447994: shit\n", + "0.002818066343305709: i\n", + "-0.0009495127061038011: 3\n", + "-0.0007321653144316716: it\n", + "0.0006758513650775456: bad\n", + "-0.0005113655228402811: nyan\n", + "-0.00045892382707229636: of\n", + "0.0003103226245059089: *review length*\n", + "-0.00026518360080170943: ͡°\n" + ] + } + ], + "source": [ + "model = linear_model.Lasso(alpha=0.01, fit_intercept=True)\n", + "model.fit(X_train, Y_helpful_train)\n", + "idxs = np.argsort(np.abs(model.coef_))[::-1][:10]\n", + "\n", + "for idx in idxs:\n", + " if (idx < len(words)):\n", + " print(f\"{model.coef_[idx]}: {words[idx]}\")\n", + " else:\n", + " print(f\"{model.coef_[idx]}: *review length*\")" + ] } ], "metadata": { diff --git a/preliminary.ipynb b/preliminary.ipynb index 55037ff..eebf99f 100644 --- a/preliminary.ipynb +++ b/preliminary.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "9808cacf", "metadata": {}, "outputs": [], @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "494d6c25", "metadata": {}, "outputs": [],