From 7c3cc130140cefcbe11fa34942eec07166b0e003 Mon Sep 17 00:00:00 2001 From: ltcptgeneral <35508619+ltcptgeneral@users.noreply.github.com> Date: Tue, 27 Nov 2018 22:45:28 -0600 Subject: [PATCH] analysis.py - v 1.0.6.000 changelog: - added calc_overfit, which calculates two measures of overfit, error and performance - added calculating overfit to optimize_regression --- .../__pycache__/analysis.cpython-37.pyc | Bin 23960 -> 25085 bytes data analysis/analysis.py | 79 +++++++++++++++--- 2 files changed, 68 insertions(+), 11 deletions(-) diff --git a/data analysis/__pycache__/analysis.cpython-37.pyc b/data analysis/__pycache__/analysis.cpython-37.pyc index 34f8d5f6d39d12ad2eefd109c136ceeb11bb3e4b..a3a86a9df77a4f467c0709838b3ffb68d9b5a366 100644 GIT binary patch delta 3457 zcmaJDZE#dq^_};z`}QMC5<)g13A`W_7D$>9z7moU!dD;=Abf;;k#2KeLS#3a+e!0VE7R6-DuuDMItuhnf2h?~+tP8w(Qz!D?pTL$Mjd9Vqoe+CXshSEU6LT3 zzT}>J?m6e4bMN`O``nM=leZw$oRj01@H6Lq+TAmLK2!vC*O#7xhE=Dfu1Z$J@^OBc zt!8U*u3~H1I-IN7dRB|`s_hc1V;fFOY=b#}O6jU$-K?H9oCfh{s7dW)tthjB^|0Nn z?KE`NvpsAtQX1Gk){b){+s_W*ypi>?gX|DWG_fPBgB?a}6YFE0tP8PbHgCTad1S&@ zU0GGRrn0K4D#|^Q)PFQOJZQ!egLVFCMxEclufm8jW-=p{7_o-pLlb5nHwU?yOb#W6 zje)V@ehZP6#)+dt{YQT!Q8 zWtlu_B*rWwnTVsJ<0c;%vd9V@AZaYScb24aV#qpb@FS~?2BSKXRX-Ao4iDos@G?WR zp>A3OGcD7-gfTQcbH)cQ>dD;H8J9PEA@02HO96~J^?|pX7W1 z&pPji4g!4JsmKigye4Y%o`au=-{zf$Q%*8GO#kni?4@o$s%C8}&GR-TMGBv9tCmI3sS= z-cCI&OHwkI(p^^Fze2f44$6MHUn3%ZR0}GO%{Y=>2nMA|ZM@2o7+7E{Q^4dYWsB5X zI_a?$+fyt}dM%kLwiK3_8kVL2q|w3-$x>|1R;T1i-BPod&xVxkE0S#4){CS88M9hz z<9TR;Hz1e@9*@S7mC=zAbC@mTPa!Fy@r}f8B5;;Kh`<~K5r0}YN5`U+8EGGnC;KcO z9U4ySd{q|njQ3e)(n@^UhwIlrqO2v6 zH!V1(@B;K8Ou;FzrAf(#NkE^gfQkno!+snA9KrD} zGH%Nh-mJ-Ole+D{=(7Sg1y`R!ekyAwa35LCQiJzVwG4IiM;+4mS5UdiFqaB4`6FX) z@k#yU5!XPF3|Kj~ZYw2tZ!$E?&TQ_I412c0+3XBg6aT0$i+{e!(C3{bX9< ztCDGNw$Mdw6lUZRsXV%_(Yii%;l+ha<5|c3Y+uJ5|78RY_8sHu0*=?^v;TmILyGF%4ncI#HSApEKZwsm?Yh_ zUr3mnyQG#E5O)D$5jm~ln>S|iyl3{6!iS=4Ul}YG8}|JZ{wf}BFM$=JwY?k4#g+Do zxp$}UhZicsjN}#KFYU)e<)ldED{)HWQ-8I+Jj^X~RKa&A`x=G7K!@-uae4nEIypF^ z@VO#zVENp@jF2BiP^d<<9JsuHDGKpDbj&6Kw1;0J%=OJ;QlaraBDk^0BYpva5P|>m z4*iD6S4HE&Uxu>AvpYpzIPB?0ZkZ}pANozz3fkO6T3tCgVr+3qf&Pi&6xOKQ?k$V#|7SS_yT*=ut5lx+Cxh~cH zF`)D%lWSBF?<|7#;!Njvsz1v~>&e_K|D@INN25IJCDGGW0-MBm*NOZ>lA|7op>VxX zE12xTiBk z4blDKk*UU9THYi;U4~Z?mNa%WR-QlbEm-g$j+Vf zo$q|#?|Yte?!9;t4*v;U4Rhx@8U4PPx!5~7cF>gv6~{{UKwZ`(>nRl#qLSw8#Zs}1 z<}$Hdtf0AkH4|0hhDj!F2~xUy1}_FpCNugR}%Uu%^VslX++(Gjj^~k0WsL3Nq3A3n1DU7neEbP*l5_d z_rx^8<6LODOlT9@O|0+cU5>aut;cL}7KgZFg7hBt1OSWMrpHso+N{|A3$UJ)j#YHB z?R&|&CTwxdP0n-K1Sj{m?{e(uw*uHzfS4ohh-(FG0)7!lkNs9l@y4w@RKVY=X9IV# zu_<8c=_am=1|@RO?<68GCu02HL{ha^C-Y3LJXM|4>ezB);dnqbHZ~`&Prz(r7gJ;H zafgR`v4u~8oEG-D9(U9-nr*W)>1){(j7?M0RXd8QPflT`pTO?T$VaCCHdYiwvV;v3l-jrlf(eM6C$oJWU5)+y3ORx2PZCWDB z1~-5nE{|j8s<#bFBoq;&Blzj6_NrG2<+=iQ@+_|N9PZ_AkV-e8yr>_&UzmCsS{GwV zGY4d|M?y&+zlM|B~m=)>VNG zbK8s3l%2ew$Q-P1&-0`KvA-(fds5$zvb}_V&+w7=-cZOLilYO?WVhu*ac;%U zCC<+iwr!!|tzpTqzrzbp;m(di$i{~{&Vh*!tS^9kd~SVjTBgEk2?)spSntWbD&Y*? zHWU%T;aGU=#N|r46Y@X@=_T}lcBsgW{hfW8maQ%eNSJ`(q=TYM4kS5v=ZXUt_^4U3 z2%&4Ep_=d4B*&qy>+()t0Xfyd0a>h~079V~-{{&`okL66tO^S#qKxH}ide17CavEl zgGyj+7`;iD)D*dhC%oDA9@6|(!z11A&7F;##8SC=_Fhu{UBg2?AC%=QlYnw#!FbYT zo|67*Pc{(ftY#kSbwJUKGB`lQ4+$7ZIeq6y_OphsZP?&T!N^6I{aI&Jp#Ycm-BN57 z6HtzfWMm7OQVhe#`U?G)RGB8UU^M!p>Rl$S=<`CI^Kvh^tw)yNKl+|7|Gn0gs0>oN zme99v3dz~t;UD`8=-c{H{~cL2g&R=`X~~4H#bD42^DrJ&O-#!m*|>q!?BPT?+dL?a~T06-$*wyj2iqrwBUb1 C%1+e) diff --git a/data analysis/analysis.py b/data analysis/analysis.py index b038ccc3..44611d65 100644 --- a/data analysis/analysis.py +++ b/data analysis/analysis.py @@ -8,9 +8,12 @@ #setup: -__version__ = "1.0.5.000" +__version__ = "1.0.6.000" __changelog__ = """changelog: +1.0.6.000: + - added calc_overfit, which calculates two measures of overfit, error and performance + - added calculating overfit to optimize_regression 1.0.5.000: - added optimize_regression function, which is a sample function to find the optimal regressions - optimize_regression function filters out some overfit funtions (functions with r^2 = 1) @@ -667,6 +670,24 @@ def rms(predictions, targets): # assumes equal size inputs return float(out) +def calc_overfit(equation, rms_train, r2_train, x_test, y_test): + + #overfit = performance(train) - performance(test) where performance is r^2 + #overfir = error(train) - error(test) where error is rms + + vals = [] + + for i in range(0, len(x_test), 1): + + z = x_test[i] + + exec("vals.append(" + equation + ")") + + r2_test = r_squared(vals, y_test) + rms_test = rms(vals, y_test) + + return rms_train - rms_test, r2_train - r2_test + def strip_data(data, mode): if mode == "adam": #x is the row number, y are the data @@ -686,7 +707,31 @@ def optimize_regression(x, y, _range, resolution):#_range in poly regression is if type(resolution) != int: raise error("resolution must be int") + x = x + y = y + x_train = [] + y_train = [] + + x_test = [] + y_test = [] + + for i in range (0, math.floor(len(x) * 0.4), 1): + + index = random.randint(0, len(x) - 1) + + x_test.append(x[index]) + y_test.append(y[index]) + + x.pop(index) + y.pop(index) + + x_train = x + y_train = y + + #print(x_train, x_test) + #print(y_train, y_test) + eqs = [] rmss = [] @@ -695,17 +740,17 @@ def optimize_regression(x, y, _range, resolution):#_range in poly regression is for i in range (0, _range + 1, 1): - eqs.append(poly_regression(x, y, i)[0]) - rmss.append(poly_regression(x, y, i)[1]) - r2s.append(poly_regression(x, y, i)[2]) + eqs.append(poly_regression(x_train, y_train, i)[0]) + rmss.append(poly_regression(x_train, y_train, i)[1]) + r2s.append(poly_regression(x_train, y_train, i)[2]) for i in range (1, 100 * resolution + 1): try: - eqs.append(exp_regression(x, y, float(i / resolution))[0]) - rmss.append(exp_regression(x, y, float(i / resolution))[1]) - r2s.append(exp_regression(x, y, float(i / resolution))[2]) + eqs.append(exp_regression(x_train, y_train, float(i / resolution))[0]) + rmss.append(exp_regression(x_train, y_train, float(i / resolution))[1]) + r2s.append(exp_regression(x_train, y_train, float(i / resolution))[2]) except: @@ -715,9 +760,9 @@ def optimize_regression(x, y, _range, resolution):#_range in poly regression is try: - eqs.append(log_regression(x, y, float(i / resolution))[0]) - rmss.append(log_regression(x, y, float(i / resolution))[1]) - r2s.append(log_regression(x, y, float(i / resolution))[2]) + eqs.append(log_regression(x_train, y_train, float(i / resolution))[0]) + rmss.append(log_regression(x_train, y_train, float(i / resolution))[1]) + r2s.append(log_regression(x_train, y_train, float(i / resolution))[2]) except: @@ -742,8 +787,14 @@ def optimize_regression(x, y, _range, resolution):#_range in poly regression is except: break + + overfit = [] + + for i in range (0, len(eqs), 1): + + overfit.append(calc_overfit(eqs[i], rmss[i], r2s[i], x_test, y_test)) - return eqs, rmss, r2s + return eqs, rmss, r2s, overfit def basic_analysis(filepath): #assumes that rows are the independent variable and columns are the dependant. also assumes that time flows from lowest column to highest column. @@ -855,6 +906,12 @@ def debug(): print(exp_regression([1, 2, 3, 4], [2, 4, 8, 16], 2.717)) + x, y, z = optimize_regression([0, 1, 2, 3, 4], [1, 2, 4, 7, 19], 10, 100) + + for i in range(0, len(x), 1): + + print(str(x[i]) + " | " + str(y[i]) + " | " + str(z[i])) + #statistics def below------------------------------------------------------------------------------------------------------------------------------------------------------ class StatisticsError(ValueError):