From 8545a0669124baaceb4fd7380c3a2bf0067749f3 Mon Sep 17 00:00:00 2001 From: Jacob Levine Date: Sat, 16 Mar 2019 15:47:56 -0500 Subject: [PATCH] changed to signups. not complete yet --- .../.ipynb_checkpoints/analysis-checkpoint.py | 1107 +++++++++++++++++ .../__pycache__/analysis.cpython-36.pyc | Bin 0 -> 25273 bytes 2 files changed, 1107 insertions(+) create mode 100644 data analysis/.ipynb_checkpoints/analysis-checkpoint.py create mode 100644 data analysis/__pycache__/analysis.cpython-36.pyc diff --git a/data analysis/.ipynb_checkpoints/analysis-checkpoint.py b/data analysis/.ipynb_checkpoints/analysis-checkpoint.py new file mode 100644 index 00000000..479f79de --- /dev/null +++ b/data analysis/.ipynb_checkpoints/analysis-checkpoint.py @@ -0,0 +1,1107 @@ +#Titan Robotics Team 2022: Data Analysis Module +#Written by Arthur Lu & Jacob Levine +#Notes: +# this should be imported as a python module using 'import analysis' +# this should be included in the local directory or environment variable +# this module has not been optimized for multhreaded computing +#number of easter eggs: 2 +#setup: + +__version__ = "1.0.8.005" + +#changelog should be viewed using print(analysis.__changelog__) +__changelog__ = """changelog: +1.0.8.005: + - minor fixes +1.0.8.004: + - removed a few unused dependencies +1.0.8.003: + - added p_value function +1.0.8.002: + - updated __all__ correctly to contain changes made in v 1.0.8.000 and v 1.0.8.001 +1.0.8.001: + - refactors + - bugfixes +1.0.8.000: + - depreciated histo_analysis_old + - depreciated debug + - altered basic_analysis to take array data instead of filepath + - refactor + - optimization +1.0.7.002: + - bug fixes +1.0.7.001: + - bug fixes +1.0.7.000: + - added tanh_regression (logistical regression) + - bug fixes +1.0.6.005: + - added z_normalize function to normalize dataset + - bug fixes +1.0.6.004: + - bug fixes +1.0.6.003: + - bug fixes +1.0.6.002: + - bug fixes +1.0.6.001: + - corrected __all__ to contain all of the functions +1.0.6.000: + - added calc_overfit, which calculates two measures of overfit, error and performance + - added calculating overfit to optimize_regression +1.0.5.000: + - added optimize_regression function, which is a sample function to find the optimal regressions + - optimize_regression function filters out some overfit funtions (functions with r^2 = 1) + - planned addition: overfit detection in the optimize_regression function +1.0.4.002: + - added __changelog__ + - updated debug function with log and exponential regressions +1.0.4.001: + - added log regressions + - added exponential regressions + - added log_regression and exp_regression to __all__ +1.0.3.008: + - added debug function to further consolidate functions +1.0.3.007: + - added builtin benchmark function + - added builtin random (linear) data generation function + - added device initialization (_init_device) +1.0.3.006: + - reorganized the imports list to be in alphabetical order + - added search and regurgitate functions to c_entities, nc_entities, obstacles, objectives +1.0.3.005: + - major bug fixes + - updated historical analysis + - depreciated old historical analysis +1.0.3.004: + - added __version__, __author__, __all__ + - added polynomial regression + - added root mean squared function + - added r squared function +1.0.3.003: + - bug fixes + - added c_entities +1.0.3.002: + - bug fixes + - added nc_entities, obstacles, objectives + - consolidated statistics.py to analysis.py +1.0.3.001: + - compiled 1d, column, and row basic stats into basic stats function +1.0.3.000: + - added historical analysis function +1.0.2.xxx: + - added z score test +1.0.1.xxx: + - major bug fixes +1.0.0.xxx: + - added loading csv + - added 1d, column, row basic stats +""" + +__author__ = ( + "Arthur Lu , " + "Jacob Levine ," + ) + +__all__ = [ + '_init_device', + 'c_entities', + 'nc_entities', + 'obstacles', + 'objectives', + 'load_csv', + 'basic_stats', + 'z_score', + 'z_normalize', + 'stdev_z_split', + 'histo_analysis', + 'poly_regression', + 'log_regression', + 'exp_regression', + 'r_squared', + 'rms', + 'calc_overfit', + 'strip_data', + 'optimize_regression', + 'select_best_regression', + 'basic_analysis', + #all statistics functions left out due to integration in other functions + ] + +#now back to your regularly scheduled programming: + +#imports (now in alphabetical order! v 1.0.3.006): + +from bisect import bisect_left, bisect_right +import collections +import csv +from decimal import Decimal +import functools +from fractions import Fraction +from itertools import groupby +import math +import matplotlib +import numbers +import numpy as np +import pandas +import random +import scipy +from scipy.optimize import curve_fit +from scipy import stats +from sklearn import * +#import statistics <-- statistics.py functions have been integrated into analysis.py as of v 1.0.3.002 +import time +import torch + +class error(ValueError): + pass + +def _init_device (setting, arg): #initiates computation device for ANNs + if setting == "cuda": + try: + return torch.device(setting + ":" + str(arg) if torch.cuda.is_available() else "cpu") + except: + raise error("could not assign cuda or cpu") + elif setting == "cpu": + try: + return torch.device("cpu") + except: + raise error("could not assign cpu") + else: + raise error("specified device does not exist") + +class c_entities: + + c_names = [] + c_ids = [] + c_pos = [] + c_properties = [] + c_logic = [] + + def debug(self): + print("c_entities has attributes names, ids, positions, properties, and logic. __init__ takes self, 1d array of names, 1d array of ids, 2d array of positions, nd array of properties, and nd array of logic") + return[self.c_names, self.c_ids, self.c_pos, self.c_properties, self.c_logic] + + def __init__(self, names, ids, pos, properties, logic): + self.c_names = names + self.c_ids = ids + self.c_pos = pos + self.c_properties = properties + self.c_logic = logic + return None + + + def append(self, n_name, n_id, n_pos, n_property, n_logic): + self.c_names.append(n_name) + self.c_ids.append(n_id) + self.c_pos.append(n_pos) + self.c_properties.append(n_property) + self.c_logic.append(n_logic) + return None + + def edit(self, search, n_name, n_id, n_pos, n_property, n_logic): + position = 0 + for i in range(0, len(self.c_ids), 1): + if self.c_ids[i] == search: + position = i + if n_name != "null": + self.c_names[position] = n_name + + if n_id != "null": + self.c_ids[position] = n_id + + if n_pos != "null": + self.c_pos[position] = n_pos + + if n_property != "null": + self.c_properties[position] = n_property + + if n_logic != "null": + self.c_logic[position] = n_logic + + return None + + def search(self, search): + position = 0 + for i in range(0, len(self.c_ids), 1): + if self.c_ids[i] == search: + position = i + + return [self.c_names[position], self.c_ids[position], self.c_pos[position], self.c_properties[position], self.c_logic[position]] + + def regurgitate(self): + return[self.c_names, self.c_ids, self.c_pos, self.c_properties, self.c_logic] + +class nc_entities: + + c_names = [] + c_ids = [] + c_pos = [] + c_properties = [] + c_effects = [] + + def debug(self): + print ("nc_entities (non-controlable entities) has attributes names, ids, positions, properties, and effects. __init__ takes self, 1d array of names, 1d array of ids, 2d array of positions, 2d array of properties, and 2d array of effects.") + return[self.c_names, self.c_ids, self.c_pos, self.c_properties, self.c_effects] + + def __init__(self, names, ids, pos, properties, effects): + self.c_names = names + self.c_ids = ids + self.c_pos = pos + self.c_properties = properties + self.c_effects = effects + return None + + def append(self, n_name, n_id, n_pos, n_property, n_effect): + self.c_names.append(n_name) + self.c_ids.append(n_id) + self.c_pos.append(n_pos) + self.c_properties.append(n_property) + self.c_effects.append(n_effect) + + return None + + def edit(self, search, n_name, n_id, n_pos, n_property, n_effect): + position = 0 + for i in range(0, len(self.c_ids), 1): + if self.c_ids[i] == search: + position = i + if n_name != "null": + self.c_names[position] = n_name + + if n_id != "null": + self.c_ids[position] = n_id + + if n_pos != "null": + self.c_pos[position] = n_pos + + if n_property != "null": + self.c_properties[position] = n_property + + if n_effect != "null": + self.c_effects[position] = n_effect + + return None + + def search(self, search): + position = 0 + for i in range(0, len(self.c_ids), 1): + if self.c_ids[i] == search: + position = i + + return [self.c_names[position], self.c_ids[position], self.c_pos[position], self.c_properties[position], self.c_effects[position]] + + def regurgitate(self): + + return[self.c_names, self.c_ids, self.c_pos, self.c_properties, self.c_effects] + +class obstacles: + + c_names = [] + c_ids = [] + c_perim = [] + c_effects = [] + + def debug(self): + print("obstacles has atributes names, ids, positions, perimeters, and effects. __init__ takes self, 1d array of names, 1d array of ids, 2d array of position, 3d array of perimeters, 2d array of effects.") + return [self.c_names, self.c_ids, self.c_perim, self.c_effects] + + def __init__(self, names, ids, perims, effects): + self.c_names = names + self.c_ids = ids + self.c_perim = perims + self.c_effects = effects + return None + + def append(self, n_name, n_id, n_perim, n_effect): + self.c_names.append(n_name) + self.c_ids.append(n_id) + self.c_perim.append(n_perim) + self.c_effects.append(n_effect) + return None + + def edit(self, search, n_name, n_id, n_perim, n_effect): + position = 0 + for i in range(0, len(self.c_ids), 1): + if self.c_ids[i] == search: + position = i + + if n_name != "null": + self.c_names[position] = n_name + + if n_id != "null": + self.c_ids[position] = n_id + + if n_perim != "null": + self.c_perim[position] = n_perim + + if n_effect != "null": + self.c_effects[position] = n_effect + + return None + + def search(self, search): + position = 0 + for i in range(0, len(self.c_ids), 1): + if self.c_ids[i] == search: + position = i + + return [self.c_names[position], self.c_ids[position], self.c_perim[position], self.c_effects[position]] + + def regurgitate(self): + return[self.c_names, self.c_ids, self.c_perim, self.c_effects] + +class objectives: + + c_names = [] + c_ids = [] + c_pos = [] + c_effects = [] + + def debug(self): + print("objectives has atributes names, ids, positions, and effects. __init__ takes self, 1d array of names, 1d array of ids, 2d array of position, 1d array of effects.") + return [self.c_names, self.c_ids, self.c_pos, self.c_effects] + + def __init__(self, names, ids, pos, effects): + self.c_names = names + self.c_ids = ids + self.c_pos = pos + self.c_effects = effects + return None + + def append(self, n_name, n_id, n_pos, n_effect): + self.c_names.append(n_name) + self.c_ids.append(n_id) + self.c_pos.append(n_pos) + self.c_effects.append(n_effect) + return None + + def edit(self, search, n_name, n_id, n_pos, n_effect): + position = 0 + print(self.c_ids) + for i in range(0, len(self.c_ids), 1): + if self.c_ids[i] == search: + position = i + + if n_name != "null": + self.c_names[position] = n_name + + if n_id != "null": + self.c_ids[position] = n_id + + if n_pos != "null": + self.c_pos[position] = n_pos + + if n_effect != "null": + self.c_effects[position] = n_effect + + return None + + def search(self, search): + position = 0 + for i in range(0, len(self.c_ids), 1): + if self.c_ids[i] == search: + position = i + + return [self.c_names[position], self.c_ids[position], self.c_pos[position], self.c_effects[position]] + + def regurgitate(self): + return[self.c_names, self.c_ids, self.c_pos, self.c_effects] + +def load_csv(filepath): + with open(filepath, newline = '') as csvfile: + file_array = list(csv.reader(csvfile)) + csvfile.close() + return file_array + +def basic_stats(data, method, arg): # data=array, mode = ['1d':1d_basic_stats, 'column':c_basic_stats, 'row':r_basic_stats], arg for mode 1 or mode 2 for column or row + + if method == 'debug': + return "basic_stats requires 3 args: data, mode, arg; where data is data to be analyzed, mode is an int from 0 - 2 depending on type of analysis (by column or by row) and is only applicable to 2d arrays (for 1d arrays use mode 1), and arg is row/column number for mode 1 or mode 2; function returns: [mean, median, mode, stdev, variance]" + + if method == "1d" or method == 0: + + data_t = [] + + for i in range (0, len(data), 1): + data_t.append(float(data[i])) + + _mean = mean(data_t) + _median = median(data_t) + try: + _mode = mode(data_t) + except: + _mode = None + try: + _stdev = stdev(data_t) + except: + _stdev = None + try: + _variance = variance(data_t) + except: + _variance = None + + return _mean, _median, _mode, _stdev, _variance + + elif method == "column" or method == 1: + + c_data = [] + c_data_sorted = [] + + for i in data: + try: + c_data.append(float(i[arg])) + except: + pass + + _mean = mean(c_data) + _median = median(c_data) + try: + _mode = mode(c_data) + except: + _mode = None + try: + _stdev = stdev(c_data) + except: + _stdev = None + try: + _variance = variance(c_data) + except: + _variance = None + + return _mean, _median, _mode, _stdev, _variance + + elif method == "row" or method == 2: + + r_data = [] + + for i in range(len(data[arg])): + r_data.append(float(data[arg][i])) + + _mean = mean(r_data) + _median = median(r_data) + try: + _mode = mode(r_data) + except: + _mode = None + try: + _stdev = stdev(r_data) + except: + _stdev = None + try: + _variance = variance(r_data) + except: + _variance = None + + return _mean, _median, _mode, _stdev, _variance + + else: + raise error("method error") + +def z_score(point, mean, stdev): #returns z score with inputs of point, mean and standard deviation of spread + score = (point - mean)/stdev + return score + +def z_normalize(x, y, mode): #mode is either 'x' or 'y' or 'both' depending on the variable(s) to be normalized + + x_norm = [] + y_norm = [] + + mean = 0 + stdev = 0 + + if mode == 'x': + _mean, _median, _mode, _stdev, _variance = basic_stats(x, "1d", 0) + + for i in range (0, len(x), 1): + x_norm.append(z_score(x[i], _mean, _stdev)) + + return x_norm, y + + if mode == 'y': + _mean, _median, _mode, _stdev, _variance = basic_stats(y, "1d", 0) + + for i in range (0, len(y), 1): + y_norm.append(z_score(y[i], _mean, _stdev)) + + return x, y_norm + + if mode == 'both': + _mean, _median, _mode, _stdev, _variance = basic_stats(x, "1d", 0) + + for i in range (0, len(x), 1): + x_norm.append(z_score(x[i], _mean, _stdev)) + + _mean, _median, _mode, _stdev, _variance = basic_stats(y, "1d", 0) + + for i in range (0, len(y), 1): + y_norm.append(z_score(y[i], _mean, _stdev)) + + return x_norm, y_norm + + else: + + return error('method error') + +def stdev_z_split(mean, stdev, delta, low_bound, high_bound): #returns n-th percentile of spread given mean, standard deviation, lower z-score, and upper z-score + + z_split = [] + i = low_bound + + while True: + z_split.append(float((1 / (stdev * math.sqrt(2 * math.pi))) * math.e ** (-0.5 * (((i - mean) / stdev) ** 2)))) + i = i + delta + if i > high_bound: + break + + return z_split + +def histo_analysis(hist_data, delta, low_bound, high_bound): + + if hist_data == 'debug': + return ('returns list of predicted values based on historical data; input delta for delta step in z-score and lower and higher bounds in number of standard deviations') + + derivative = [] + + for i in range(0, len(hist_data), 1): + try: + derivative.append(float(hist_data[i - 1]) - float(hist_data [i])) + except: + pass + + derivative_sorted = sorted(derivative, key=int) + mean_derivative = basic_stats(derivative_sorted,"1d", 0)[0] + stdev_derivative = basic_stats(derivative_sorted, "1d", 0)[3] + + predictions = [] + pred_change = 0 + + i = low_bound + + while True: + if i > high_bound: + break + + try: + pred_change = mean_derivative + i * stdev_derivative + except: + pred_change = mean_derivative + + predictions.append(float(hist_data[-1:][0]) + pred_change) + + i = i + delta + + return predictions + +def poly_regression(x, y, power): + + if x == "null": #if x is 'null', then x will be filled with integer points between 1 and the size of y + x = [] + + for i in range(len(y)): + print(i) + x.append(i+1) + + reg_eq = scipy.polyfit(x, y, deg = power) + eq_str = "" + + for i in range(0, len(reg_eq), 1): + if i < len(reg_eq)- 1: + eq_str = eq_str + str(reg_eq[i]) + "*(z**" + str(len(reg_eq) - i - 1) + ")+" + else: + eq_str = eq_str + str(reg_eq[i]) + "*(z**" + str(len(reg_eq) - i - 1) + ")" + + vals = [] + + for i in range(0, len(x), 1): + z = x[i] + + try: + exec("vals.append(" + eq_str + ")") + except: + pass + + _rms = rms(vals, y) + r2_d2 = r_squared(vals, y) + + return [eq_str, _rms, r2_d2] + +def log_regression(x, y, base): + + x_fit = [] + + for i in range(len(x)): + try: + x_fit.append(np.log(x[i]) / np.log(base)) #change of base for logs + except: + pass + + reg_eq = np.polyfit(x_fit, y, 1) # y = reg_eq[0] * log(x, base) + reg_eq[1] + q_str = str(reg_eq[0]) + "* (np.log(z) / np.log(" + str(base) +"))+" + str(reg_eq[1]) + vals = [] + + for i in range(len(x)): + z = x[i] + + try: + exec("vals.append(" + eq_str + ")") + except: + pass + + _rms = rms(vals, y) + r2_d2 = r_squared(vals, y) + + return eq_str, _rms, r2_d2 + +def exp_regression(x, y, base): + + y_fit = [] + + for i in range(len(y)): + try: + y_fit.append(np.log(y[i]) / np.log(base)) #change of base for logs + except: + pass + + reg_eq = np.polyfit(x, y_fit, 1, w=np.sqrt(y_fit)) # y = base ^ (reg_eq[0] * x) * base ^ (reg_eq[1]) + eq_str = "(" + str(base) + "**(" + str(reg_eq[0]) + "*z))*(" + str(base) + "**(" + str(reg_eq[1]) + "))" + vals = [] + + for i in range(len(x)): + z = x[i] + + try: + exec("vals.append(" + eq_str + ")") + except: + pass + + _rms = rms(vals, y) + r2_d2 = r_squared(vals, y) + + return eq_str, _rms, r2_d2 + +def tanh_regression(x, y): + + def tanh (x, a, b, c, d): + + return a * np.tanh(b * (x - c)) + d + + reg_eq = np.float64(curve_fit(tanh, np.array(x), np.array(y))[0]).tolist() + eq_str = str(reg_eq[0]) + " * np.tanh(" + str(reg_eq[1]) + "*(z - " + str(reg_eq[2]) + ")) + " + str(reg_eq[3]) + vals = [] + + for i in range(len(x)): + z = x[i] + try: + exec("vals.append(" + eq_str + ")") + except: + pass + + _rms = rms(vals, y) + r2_d2 = r_squared(vals, y) + + return eq_str, _rms, r2_d2 + +def r_squared(predictions, targets): # assumes equal size inputs + + return metrics.r2_score(np.array(targets), np.array(predictions)) + +def rms(predictions, targets): # assumes equal size inputs + + _sum = 0 + + for i in range(0, len(targets), 1): + _sum = (targets[i] - predictions[i]) ** 2 + + return float(math.sqrt(_sum/len(targets))) + +def calc_overfit(equation, rms_train, r2_train, x_test, y_test): + + #performance overfit = performance(train) - performance(test) where performance is r^2 + #error overfit = error(train) - error(test) where error is rms; biased towards smaller values + + vals = [] + + for i in range(0, len(x_test), 1): + + z = x_test[i] + + exec("vals.append(" + equation + ")") + + r2_test = r_squared(vals, y_test) + rms_test = rms(vals, y_test) + + return r2_train - r2_test + +def strip_data(data, mode): + + if mode == "adam": #x is the row number, y are the data + pass + + if mode == "eve": #x are the data, y is the column number + pass + + else: + raise error("mode error") + +def optimize_regression(x, y, _range, resolution):#_range in poly regression is the range of powers tried, and in log/exp it is the inverse of the stepsize taken from -1000 to 1000 +#usage not: for demonstration purpose only, performance is shit + if type(resolution) != int: + raise error("resolution must be int") + + x_train = x + y_train = [] + + for i in range(len(y)): + y_train.append(float(y[i])) + + x_test = [] + y_test = [] + + for i in range (0, math.floor(len(x) * 0.5), 1): + index = random.randint(0, len(x) - 1) + + x_test.append(x[index]) + y_test.append(float(y[index])) + + x_train.pop(index) + y_train.pop(index) + + #print(x_train, x_test) + #print(y_train, y_test) + + eqs = [] + rmss = [] + r2s = [] + + for i in range (0, _range + 1, 1): + try: + x, y, z = poly_regression(x_train, y_train, i) + eqs.append(x) + rmss.append(y) + r2s.append(z) + except: + pass + + for i in range (1, 100 * resolution + 1): + try: + x, y, z = exp_regression(x_train, y_train, float(i / resolution)) + eqs.append(x) + rmss.append(y) + r2s.append(z) + except: + pass + + for i in range (1, 100 * resolution + 1): + try: + x, y, z = log_regression(x_train, y_train, float(i / resolution)) + eqs.append(x) + rmss.append(y) + r2s.append(z) + except: + pass + + try: + x, y, z = tanh_regression(x_train, y_train) + + eqs.append(x) + rmss.append(y) + r2s.append(z) + except: + pass + + for i in range (0, len(eqs), 1): #marks all equations where r2 = 1 as they 95% of the time overfit the data + if r2s[i] == 1: + eqs[i] = "" + rmss[i] = "" + r2s[i] = "" + + while True: #removes all equations marked for removal + try: + eqs.remove('') + rmss.remove('') + r2s.remove('') + except: + break + + overfit = [] + + for i in range (0, len(eqs), 1): + + overfit.append(calc_overfit(eqs[i], rmss[i], r2s[i], x_test, y_test)) + + return eqs, rmss, r2s, overfit + +def select_best_regression(eqs, rmss, r2s, overfit, selector): + + b_eq = "" + b_rms = 0 + b_r2 = 0 + b_overfit = 0 + + ind = 0 + + if selector == "min_overfit": + + ind = np.argmin(overfit) + + b_eq = eqs[ind] + b_rms = rmss[ind] + b_r2 = r2s[ind] + b_overfit = overfit[ind] + + if selector == "max_r2s": + + ind = np.argmax(r2s) + b_eq = eqs[ind] + b_rms = rmss[ind] + b_r2 = r2s[ind] + b_overfit = overfit[ind] + + return b_eq, b_rms, b_r2, b_overfit + +def p_value(x, y): #takes 2 1d arrays + + return stats.ttest_ind(x, y)[1] + +def basic_analysis(data): #assumes that rows are the independent variable and columns are the dependant. also assumes that time flows from lowest column to highest column. + + row = len(data) + column = [] + + for i in range(0, row, 1): + column.append(len(data[i])) + + column_max = max(column) + row_b_stats = [] + row_histo = [] + + for i in range(0, row, 1): + row_b_stats.append(basic_stats(data, "row", i)) + row_histo.append(histo_analysis(data[i], 0.67449, -0.67449, 0.67449)) + + column_b_stats = [] + + for i in range(0, column_max, 1): + column_b_stats.append(basic_stats(data, "column", i)) + + return[row_b_stats, column_b_stats, row_histo] + + +def benchmark(x, y): + + start_g = time.time() + generate_data("data/data.csv", x, y, -10, 10) + end_g = time.time() + + start_a = time.time() + basic_analysis("data/data.csv") + end_a = time.time() + + return [(end_g - start_g), (end_a - start_a)] + +def generate_data(filename, x, y, low, high): + + file = open(filename, "w") + + for i in range (0, y, 1): + temp = "" + + for j in range (0, x - 1, 1): + temp = str(random.uniform(low, high)) + "," + temp + + temp = temp + str(random.uniform(low, high)) + file.write(temp + "\n") + +class StatisticsError(ValueError): + pass + +def _sum(data, start=0): + count = 0 + n, d = _exact_ratio(start) + partials = {d: n} + partials_get = partials.get + T = _coerce(int, type(start)) + for typ, values in groupby(data, type): + T = _coerce(T, typ) # or raise TypeError + for n,d in map(_exact_ratio, values): + count += 1 + partials[d] = partials_get(d, 0) + n + if None in partials: + + total = partials[None] + assert not _isfinite(total) + else: + + total = sum(Fraction(n, d) for d, n in sorted(partials.items())) + return (T, total, count) + +def _isfinite(x): + try: + return x.is_finite() # Likely a Decimal. + except AttributeError: + return math.isfinite(x) # Coerces to float first. + +def _coerce(T, S): + + assert T is not bool, "initial type T is bool" + + if T is S: return T + + if S is int or S is bool: return T + if T is int: return S + + if issubclass(S, T): return S + if issubclass(T, S): return T + + if issubclass(T, int): return S + if issubclass(S, int): return T + + if issubclass(T, Fraction) and issubclass(S, float): + return S + if issubclass(T, float) and issubclass(S, Fraction): + return T + + msg = "don't know how to coerce %s and %s" + raise TypeError(msg % (T.__name__, S.__name__)) + +def _exact_ratio(x): + + try: + + if type(x) is float or type(x) is Decimal: + return x.as_integer_ratio() + try: + + return (x.numerator, x.denominator) + except AttributeError: + try: + + return x.as_integer_ratio() + except AttributeError: + + pass + except (OverflowError, ValueError): + + assert not _isfinite(x) + return (x, None) + msg = "can't convert type '{}' to numerator/denominator" + raise TypeError(msg.format(type(x).__name__)) + +def _convert(value, T): + + if type(value) is T: + + return value + if issubclass(T, int) and value.denominator != 1: + T = float + try: + + return T(value) + except TypeError: + if issubclass(T, Decimal): + return T(value.numerator)/T(value.denominator) + else: + raise + +def _counts(data): + + table = collections.Counter(iter(data)).most_common() + if not table: + return table + + maxfreq = table[0][1] + for i in range(1, len(table)): + if table[i][1] != maxfreq: + table = table[:i] + break + return table + + +def _find_lteq(a, x): + + i = bisect_left(a, x) + if i != len(a) and a[i] == x: + return i + raise ValueError + + +def _find_rteq(a, l, x): + + i = bisect_right(a, x, lo=l) + if i != (len(a)+1) and a[i-1] == x: + return i-1 + raise ValueError + + +def _fail_neg(values, errmsg='negative value'): + + for x in values: + if x < 0: + raise StatisticsError(errmsg) + yield x + +def mean(data): + + if iter(data) is data: + data = list(data) + n = len(data) + if n < 1: + raise StatisticsError('mean requires at least one data point') + T, total, count = _sum(data) + assert count == n + return _convert(total/n, T) + +def median(data): + + data = sorted(data) + n = len(data) + if n == 0: + raise StatisticsError("no median for empty data") + if n%2 == 1: + return data[n//2] + else: + i = n//2 + return (data[i - 1] + data[i])/2 + +def mode(data): + + table = _counts(data) + if len(table) == 1: + return table[0][0] + elif table: + raise StatisticsError( + 'no unique mode; found %d equally common values' % len(table) + ) + else: + raise StatisticsError('no mode for empty data') + +def _ss(data, c=None): + + if c is None: + c = mean(data) + T, total, count = _sum((x-c)**2 for x in data) + + U, total2, count2 = _sum((x-c) for x in data) + assert T == U and count == count2 + total -= total2**2/len(data) + assert not total < 0, 'negative sum of square deviations: %f' % total + return (T, total) + +def variance(data, xbar=None): + + if iter(data) is data: + data = list(data) + n = len(data) + if n < 2: + raise StatisticsError('variance requires at least two data points') + T, ss = _ss(data, xbar) + return _convert(ss/(n-1), T) + +def stdev(data, xbar=None): + + var = variance(data, xbar) + try: + return var.sqrt() + except AttributeError: + return math.sqrt(var) \ No newline at end of file diff --git a/data analysis/__pycache__/analysis.cpython-36.pyc b/data analysis/__pycache__/analysis.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bb37e54948766e3d358b9a78e77e89b1e0060f2a GIT binary patch literal 25273 zcmdUX4QwRGm0owxU(ayJrMMKATCJqD`_SFoNld?WUVw_X*8S~ zlEWR&kk!pyay*mxL|v>TwofrG&H?e|oWMYQxE$aFx#VIvNbJ~2aQQ`m2tnt7GcGI{9t8P@|!C+OFmbbll)NS5c1nw^Xs=%ZjnCjXdPNV zTsbV|QtQ?i&6tiN)ib$e&P4 z>IL;8dU#TOUcH1XPpQl5W#kvtE2@J0X;oKWP*>368Ku;#>NS);r5dWLUPtNE>hKFz zdAXZEvv7Lh?852O57zD}SXO0ES}oN(S~u!`Yp3LQaMJc`&34HQ+$*iuRHKCRYo%!Xr%ScAN}QjG zpKvC6f^IC=>V8Li;Y^pdR)(i}I%$R5O(3r?l&1=@<8#xQVcC}HeXH2E3zFOk7triHzEBV(urS(S5+X774XWX)( zHC7hMbOZMiPVIUl1cU)!)JwCy65U4Vh3Ic25GB<8;EiJ&(==Xl(O8=v)~{Dv-)Idc z%f~D?F*HIzIvFB>Xvw@^_oQJ^KIXjTm%Pq8`Y|x#07$uB=|r^rz0&ojzgp6-EtVcD zor%b6qg89Sfgb>)$r~Ptdr%D@ynqCXMIR3lRF(iqJWqkm1At9T6$ZN1YRvbl)e!Cx z*O3Jr_h3ecr6Aa9Y;Sbh4NPGKBxF5&OiW?sF}!WCbRx)^y1lwGtT(tc0&v28h^8DTx-_J%z<(;4%75Rl^3d}vK&wIp=f-% z(a|flHYmW5Y;%31qkXT`0`np*FNt`lwKi63OAW)$I$AaK(6~K}5^I+b2Ew-VO4A<# z3p=V;iFqHQ?Nq6qJn1ZXey!f(_0G~7sqRI%V0VpI}t8gxNRh1 z!VxB=3uI1dTuTxl9~Se>h|aO@$jz(OQzVovf3>5-GDYhU3^qEgopxt^n6`&n=uXEc zLu;43&8-?GTeOOYo84F)jc*L&P7?e8sxhA9l{*SJGsAr#!aN{B?WzQP`6AoAg$>av zB9P-Wesa8+>z(xt2xV0|qfTMfw6@mUrwpldt{d4d9eSW#(i??+cp(N)H2)F#F$R?b zkn*0zh3)O_h|44QW0|>2)tCCPjZx@O<6es8XW|4vvwG!JOA>C2GMEwP7VVK}Ix`OXXDHR;%sWdZStm z3f1a*M{TuOE>^4H04-DV8rxzE! zE28j53+_r(z{=BipqFoQL@ZW29v!(bC18 zS)MBk0Lono()BG>d(VEPlB#cPb&u3LTVTy?@W&d|=M~r|s4juuvv&5zT3o%u+W;Py zn_w)Vh)cib)^W?qjs&DTnV%e_k*mv4lds<2bmYgB#Ah<8!@SZ~wQOAl^N#RK2 z$l%E0$SHU*V12suVIa@bkAb{EUk36C`ZJIh>C-?ysU`uyDb4=t929@dGdKjdB@}$% zcGtFn<24(HgCm6_jU%&?A&Fk<{&>JYN~<+!vOX5!(iRnO(j}yEQz6;tc%nkGs5{WN zjRdEtr-f5rfM662qbgRJSHe*ynsb@F&( zc|${T1i8AbG%v{1tLUB?It8bx4mx2YbDJHcInWU_2N}`#ej6b4(nyHt?(_gq3sPOr zvI;-uM1;WoC<;A5yK40S@2cGc#H&sZFt4V18Kh~Uefv^5qfeo&zL&`Yllz#QMpDk| zGsuFBfGkLH9x2YEFqlInBQqEsIT(#W!#_X=!jmjeW%SoQn3zP=a{N3l;C|2vDJP;K z2W2OurxeQI2tr$Fl)(?AoIx2Y9xg1daLh6*vkbDe4LUX6Z?@UCU@l0vfjG=v@K!)( z=xry+wGGxr=B&?Rs6QM`w`fOy-J25AH3O*U(2&184&qjj_BIO2?&r2oamo0ySaz`@ zcT*RwSLgaJ0O9o9TYy-wx%r;k&uz|M27#BUid4hP8}b$;y?l^s=~;d6&x?K-p@w%DEr|n_vYDtJP>5SPBh| zkWb!_PuA$;`jhBDKf>fuCg+$uh9q#K>m?mB*5x(8+(t-2(}EQvzxqr-=L%o*IfseLu(l0FwuqJcK03g#%9ztI#Ql`hVfL zQ7q0TNB$=%7tI|9ABnFF4cP2T(Kdq8bxVnBE6Z}Bd zE+8~w&@ittaMYhcPkZxxGCmiwcR%uPItkuCJpxdmv~K};JApAG0Eg)6qnCg(!13&d z(&#uxL}_9<-9t$f{J)DnKIub{+N^AVyNzA z=|$F0Iw4y>iyO^2GDfG1WmhFDgY@*L+2mN9#8T_Q!P1LOxtYNrV~I6(oRp4KK4F>d z+megMAKIv*35t}uVG=Q8rFmvxEqrxk7VP2NU>1>R>P`=45sD?63;ek_Vha8P(wJ!z`aucRS3?}_^n94px z&r<}}VYD5X8#kC;Qv=}LAaqCyLZ&aF@X=dc!ZwDitq+36zU9p20hxqGr;{X;F;XTb zlUdAl)XtKT!p0iE6!#V`RSrXhi`~6>0O%pnGP$=^6_de-kSd(dNj{$Qi47{6Q#t(2 z?qyDW3^IjO^x!hOJ^@pu|8Nmv8Xth7~-Fxthiwc112brsJ-NbKbS%+CvHki;r#@~Tfhnhg)hDq zIwB!5upi^oy9DtU`}8O>>|P`?j`k=rj>bN{OXAZ@_XZ;4b@A!ha>a-YEk3=BTrnc! zP4Vevd1VigkuwU4h|&ufs&P&Y{8EDO@p5ATGP3otZmOqI@kIiU3LoW0!8q;4IP4PO z{Qp&S49I8SqT}rW<;0>RNjV>d>=+DdZ`tvy97iZS<`SUZukeVU@gal%F`K6E+aEO{fEtrdj#>ZEkUp~@@8{EfUAa%xFIBk3OT6z~-OqW60zhFIYy$ts^ zEQi#k0B(NhevRvlQYmMGG}ztK&7e85#xWRdT53Nu|KF zD*HX7`Y-jtzsmVgA++Fwp0a1`I>*l6B#uKkWDYL@EZ`7$b=65bT75$kVW*;x8OS2#^&N>?tWsAqo46Jl4o~{epX7k%T{~Z z?1d_;pL+%Sqc2;i%}K30RO`mIuGHp-YV)QRj02AUp4-bWV_y#A&0^f|AkSQN-nR8y z79XgjF`Fa(bgvk~pAP;Q2XEV(e@A9e#50Kn z=4J*g?L7n5?mYw6?qddiUghJx>cS;iinY&x5)*L~h;!Q9YBDhKe(YARc#lYo)~OP< z_#3c`^~$4&XGHXB7@z8y+{9DKo;~8NOzRS(2pNv&mzFhxzD^@_crgqGmheTy>FsPF z25=e8Ozdpw#L`X}Ny9yIoFLewEY>(W>$DN-3sKoZG!*u@8G9Ak%nTgF4UtCVl@N9; zLq_PJvF0&y4ha3-XF_#K?XC4CguC%RPOfx@kCf8lqj9tiz~OJ{Ho))&1~_465H6An zJS3XrROwnx<6^z>YT%qvvd1MxRgg80LTE_m`n%+w-6A{;s~yDaG19YKNZ4DnCxgr~ zf?50^&1VH!^DLaXYcDlI5kUFA_D#RHd^VGLD?* z>@Hh1{pAsMZ3bV-)+JJ<#&8}!*Z|fI{KJJbN_D+yq>iDZ8i!7^2sv|2+ z6RY78(#tHbF!=(KN_Jaf)GOH?$p-|YUuT;dlO-l~CW^^LCdcCBPL{YI|G!B$)O{== zdJ?{i1u|zJcHBg%5ME;qNIhMr%EKQ=&O<2dbre8d5Z!PaJ>Sc$LF_2Fz^K$H2Wfg1 zlr(GIYN3pPChoq2^zB;__;k>kwRjk@iW-DMfj+an^wqR3JZEhd0BHu$WG)$h&}DfT z66_N{PS~*UkFZH4p0k&62EK&X5fE;&sr|s&Xa;rzyBLYpq#7;6z2#e->(!;sR$B#y zRfMXUa)se}48a=I^^{_BRQ{pn* zeioWhD%6b30z2E!>>LNV8Qlm#nztymMN`VKG>?Ni;pWj;?>Gv*1BMh9Wg0}7`KkkB z44}hnS*cxwN8AE^DRB-1tncFKd6nuHdQLyH{StJc6u|s9p0jqQv7|FScNQvMA0Ur( zE*NWtP;bhcvEhX;>zANAu>W1VS2!f&!o7%H>t{B973j(XU3nwLyT2C7sgi^@OMnsB zeVwn%$ZIuT2{Ilhh>uSMmr_nV3JJY|cznT@NJCS?izgeXDs}HQaiu0O>N+GQOT;9K zf>#E(M$!~dMp$86)wGFTlUQwUAQGWjr>-`3WYxZ#0?yw3WBi78zXk-BH4+HnLSWNe z2IYxLp2If+X&qN|n@I-=BFo?xzlH}Qwwdb@FRjHFB3X8TvJf%jQNzl#i3c5A2`2Dj z4zm|vABprd3Pu1@{5ke<5o<-n&_N(##it#I0ukx7)ZFeAomgOz z)MJz7A+U>(h|hwAEUNY(U+_v%D-ZWmkQPyp62ut;8b!4~O!7qR4OTm2c)pr~3`&Fa z-oOILpa%pqg?@rI#b9)0-nwe(Z}l=DUl;}?v_p8S?vUkYO^aS?9=wA~0@rgwc(3iV zwQE{mCdbNdKN*e#J%!^~z&Ie$xOjGE{hZ48GI$2=3YXu{;^p@7+(#AWMIuvpV~OrVA-!(qw~65JY#4n*s9usDp)1p{%V+Sm-TjZLUo zI!N<@f!z(#RR#%X^kP*lj?xoZ*^8tn>TrghW z_CZ|(W=j?u3S#W26lsO`0(WUCL+>$k)JGI3;?K?xJ%0g)kvz5U3f40|FiwgKvHtb= z$zR8lzm6y8!7z#Syk7`Q1yDl1S6C+2bD^|iSXp=OBqCcj77#^vqFXNASBlE1a`~QN zoa^goU!EAz2?D3RfjG3zirzuBU|chxSofI>us)H%dYWvtlG$cVwGm}KG_GU>@0y!D z`p+`?5|dwKax6|nOQ5;GMXd7-qRyEC2RSO(ow82=^Pqu!;G3bc34BK!V;JAy1H{cf z@y*yjwiO2fTqxit5T>~Vwjs)t2u*{-8|p$z5YF+z)Zi+taP*Xqso}dZHGCJ;@ZIs$ z0A3JL19*Uh$swfu`g`_?ZtCR86K4HqPj<^?@BpVgviiqUf_{TY+GQf7u*MQOfT4%6 zq@dqKeI>IqObLc|_CW_zxclw#bTALqXAB*nvHXT4$iKuTv)kTxB>Wu_hRDHdfp_Ml z3{hHk#r*||f|YHQ3|YZEOX8I0P-$d^W35W-e7M39?+qDq9?}B7=a9oJK&!NR*^sZh zs?Y;pP!lmudofOXz-do#3Z+K)dV%T<*E*yT9YX+9oV(NUJC4bV5`xE_+`}p77-wDb-K5Wy!g6suGPWI z9p3o`Rv5CHD4isi;e``IdfhCD41Nwx^|_lVmrM7Q^d`$gGAE+CH&9c_iOv7e1DZbW zAR{irAnW55JcteY#;~+&NGo3<63)qisB zZHT=ea}whH6@fnQ#5B&u97FSYoX8S`hf#vKq56eWIf7-DGRq8`L0H~cWT1Re200x@ z=Ht@|P?pkRDUWsH&U0lY z0@%)c0WE;-%-l%8m<%g9Af4B72XAJJ5qWqMp7pN@9x_BL1I-7P@+{m%&~yatxXPS| z;QMwIsF}d9nntUoYK?xZiGLrYu3|t3pfIyW zZ3=sE!i>IrM(cpJeZV z1_)c$-@c#HE=Et`riJkuz0Y%|KCDV3XlVmF*RJS?s`OdlO-f`g2ehHlGAU&qAXwtWEPZG+=ciE)}B^+ze9&LeE*o z$$&e3!THqoxf`B;0?(fU-Q2{df>+|1NM%IOz};PW83sN45H5%UYKy(8Wrp14E_LUy zZ-Dn;I``>{$off8cLZ8Jyz^JqR;!oa2gE0rNON9G@4R(Q|ke$Q`A17I&Lh zTBcw#IPqOm6f@3=J;cnL7#JibY`T6HM-GRJ16mk28}&1yWPz*U3Y<>R?6A@$hvg?Z zHVGwnaR;|0w{Zvf$G+|ew+V%UAecxILN1jFd}b#4h6c2T+IAJNE~kuAmxY3Wdc1^% zytWNP{F}V-*O>e@Cclm(aLwDb9bHK;!5pt-mS}$SL@!qIOHuFiFWsOHa!2N|^sd2kB_uoK!Tj=l}mmAp?uh_~Dn z1X1iz7$GEhBFqPJsvH!hpXKOpBaJt|DWK>i6It{<6zInX4s7srXZyJE6^H7>95f*Z zj>b?E3S|$yNVtxBL|);(EVO&bR?1b`pP<(!gZm(Jpmf0fDLf~OEA%};rNj+~43%1l}F%WyPBVyFOM?^h=n_%Cj$aCR=~`47Lva#gggOki)Wx&${3UK)o^gA;L(R;I!> zVbusa^v&pzt``%T0$;va)5xVX13$jG64ON%ckmGWP)h|}Si8WDK$toVr0#CUj;^P| z`n1s_;_HA}4?a1nb@SE=73=h+?j&8g_c2*O1j4&iBb)z|p4Ak%E5e$C$?z*14dXlI z@BzG-+VGMvly1MeQpw<@Mr7f6&79XprXS?v4}N@+KoDgOK)`FmNVy#JZ{P{BTARm7 zJa-x?q&#RxbdR?%e`*_`1n2@+z=P888*!fqAsUJvNKwxnvF-vBvA({6QjptfH~B%H zAah+e{YE)&Vm|mf>i~#!9$gR8^o<8;#(xO40RfHmjlf z^O#z>dSaXEM>v6mAisc)cg5+$zYMJ>)r8alj2WE<%!IlvB*^yjm1dLupd+d0^ zb4~4mwWr}RfFzOHqhW83avsK7fcA&o(_MJEfm=}cIcxJJ=#giMxnAaKR(}gVCv4gD z3-CEWXDV9W_k8yayL}kJ4f3Wr{DjbKcft8C;s7tB-w+!y;!Q}JJ~8722*l>{kjrxJ zS+CAwSZ2Z?!d>Sj_}4ZaAs$$_khRWY#e;j?q-N4ev_g%b7 zF)CI0eEY>91=TFbnk^)LcEGQ-MrNwtMi+mDbEGB47(M7X&|IeAF(}}0p}mkPOs5Xp z(l&-WNt$H#6b|nMk{!O543_u;&;b*wpu;`5$cyj-VzeS0q=P&@cn||1ddFV?V3b8S zf)DT|eC`+?VFpi`uD_4uH#u$!P7B(Si!sa9coCphC__g0k8uK`wyx&AX|il%+TjIi zS0CwN%-o2*>UyW>mBKZ|HQ^c@3n-uK!I=Qj$`~y~4TWc-66lxNj{A?kgSzFYrJVV3 z9hQ>*3VZNjYUr;-J;>R&lJ z;)c2fGY+O}9@I#`vC_~XAIZme1`~-}^*>|t$HW<{4dxt zD<7ltMXE$f>c3)>V{syD>UVJZ&xuLl1=#YO$EtE*^-Lkf`odn;4D6i(aH!cfUTMT$ z%nRsEGcc0XH7%<^0v$l(_>K$09q3>=;ivmH@WfSx^#x31fCUv|f$i~uQ2iR~*AO}) zITY4gf%!K=Ls+;;q>j4=oWF~w8)Nj}vLw(Y5Hvxo{|yp;2uE0=Q1u85@MWA3g}+Zg z%%VNaK9DO3SAama=;(is>oM`Bal%80_jMF#GnmLA_~}80eGK0_%w-Rq2v!95-=Nh$ z;6tm1tOX$%X&QMrS>JO2q=(QRnmI!W`ZnS`9)hMs&*t!(F2Mw}U4Ds(yWT1nhgdR1Ex(*g` z5m1?yNHP5%aWy6fw6k~!JQh$OIZzNoLvY}u1;R16!BsRjtgWLe=IvR-(-UfSw)^?bX#xk=sS*Hm2namD$6pyPgKQ7CI^Ew7PS5Y+#D|l0v9MS=zkG~ zkQ-30&KmUhfozfi7lsV;6dhUQr^X=)cU+{Vf;_xu{0wU7GspUei3?)k5+L%rcQGhr z5cgX1OD%ky(?_gK809K)f{0Q14`{7F$3%=yPFpk@!zj6+u|2>=VRS2sJpE%PKVULU zZW1UpOE65MQXE6T75}EN9;7w~2SE{o;|fj;H~<1!2_Xm^U4o@3AOJ{GbK*bLXSr@v z>M*EJcn3UgP;ZnT>;zz@v6#i&h;kv^?gAzVvu+a~9p;t*f_@pLR=Nw(x%`M3f{hK&3nQqy za|8$)j(`JmChtlBA>2*~cy(m{3^2Th^N%cMz7GVs0QFe5AL4aMrIfQL-o39Oc?*gJ z$bZ^E;m-s2U$&M31$<=-YO}F*BBBrLupox&GW{%UVh47V1GN+1?!m&iLY~R3B$YlW zYA6NdSqYX!lqurtpvE}xJcFJHMckdDIs|dms-8E-avDDhGA4{-DjLO*{kG4C zKQ?%{p?G%My({LUFtQlCZa)7fpK6nRF@%cWwG83-3MWC^$Z!V{nV~mIW;)Cr0{in7 z?D@jHT0|T+#jlAEir_+!1Sx>vVUEhM3@)A{&Jl>vS%gySpQ1@jKIEWiX%-K?8}GwG zBSt^OK`CsFX%6)Q8>4i@fODpVb7EUmHn>klvEyU7CqHsbe-u8n29?w^{31J+hjQei z4~6VANN+D;8(a)jf+=udT#PKFS2;Pt!Ym}(oaPTU9P468kTPSxH#P0qO-3BQDV|0{c)<4ft7ErL@Ee@?r zUI%JJ`S}w$gAySBp9v`>d78p^p;tYm3q6TA@~%FQErn^C?K9@rd^D4!YFO zAqjHId=*mE*B~#S9)$@9U*~f8jYwm-!JdY)=d((@4918qAjWs2)$v=+CA<U+WJB2S?xrH2B6y0K> HSSb8opUuc; literal 0 HcmV?d00001