2020-03-10 03:58:51 +00:00
# Titan Robotics Team 2022: Data Analysis Module
2020-08-10 19:29:51 +00:00
# Written by Arthur Lu, Jacob Levine, and Dev Singh
2020-03-10 03:58:51 +00:00
# Notes:
2020-08-10 19:29:51 +00:00
# this should be imported as a python module using 'from tra_analysis import analysis'
2020-03-10 03:58:51 +00:00
# this should be included in the local directory or environment variable
# this module has been optimized for multhreaded computing
# current benchmark of optimization: 1.33 times faster
# setup:
2020-09-18 21:55:59 +00:00
__version__ = " 2.2.3 "
2020-03-10 03:58:51 +00:00
# changelog should be viewed using print(analysis.__changelog__)
__changelog__ = """ changelog:
2020-09-18 21:55:59 +00:00
2.2 .3 :
- fixed spelling of RandomForest
- made n_neighbors required for KNN
- made n_classifiers required for SVM
2020-09-17 02:11:44 +00:00
2.2 .2 :
- fixed 2.2 .1 changelog entry
- changed regression to return dictionary
2020-08-10 21:25:25 +00:00
2.2 .1 :
2020-09-17 02:11:44 +00:00
- changed all references to parent package analysis to tra_analysis
2020-07-30 19:05:07 +00:00
2.2 .0 :
2020-07-05 05:30:48 +00:00
- added Sort class
- added several array sorting functions to Sort class including :
- quick sort
- merge sort
- intro ( spective ) sort
- heap sort
- insertion sort
- tim sort
- selection sort
- bubble sort
- cycle sort
- cocktail sort
- tested all sorting algorithms with both lists and numpy arrays
- depreciated sort function from Array class
- added warnings as an import
2020-07-30 19:05:07 +00:00
2.1 .4 :
2020-05-22 00:37:39 +00:00
- added sort and search functions to Array class
2020-07-30 19:05:07 +00:00
2.1 .3 :
2020-05-20 13:52:38 +00:00
- changed output of basic_stats and histo_analysis to libraries
2020-05-15 21:05:52 +00:00
- fixed __all__
2020-07-30 19:05:07 +00:00
2.1 .2 :
2020-05-13 16:35:46 +00:00
- renamed ArrayTest class to Array
2020-07-30 19:05:07 +00:00
2.1 .1 :
2020-05-13 01:19:58 +00:00
- added add , mul , neg , and inv functions to ArrayTest class
- added normalize function to ArrayTest class
- added dot and cross functions to ArrayTest class
2020-07-30 19:05:07 +00:00
2.1 .0 :
2020-05-04 19:50:36 +00:00
- added ArrayTest class
- added elementwise mean , median , standard deviation , variance , min , max functions to ArrayTest class
- added elementwise_stats to ArrayTest which encapsulates elementwise statistics
- appended to __all__ to reflect changes
2020-07-30 19:05:07 +00:00
2.0 .6 :
2020-05-04 16:59:25 +00:00
- renamed func functions in regression to lin , log , exp , and sig
2020-07-30 19:05:07 +00:00
2.0 .5 :
2020-05-02 03:59:54 +00:00
- moved random_forrest_regressor and random_forrest_classifier to RandomForrest class
- renamed Metrics to Metric
- renamed RegressionMetrics to RegressionMetric
- renamed ClassificationMetrics to ClassificationMetric
- renamed CorrelationTests to CorrelationTest
- renamed StatisticalTests to StatisticalTest
- reflected rafactoring to all mentions of above classes / functions
2020-07-30 19:05:07 +00:00
2.0 .4 :
2020-05-01 21:15:07 +00:00
- fixed __all__ to reflected the correct functions and classes
- fixed CorrelationTests and StatisticalTests class functions to require self invocation
- added missing math import
- fixed KNN class functions to require self invocation
- fixed Metrics class functions to require self invocation
- various spelling fixes in CorrelationTests and StatisticalTests
2020-07-30 19:05:07 +00:00
2.0 .3 :
2020-05-01 21:15:07 +00:00
- bug fixes with CorrelationTests and StatisticalTests
- moved glicko2 and trueskill to the metrics subpackage
- moved elo to a new metrics subpackage
2020-07-30 19:05:07 +00:00
2.0 .2 :
2020-05-01 21:15:07 +00:00
- fixed docs
2020-07-30 19:05:07 +00:00
2.0 .1 :
2020-05-01 21:15:07 +00:00
- fixed docs
2020-07-30 19:05:07 +00:00
2.0 .0 :
2020-05-01 21:15:07 +00:00
- cleaned up wild card imports with scipy and sklearn
- added CorrelationTests class
- added StatisticalTests class
- added several correlation tests to CorrelationTests
- added several statistical tests to StatisticalTests
2020-08-10 21:11:38 +00:00
1.13 .9 :
2020-05-01 21:15:07 +00:00
- moved elo , glicko2 , trueskill functions under class Metrics
2020-08-10 21:11:38 +00:00
1.13 .8 :
2020-05-01 21:15:07 +00:00
- moved Glicko2 to a seperate package
2020-08-10 21:11:38 +00:00
1.13 .7 :
2020-05-01 21:15:07 +00:00
- fixed bug with trueskill
2020-08-10 21:11:38 +00:00
1.13 .6 :
2020-05-01 21:15:07 +00:00
- cleaned up imports
2020-08-10 21:11:38 +00:00
1.13 .5 :
2020-05-01 21:15:07 +00:00
- cleaned up package
2020-08-10 21:11:38 +00:00
1.13 .4 :
2020-05-01 21:15:07 +00:00
- small fixes to regression to improve performance
2020-07-30 19:05:07 +00:00
1.13 .3 :
2020-05-01 21:15:07 +00:00
- filtered nans from regression
2020-07-30 19:05:07 +00:00
1.13 .2 :
2020-05-01 21:15:07 +00:00
- removed torch requirement , and moved Regression back to regression . py
2020-07-30 19:05:07 +00:00
1.13 .1 :
2020-05-01 21:15:07 +00:00
- bug fix with linear regression not returning a proper value
- cleaned up regression
- fixed bug with polynomial regressions
2020-07-30 19:05:07 +00:00
1.13 .0 :
2020-05-01 21:15:07 +00:00
- fixed all regressions to now properly work
2020-07-30 19:05:07 +00:00
1.12 .6 :
2020-05-01 21:15:07 +00:00
- fixed bg with a division by zero in histo_analysis
2020-07-30 19:05:07 +00:00
1.12 .5 :
2020-05-01 21:15:07 +00:00
- fixed numba issues by removing numba from elo , glicko2 and trueskill
2020-07-30 19:05:07 +00:00
1.12 .4 :
2020-05-01 21:15:07 +00:00
- renamed gliko to glicko
2020-07-30 19:05:07 +00:00
1.12 .3 :
2020-05-01 21:15:07 +00:00
- removed depreciated code
2020-07-30 19:05:07 +00:00
1.12 .2 :
2020-05-01 21:15:07 +00:00
- removed team first time trueskill instantiation in favor of integration in superscript . py
2020-07-30 19:05:07 +00:00
1.12 .1 :
2020-05-01 21:15:07 +00:00
- improved readibility of regression outputs by stripping tensor data
- used map with lambda to acheive the improved readibility
- lost numba jit support with regression , and generated_jit hangs at execution
- TODO : reimplement correct numba integration in regression
2020-07-30 19:05:07 +00:00
1.12 .0 :
2020-05-01 21:15:07 +00:00
- temporarily fixed polynomial regressions by using sklearn ' s PolynomialFeatures
2020-07-30 19:05:07 +00:00
1.11 .010 :
2020-05-01 21:15:07 +00:00
- alphabeticaly ordered import lists
2020-07-30 19:05:07 +00:00
1.11 .9 :
2020-05-01 21:15:07 +00:00
- bug fixes
2020-07-30 19:05:07 +00:00
1.11 .8 :
2020-05-01 21:15:07 +00:00
- bug fixes
2020-07-30 19:05:07 +00:00
1.11 .7 :
2020-05-01 21:15:07 +00:00
- bug fixes
2020-07-30 19:05:07 +00:00
1.11 .6 :
2020-05-01 21:15:07 +00:00
- tested min and max
- bug fixes
2020-07-30 19:05:07 +00:00
1.11 .5 :
2020-05-01 21:15:07 +00:00
- added min and max in basic_stats
2020-07-30 19:05:07 +00:00
1.11 .4 :
2020-05-01 21:15:07 +00:00
- bug fixes
2020-07-30 19:05:07 +00:00
1.11 .3 :
2020-05-01 21:15:07 +00:00
- bug fixes
2020-07-30 19:05:07 +00:00
1.11 .2 :
2020-05-01 21:15:07 +00:00
- consolidated metrics
- fixed __all__
2020-07-30 19:05:07 +00:00
1.11 .1 :
2020-05-01 21:15:07 +00:00
- added test / train split to RandomForestClassifier and RandomForestRegressor
2020-07-30 19:05:07 +00:00
1.11 .0 :
2020-05-01 21:15:07 +00:00
- added RandomForestClassifier and RandomForestRegressor
- note : untested
2020-07-30 19:05:07 +00:00
1.10 .0 :
2020-05-01 21:15:07 +00:00
- added numba . jit to remaining functions
2020-07-30 19:05:07 +00:00
1.9 .2 :
2020-05-01 21:15:07 +00:00
- kernelized PCA and KNN
2020-07-30 19:05:07 +00:00
1.9 .1 :
2020-05-01 21:15:07 +00:00
- fixed bugs with SVM and NaiveBayes
2020-07-30 19:05:07 +00:00
1.9 .0 :
2020-05-01 21:15:07 +00:00
- added SVM class , subclasses , and functions
- note : untested
2020-07-30 19:05:07 +00:00
1.8 .0 :
2020-05-01 21:15:07 +00:00
- added NaiveBayes classification engine
- note : untested
2020-07-30 19:05:07 +00:00
1.7 .0 :
2020-05-01 21:15:07 +00:00
- added knn ( )
- added confusion matrix to decisiontree ( )
2020-07-30 19:05:07 +00:00
1.6 .2 :
2020-05-01 21:15:07 +00:00
- changed layout of __changelog to be vscode friendly
2020-07-30 19:05:07 +00:00
1.6 .1 :
2020-05-01 21:15:07 +00:00
- added additional hyperparameters to decisiontree ( )
2020-07-30 19:05:07 +00:00
1.6 .0 :
2020-05-01 21:15:07 +00:00
- fixed __version__
- fixed __all__ order
- added decisiontree ( )
2020-07-30 19:05:07 +00:00
1.5 .3 :
2020-05-01 21:15:07 +00:00
- added pca
2020-07-30 19:05:07 +00:00
1.5 .2 :
2020-05-01 21:15:07 +00:00
- reduced import list
- added kmeans clustering engine
2020-07-30 19:05:07 +00:00
1.5 .1 :
2020-05-01 21:15:07 +00:00
- simplified regression by using . to ( device )
2020-07-30 19:05:07 +00:00
1.5 .0 :
2020-05-01 21:15:07 +00:00
- added polynomial regression to regression ( ) ; untested
2020-07-30 19:05:07 +00:00
1.4 .0 :
2020-05-01 21:15:07 +00:00
- added trueskill ( )
2020-07-30 19:05:07 +00:00
1.3 .2 :
2020-05-01 21:15:07 +00:00
- renamed regression class to Regression , regression_engine ( ) to regression gliko2_engine class to Gliko2
2020-07-30 19:05:07 +00:00
1.3 .1 :
2020-05-01 21:15:07 +00:00
- changed glicko2 ( ) to return tuple instead of array
2020-07-30 19:05:07 +00:00
1.3 .0 :
2020-05-01 21:15:07 +00:00
- added glicko2_engine class and glicko ( )
- verified glicko2 ( ) accuracy
2020-07-30 19:05:07 +00:00
1.2 .3 :
2020-05-01 21:15:07 +00:00
- fixed elo ( )
2020-07-30 19:05:07 +00:00
1.2 .2 :
2020-05-01 21:15:07 +00:00
- added elo ( )
- elo ( ) has bugs to be fixed
2020-07-30 19:05:07 +00:00
1.2 .1 :
2020-05-01 21:15:07 +00:00
- readded regrression import
2020-07-30 19:05:07 +00:00
1.2 .0 :
2020-05-01 21:15:07 +00:00
- integrated regression . py as regression class
- removed regression import
- fixed metadata for regression class
- fixed metadata for analysis class
2020-07-30 19:05:07 +00:00
1.1 .1 :
2020-05-01 21:15:07 +00:00
- regression_engine ( ) bug fixes , now actaully regresses
2020-07-30 19:05:07 +00:00
1.1 .0 :
2020-05-01 21:15:07 +00:00
- added regression_engine ( )
- added all regressions except polynomial
2020-07-30 19:05:07 +00:00
1.0 .7 :
2020-05-01 21:15:07 +00:00
- updated _init_device ( )
2020-07-30 19:05:07 +00:00
1.0 .6 :
2020-05-01 21:15:07 +00:00
- removed useless try statements
2020-07-30 19:05:07 +00:00
1.0 .5 :
2020-05-01 21:15:07 +00:00
- removed impossible outcomes
2020-07-30 19:05:07 +00:00
1.0 .4 :
2020-05-01 21:15:07 +00:00
- added performance metrics ( r ^ 2 , mse , rms )
2020-07-30 19:05:07 +00:00
1.0 .3 :
2020-05-01 21:15:07 +00:00
- resolved nopython mode for mean , median , stdev , variance
2020-07-30 19:05:07 +00:00
1.0 .2 :
2020-05-01 21:15:07 +00:00
- snapped ( removed ) majority of uneeded imports
- forced object mode ( bad ) on all jit
- TODO : stop numba complaining about not being able to compile in nopython mode
2020-07-30 19:05:07 +00:00
1.0 .1 :
2020-05-01 21:15:07 +00:00
- removed from sklearn import * to resolve uneeded wildcard imports
2020-07-30 19:05:07 +00:00
1.0 .0 :
2020-05-01 21:15:07 +00:00
- removed c_entities , nc_entities , obstacles , objectives from __all__
- applied numba . jit to all functions
- depreciated and removed stdev_z_split
- cleaned up histo_analysis to include numpy and numba . jit optimizations
- depreciated and removed all regression functions in favor of future pytorch optimizer
- depreciated and removed all nonessential functions ( basic_analysis , benchmark , strip_data )
- optimized z_normalize using sklearn . preprocessing . normalize
- TODO : implement kernel / function based pytorch regression optimizer
2020-07-30 19:05:07 +00:00
0.9 .0 :
2020-05-01 21:15:07 +00:00
- refactored
- numpyed everything
- removed stats in favor of numpy functions
2020-07-30 19:05:07 +00:00
0.8 .5 :
2020-05-01 21:15:07 +00:00
- minor fixes
2020-07-30 19:05:07 +00:00
0.8 .4 :
2020-05-01 21:15:07 +00:00
- removed a few unused dependencies
2020-07-30 19:05:07 +00:00
0.8 .3 :
2020-05-01 21:15:07 +00:00
- added p_value function
2020-07-30 19:05:07 +00:00
0.8 .2 :
- updated __all__ correctly to contain changes made in v 0.8 .0 and v 0.8 .1
0.8 .1 :
2020-05-01 21:15:07 +00:00
- refactors
- bugfixes
2020-07-30 19:05:07 +00:00
0.8 .0 :
2020-05-01 21:15:07 +00:00
- depreciated histo_analysis_old
- depreciated debug
- altered basic_analysis to take array data instead of filepath
- refactor
- optimization
2020-07-30 19:05:07 +00:00
0.7 .2 :
2020-05-01 21:15:07 +00:00
- bug fixes
2020-07-30 19:05:07 +00:00
0.7 .1 :
2020-05-01 21:15:07 +00:00
- bug fixes
2020-07-30 19:05:07 +00:00
0.7 .0 :
2020-05-01 21:15:07 +00:00
- added tanh_regression ( logistical regression )
- bug fixes
2020-07-30 19:05:07 +00:00
0.6 .5 :
2020-05-01 21:15:07 +00:00
- added z_normalize function to normalize dataset
- bug fixes
2020-07-30 19:05:07 +00:00
0.6 .4 :
2020-05-01 21:15:07 +00:00
- bug fixes
2020-07-30 19:05:07 +00:00
0.6 .3 :
2020-05-01 21:15:07 +00:00
- bug fixes
2020-07-30 19:05:07 +00:00
0.6 .2 :
2020-05-01 21:15:07 +00:00
- bug fixes
2020-07-30 19:05:07 +00:00
0.6 .1 :
2020-05-01 21:15:07 +00:00
- corrected __all__ to contain all of the functions
2020-07-30 19:05:07 +00:00
0.6 .0 :
2020-05-01 21:15:07 +00:00
- added calc_overfit , which calculates two measures of overfit , error and performance
- added calculating overfit to optimize_regression
2020-07-30 19:05:07 +00:00
0.5 .0 :
2020-05-01 21:15:07 +00:00
- added optimize_regression function , which is a sample function to find the optimal regressions
- optimize_regression function filters out some overfit funtions ( functions with r ^ 2 = 1 )
- planned addition : overfit detection in the optimize_regression function
2020-07-30 19:05:07 +00:00
0.4 .2 :
2020-05-01 21:15:07 +00:00
- added __changelog__
- updated debug function with log and exponential regressions
2020-07-30 19:05:07 +00:00
0.4 .1 :
2020-05-01 21:15:07 +00:00
- added log regressions
- added exponential regressions
- added log_regression and exp_regression to __all__
2020-07-30 19:05:07 +00:00
0.3 .8 :
2020-05-01 21:15:07 +00:00
- added debug function to further consolidate functions
2020-07-30 19:05:07 +00:00
0.3 .7 :
2020-05-01 21:15:07 +00:00
- added builtin benchmark function
- added builtin random ( linear ) data generation function
- added device initialization ( _init_device )
2020-07-30 19:05:07 +00:00
0.3 .6 :
2020-05-01 21:15:07 +00:00
- reorganized the imports list to be in alphabetical order
- added search and regurgitate functions to c_entities , nc_entities , obstacles , objectives
2020-07-30 19:05:07 +00:00
0.3 .5 :
2020-05-01 21:15:07 +00:00
- major bug fixes
- updated historical analysis
- depreciated old historical analysis
2020-07-30 19:05:07 +00:00
0.3 .4 :
2020-05-01 21:15:07 +00:00
- added __version__ , __author__ , __all__
- added polynomial regression
- added root mean squared function
- added r squared function
2020-07-30 19:05:07 +00:00
0.3 .3 :
2020-05-01 21:15:07 +00:00
- bug fixes
- added c_entities
2020-07-30 19:05:07 +00:00
0.3 .2 :
2020-05-01 21:15:07 +00:00
- bug fixes
- added nc_entities , obstacles , objectives
- consolidated statistics . py to analysis . py
2020-07-30 19:05:07 +00:00
0.3 .1 :
2020-05-01 21:15:07 +00:00
- compiled 1 d , column , and row basic stats into basic stats function
2020-07-30 19:05:07 +00:00
0.3 .0 :
2020-05-01 21:15:07 +00:00
- added historical analysis function
2020-07-30 19:05:07 +00:00
0.2 . x :
2020-05-01 21:15:07 +00:00
- added z score test
2020-07-30 19:05:07 +00:00
0.1 . x :
2020-05-01 21:15:07 +00:00
- major bug fixes
2020-07-30 19:05:07 +00:00
0.0 . x :
2020-05-01 21:15:07 +00:00
- added loading csv
- added 1 d , column , row basic stats
2020-03-10 03:58:51 +00:00
"""
__author__ = (
2020-05-01 21:15:07 +00:00
" Arthur Lu <learthurgo@gmail.com> " ,
" Jacob Levine <jlevine@imsa.edu> " ,
2020-03-10 03:58:51 +00:00
)
__all__ = [
2020-05-01 21:15:07 +00:00
' load_csv ' ,
' basic_stats ' ,
' z_score ' ,
' z_normalize ' ,
' histo_analysis ' ,
' regression ' ,
2020-05-02 03:59:54 +00:00
' Metric ' ,
' RegressionMetric ' ,
' ClassificationMetric ' ,
2020-05-01 21:15:07 +00:00
' kmeans ' ,
' pca ' ,
' decisiontree ' ,
' KNN ' ,
' NaiveBayes ' ,
' SVM ' ,
2020-05-02 03:59:54 +00:00
' RandomForrest ' ,
' CorrelationTest ' ,
' StatisticalTest ' ,
2020-05-15 19:48:26 +00:00
' Array ' ,
2020-05-01 21:15:07 +00:00
# all statistics functions left out due to integration in other functions
2020-03-10 03:58:51 +00:00
]
# now back to your regularly scheduled programming:
2020-07-30 19:05:07 +00:00
# imports (now in alphabetical order! v 0.3.006):
2020-03-10 03:58:51 +00:00
import csv
2020-08-10 21:25:25 +00:00
from tra_analysis . metrics import elo as Elo
from tra_analysis . metrics import glicko2 as Glicko2
2020-04-30 21:03:37 +00:00
import math
2020-03-10 03:58:51 +00:00
import numba
from numba import jit
import numpy as np
import scipy
2020-04-13 19:58:04 +00:00
from scipy import optimize , stats
2020-03-10 03:58:51 +00:00
import sklearn
2020-04-13 19:58:04 +00:00
from sklearn import preprocessing , pipeline , linear_model , metrics , cluster , decomposition , tree , neighbors , naive_bayes , svm , model_selection , ensemble
2020-08-10 21:25:25 +00:00
from tra_analysis . metrics import trueskill as Trueskill
2020-07-05 05:30:48 +00:00
import warnings
2020-03-10 03:58:51 +00:00
class error ( ValueError ) :
2020-05-01 21:15:07 +00:00
pass
2020-03-10 03:58:51 +00:00
def load_csv ( filepath ) :
2020-05-01 21:15:07 +00:00
with open ( filepath , newline = ' ' ) as csvfile :
file_array = np . array ( list ( csv . reader ( csvfile ) ) )
csvfile . close ( )
return file_array
2020-03-10 03:58:51 +00:00
# expects 1d array
@jit ( forceobj = True )
def basic_stats ( data ) :
2020-05-01 21:15:07 +00:00
data_t = np . array ( data ) . astype ( float )
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
_mean = mean ( data_t )
_median = median ( data_t )
_stdev = stdev ( data_t )
_variance = variance ( data_t )
_min = npmin ( data_t )
_max = npmax ( data_t )
2020-03-10 03:58:51 +00:00
2020-05-20 13:52:38 +00:00
return { " mean " : _mean , " median " : _median , " standard-deviation " : _stdev , " variance " : _variance , " minimum " : _min , " maximum " : _max }
2020-03-10 03:58:51 +00:00
# returns z score with inputs of point, mean and standard deviation of spread
@jit ( forceobj = True )
def z_score ( point , mean , stdev ) :
2020-05-01 21:15:07 +00:00
score = ( point - mean ) / stdev
return score
2020-03-10 03:58:51 +00:00
# expects 2d array, normalizes across all axes
@jit ( forceobj = True )
def z_normalize ( array , * args ) :
2020-05-02 04:14:19 +00:00
array = np . array ( array )
for arg in args :
array = sklearn . preprocessing . normalize ( array , axis = arg )
2020-03-10 03:58:51 +00:00
2020-05-02 04:16:32 +00:00
return array
2020-03-10 03:58:51 +00:00
@jit ( forceobj = True )
# expects 2d array of [x,y]
def histo_analysis ( hist_data ) :
2020-05-20 13:52:38 +00:00
if len ( hist_data [ 0 ] ) > 2 :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
hist_data = np . array ( hist_data )
derivative = np . array ( len ( hist_data ) - 1 , dtype = float )
t = np . diff ( hist_data )
derivative = t [ 1 ] / t [ 0 ]
np . sort ( derivative )
2020-03-10 03:58:51 +00:00
2020-05-20 13:52:38 +00:00
return { " mean " : basic_stats ( derivative ) [ " mean " ] , " deviation " : basic_stats ( derivative ) [ " standard-deviation " ] }
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
else :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
return None
2020-03-10 03:58:51 +00:00
def regression ( inputs , outputs , args ) : # inputs, outputs expects N-D array
2020-05-01 21:15:07 +00:00
X = np . array ( inputs )
y = np . array ( outputs )
2020-03-10 03:58:51 +00:00
2020-09-17 02:11:44 +00:00
regressions = { }
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
if ' lin ' in args : # formula: ax + b
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
try :
2020-03-10 03:58:51 +00:00
2020-05-04 16:59:25 +00:00
def lin ( x , a , b ) :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
return a * x + b
2020-03-10 03:58:51 +00:00
2020-05-04 16:59:25 +00:00
popt , pcov = scipy . optimize . curve_fit ( lin , X , y )
2020-03-10 03:58:51 +00:00
2020-05-15 04:36:28 +00:00
coeffs = popt . flatten ( ) . tolist ( )
2020-09-17 02:11:44 +00:00
regressions [ " lin " ] = ( str ( coeffs [ 0 ] ) + " *x+ " + str ( coeffs [ 1 ] ) )
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
except Exception as e :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
pass
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
if ' log ' in args : # formula: a log (b(x + c)) + d
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
try :
2020-03-10 03:58:51 +00:00
2020-05-04 16:59:25 +00:00
def log ( x , a , b , c , d ) :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
return a * np . log ( b * ( x + c ) ) + d
2020-03-10 03:58:51 +00:00
2020-05-04 16:59:25 +00:00
popt , pcov = scipy . optimize . curve_fit ( log , X , y )
2020-03-10 03:58:51 +00:00
2020-05-15 04:36:28 +00:00
coeffs = popt . flatten ( ) . tolist ( )
2020-09-17 02:11:44 +00:00
regressions [ " log " ] = ( str ( coeffs [ 0 ] ) + " *log( " + str ( coeffs [ 1 ] ) + " *(x+ " + str ( coeffs [ 2 ] ) + " ))+ " + str ( coeffs [ 3 ] ) )
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
except Exception as e :
pass
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
if ' exp ' in args : # formula: a e ^ (b(x + c)) + d
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
try :
2020-03-10 03:58:51 +00:00
2020-05-04 16:59:25 +00:00
def exp ( x , a , b , c , d ) :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
return a * np . exp ( b * ( x + c ) ) + d
2020-03-10 03:58:51 +00:00
2020-05-04 16:59:25 +00:00
popt , pcov = scipy . optimize . curve_fit ( exp , X , y )
2020-03-10 03:58:51 +00:00
2020-05-15 04:36:28 +00:00
coeffs = popt . flatten ( ) . tolist ( )
2020-09-17 02:11:44 +00:00
regressions [ " exp " ] = ( str ( coeffs [ 0 ] ) + " *e^( " + str ( coeffs [ 1 ] ) + " *(x+ " + str ( coeffs [ 2 ] ) + " ))+ " + str ( coeffs [ 3 ] ) )
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
except Exception as e :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
pass
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
if ' ply ' in args : # formula: a + bx^1 + cx^2 + dx^3 + ...
inputs = np . array ( [ inputs ] )
outputs = np . array ( [ outputs ] )
2020-03-10 03:58:51 +00:00
2020-09-17 02:11:44 +00:00
plys = { }
2020-05-01 21:15:07 +00:00
limit = len ( outputs [ 0 ] )
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
for i in range ( 2 , limit ) :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
model = sklearn . preprocessing . PolynomialFeatures ( degree = i )
model = sklearn . pipeline . make_pipeline ( model , sklearn . linear_model . LinearRegression ( ) )
model = model . fit ( np . rot90 ( inputs ) , np . rot90 ( outputs ) )
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
params = model . steps [ 1 ] [ 1 ] . intercept_ . tolist ( )
params = np . append ( params , model . steps [ 1 ] [ 1 ] . coef_ [ 0 ] . tolist ( ) [ 1 : : ] )
2020-05-15 04:36:28 +00:00
params = params . flatten ( ) . tolist ( )
temp = " "
counter = 0
for param in params :
temp + = " ( " + str ( param ) + " *x^ " + str ( counter ) + " ) "
counter + = 1
2020-09-17 02:11:44 +00:00
plys [ " x^ " + str ( i ) ] = ( temp )
2020-03-10 03:58:51 +00:00
2020-09-17 02:11:44 +00:00
regressions [ " ply " ] = ( plys )
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
if ' sig ' in args : # formula: a tanh (b(x + c)) + d
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
try :
2020-03-10 03:58:51 +00:00
2020-05-04 16:59:25 +00:00
def sig ( x , a , b , c , d ) :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
return a * np . tanh ( b * ( x + c ) ) + d
2020-03-10 03:58:51 +00:00
2020-05-04 16:59:25 +00:00
popt , pcov = scipy . optimize . curve_fit ( sig , X , y )
2020-03-10 03:58:51 +00:00
2020-05-15 04:36:28 +00:00
coeffs = popt . flatten ( ) . tolist ( )
2020-09-17 02:11:44 +00:00
regressions [ " sig " ] = ( str ( coeffs [ 0 ] ) + " *tanh( " + str ( coeffs [ 1 ] ) + " *(x+ " + str ( coeffs [ 2 ] ) + " ))+ " + str ( coeffs [ 3 ] ) )
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
except Exception as e :
pass
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
return regressions
2020-03-10 03:58:51 +00:00
2020-05-02 03:59:54 +00:00
class Metric :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
def elo ( self , starting_score , opposing_score , observed , N , K ) :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
return Elo . calculate ( starting_score , opposing_score , observed , N , K )
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
def glicko2 ( self , starting_score , starting_rd , starting_vol , opposing_score , opposing_rd , observations ) :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
player = Glicko2 . Glicko2 ( rating = starting_score , rd = starting_rd , vol = starting_vol )
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
player . update_player ( [ x for x in opposing_score ] , [ x for x in opposing_rd ] , observations )
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
return ( player . rating , player . rd , player . vol )
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
def trueskill ( self , teams_data , observations ) : # teams_data is array of array of tuples ie. [[(mu, sigma), (mu, sigma), (mu, sigma)], [(mu, sigma), (mu, sigma), (mu, sigma)]]
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
team_ratings = [ ]
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
for team in teams_data :
team_temp = ( )
for player in team :
player = Trueskill . Rating ( player [ 0 ] , player [ 1 ] )
team_temp = team_temp + ( player , )
team_ratings . append ( team_temp )
2020-04-12 02:51:40 +00:00
2020-05-01 21:15:07 +00:00
return Trueskill . rate ( team_ratings , ranks = observations )
2020-03-10 03:58:51 +00:00
2020-05-02 03:59:54 +00:00
class RegressionMetric ( ) :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
def __new__ ( cls , predictions , targets ) :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
return cls . r_squared ( cls , predictions , targets ) , cls . mse ( cls , predictions , targets ) , cls . rms ( cls , predictions , targets )
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
def r_squared ( self , predictions , targets ) : # assumes equal size inputs
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
return sklearn . metrics . r2_score ( targets , predictions )
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
def mse ( self , predictions , targets ) :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
return sklearn . metrics . mean_squared_error ( targets , predictions )
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
def rms ( self , predictions , targets ) :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
return math . sqrt ( sklearn . metrics . mean_squared_error ( targets , predictions ) )
2020-03-10 03:58:51 +00:00
2020-05-02 03:59:54 +00:00
class ClassificationMetric ( ) :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
def __new__ ( cls , predictions , targets ) :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
return cls . cm ( cls , predictions , targets ) , cls . cr ( cls , predictions , targets )
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
def cm ( self , predictions , targets ) :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
return sklearn . metrics . confusion_matrix ( targets , predictions )
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
def cr ( self , predictions , targets ) :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
return sklearn . metrics . classification_report ( targets , predictions )
2020-03-10 03:58:51 +00:00
@jit ( nopython = True )
def mean ( data ) :
2020-05-01 21:15:07 +00:00
return np . mean ( data )
2020-03-10 03:58:51 +00:00
@jit ( nopython = True )
def median ( data ) :
2020-05-01 21:15:07 +00:00
return np . median ( data )
2020-03-10 03:58:51 +00:00
@jit ( nopython = True )
def stdev ( data ) :
2020-05-01 21:15:07 +00:00
return np . std ( data )
2020-03-10 03:58:51 +00:00
@jit ( nopython = True )
def variance ( data ) :
2020-05-01 21:15:07 +00:00
return np . var ( data )
2020-03-10 03:58:51 +00:00
@jit ( nopython = True )
def npmin ( data ) :
2020-05-01 21:15:07 +00:00
return np . amin ( data )
2020-03-10 03:58:51 +00:00
@jit ( nopython = True )
def npmax ( data ) :
2020-05-01 21:15:07 +00:00
return np . amax ( data )
2020-03-10 03:58:51 +00:00
@jit ( forceobj = True )
def kmeans ( data , n_clusters = 8 , init = " k-means++ " , n_init = 10 , max_iter = 300 , tol = 0.0001 , precompute_distances = " auto " , verbose = 0 , random_state = None , copy_x = True , n_jobs = None , algorithm = " auto " ) :
2020-05-01 21:15:07 +00:00
kernel = sklearn . cluster . KMeans ( n_clusters = n_clusters , init = init , n_init = n_init , max_iter = max_iter , tol = tol , precompute_distances = precompute_distances , verbose = verbose , random_state = random_state , copy_x = copy_x , n_jobs = n_jobs , algorithm = algorithm )
kernel . fit ( data )
predictions = kernel . predict ( data )
centers = kernel . cluster_centers_
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
return centers , predictions
2020-03-10 03:58:51 +00:00
@jit ( forceobj = True )
def pca ( data , n_components = None , copy = True , whiten = False , svd_solver = " auto " , tol = 0.0 , iterated_power = " auto " , random_state = None ) :
2020-05-01 21:15:07 +00:00
kernel = sklearn . decomposition . PCA ( n_components = n_components , copy = copy , whiten = whiten , svd_solver = svd_solver , tol = tol , iterated_power = iterated_power , random_state = random_state )
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
return kernel . fit_transform ( data )
2020-03-10 03:58:51 +00:00
@jit ( forceobj = True )
def decisiontree ( data , labels , test_size = 0.3 , criterion = " gini " , splitter = " default " , max_depth = None ) : #expects *2d data and 1d labels
2020-05-01 21:15:07 +00:00
data_train , data_test , labels_train , labels_test = sklearn . model_selection . train_test_split ( data , labels , test_size = test_size , random_state = 1 )
model = sklearn . tree . DecisionTreeClassifier ( criterion = criterion , splitter = splitter , max_depth = max_depth )
model = model . fit ( data_train , labels_train )
predictions = model . predict ( data_test )
2020-05-02 03:59:54 +00:00
metrics = ClassificationMetric ( predictions , labels_test )
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
return model , metrics
2020-03-10 03:58:51 +00:00
2020-04-12 02:51:40 +00:00
class KNN :
2020-03-10 03:58:51 +00:00
2020-09-18 21:55:59 +00:00
def knn_classifier ( self , data , labels , n_neighbors , test_size = 0.3 , algorithm = ' auto ' , leaf_size = 30 , metric = ' minkowski ' , metric_params = None , n_jobs = None , p = 2 , weights = ' uniform ' ) : #expects *2d data and 1d labels post-scaling
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
data_train , data_test , labels_train , labels_test = sklearn . model_selection . train_test_split ( data , labels , test_size = test_size , random_state = 1 )
model = sklearn . neighbors . KNeighborsClassifier ( )
model . fit ( data_train , labels_train )
predictions = model . predict ( data_test )
2020-03-10 03:58:51 +00:00
2020-05-02 03:59:54 +00:00
return model , ClassificationMetric ( predictions , labels_test )
2020-03-10 03:58:51 +00:00
2020-09-18 21:55:59 +00:00
def knn_regressor ( self , data , outputs , n_neighbors , test_size = 0.3 , weights = " uniform " , algorithm = " auto " , leaf_size = 30 , p = 2 , metric = " minkowski " , metric_params = None , n_jobs = None ) :
2020-04-12 02:51:40 +00:00
2020-05-01 21:15:07 +00:00
data_train , data_test , outputs_train , outputs_test = sklearn . model_selection . train_test_split ( data , outputs , test_size = test_size , random_state = 1 )
model = sklearn . neighbors . KNeighborsRegressor ( n_neighbors = n_neighbors , weights = weights , algorithm = algorithm , leaf_size = leaf_size , p = p , metric = metric , metric_params = metric_params , n_jobs = n_jobs )
model . fit ( data_train , outputs_train )
predictions = model . predict ( data_test )
2020-03-10 03:58:51 +00:00
2020-05-02 03:59:54 +00:00
return model , RegressionMetric ( predictions , outputs_test )
2020-03-10 03:58:51 +00:00
class NaiveBayes :
2020-05-01 21:15:07 +00:00
def guassian ( self , data , labels , test_size = 0.3 , priors = None , var_smoothing = 1e-09 ) :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
data_train , data_test , labels_train , labels_test = sklearn . model_selection . train_test_split ( data , labels , test_size = test_size , random_state = 1 )
model = sklearn . naive_bayes . GaussianNB ( priors = priors , var_smoothing = var_smoothing )
model . fit ( data_train , labels_train )
predictions = model . predict ( data_test )
2020-03-10 03:58:51 +00:00
2020-05-02 03:59:54 +00:00
return model , ClassificationMetric ( predictions , labels_test )
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
def multinomial ( self , data , labels , test_size = 0.3 , alpha = 1.0 , fit_prior = True , class_prior = None ) :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
data_train , data_test , labels_train , labels_test = sklearn . model_selection . train_test_split ( data , labels , test_size = test_size , random_state = 1 )
model = sklearn . naive_bayes . MultinomialNB ( alpha = alpha , fit_prior = fit_prior , class_prior = class_prior )
model . fit ( data_train , labels_train )
predictions = model . predict ( data_test )
2020-03-10 03:58:51 +00:00
2020-05-02 03:59:54 +00:00
return model , ClassificationMetric ( predictions , labels_test )
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
def bernoulli ( self , data , labels , test_size = 0.3 , alpha = 1.0 , binarize = 0.0 , fit_prior = True , class_prior = None ) :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
data_train , data_test , labels_train , labels_test = sklearn . model_selection . train_test_split ( data , labels , test_size = test_size , random_state = 1 )
model = sklearn . naive_bayes . BernoulliNB ( alpha = alpha , binarize = binarize , fit_prior = fit_prior , class_prior = class_prior )
model . fit ( data_train , labels_train )
predictions = model . predict ( data_test )
2020-03-10 03:58:51 +00:00
2020-05-02 03:59:54 +00:00
return model , ClassificationMetric ( predictions , labels_test )
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
def complement ( self , data , labels , test_size = 0.3 , alpha = 1.0 , fit_prior = True , class_prior = None , norm = False ) :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
data_train , data_test , labels_train , labels_test = sklearn . model_selection . train_test_split ( data , labels , test_size = test_size , random_state = 1 )
model = sklearn . naive_bayes . ComplementNB ( alpha = alpha , fit_prior = fit_prior , class_prior = class_prior , norm = norm )
model . fit ( data_train , labels_train )
predictions = model . predict ( data_test )
2020-03-10 03:58:51 +00:00
2020-05-02 03:59:54 +00:00
return model , ClassificationMetric ( predictions , labels_test )
2020-03-10 03:58:51 +00:00
class SVM :
2020-05-01 21:15:07 +00:00
class CustomKernel :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
def __new__ ( cls , C , kernel , degre , gamma , coef0 , shrinking , probability , tol , cache_size , class_weight , verbose , max_iter , decision_function_shape , random_state ) :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
return sklearn . svm . SVC ( C = C , kernel = kernel , gamma = gamma , coef0 = coef0 , shrinking = shrinking , probability = probability , tol = tol , cache_size = cache_size , class_weight = class_weight , verbose = verbose , max_iter = max_iter , decision_function_shape = decision_function_shape , random_state = random_state )
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
class StandardKernel :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
def __new__ ( cls , kernel , C = 1.0 , degree = 3 , gamma = ' auto_deprecated ' , coef0 = 0.0 , shrinking = True , probability = False , tol = 0.001 , cache_size = 200 , class_weight = None , verbose = False , max_iter = - 1 , decision_function_shape = ' ovr ' , random_state = None ) :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
return sklearn . svm . SVC ( C = C , kernel = kernel , gamma = gamma , coef0 = coef0 , shrinking = shrinking , probability = probability , tol = tol , cache_size = cache_size , class_weight = class_weight , verbose = verbose , max_iter = max_iter , decision_function_shape = decision_function_shape , random_state = random_state )
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
class PrebuiltKernel :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
class Linear :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
def __new__ ( cls ) :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
return sklearn . svm . SVC ( kernel = ' linear ' )
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
class Polynomial :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
def __new__ ( cls , power , r_bias ) :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
return sklearn . svm . SVC ( kernel = ' polynomial ' , degree = power , coef0 = r_bias )
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
class RBF :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
def __new__ ( cls , gamma ) :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
return sklearn . svm . SVC ( kernel = ' rbf ' , gamma = gamma )
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
class Sigmoid :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
def __new__ ( cls , r_bias ) :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
return sklearn . svm . SVC ( kernel = ' sigmoid ' , coef0 = r_bias )
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
def fit ( self , kernel , train_data , train_outputs ) : # expects *2d data, 1d labels or outputs
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
return kernel . fit ( train_data , train_outputs )
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
def eval_classification ( self , kernel , test_data , test_outputs ) :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
predictions = kernel . predict ( test_data )
2020-03-10 03:58:51 +00:00
2020-05-02 03:59:54 +00:00
return ClassificationMetric ( predictions , test_outputs )
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
def eval_regression ( self , kernel , test_data , test_outputs ) :
2020-03-10 03:58:51 +00:00
2020-05-01 21:15:07 +00:00
predictions = kernel . predict ( test_data )
2020-03-10 03:58:51 +00:00
2020-05-02 03:59:54 +00:00
return RegressionMetric ( predictions , test_outputs )
2020-03-10 03:58:51 +00:00
2020-09-18 21:55:59 +00:00
class RandomForest :
2020-03-10 03:58:51 +00:00
2020-09-18 21:55:59 +00:00
def random_forest_classifier ( self , data , labels , test_size , n_estimators , criterion = " gini " , max_depth = None , min_samples_split = 2 , min_samples_leaf = 1 , min_weight_fraction_leaf = 0.0 , max_features = " auto " , max_leaf_nodes = None , min_impurity_decrease = 0.0 , min_impurity_split = None , bootstrap = True , oob_score = False , n_jobs = None , random_state = None , verbose = 0 , warm_start = False , class_weight = None ) :
2020-05-02 03:59:54 +00:00
data_train , data_test , labels_train , labels_test = sklearn . model_selection . train_test_split ( data , labels , test_size = test_size , random_state = 1 )
kernel = sklearn . ensemble . RandomForestClassifier ( n_estimators = n_estimators , criterion = criterion , max_depth = max_depth , min_samples_split = min_samples_split , min_samples_leaf = min_samples_leaf , min_weight_fraction_leaf = min_weight_fraction_leaf , max_leaf_nodes = max_leaf_nodes , min_impurity_decrease = min_impurity_decrease , bootstrap = bootstrap , oob_score = oob_score , n_jobs = n_jobs , random_state = random_state , verbose = verbose , warm_start = warm_start , class_weight = class_weight )
kernel . fit ( data_train , labels_train )
predictions = kernel . predict ( data_test )
2020-03-10 03:58:51 +00:00
2020-05-02 03:59:54 +00:00
return kernel , ClassificationMetric ( predictions , labels_test )
2020-03-10 03:58:51 +00:00
2020-09-18 21:55:59 +00:00
def random_forest_regressor ( self , data , outputs , test_size , n_estimators , criterion = " mse " , max_depth = None , min_samples_split = 2 , min_samples_leaf = 1 , min_weight_fraction_leaf = 0.0 , max_features = " auto " , max_leaf_nodes = None , min_impurity_decrease = 0.0 , min_impurity_split = None , bootstrap = True , oob_score = False , n_jobs = None , random_state = None , verbose = 0 , warm_start = False ) :
2020-03-10 03:58:51 +00:00
2020-05-02 03:59:54 +00:00
data_train , data_test , outputs_train , outputs_test = sklearn . model_selection . train_test_split ( data , outputs , test_size = test_size , random_state = 1 )
kernel = sklearn . ensemble . RandomForestRegressor ( n_estimators = n_estimators , criterion = criterion , max_depth = max_depth , min_samples_split = min_samples_split , min_weight_fraction_leaf = min_weight_fraction_leaf , max_features = max_features , max_leaf_nodes = max_leaf_nodes , min_impurity_decrease = min_impurity_decrease , min_impurity_split = min_impurity_split , bootstrap = bootstrap , oob_score = oob_score , n_jobs = n_jobs , random_state = random_state , verbose = verbose , warm_start = warm_start )
kernel . fit ( data_train , outputs_train )
predictions = kernel . predict ( data_test )
2020-03-10 03:58:51 +00:00
2020-05-02 03:59:54 +00:00
return kernel , RegressionMetric ( predictions , outputs_test )
2020-04-13 19:58:04 +00:00
2020-05-02 03:59:54 +00:00
class CorrelationTest :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def anova_oneway ( self , * args ) : #expects arrays of samples
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . f_oneway ( * args )
return { " F-value " : results [ 0 ] , " p-value " : results [ 1 ] }
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def pearson ( self , x , y ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . pearsonr ( x , y )
return { " r-value " : results [ 0 ] , " p-value " : results [ 1 ] }
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def spearman ( self , a , b = None , axis = 0 , nan_policy = ' propagate ' ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . spearmanr ( a , b = b , axis = axis , nan_policy = nan_policy )
return { " r-value " : results [ 0 ] , " p-value " : results [ 1 ] }
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def point_biserial ( self , x , y ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . pointbiserialr ( x , y )
return { " r-value " : results [ 0 ] , " p-value " : results [ 1 ] }
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def kendall ( self , x , y , initial_lexsort = None , nan_policy = ' propagate ' , method = ' auto ' ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . kendalltau ( x , y , initial_lexsort = initial_lexsort , nan_policy = nan_policy , method = method )
return { " tau " : results [ 0 ] , " p-value " : results [ 1 ] }
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def kendall_weighted ( self , x , y , rank = True , weigher = None , additive = True ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . weightedtau ( x , y , rank = rank , weigher = weigher , additive = additive )
return { " tau " : results [ 0 ] , " p-value " : results [ 1 ] }
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def mgc ( self , x , y , compute_distance = None , reps = 1000 , workers = 1 , is_twosamp = False , random_state = None ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . multiscale_graphcorr ( x , y , compute_distance = compute_distance , reps = reps , workers = workers , is_twosamp = is_twosamp , random_state = random_state )
return { " k-value " : results [ 0 ] , " p-value " : results [ 1 ] , " data " : results [ 2 ] } # unsure if MGC test returns a k value
2020-04-13 19:58:04 +00:00
2020-05-02 03:59:54 +00:00
class StatisticalTest :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def ttest_onesample ( self , a , popmean , axis = 0 , nan_policy = ' propagate ' ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . ttest_1samp ( a , popmean , axis = axis , nan_policy = nan_policy )
return { " t-value " : results [ 0 ] , " p-value " : results [ 1 ] }
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def ttest_independent ( self , a , b , equal = True , nan_policy = ' propagate ' ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . ttest_ind ( a , b , equal_var = equal , nan_policy = nan_policy )
return { " t-value " : results [ 0 ] , " p-value " : results [ 1 ] }
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def ttest_statistic ( self , o1 , o2 , equal = True ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . ttest_ind_from_stats ( o1 [ " mean " ] , o1 [ " std " ] , o1 [ " nobs " ] , o2 [ " mean " ] , o2 [ " std " ] , o2 [ " nobs " ] , equal_var = equal )
return { " t-value " : results [ 0 ] , " p-value " : results [ 1 ] }
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def ttest_related ( self , a , b , axis = 0 , nan_policy = ' propagate ' ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . ttest_rel ( a , b , axis = axis , nan_policy = nan_policy )
return { " t-value " : results [ 0 ] , " p-value " : results [ 1 ] }
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def ks_fitness ( self , rvs , cdf , args = ( ) , N = 20 , alternative = ' two-sided ' , mode = ' approx ' ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . kstest ( rvs , cdf , args = args , N = N , alternative = alternative , mode = mode )
return { " ks-value " : results [ 0 ] , " p-value " : results [ 1 ] }
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def chisquare ( self , f_obs , f_exp = None , ddof = None , axis = 0 ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . chisquare ( f_obs , f_exp = f_exp , ddof = ddof , axis = axis )
return { " chisquared-value " : results [ 0 ] , " p-value " : results [ 1 ] }
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def powerdivergence ( self , f_obs , f_exp = None , ddof = None , axis = 0 , lambda_ = None ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . power_divergence ( f_obs , f_exp = f_exp , ddof = ddof , axis = axis , lambda_ = lambda_ )
return { " powerdivergence-value " : results [ 0 ] , " p-value " : results [ 1 ] }
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def ks_twosample ( self , x , y , alternative = ' two_sided ' , mode = ' auto ' ) :
results = scipy . stats . ks_2samp ( x , y , alternative = alternative , mode = mode )
return { " ks-value " : results [ 0 ] , " p-value " : results [ 1 ] }
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def es_twosample ( self , x , y , t = ( 0.4 , 0.8 ) ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . epps_singleton_2samp ( x , y , t = t )
return { " es-value " : results [ 0 ] , " p-value " : results [ 1 ] }
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def mw_rank ( self , x , y , use_continuity = True , alternative = None ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . mannwhitneyu ( x , y , use_continuity = use_continuity , alternative = alternative )
return { " u-value " : results [ 0 ] , " p-value " : results [ 1 ] }
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def mw_tiecorrection ( self , rank_values ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . tiecorrect ( rank_values )
return { " correction-factor " : results }
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def rankdata ( self , a , method = ' average ' ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . rankdata ( a , method = method )
return results
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def wilcoxon_ranksum ( self , a , b ) : # this seems to be superceded by Mann Whitney Wilcoxon U Test
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . ranksums ( a , b )
return { " u-value " : results [ 0 ] , " p-value " : results [ 1 ] }
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def wilcoxon_signedrank ( self , x , y = None , zero_method = ' wilcox ' , correction = False , alternative = ' two-sided ' ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . wilcoxon ( x , y = y , zero_method = zero_method , correction = correction , alternative = alternative )
return { " t-value " : results [ 0 ] , " p-value " : results [ 1 ] }
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def kw_htest ( self , * args , nan_policy = ' propagate ' ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . kruskal ( * args , nan_policy = nan_policy )
return { " h-value " : results [ 0 ] , " p-value " : results [ 1 ] }
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def friedman_chisquare ( self , * args ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . friedmanchisquare ( * args )
return { " chisquared-value " : results [ 0 ] , " p-value " : results [ 1 ] }
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def bm_wtest ( self , x , y , alternative = ' two-sided ' , distribution = ' t ' , nan_policy = ' propagate ' ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . brunnermunzel ( x , y , alternative = alternative , distribution = distribution , nan_policy = nan_policy )
return { " w-value " : results [ 0 ] , " p-value " : results [ 1 ] }
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def combine_pvalues ( self , pvalues , method = ' fisher ' , weights = None ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . combine_pvalues ( pvalues , method = method , weights = weights )
return { " combined-statistic " : results [ 0 ] , " p-value " : results [ 1 ] }
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def jb_fitness ( self , x ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . jarque_bera ( x )
return { " jb-value " : results [ 0 ] , " p-value " : results [ 1 ] }
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def ab_equality ( self , x , y ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . ansari ( x , y )
return { " ab-value " : results [ 0 ] , " p-value " : results [ 1 ] }
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def bartlett_variance ( self , * args ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . bartlett ( * args )
return { " t-value " : results [ 0 ] , " p-value " : results [ 1 ] }
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def levene_variance ( self , * args , center = ' median ' , proportiontocut = 0.05 ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . levene ( * args , center = center , proportiontocut = proportiontocut )
return { " w-value " : results [ 0 ] , " p-value " : results [ 1 ] }
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def sw_normality ( self , x ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . shapiro ( x )
return { " w-value " : results [ 0 ] , " p-value " : results [ 1 ] }
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def shapiro ( self , x ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
return " destroyed by facts and logic "
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def ad_onesample ( self , x , dist = ' norm ' ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . anderson ( x , dist = dist )
return { " d-value " : results [ 0 ] , " critical-values " : results [ 1 ] , " significance-value " : results [ 2 ] }
def ad_ksample ( self , samples , midrank = True ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . anderson_ksamp ( samples , midrank = midrank )
return { " d-value " : results [ 0 ] , " critical-values " : results [ 1 ] , " significance-value " : results [ 2 ] }
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def binomial ( self , x , n = None , p = 0.5 , alternative = ' two-sided ' ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . binom_test ( x , n = n , p = p , alternative = alternative )
return { " p-value " : results }
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def fk_variance ( self , * args , center = ' median ' , proportiontocut = 0.05 ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . fligner ( * args , center = center , proportiontocut = proportiontocut )
return { " h-value " : results [ 0 ] , " p-value " : results [ 1 ] } # unknown if the statistic is an h value
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def mood_mediantest ( self , * args , ties = ' below ' , correction = True , lambda_ = 1 , nan_policy = ' propagate ' ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . median_test ( * args , ties = ties , correction = correction , lambda_ = lambda_ , nan_policy = nan_policy )
return { " chisquared-value " : results [ 0 ] , " p-value " : results [ 1 ] , " m-value " : results [ 2 ] , " table " : results [ 3 ] }
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def mood_equalscale ( self , x , y , axis = 0 ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . mood ( x , y , axis = axis )
return { " z-score " : results [ 0 ] , " p-value " : results [ 1 ] }
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def skewtest ( self , a , axis = 0 , nan_policy = ' propogate ' ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . skewtest ( a , axis = axis , nan_policy = nan_policy )
return { " z-score " : results [ 0 ] , " p-value " : results [ 1 ] }
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def kurtosistest ( self , a , axis = 0 , nan_policy = ' propogate ' ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . kurtosistest ( a , axis = axis , nan_policy = nan_policy )
return { " z-score " : results [ 0 ] , " p-value " : results [ 1 ] }
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
def normaltest ( self , a , axis = 0 , nan_policy = ' propogate ' ) :
2020-04-13 19:58:04 +00:00
2020-05-01 21:15:07 +00:00
results = scipy . stats . normaltest ( a , axis = axis , nan_policy = nan_policy )
2020-05-04 19:50:36 +00:00
return { " z-score " : results [ 0 ] , " p-value " : results [ 1 ] }
2020-05-13 16:35:46 +00:00
class Array ( ) : # tests on nd arrays independent of basic_stats
2020-05-04 19:50:36 +00:00
def elementwise_mean ( self , * args ) : # expects arrays that are size normalized
return np . mean ( [ * args ] , axis = 0 )
def elementwise_median ( self , * args ) :
return np . median ( [ * args ] , axis = 0 )
def elementwise_stdev ( self , * args ) :
return np . std ( [ * args ] , axis = 0 )
def elementwise_variance ( self , * args ) :
return np . var ( [ * args ] , axis = 0 )
def elementwise_npmin ( self , * args ) :
return np . amin ( [ * args ] , axis = 0 )
def elementwise_npmax ( self , * args ) :
return np . amax ( [ * args ] , axis = 0 )
def elementwise_stats ( self , * args ) :
_mean = self . elementwise_mean ( * args )
_median = self . elementwise_median ( * args )
_stdev = self . elementwise_stdev ( * args )
_variance = self . elementwise_variance ( * args )
_min = self . elementwise_npmin ( * args )
_max = self . elementwise_npmax ( * args )
2020-05-13 01:19:58 +00:00
return _mean , _median , _stdev , _variance , _min , _max
def normalize ( self , array ) :
a = np . atleast_1d ( np . linalg . norm ( array ) )
a [ a == 0 ] = 1
2020-05-13 01:39:23 +00:00
return array / np . expand_dims ( a , - 1 )
2020-05-13 01:19:58 +00:00
def add ( self , * args ) :
temp = np . array ( [ ] )
2020-05-13 01:21:11 +00:00
for a in args :
2020-05-13 01:19:58 +00:00
temp + = a
return temp
def mul ( self , * args ) :
temp = np . array ( [ ] )
2020-05-13 01:21:11 +00:00
for a in args :
2020-05-13 01:19:58 +00:00
temp * = a
return temp
def neg ( self , array ) :
return - array
def inv ( self , array ) :
return 1 / array
def dot ( self , a , b ) :
return np . dot ( a , b )
def cross ( self , a , b ) :
2020-05-22 00:37:39 +00:00
return np . cross ( a , b )
2020-07-05 05:30:48 +00:00
def sort ( self , array ) : # depreciated
warnings . warn ( " Array.sort has been depreciated in favor of Sort " )
2020-05-22 00:37:39 +00:00
array_length = len ( array )
if array_length < = 1 :
return array
middle_index = int ( array_length / 2 )
left = array [ 0 : middle_index ]
right = array [ middle_index : ]
left = self . sort ( left )
right = self . sort ( right )
return self . __merge ( left , right )
def __merge ( self , left , right ) :
sorted_list = [ ]
left = left [ : ]
right = right [ : ]
while len ( left ) > 0 or len ( right ) > 0 :
if len ( left ) > 0 and len ( right ) > 0 :
if left [ 0 ] < = right [ 0 ] :
sorted_list . append ( left . pop ( 0 ) )
else :
sorted_list . append ( right . pop ( 0 ) )
elif len ( left ) > 0 :
sorted_list . append ( left . pop ( 0 ) )
elif len ( right ) > 0 :
sorted_list . append ( right . pop ( 0 ) )
return sorted_list
def search ( self , arr , x ) :
return self . __search ( arr , 0 , len ( arr ) - 1 , x )
def __search ( self , arr , low , high , x ) :
if high > = low :
mid = ( high + low ) / / 2
if arr [ mid ] == x :
return mid
elif arr [ mid ] > x :
return binary_search ( arr , low , mid - 1 , x )
else :
return binary_search ( arr , mid + 1 , high , x )
else :
2020-07-05 05:30:48 +00:00
return - 1
class Sort : # if you haven't used a sort, then you've never lived
def quicksort ( self , a ) :
def sort ( array ) :
less = [ ]
equal = [ ]
greater = [ ]
if len ( array ) > 1 :
pivot = array [ 0 ]
for x in array :
if x < pivot :
less . append ( x )
elif x == pivot :
equal . append ( x )
elif x > pivot :
greater . append ( x )
return sort ( less ) + equal + sort ( greater )
else :
return array
return np . array ( sort ( a ) )
def mergesort ( self , a ) :
def sort ( array ) :
array = array
if len ( array ) > 1 :
middle = len ( array ) / / 2
L = array [ : middle ]
R = array [ middle : ]
sort ( L )
sort ( R )
i = j = k = 0
while i < len ( L ) and j < len ( R ) :
if L [ i ] < R [ j ] :
array [ k ] = L [ i ]
i + = 1
else :
array [ k ] = R [ j ]
j + = 1
k + = 1
while i < len ( L ) :
array [ k ] = L [ i ]
i + = 1
k + = 1
while j < len ( R ) :
array [ k ] = R [ j ]
j + = 1
k + = 1
return array
return sort ( a )
def introsort ( self , a ) :
def sort ( array , start , end , maxdepth ) :
array = array
if end - start < = 1 :
return
elif maxdepth == 0 :
heapsort ( array , start , end )
else :
p = partition ( array , start , end )
sort ( array , start , p + 1 , maxdepth - 1 )
sort ( array , p + 1 , end , maxdepth - 1 )
return array
def partition ( array , start , end ) :
pivot = array [ start ]
i = start - 1
j = end
while True :
i = i + 1
while array [ i ] < pivot :
i = i + 1
j = j - 1
while array [ j ] > pivot :
j = j - 1
if i > = j :
return j
swap ( array , i , j )
def swap ( array , i , j ) :
array [ i ] , array [ j ] = array [ j ] , array [ i ]
def heapsort ( array , start , end ) :
build_max_heap ( array , start , end )
for i in range ( end - 1 , start , - 1 ) :
swap ( array , start , i )
max_heapify ( array , index = 0 , start = start , end = i )
def build_max_heap ( array , start , end ) :
def parent ( i ) :
return ( i - 1 ) / / 2
length = end - start
index = parent ( length - 1 )
while index > = 0 :
max_heapify ( array , index , start , end )
index = index - 1
def max_heapify ( array , index , start , end ) :
def left ( i ) :
return 2 * i + 1
def right ( i ) :
return 2 * i + 2
size = end - start
l = left ( index )
r = right ( index )
if ( l < size and array [ start + l ] > array [ start + index ] ) :
largest = l
else :
largest = index
if ( r < size and array [ start + r ] > array [ start + largest ] ) :
largest = r
if largest != index :
swap ( array , start + largest , start + index )
max_heapify ( array , largest , start , end )
maxdepth = ( len ( a ) . bit_length ( ) - 1 ) * 2
return sort ( a , 0 , len ( a ) , maxdepth )
def heapsort ( self , a ) :
def sort ( array ) :
array = array
n = len ( array )
for i in range ( n / / 2 - 1 , - 1 , - 1 ) :
heapify ( array , n , i )
for i in range ( n - 1 , 0 , - 1 ) :
array [ i ] , array [ 0 ] = array [ 0 ] , array [ i ]
heapify ( array , i , 0 )
return array
def heapify ( array , n , i ) :
array = array
largest = i
l = 2 * i + 1
r = 2 * i + 2
if l < n and array [ i ] < array [ l ] :
largest = l
if r < n and array [ largest ] < array [ r ] :
largest = r
if largest != i :
array [ i ] , array [ largest ] = array [ largest ] , array [ i ]
heapify ( array , n , largest )
return array
return sort ( a )
def insertionsort ( self , a ) :
def sort ( array ) :
array = array
for i in range ( 1 , len ( array ) ) :
key = array [ i ]
j = i - 1
while j > = 0 and key < array [ j ] :
array [ j + 1 ] = array [ j ]
j - = 1
array [ j + 1 ] = key
return array
return sort ( a )
def timsort ( self , a , block = 32 ) :
BLOCK = block
def sort ( array , n ) :
array = array
for i in range ( 0 , n , BLOCK ) :
insertionsort ( array , i , min ( ( i + 31 ) , ( n - 1 ) ) )
size = BLOCK
while size < n :
for left in range ( 0 , n , 2 * size ) :
mid = left + size - 1
right = min ( ( left + 2 * size - 1 ) , ( n - 1 ) )
merge ( array , left , mid , right )
size = 2 * size
return array
def insertionsort ( array , left , right ) :
array = array
for i in range ( left + 1 , right + 1 ) :
temp = array [ i ]
j = i - 1
while j > = left and array [ j ] > temp :
array [ j + 1 ] = array [ j ]
j - = 1
array [ j + 1 ] = temp
return array
def merge ( array , l , m , r ) :
len1 , len2 = m - l + 1 , r - m
left , right = [ ] , [ ]
for i in range ( 0 , len1 ) :
left . append ( array [ l + i ] )
for i in range ( 0 , len2 ) :
right . append ( array [ m + 1 + i ] )
i , j , k = 0 , 0 , l
while i < len1 and j < len2 :
if left [ i ] < = right [ j ] :
array [ k ] = left [ i ]
i + = 1
else :
array [ k ] = right [ j ]
j + = 1
k + = 1
while i < len1 :
array [ k ] = left [ i ]
k + = 1
i + = 1
while j < len2 :
array [ k ] = right [ j ]
k + = 1
j + = 1
return sort ( a , len ( a ) )
def selectionsort ( self , a ) :
array = a
for i in range ( len ( array ) ) :
min_idx = i
for j in range ( i + 1 , len ( array ) ) :
if array [ min_idx ] > array [ j ] :
min_idx = j
array [ i ] , array [ min_idx ] = array [ min_idx ] , array [ i ]
return array
def shellsort ( self , a ) :
array = a
n = len ( array )
gap = n / / 2
while gap > 0 :
for i in range ( gap , n ) :
temp = array [ i ]
j = i
while j > = gap and array [ j - gap ] > temp :
array [ j ] = array [ j - gap ]
j - = gap
array [ j ] = temp
gap / / = 2
return array
def bubblesort ( self , a ) :
def sort ( array ) :
for i , num in enumerate ( array ) :
try :
if array [ i + 1 ] < num :
array [ i ] = array [ i + 1 ]
array [ i + 1 ] = num
sort ( array )
except IndexError :
pass
return array
return sort ( a )
def cyclesort ( self , a ) :
def sort ( array ) :
array = array
writes = 0
for cycleStart in range ( 0 , len ( array ) - 1 ) :
item = array [ cycleStart ]
pos = cycleStart
for i in range ( cycleStart + 1 , len ( array ) ) :
if array [ i ] < item :
pos + = 1
if pos == cycleStart :
continue
while item == array [ pos ] :
pos + = 1
array [ pos ] , item = item , array [ pos ]
writes + = 1
while pos != cycleStart :
pos = cycleStart
for i in range ( cycleStart + 1 , len ( array ) ) :
if array [ i ] < item :
pos + = 1
while item == array [ pos ] :
pos + = 1
array [ pos ] , item = item , array [ pos ]
writes + = 1
return array
return sort ( a )
def cocktailsort ( self , a ) :
def sort ( array ) :
array = array
n = len ( array )
swapped = True
start = 0
end = n - 1
while ( swapped == True ) :
swapped = False
for i in range ( start , end ) :
if ( array [ i ] > array [ i + 1 ] ) :
array [ i ] , array [ i + 1 ] = array [ i + 1 ] , array [ i ]
swapped = True
if ( swapped == False ) :
break
swapped = False
end = end - 1
for i in range ( end - 1 , start - 1 , - 1 ) :
if ( array [ i ] > array [ i + 1 ] ) :
array [ i ] , array [ i + 1 ] = array [ i + 1 ] , array [ i ]
swapped = True
start = start + 1
return array
return sort ( a )