Merge pull request #84 from titanscouting/typehinting-docstrings

Grab docstrings for Analysis to analysis-v4
This commit is contained in:
Arthur Lu 2021-11-18 01:30:51 -08:00 committed by GitHub
commit a50be44c18
11 changed files with 143 additions and 54 deletions

View File

@ -7,10 +7,15 @@
# current benchmark of optimization: 1.33 times faster # current benchmark of optimization: 1.33 times faster
# setup: # setup:
__version__ = "3.0.4" __version__ = "3.0.6"
# changelog should be viewed using print(analysis.__changelog__) # changelog should be viewed using print(analysis.__changelog__)
__changelog__ = """changelog: __changelog__ = """changelog:
3.0.6:
- added docstrings
3.0.5:
- removed extra submodule imports
- fixed/optimized header
3.0.4: 3.0.4:
- removed -_obj imports - removed -_obj imports
3.0.3: 3.0.3:
@ -361,7 +366,6 @@ __all__ = [
'histo_analysis', 'histo_analysis',
'regression', 'regression',
'Metric', 'Metric',
'kmeans',
'pca', 'pca',
'decisiontree', 'decisiontree',
# all statistics functions left out due to integration in other functions # all statistics functions left out due to integration in other functions
@ -374,34 +378,39 @@ __all__ = [
import csv import csv
from tra_analysis.metrics import elo as Elo from tra_analysis.metrics import elo as Elo
from tra_analysis.metrics import glicko2 as Glicko2 from tra_analysis.metrics import glicko2 as Glicko2
import math
import numpy as np import numpy as np
import scipy import scipy
from scipy import optimize, stats import sklearn, sklearn.cluster
import sklearn
from sklearn import preprocessing, pipeline, linear_model, metrics, cluster, decomposition, tree, neighbors, naive_bayes, svm, model_selection, ensemble
from tra_analysis.metrics import trueskill as Trueskill from tra_analysis.metrics import trueskill as Trueskill
import warnings
# import submodules # import submodules
from .Array import Array
from .ClassificationMetric import ClassificationMetric from .ClassificationMetric import ClassificationMetric
from .RegressionMetric import RegressionMetric
from . import SVM
class error(ValueError): class error(ValueError):
pass pass
def load_csv(filepath): def load_csv(filepath):
"""
Loads csv file into 2D numpy array. Does not check csv file validity.
parameters:
filepath: String path to the csv file
return:
2D numpy array of values stored in csv file
"""
with open(filepath, newline='') as csvfile: with open(filepath, newline='') as csvfile:
file_array = np.array(list(csv.reader(csvfile))) file_array = np.array(list(csv.reader(csvfile)))
csvfile.close() csvfile.close()
return file_array return file_array
# expects 1d array
def basic_stats(data): def basic_stats(data):
"""
Calculates mean, median, standard deviation, variance, minimum, maximum of a simple set of elements.
parameters:
data: List representing set of unordered elements
return:
Dictionary with (mean, median, standard-deviation, variance, minimum, maximum) as keys and corresponding values
"""
data_t = np.array(data).astype(float) data_t = np.array(data).astype(float)
_mean = mean(data_t) _mean = mean(data_t)
@ -413,24 +422,43 @@ def basic_stats(data):
return {"mean": _mean, "median": _median, "standard-deviation": _stdev, "variance": _variance, "minimum": _min, "maximum": _max} return {"mean": _mean, "median": _median, "standard-deviation": _stdev, "variance": _variance, "minimum": _min, "maximum": _max}
# returns z score with inputs of point, mean and standard deviation of spread
def z_score(point, mean, stdev): def z_score(point, mean, stdev):
"""
Calculates z score of a specific point given mean and standard deviation of data.
parameters:
point: Real value corresponding to a single point of data
mean: Real value corresponding to the mean of the dataset
stdev: Real value corresponding to the standard deviation of the dataset
return:
Real value that is the point's z score
"""
score = (point - mean) / stdev score = (point - mean) / stdev
return score return score
# expects 2d array, normalizes across all axes
def z_normalize(array, *args): def z_normalize(array, *args):
"""
Applies sklearn.normalize(array, axis = args) on any arraylike parseable by numpy.
parameters:
array: array like structure of reals aka nested indexables
*args: arguments relating to axis normalized against
return:
numpy array of normalized values from ArrayLike input
"""
array = np.array(array) array = np.array(array)
for arg in args: for arg in args:
array = sklearn.preprocessing.normalize(array, axis = arg) array = sklearn.preprocessing.normalize(array, axis = arg)
return array return array
# expects 2d array of [x,y]
def histo_analysis(hist_data): def histo_analysis(hist_data):
"""
Calculates the mean and standard deviation of derivatives of (x,y) points. Requires at least 2 points to compute.
parameters:
hist_data: list of real coordinate point data (x, y)
return:
Dictionary with (mean, deviation) as keys to corresponding values
"""
if len(hist_data[0]) > 2: if len(hist_data[0]) > 2:
hist_data = np.array(hist_data) hist_data = np.array(hist_data)
@ -446,7 +474,15 @@ def histo_analysis(hist_data):
return None return None
def regression(inputs, outputs, args): # inputs, outputs expects N-D array def regression(inputs, outputs, args): # inputs, outputs expects N-D array
"""
Applies specified regression kernels onto input, output data pairs.
parameters:
inputs: List of Reals representing independent variable values of each point
outputs: List of Reals representing dependent variable values of each point
args: List of Strings from values (lin, log, exp, ply, sig)
return:
Dictionary with keys (lin, log, exp, ply, sig) as keys to correspondiong regression models
"""
X = np.array(inputs) X = np.array(inputs)
y = np.array(outputs) y = np.array(outputs)
@ -550,13 +586,39 @@ def regression(inputs, outputs, args): # inputs, outputs expects N-D array
return regressions return regressions
class Metric: class Metric:
"""
The metric class wraps the metrics models. Call without instantiation as Metric.<method>(...)
"""
def elo(self, starting_score, opposing_score, observed, N, K): def elo(self, starting_score, opposing_score, observed, N, K):
"""
Calculates an elo adjusted ELO score given a player's current score, opponent's score, and outcome of match.
reference: https://en.wikipedia.org/wiki/Elo_rating_system
parameters:
starting_score: Real value representing player's ELO score before a match
opposing_score: Real value representing opponent's score before the match
observed: Array of Real values representing multiple sequential match outcomes against the same opponent. 1 for match win, 0.5 for tie, 0 for loss.
N: Real value representing the normal or mean score expected (usually 1200)
K: R eal value representing a system constant, determines how quickly players will change scores (usually 24)
return:
Real value representing the player's new ELO score
"""
return Elo.calculate(starting_score, opposing_score, observed, N, K) return Elo.calculate(starting_score, opposing_score, observed, N, K)
def glicko2(self, starting_score, starting_rd, starting_vol, opposing_score, opposing_rd, observations): def glicko2(self, starting_score, starting_rd, starting_vol, opposing_score, opposing_rd, observations):
"""
Calculates an adjusted Glicko-2 score given a player's current score, multiple opponent's score, and outcome of several matches.
reference: http://www.glicko.net/glicko/glicko2.pdf
parameters:
starting_score: Real value representing the player's Glicko-2 score
starting_rd: Real value representing the player's RD
starting_vol: Real value representing the player's volatility
opposing_score: List of Real values representing multiple opponent's Glicko-2 scores
opposing_rd: List of Real values representing multiple opponent's RD
opposing_vol: List of Real values representing multiple opponent's volatility
observations: List of Real values representing the outcome of several matches, where each match's opponent corresponds with the opposing_score, opposing_rd, opposing_vol values of the same indesx. Outcomes can be a score, presuming greater score is better.
return:
Tuple of 3 Real values representing the player's new score, rd, and vol
"""
player = Glicko2.Glicko2(rating = starting_score, rd = starting_rd, vol = starting_vol) player = Glicko2.Glicko2(rating = starting_score, rd = starting_rd, vol = starting_vol)
player.update_player([x for x in opposing_score], [x for x in opposing_rd], observations) player.update_player([x for x in opposing_score], [x for x in opposing_rd], observations)
@ -564,7 +626,15 @@ class Metric:
return (player.rating, player.rd, player.vol) return (player.rating, player.rd, player.vol)
def trueskill(self, teams_data, observations): # teams_data is array of array of tuples ie. [[(mu, sigma), (mu, sigma), (mu, sigma)], [(mu, sigma), (mu, sigma), (mu, sigma)]] def trueskill(self, teams_data, observations): # teams_data is array of array of tuples ie. [[(mu, sigma), (mu, sigma), (mu, sigma)], [(mu, sigma), (mu, sigma), (mu, sigma)]]
"""
Calculates the score changes for multiple teams playing in a single match accoding to the trueskill algorithm.
reference: https://trueskill.org/
parameters:
teams_data: List of List of Tuples of 2 Real values representing multiple player ratings. List of teams, which is a List of players. Each player rating is a Tuple of 2 Real values (mu, sigma).
observations: List of Real values representing the match outcome. Each value in the List is the score corresponding to the team at the same index in teams_data.
return:
List of List of Tuples of 2 Real values representing new player ratings. Same structure as teams_data.
"""
team_ratings = [] team_ratings = []
for team in teams_data: for team in teams_data:
@ -599,24 +669,32 @@ def npmin(data):
def npmax(data): def npmax(data):
return np.amax(data) return np.amax(data)
""" need to decide what to do with this function
def kmeans(data, n_clusters=8, init="k-means++", n_init=10, max_iter=300, tol=0.0001, precompute_distances="auto", verbose=0, random_state=None, copy_x=True, n_jobs=None, algorithm="auto"):
kernel = sklearn.cluster.KMeans(n_clusters = n_clusters, init = init, n_init = n_init, max_iter = max_iter, tol = tol, precompute_distances = precompute_distances, verbose = verbose, random_state = random_state, copy_x = copy_x, n_jobs = n_jobs, algorithm = algorithm)
kernel.fit(data)
predictions = kernel.predict(data)
centers = kernel.cluster_centers_
return centers, predictions
"""
def pca(data, n_components = None, copy = True, whiten = False, svd_solver = "auto", tol = 0.0, iterated_power = "auto", random_state = None): def pca(data, n_components = None, copy = True, whiten = False, svd_solver = "auto", tol = 0.0, iterated_power = "auto", random_state = None):
"""
Performs a principle component analysis on the input data.
reference: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
parameters:
data: Arraylike of Reals representing the set of data to perform PCA on
* : refer to reference for usage, parameters follow same usage
return:
Arraylike of Reals representing the set of data that has had PCA performed. The dimensionality of the Arraylike may be smaller or equal.
"""
kernel = sklearn.decomposition.PCA(n_components = n_components, copy = copy, whiten = whiten, svd_solver = svd_solver, tol = tol, iterated_power = iterated_power, random_state = random_state) kernel = sklearn.decomposition.PCA(n_components = n_components, copy = copy, whiten = whiten, svd_solver = svd_solver, tol = tol, iterated_power = iterated_power, random_state = random_state)
return kernel.fit_transform(data) return kernel.fit_transform(data)
def decisiontree(data, labels, test_size = 0.3, criterion = "gini", splitter = "default", max_depth = None): #expects *2d data and 1d labels def decisiontree(data, labels, test_size = 0.3, criterion = "gini", splitter = "default", max_depth = None): #expects *2d data and 1d labels
"""
Generates a decision tree classifier fitted to the given data.
reference: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
parameters:
data: List of values representing each data point of multiple axes
labels: List of values represeing the labels corresponding to the same index at data
* : refer to reference for usage, parameters follow same usage
return:
DecisionTreeClassifier model and corresponding classification accuracy metrics
"""
data_train, data_test, labels_train, labels_test = sklearn.model_selection.train_test_split(data, labels, test_size=test_size, random_state=1) data_train, data_test, labels_train, labels_test = sklearn.model_selection.train_test_split(data, labels, test_size=test_size, random_state=1)
model = sklearn.tree.DecisionTreeClassifier(criterion = criterion, splitter = splitter, max_depth = max_depth) model = sklearn.tree.DecisionTreeClassifier(criterion = criterion, splitter = splitter, max_depth = max_depth)
model = model.fit(data_train,labels_train) model = model.fit(data_train,labels_train)

View File

@ -4,9 +4,11 @@
# this should be imported as a python module using 'from tra_analysis import ClassificationMetric' # this should be imported as a python module using 'from tra_analysis import ClassificationMetric'
# setup: # setup:
__version__ = "1.0.1" __version__ = "1.0.2"
__changelog__ = """changelog: __changelog__ = """changelog:
1.0.2:
- optimized imports
1.0.1: 1.0.1:
- fixed __all__ - fixed __all__
1.0.0: 1.0.0:
@ -22,7 +24,6 @@ __all__ = [
] ]
import sklearn import sklearn
from sklearn import metrics
class ClassificationMetric(): class ClassificationMetric():

View File

@ -4,9 +4,11 @@
# this should be imported as a python module using 'from tra_analysis import CorrelationTest' # this should be imported as a python module using 'from tra_analysis import CorrelationTest'
# setup: # setup:
__version__ = "1.0.1" __version__ = "1.0.2"
__changelog__ = """changelog: __changelog__ = """changelog:
1.0.2:
- optimized imports
1.0.1: 1.0.1:
- fixed __all__ - fixed __all__
1.0.0: 1.0.0:
@ -29,7 +31,6 @@ __all__ = [
] ]
import scipy import scipy
from scipy import stats
def anova_oneway(*args): #expects arrays of samples def anova_oneway(*args): #expects arrays of samples

View File

@ -4,9 +4,11 @@
# this should be imported as a python module using 'from tra_analysis import KNN' # this should be imported as a python module using 'from tra_analysis import KNN'
# setup: # setup:
__version__ = "1.0.0" __version__ = "1.0.1"
__changelog__ = """changelog: __changelog__ = """changelog:
1.0.1:
- optimized imports
1.0.0: 1.0.0:
- ported analysis.KNN() here - ported analysis.KNN() here
- removed classness - removed classness
@ -23,7 +25,6 @@ __all__ = [
] ]
import sklearn import sklearn
from sklearn import model_selection, neighbors
from . import ClassificationMetric, RegressionMetric from . import ClassificationMetric, RegressionMetric
def knn_classifier(data, labels, n_neighbors = 5, test_size = 0.3, algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=None, p=2, weights='uniform'): #expects *2d data and 1d labels post-scaling def knn_classifier(data, labels, n_neighbors = 5, test_size = 0.3, algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=None, p=2, weights='uniform'): #expects *2d data and 1d labels post-scaling

View File

@ -4,9 +4,11 @@
# this should be imported as a python module using 'from tra_analysis import NaiveBayes' # this should be imported as a python module using 'from tra_analysis import NaiveBayes'
# setup: # setup:
__version__ = "1.0.0" __version__ = "1.0.1"
__changelog__ = """changelog: __changelog__ = """changelog:
1.0.1:
- optimized imports
1.0.0: 1.0.0:
- ported analysis.NaiveBayes() here - ported analysis.NaiveBayes() here
- removed classness - removed classness
@ -24,8 +26,7 @@ __all__ = [
] ]
import sklearn import sklearn
from sklearn import model_selection, naive_bayes from . import ClassificationMetric
from . import ClassificationMetric, RegressionMetric
def gaussian(data, labels, test_size = 0.3, priors = None, var_smoothing = 1e-09): def gaussian(data, labels, test_size = 0.3, priors = None, var_smoothing = 1e-09):

View File

@ -4,9 +4,11 @@
# this should be imported as a python module using 'from tra_analysis import RandomForest' # this should be imported as a python module using 'from tra_analysis import RandomForest'
# setup: # setup:
__version__ = "1.0.1" __version__ = "1.0.2"
__changelog__ = """changelog: __changelog__ = """changelog:
1.0.2:
- optimized imports
1.0.1: 1.0.1:
- fixed __all__ - fixed __all__
1.0.0: 1.0.0:
@ -23,8 +25,7 @@ __all__ = [
"random_forest_regressor", "random_forest_regressor",
] ]
import sklearn import sklearn, sklearn.ensemble, sklearn.naive_bayes
from sklearn import ensemble, model_selection
from . import ClassificationMetric, RegressionMetric from . import ClassificationMetric, RegressionMetric
def random_forest_classifier(data, labels, test_size, n_estimators, criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features="auto", max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None): def random_forest_classifier(data, labels, test_size, n_estimators, criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features="auto", max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None):

View File

@ -4,9 +4,11 @@
# this should be imported as a python module using 'from tra_analysis import RegressionMetric' # this should be imported as a python module using 'from tra_analysis import RegressionMetric'
# setup: # setup:
__version__ = "1.0.0" __version__ = "1.0.1"
__changelog__ = """changelog: __changelog__ = """changelog:
1.0.1:
- optimized imports
1.0.0: 1.0.0:
- ported analysis.RegressionMetric() here - ported analysis.RegressionMetric() here
""" """
@ -21,7 +23,6 @@ __all__ = [
import numpy as np import numpy as np
import sklearn import sklearn
from sklearn import metrics
class RegressionMetric(): class RegressionMetric():

View File

@ -4,9 +4,11 @@
# this should be imported as a python module using 'from tra_analysis import SVM' # this should be imported as a python module using 'from tra_analysis import SVM'
# setup: # setup:
__version__ = "1.0.2" __version__ = "1.0.3"
__changelog__ = """changelog: __changelog__ = """changelog:
1.0.3:
- optimized imports
1.0.2: 1.0.2:
- fixed __all__ - fixed __all__
1.0.1: 1.0.1:
@ -30,7 +32,6 @@ __all__ = [
] ]
import sklearn import sklearn
from sklearn import svm
from . import ClassificationMetric, RegressionMetric from . import ClassificationMetric, RegressionMetric
class CustomKernel: class CustomKernel:

View File

@ -16,7 +16,7 @@ __changelog__ = """changelog:
__author__ = ( __author__ = (
"Arthur Lu <learthurgo@gmail.com>", "Arthur Lu <learthurgo@gmail.com>",
"James Pan <zpan@imsa.edu>" "James Pan <zpan@imsa.edu>",
) )
__all__ = [ __all__ = [

View File

@ -4,9 +4,11 @@
# this should be imported as a python module using 'from tra_analysis import StatisticalTest' # this should be imported as a python module using 'from tra_analysis import StatisticalTest'
# setup: # setup:
__version__ = "1.0.2" __version__ = "1.0.3"
__changelog__ = """changelog: __changelog__ = """changelog:
1.0.3:
- optimized imports
1.0.2: 1.0.2:
- added tukey_multicomparison - added tukey_multicomparison
- fixed styling - fixed styling
@ -61,7 +63,6 @@ __all__ = [
import numpy as np import numpy as np
import scipy import scipy
from scipy import stats, interpolate
def ttest_onesample(a, popmean, axis = 0, nan_policy = 'propagate'): def ttest_onesample(a, popmean, axis = 0, nan_policy = 'propagate'):
@ -279,9 +280,9 @@ def get_tukeyQcrit(k, df, alpha=0.05):
cv001 = c[:, 2::2] cv001 = c[:, 2::2]
if alpha == 0.05: if alpha == 0.05:
intp = interpolate.interp1d(crows, cv005[:,k-2]) intp = scipy.interpolate.interp1d(crows, cv005[:,k-2])
elif alpha == 0.01: elif alpha == 0.01:
intp = interpolate.interp1d(crows, cv001[:,k-2]) intp = scipy.interpolate.interp1d(crows, cv001[:,k-2])
else: else:
raise ValueError('only implemented for alpha equal to 0.01 and 0.05') raise ValueError('only implemented for alpha equal to 0.01 and 0.05')
return intp(df) return intp(df)

View File

@ -16,6 +16,8 @@ __changelog__ = """changelog:
- deprecated titanlearn.py - deprecated titanlearn.py
- deprecated visualization.py - deprecated visualization.py
- removed matplotlib from requirements - removed matplotlib from requirements
- removed extra submodule imports in Analysis
- added typehinting, docstrings for each function
3.0.0: 3.0.0:
- incremented version to release 3.0.0 - incremented version to release 3.0.0
3.0.0-rc2: 3.0.0-rc2:
@ -45,6 +47,7 @@ __all__ = [
"Analysis", "Analysis",
"Array", "Array",
"ClassificationMetric", "ClassificationMetric",
"Clustering",
"CorrelationTest", "CorrelationTest",
"Expression", "Expression",
"Fit", "Fit",