17 Commits

Author SHA1 Message Date
Arthur Lu
93a6962819 Merge pull request #85 from titanscouting/analysis-v4
Analysis v4
2022-03-13 21:15:40 -07:00
Arthur Lu
df6362c52a fixed unit-test action
(added symbolic link to requirements.txt)
2022-02-08 09:50:04 +00:00
Arthur Lu
f793a77660 removed unessasary comments in unit tests 2022-02-08 07:39:51 +00:00
Arthur Lu
9187f1e7da Merge pull request #86 from titanscouting/improve-devdocker
Improve devdocker
2022-02-07 23:34:04 -08:00
Arthur Lu
6647dcfd72 generalized keyword argument handling for:
Clustering.py, CorrelationTest.py, KNN.py, NaiveBayes.py
2022-02-08 07:29:47 +00:00
Arthur Lu
8ae1593861 add pylint pytest to requirements.txt 2022-02-08 07:20:31 +00:00
Arthur Lu
1372aa03f9 switch docker image to python:slim,
move requirements.txt to .devcontainer/
2022-02-04 09:19:39 +00:00
Arthur Lu
7284851091 fix minor bugs in RandomForest.py 2022-02-04 08:50:56 +00:00
Arthur Lu
9232cc31be fixed import error in Analysis,
updated unit tests
2021-11-18 09:51:39 +00:00
Arthur Lu
6fb0eefbc0 Merge pull request #84 from titanscouting/typehinting-docstrings
Grab docstrings for Analysis to analysis-v4
2021-11-18 01:30:51 -08:00
Arthur Lu
9167e50858 removed unessasary comments,
updated version and changelog
2021-11-18 09:24:52 +00:00
Arthur Lu
55707fa0ca finished Analysis docstrings,
removed typehinting to rework
2021-11-18 09:23:19 +00:00
Arthur Lu
33c462570d added type hinting for a few functions,
added typedef module to hold custom typings

Signed-off-by: Arthur Lu <learthurgo@gmail.com>
2021-11-16 20:17:46 +00:00
Arthur Lu
4f71c21471 populated __init__.py for metrics submodule 2021-11-12 07:42:20 +00:00
Arthur Lu
5d5d6c4c5e fixed/optimized imports,
fixed headers

Signed-off-by: Arthur Lu <learthurgo@gmail.com>
2021-11-09 22:52:04 +00:00
Arthur Lu
a48ef20ef2 Merge pull request #82 from titanscouting/improve-clustering
Added new clustering tools and reorganize existing ones
2021-09-27 15:32:12 -07:00
Arthur Lu
9fe3bd4567 removed matplotlib from requirements 2021-05-27 21:39:50 +00:00
20 changed files with 255 additions and 124 deletions

View File

@@ -1,7 +1,6 @@
FROM ubuntu:20.04 FROM python:slim
WORKDIR / WORKDIR /
RUN apt-get -y update RUN apt-get -y update; apt-get -y upgrade
RUN DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends tzdata RUN apt-get -y install git
RUN apt-get install -y python3 python3-dev git python3-pip python3-kivy python-is-python3 libgl1-mesa-dev build-essential COPY requirements.txt .
RUN ln -s $(which pip3) /usr/bin/pip RUN pip install -r requirements.txt
RUN pip install pymongo pandas numpy scipy scikit-learn matplotlib pylint kivy

View File

@@ -1,2 +0,0 @@
FROM titanscout2022/tra-analysis-base:latest
WORKDIR /

View File

@@ -1,28 +1,22 @@
{ {
"name": "TRA Analysis Development Environment", "name": "TRA Analysis Development Environment",
"build": { "build": {
"dockerfile": "dev-dockerfile", "dockerfile": "Dockerfile",
}, },
"settings": { "settings": {
"terminal.integrated.shell.linux": "/bin/bash", "terminal.integrated.shell.linux": "/bin/bash",
"python.pythonPath": "/usr/local/bin/python", "python.pythonPath": "",
"python.linting.enabled": true, "python.linting.enabled": true,
"python.linting.pylintEnabled": true, "python.linting.pylintEnabled": true,
"python.formatting.autopep8Path": "/usr/local/py-utils/bin/autopep8", "python.linting.pylintPath": "",
"python.formatting.blackPath": "/usr/local/py-utils/bin/black", "python.testing.pytestPath": "",
"python.formatting.yapfPath": "/usr/local/py-utils/bin/yapf", "editor.tabSize": 4,
"python.linting.banditPath": "/usr/local/py-utils/bin/bandit", "editor.insertSpaces": false
"python.linting.flake8Path": "/usr/local/py-utils/bin/flake8",
"python.linting.mypyPath": "/usr/local/py-utils/bin/mypy",
"python.linting.pycodestylePath": "/usr/local/py-utils/bin/pycodestyle",
"python.linting.pydocstylePath": "/usr/local/py-utils/bin/pydocstyle",
"python.linting.pylintPath": "/usr/local/py-utils/bin/pylint",
"python.testing.pytestPath": "/usr/local/py-utils/bin/pytest"
}, },
"extensions": [ "extensions": [
"mhutchie.git-graph", "mhutchie.git-graph",
"ms-python.python", "ms-python.python",
"waderyan.gitblame" "waderyan.gitblame"
], ],
"postCreateCommand": "/usr/bin/pip3 install -r ${containerWorkspaceFolder}/analysis-master/requirements.txt && /usr/bin/pip3 install --no-cache-dir pylint && /usr/bin/pip3 install pytest" "postCreateCommand": ""
} }

View File

@@ -0,0 +1,8 @@
numpy
scipy
scikit-learn
six
pyparsing
pylint
pytest

View File

@@ -10,12 +10,12 @@ on:
branches: [ master ] branches: [ master ]
jobs: jobs:
build: unittest:
runs-on: ubuntu-latest runs-on: ubuntu-latest
strategy: strategy:
matrix: matrix:
python-version: [3.7, 3.8] python-version: ["3.7", "3.8", "3.9", "3.10"]
env: env:
working-directory: ./analysis-master/ working-directory: ./analysis-master/

View File

@@ -2,5 +2,7 @@ numpy
scipy scipy
scikit-learn scikit-learn
six six
matplotlib pyparsing
pyparsing
pylint
pytest

View File

@@ -9,6 +9,7 @@ from tra_analysis import Clustering
from tra_analysis import CorrelationTest from tra_analysis import CorrelationTest
from tra_analysis import Fit from tra_analysis import Fit
from tra_analysis import KNN from tra_analysis import KNN
from tra_analysis import metrics as m
from tra_analysis import NaiveBayes from tra_analysis import NaiveBayes
from tra_analysis import RandomForest from tra_analysis import RandomForest
from tra_analysis import RegressionMetric from tra_analysis import RegressionMetric
@@ -27,7 +28,7 @@ x_data_circular = []
y_data_circular = [] y_data_circular = []
y_data_ccu = [1, 3, 7, 14, 21] y_data_ccu = [1, 3, 7, 14, 21]
y_data_ccd = [1, 5, 7, 8.5, 8.66] y_data_ccd = [8.66, 8.5, 7, 5, 1]
test_data_scrambled = [-32, 34, 19, 72, -65, -11, -43, 6, 85, -17, -98, -26, 12, 20, 9, -92, -40, 98, -78, 17, -20, 49, 93, -27, -24, -66, 40, 84, 1, -64, -68, -25, -42, -46, -76, 43, -3, 30, -14, -34, -55, -13, 41, -30, 0, -61, 48, 23, 60, 87, 80, 77, 53, 73, 79, 24, -52, 82, 8, -44, 65, 47, -77, 94, 7, 37, -79, 36, -94, 91, 59, 10, 97, -38, -67, 83, 54, 31, -95, -63, 16, -45, 21, -12, 66, -48, -18, -96, -90, -21, -83, -74, 39, 64, 69, -97, 13, 55, 27, -39] test_data_scrambled = [-32, 34, 19, 72, -65, -11, -43, 6, 85, -17, -98, -26, 12, 20, 9, -92, -40, 98, -78, 17, -20, 49, 93, -27, -24, -66, 40, 84, 1, -64, -68, -25, -42, -46, -76, 43, -3, 30, -14, -34, -55, -13, 41, -30, 0, -61, 48, 23, 60, 87, 80, 77, 53, 73, 79, 24, -52, 82, 8, -44, 65, 47, -77, 94, 7, 37, -79, 36, -94, 91, 59, 10, 97, -38, -67, 83, 54, 31, -95, -63, 16, -45, 21, -12, 66, -48, -18, -96, -90, -21, -83, -74, 39, 64, 69, -97, 13, 55, 27, -39]
test_data_sorted = [-98, -97, -96, -95, -94, -92, -90, -83, -79, -78, -77, -76, -74, -68, -67, -66, -65, -64, -63, -61, -55, -52, -48, -46, -45, -44, -43, -42, -40, -39, -38, -34, -32, -30, -27, -26, -25, -24, -21, -20, -18, -17, -14, -13, -12, -11, -3, 0, 1, 6, 7, 8, 9, 10, 12, 13, 16, 17, 19, 20, 21, 23, 24, 27, 30, 31, 34, 36, 37, 39, 40, 41, 43, 47, 48, 49, 53, 54, 55, 59, 60, 64, 65, 66, 69, 72, 73, 77, 79, 80, 82, 83, 84, 85, 87, 91, 93, 94, 97, 98] test_data_sorted = [-98, -97, -96, -95, -94, -92, -90, -83, -79, -78, -77, -76, -74, -68, -67, -66, -65, -64, -63, -61, -55, -52, -48, -46, -45, -44, -43, -42, -40, -39, -38, -34, -32, -30, -27, -26, -25, -24, -21, -20, -18, -17, -14, -13, -12, -11, -3, 0, 1, 6, 7, 8, 9, 10, 12, 13, 16, 17, 19, 20, 21, 23, 24, 27, 30, 31, 34, 36, 37, 39, 40, 41, 43, 47, 48, 49, 53, 54, 55, 59, 60, 64, 65, 66, 69, 72, 73, 77, 79, 80, 82, 83, 84, 85, 87, 91, 93, 94, 97, 98]
@@ -48,16 +49,25 @@ def test_basicstats():
def test_regression(): def test_regression():
assert all(isinstance(item, str) for item in an.regression(test_data_linear, y_data_ccu, ["lin"])) == True assert all(isinstance(item, str) for item in an.regression(test_data_linear, y_data_ccu, ["lin"])) == True
#assert all(isinstance(item, str) for item in an.regression(test_data_linear, y_data_ccd, ["log"])) == True assert all(isinstance(item, str) for item in an.regression(test_data_linear, y_data_ccd, ["log"])) == True
#assert all(isinstance(item, str) for item in an.regression(test_data_linear, y_data_ccu, ["exp"])) == True assert all(isinstance(item, str) for item in an.regression(test_data_linear, y_data_ccu, ["exp"])) == True
#assert all(isinstance(item, str) for item in an.regression(test_data_linear, y_data_ccu, ["ply"])) == True assert all(isinstance(item, str) for item in an.regression(test_data_linear, y_data_ccu, ["ply"])) == True
#assert all(isinstance(item, str) for item in an.regression(test_data_linear, y_data_ccd, ["sig"])) == True assert all(isinstance(item, str) for item in an.regression(test_data_linear, y_data_ccd, ["sig"])) == True
def test_metrics(): def test_metrics():
assert an.Metric().elo(1500, 1500, [1, 0], 400, 24) == 1512.0 assert an.Metric().elo(1500, 1500, [1, 0], 400, 24) == 1512.0
assert an.Metric().glicko2(1500, 250, 0.06, [1500, 1400], [250, 240], [1, 0]) == (1478.864307445517, 195.99122679202452, 0.05999602937563585) assert an.Metric().glicko2(1500, 250, 0.06, [1500, 1400], [250, 240], [1, 0]) == (1478.864307445517, 195.99122679202452, 0.05999602937563585)
#assert an.Metric().trueskill([[(25, 8.33), (24, 8.25), (32, 7.5)], [(25, 8.33), (25, 8.33), (21, 6.5)]], [1, 0]) == [(metrics.trueskill.Rating(mu=21.346, sigma=7.875), metrics.trueskill.Rating(mu=20.415, sigma=7.808), metrics.trueskill.Rating(mu=29.037, sigma=7.170)), (metrics.trueskill.Rating(mu=28.654, sigma=7.875), metrics.trueskill.Rating(mu=28.654, sigma=7.875), metrics.trueskill.Rating(mu=23.225, sigma=6.287))] e = [[(21.346, 7.875), (20.415, 7.808), (29.037, 7.170)], [(28.654, 7.875), (28.654, 7.875), (23.225, 6.287)]]
r = an.Metric().trueskill([[(25, 8.33), (24, 8.25), (32, 7.5)], [(25, 8.33), (25, 8.33), (21, 6.5)]], [1, 0])
i = 0
for group in r:
j = 0
for team in group:
assert abs(team.mu - e[i][j][0]) < 0.001
assert abs(team.sigma - e[i][j][1]) < 0.001
j+=1
i+=1
def test_array(): def test_array():
@@ -143,14 +153,9 @@ def test_sort():
assert all(a == b for a, b in zip(sort(test_data_scrambled), test_data_sorted)) assert all(a == b for a, b in zip(sort(test_data_scrambled), test_data_sorted))
def test_statisticaltest(): def test_statisticaltest():
#print(StatisticalTest.tukey_multicomparison([test_data_linear, test_data_linear2, test_data_linear3]))
assert StatisticalTest.tukey_multicomparison([test_data_linear, test_data_linear2, test_data_linear3]) == \ assert StatisticalTest.tukey_multicomparison([test_data_linear, test_data_linear2, test_data_linear3]) == \
{'group 1 and group 2': [0.32571517201527916, False], 'group 1 and group 3': [0.977145516045838, False], 'group 2 and group 3': [0.6514303440305589, False]} {'group 1 and group 2': [0.32571517201527916, False], 'group 1 and group 3': [0.977145516045838, False], 'group 2 and group 3': [0.6514303440305589, False]}
#assert all(np.isclose([i[0] for i in list(StatisticalTest.tukey_multicomparison([test_data_linear, test_data_linear2, test_data_linear3]).values],
# [0.32571517201527916, 0.977145516045838, 0.6514303440305589]))
#assert [i[1] for i in StatisticalTest.tukey_multicomparison([test_data_linear, test_data_linear2, test_data_linear3]).values] == \
# [False, False, False]
def test_svm(): def test_svm():

View File

@@ -7,10 +7,15 @@
# current benchmark of optimization: 1.33 times faster # current benchmark of optimization: 1.33 times faster
# setup: # setup:
__version__ = "3.0.4" __version__ = "3.0.6"
# changelog should be viewed using print(analysis.__changelog__) # changelog should be viewed using print(analysis.__changelog__)
__changelog__ = """changelog: __changelog__ = """changelog:
3.0.6:
- added docstrings
3.0.5:
- removed extra submodule imports
- fixed/optimized header
3.0.4: 3.0.4:
- removed -_obj imports - removed -_obj imports
3.0.3: 3.0.3:
@@ -361,7 +366,6 @@ __all__ = [
'histo_analysis', 'histo_analysis',
'regression', 'regression',
'Metric', 'Metric',
'kmeans',
'pca', 'pca',
'decisiontree', 'decisiontree',
# all statistics functions left out due to integration in other functions # all statistics functions left out due to integration in other functions
@@ -374,34 +378,39 @@ __all__ = [
import csv import csv
from tra_analysis.metrics import elo as Elo from tra_analysis.metrics import elo as Elo
from tra_analysis.metrics import glicko2 as Glicko2 from tra_analysis.metrics import glicko2 as Glicko2
import math
import numpy as np import numpy as np
import scipy import scipy
from scipy import optimize, stats import sklearn, sklearn.cluster, sklearn.pipeline
import sklearn
from sklearn import preprocessing, pipeline, linear_model, metrics, cluster, decomposition, tree, neighbors, naive_bayes, svm, model_selection, ensemble
from tra_analysis.metrics import trueskill as Trueskill from tra_analysis.metrics import trueskill as Trueskill
import warnings
# import submodules # import submodules
from .Array import Array
from .ClassificationMetric import ClassificationMetric from .ClassificationMetric import ClassificationMetric
from .RegressionMetric import RegressionMetric
from . import SVM
class error(ValueError): class error(ValueError):
pass pass
def load_csv(filepath): def load_csv(filepath):
"""
Loads csv file into 2D numpy array. Does not check csv file validity.
parameters:
filepath: String path to the csv file
return:
2D numpy array of values stored in csv file
"""
with open(filepath, newline='') as csvfile: with open(filepath, newline='') as csvfile:
file_array = np.array(list(csv.reader(csvfile))) file_array = np.array(list(csv.reader(csvfile)))
csvfile.close() csvfile.close()
return file_array return file_array
# expects 1d array
def basic_stats(data): def basic_stats(data):
"""
Calculates mean, median, standard deviation, variance, minimum, maximum of a simple set of elements.
parameters:
data: List representing set of unordered elements
return:
Dictionary with (mean, median, standard-deviation, variance, minimum, maximum) as keys and corresponding values
"""
data_t = np.array(data).astype(float) data_t = np.array(data).astype(float)
_mean = mean(data_t) _mean = mean(data_t)
@@ -413,24 +422,43 @@ def basic_stats(data):
return {"mean": _mean, "median": _median, "standard-deviation": _stdev, "variance": _variance, "minimum": _min, "maximum": _max} return {"mean": _mean, "median": _median, "standard-deviation": _stdev, "variance": _variance, "minimum": _min, "maximum": _max}
# returns z score with inputs of point, mean and standard deviation of spread
def z_score(point, mean, stdev): def z_score(point, mean, stdev):
"""
Calculates z score of a specific point given mean and standard deviation of data.
parameters:
point: Real value corresponding to a single point of data
mean: Real value corresponding to the mean of the dataset
stdev: Real value corresponding to the standard deviation of the dataset
return:
Real value that is the point's z score
"""
score = (point - mean) / stdev score = (point - mean) / stdev
return score return score
# expects 2d array, normalizes across all axes
def z_normalize(array, *args): def z_normalize(array, *args):
"""
Applies sklearn.normalize(array, axis = args) on any arraylike parseable by numpy.
parameters:
array: array like structure of reals aka nested indexables
*args: arguments relating to axis normalized against
return:
numpy array of normalized values from ArrayLike input
"""
array = np.array(array) array = np.array(array)
for arg in args: for arg in args:
array = sklearn.preprocessing.normalize(array, axis = arg) array = sklearn.preprocessing.normalize(array, axis = arg)
return array return array
# expects 2d array of [x,y]
def histo_analysis(hist_data): def histo_analysis(hist_data):
"""
Calculates the mean and standard deviation of derivatives of (x,y) points. Requires at least 2 points to compute.
parameters:
hist_data: list of real coordinate point data (x, y)
return:
Dictionary with (mean, deviation) as keys to corresponding values
"""
if len(hist_data[0]) > 2: if len(hist_data[0]) > 2:
hist_data = np.array(hist_data) hist_data = np.array(hist_data)
@@ -446,7 +474,15 @@ def histo_analysis(hist_data):
return None return None
def regression(inputs, outputs, args): # inputs, outputs expects N-D array def regression(inputs, outputs, args): # inputs, outputs expects N-D array
"""
Applies specified regression kernels onto input, output data pairs.
parameters:
inputs: List of Reals representing independent variable values of each point
outputs: List of Reals representing dependent variable values of each point
args: List of Strings from values (lin, log, exp, ply, sig)
return:
Dictionary with keys (lin, log, exp, ply, sig) as keys to correspondiong regression models
"""
X = np.array(inputs) X = np.array(inputs)
y = np.array(outputs) y = np.array(outputs)
@@ -550,13 +586,39 @@ def regression(inputs, outputs, args): # inputs, outputs expects N-D array
return regressions return regressions
class Metric: class Metric:
"""
The metric class wraps the metrics models. Call without instantiation as Metric.<method>(...)
"""
def elo(self, starting_score, opposing_score, observed, N, K): def elo(self, starting_score, opposing_score, observed, N, K):
"""
Calculates an elo adjusted ELO score given a player's current score, opponent's score, and outcome of match.
reference: https://en.wikipedia.org/wiki/Elo_rating_system
parameters:
starting_score: Real value representing player's ELO score before a match
opposing_score: Real value representing opponent's score before the match
observed: Array of Real values representing multiple sequential match outcomes against the same opponent. 1 for match win, 0.5 for tie, 0 for loss.
N: Real value representing the normal or mean score expected (usually 1200)
K: R eal value representing a system constant, determines how quickly players will change scores (usually 24)
return:
Real value representing the player's new ELO score
"""
return Elo.calculate(starting_score, opposing_score, observed, N, K) return Elo.calculate(starting_score, opposing_score, observed, N, K)
def glicko2(self, starting_score, starting_rd, starting_vol, opposing_score, opposing_rd, observations): def glicko2(self, starting_score, starting_rd, starting_vol, opposing_score, opposing_rd, observations):
"""
Calculates an adjusted Glicko-2 score given a player's current score, multiple opponent's score, and outcome of several matches.
reference: http://www.glicko.net/glicko/glicko2.pdf
parameters:
starting_score: Real value representing the player's Glicko-2 score
starting_rd: Real value representing the player's RD
starting_vol: Real value representing the player's volatility
opposing_score: List of Real values representing multiple opponent's Glicko-2 scores
opposing_rd: List of Real values representing multiple opponent's RD
opposing_vol: List of Real values representing multiple opponent's volatility
observations: List of Real values representing the outcome of several matches, where each match's opponent corresponds with the opposing_score, opposing_rd, opposing_vol values of the same indesx. Outcomes can be a score, presuming greater score is better.
return:
Tuple of 3 Real values representing the player's new score, rd, and vol
"""
player = Glicko2.Glicko2(rating = starting_score, rd = starting_rd, vol = starting_vol) player = Glicko2.Glicko2(rating = starting_score, rd = starting_rd, vol = starting_vol)
player.update_player([x for x in opposing_score], [x for x in opposing_rd], observations) player.update_player([x for x in opposing_score], [x for x in opposing_rd], observations)
@@ -564,7 +626,15 @@ class Metric:
return (player.rating, player.rd, player.vol) return (player.rating, player.rd, player.vol)
def trueskill(self, teams_data, observations): # teams_data is array of array of tuples ie. [[(mu, sigma), (mu, sigma), (mu, sigma)], [(mu, sigma), (mu, sigma), (mu, sigma)]] def trueskill(self, teams_data, observations): # teams_data is array of array of tuples ie. [[(mu, sigma), (mu, sigma), (mu, sigma)], [(mu, sigma), (mu, sigma), (mu, sigma)]]
"""
Calculates the score changes for multiple teams playing in a single match accoding to the trueskill algorithm.
reference: https://trueskill.org/
parameters:
teams_data: List of List of Tuples of 2 Real values representing multiple player ratings. List of teams, which is a List of players. Each player rating is a Tuple of 2 Real values (mu, sigma).
observations: List of Real values representing the match outcome. Each value in the List is the score corresponding to the team at the same index in teams_data.
return:
List of List of Tuples of 2 Real values representing new player ratings. Same structure as teams_data.
"""
team_ratings = [] team_ratings = []
for team in teams_data: for team in teams_data:
@@ -599,24 +669,32 @@ def npmin(data):
def npmax(data): def npmax(data):
return np.amax(data) return np.amax(data)
""" need to decide what to do with this function
def kmeans(data, n_clusters=8, init="k-means++", n_init=10, max_iter=300, tol=0.0001, precompute_distances="auto", verbose=0, random_state=None, copy_x=True, n_jobs=None, algorithm="auto"):
kernel = sklearn.cluster.KMeans(n_clusters = n_clusters, init = init, n_init = n_init, max_iter = max_iter, tol = tol, precompute_distances = precompute_distances, verbose = verbose, random_state = random_state, copy_x = copy_x, n_jobs = n_jobs, algorithm = algorithm)
kernel.fit(data)
predictions = kernel.predict(data)
centers = kernel.cluster_centers_
return centers, predictions
"""
def pca(data, n_components = None, copy = True, whiten = False, svd_solver = "auto", tol = 0.0, iterated_power = "auto", random_state = None): def pca(data, n_components = None, copy = True, whiten = False, svd_solver = "auto", tol = 0.0, iterated_power = "auto", random_state = None):
"""
Performs a principle component analysis on the input data.
reference: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
parameters:
data: Arraylike of Reals representing the set of data to perform PCA on
* : refer to reference for usage, parameters follow same usage
return:
Arraylike of Reals representing the set of data that has had PCA performed. The dimensionality of the Arraylike may be smaller or equal.
"""
kernel = sklearn.decomposition.PCA(n_components = n_components, copy = copy, whiten = whiten, svd_solver = svd_solver, tol = tol, iterated_power = iterated_power, random_state = random_state) kernel = sklearn.decomposition.PCA(n_components = n_components, copy = copy, whiten = whiten, svd_solver = svd_solver, tol = tol, iterated_power = iterated_power, random_state = random_state)
return kernel.fit_transform(data) return kernel.fit_transform(data)
def decisiontree(data, labels, test_size = 0.3, criterion = "gini", splitter = "default", max_depth = None): #expects *2d data and 1d labels def decisiontree(data, labels, test_size = 0.3, criterion = "gini", splitter = "default", max_depth = None): #expects *2d data and 1d labels
"""
Generates a decision tree classifier fitted to the given data.
reference: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
parameters:
data: List of values representing each data point of multiple axes
labels: List of values represeing the labels corresponding to the same index at data
* : refer to reference for usage, parameters follow same usage
return:
DecisionTreeClassifier model and corresponding classification accuracy metrics
"""
data_train, data_test, labels_train, labels_test = sklearn.model_selection.train_test_split(data, labels, test_size=test_size, random_state=1) data_train, data_test, labels_train, labels_test = sklearn.model_selection.train_test_split(data, labels, test_size=test_size, random_state=1)
model = sklearn.tree.DecisionTreeClassifier(criterion = criterion, splitter = splitter, max_depth = max_depth) model = sklearn.tree.DecisionTreeClassifier(criterion = criterion, splitter = splitter, max_depth = max_depth)
model = model.fit(data_train,labels_train) model = model.fit(data_train,labels_train)

View File

@@ -4,9 +4,11 @@
# this should be imported as a python module using 'from tra_analysis import ClassificationMetric' # this should be imported as a python module using 'from tra_analysis import ClassificationMetric'
# setup: # setup:
__version__ = "1.0.1" __version__ = "1.0.2"
__changelog__ = """changelog: __changelog__ = """changelog:
1.0.2:
- optimized imports
1.0.1: 1.0.1:
- fixed __all__ - fixed __all__
1.0.0: 1.0.0:
@@ -22,7 +24,6 @@ __all__ = [
] ]
import sklearn import sklearn
from sklearn import metrics
class ClassificationMetric(): class ClassificationMetric():

View File

@@ -4,10 +4,12 @@
# this should be imported as a python module using 'from tra_analysis import Clustering' # this should be imported as a python module using 'from tra_analysis import Clustering'
# setup: # setup:
__version__ = "2.0.1" __version__ = "2.0.2"
# changelog should be viewed using print(analysis.__changelog__) # changelog should be viewed using print(analysis.__changelog__)
__changelog__ = """changelog: __changelog__ = """changelog:
2.0.2:
- generalized optional args to **kwargs
2.0.1: 2.0.1:
- added normalization preprocessing to clustering, expects instance of sklearn.preprocessing.Normalizer() - added normalization preprocessing to clustering, expects instance of sklearn.preprocessing.Normalizer()
2.0.0: 2.0.0:
@@ -30,32 +32,32 @@ __all__ = [
import sklearn import sklearn
def kmeans(data, normalizer = None, n_clusters=8, init="k-means++", n_init=10, max_iter=300, tol=0.0001, precompute_distances="auto", verbose=0, random_state=None, copy_x=True, n_jobs=None, algorithm="auto"): def kmeans(data, normalizer = None, **kwargs):
if normalizer != None: if normalizer != None:
data = normalizer.transform(data) data = normalizer.transform(data)
kernel = sklearn.cluster.KMeans(n_clusters = n_clusters, init = init, n_init = n_init, max_iter = max_iter, tol = tol, precompute_distances = precompute_distances, verbose = verbose, random_state = random_state, copy_x = copy_x, n_jobs = n_jobs, algorithm = algorithm) kernel = sklearn.cluster.KMeans(**kwargs)
kernel.fit(data) kernel.fit(data)
predictions = kernel.predict(data) predictions = kernel.predict(data)
centers = kernel.cluster_centers_ centers = kernel.cluster_centers_
return centers, predictions return centers, predictions
def dbscan(data, normalizer=None, eps=0.5, min_samples=5, metric='euclidean', metric_params=None, algorithm='auto', leaf_size=30, p=None, n_jobs=None): def dbscan(data, normalizer=None, **kwargs):
if normalizer != None: if normalizer != None:
data = normalizer.transform(data) data = normalizer.transform(data)
model = sklearn.cluster.DBSCAN(eps = eps, min_samples = min_samples, metric = metric, metric_params = metric_params, algorithm = algorithm, leaf_size = leaf_size, p = p, n_jobs = n_jobs).fit(data) model = sklearn.cluster.DBSCAN(**kwargs).fit(data)
return model.labels_ return model.labels_
def spectral(data, normalizer=None, n_clusters=8, eigen_solver=None, n_components=None, random_state=None, n_init=10, gamma=1.0, affinity='rbf', n_neighbors=10, eigen_tol=0.0, assign_labels='kmeans', degree=3, coef0=1, kernel_params=None, n_jobs=None, verbose=False): def spectral(data, normalizer=None, **kwargs):
if normalizer != None: if normalizer != None:
data = normalizer.transform(data) data = normalizer.transform(data)
model = sklearn.cluster.SpectralClustering(n_clusters = n_clusters, eigen_solver = eigen_solver, n_components = n_components, random_state = random_state, n_init = n_init, gamma = gamma, affinity = affinity, n_neighbors = n_neighbors, eigen_tol = eigen_tol, assign_labels = assign_labels, degree = degree, coef0 = coef0, kernel_params = kernel_params, n_jobs = n_jobs).fit(data) model = sklearn.cluster.SpectralClustering(**kwargs).fit(data)
return model.labels_ return model.labels_

View File

@@ -4,9 +4,13 @@
# this should be imported as a python module using 'from tra_analysis import CorrelationTest' # this should be imported as a python module using 'from tra_analysis import CorrelationTest'
# setup: # setup:
__version__ = "1.0.1" __version__ = "1.0.3"
__changelog__ = """changelog: __changelog__ = """changelog:
1.0.3:
- generalized optional args to **kwargs
1.0.2:
- optimized imports
1.0.1: 1.0.1:
- fixed __all__ - fixed __all__
1.0.0: 1.0.0:
@@ -29,7 +33,6 @@ __all__ = [
] ]
import scipy import scipy
from scipy import stats
def anova_oneway(*args): #expects arrays of samples def anova_oneway(*args): #expects arrays of samples
@@ -41,9 +44,9 @@ def pearson(x, y):
results = scipy.stats.pearsonr(x, y) results = scipy.stats.pearsonr(x, y)
return {"r-value": results[0], "p-value": results[1]} return {"r-value": results[0], "p-value": results[1]}
def spearman(a, b = None, axis = 0, nan_policy = 'propagate'): def spearman(a, b = None, **kwargs):
results = scipy.stats.spearmanr(a, b = b, axis = axis, nan_policy = nan_policy) results = scipy.stats.spearmanr(a, b = b, **kwargs)
return {"r-value": results[0], "p-value": results[1]} return {"r-value": results[0], "p-value": results[1]}
def point_biserial(x, y): def point_biserial(x, y):
@@ -51,17 +54,17 @@ def point_biserial(x, y):
results = scipy.stats.pointbiserialr(x, y) results = scipy.stats.pointbiserialr(x, y)
return {"r-value": results[0], "p-value": results[1]} return {"r-value": results[0], "p-value": results[1]}
def kendall(x, y, initial_lexsort = None, nan_policy = 'propagate', method = 'auto'): def kendall(x, y, **kwargs):
results = scipy.stats.kendalltau(x, y, initial_lexsort = initial_lexsort, nan_policy = nan_policy, method = method) results = scipy.stats.kendalltau(x, y, **kwargs)
return {"tau": results[0], "p-value": results[1]} return {"tau": results[0], "p-value": results[1]}
def kendall_weighted(x, y, rank = True, weigher = None, additive = True): def kendall_weighted(x, y, **kwargs):
results = scipy.stats.weightedtau(x, y, rank = rank, weigher = weigher, additive = additive) results = scipy.stats.weightedtau(x, y, **kwargs)
return {"tau": results[0], "p-value": results[1]} return {"tau": results[0], "p-value": results[1]}
def mgc(x, y, compute_distance = None, reps = 1000, workers = 1, is_twosamp = False, random_state = None): def mgc(x, y, **kwargs):
results = scipy.stats.multiscale_graphcorr(x, y, compute_distance = compute_distance, reps = reps, workers = workers, is_twosamp = is_twosamp, random_state = random_state) results = scipy.stats.multiscale_graphcorr(x, y, **kwargs)
return {"k-value": results[0], "p-value": results[1], "data": results[2]} # unsure if MGC test returns a k value return {"k-value": results[0], "p-value": results[1], "data": results[2]} # unsure if MGC test returns a k value

View File

@@ -4,9 +4,13 @@
# this should be imported as a python module using 'from tra_analysis import KNN' # this should be imported as a python module using 'from tra_analysis import KNN'
# setup: # setup:
__version__ = "1.0.0" __version__ = "1.0.2"
__changelog__ = """changelog: __changelog__ = """changelog:
1.0.2:
- generalized optional args to **kwargs
1.0.1:
- optimized imports
1.0.0: 1.0.0:
- ported analysis.KNN() here - ported analysis.KNN() here
- removed classness - removed classness
@@ -23,22 +27,21 @@ __all__ = [
] ]
import sklearn import sklearn
from sklearn import model_selection, neighbors
from . import ClassificationMetric, RegressionMetric from . import ClassificationMetric, RegressionMetric
def knn_classifier(data, labels, n_neighbors = 5, test_size = 0.3, algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=None, p=2, weights='uniform'): #expects *2d data and 1d labels post-scaling def knn_classifier(data, labels, n_neighbors = 5, test_size = 0.3, **kwargs): #expects *2d data and 1d labels post-scaling
data_train, data_test, labels_train, labels_test = sklearn.model_selection.train_test_split(data, labels, test_size=test_size, random_state=1) data_train, data_test, labels_train, labels_test = sklearn.model_selection.train_test_split(data, labels, test_size=test_size, random_state=1)
model = sklearn.neighbors.KNeighborsClassifier(n_neighbors = n_neighbors, weights = weights, algorithm = algorithm, leaf_size = leaf_size, p = p, metric = metric, metric_params = metric_params, n_jobs = n_jobs) model = sklearn.neighbors.KNeighborsClassifier(n_neighbors = n_neighbors, **kwargs)
model.fit(data_train, labels_train) model.fit(data_train, labels_train)
predictions = model.predict(data_test) predictions = model.predict(data_test)
return model, ClassificationMetric(predictions, labels_test) return model, ClassificationMetric(predictions, labels_test)
def knn_regressor(data, outputs, n_neighbors = 5, test_size = 0.3, weights = "uniform", algorithm = "auto", leaf_size = 30, p = 2, metric = "minkowski", metric_params = None, n_jobs = None): def knn_regressor(data, outputs, n_neighbors = 5, test_size = 0.3, **kwargs):
data_train, data_test, outputs_train, outputs_test = sklearn.model_selection.train_test_split(data, outputs, test_size=test_size, random_state=1) data_train, data_test, outputs_train, outputs_test = sklearn.model_selection.train_test_split(data, outputs, test_size=test_size, random_state=1)
model = sklearn.neighbors.KNeighborsRegressor(n_neighbors = n_neighbors, weights = weights, algorithm = algorithm, leaf_size = leaf_size, p = p, metric = metric, metric_params = metric_params, n_jobs = n_jobs) model = sklearn.neighbors.KNeighborsRegressor(n_neighbors = n_neighbors, **kwargs)
model.fit(data_train, outputs_train) model.fit(data_train, outputs_train)
predictions = model.predict(data_test) predictions = model.predict(data_test)

View File

@@ -4,9 +4,13 @@
# this should be imported as a python module using 'from tra_analysis import NaiveBayes' # this should be imported as a python module using 'from tra_analysis import NaiveBayes'
# setup: # setup:
__version__ = "1.0.0" __version__ = "1.0.2"
__changelog__ = """changelog: __changelog__ = """changelog:
1.0.2:
- generalized optional args to **kwargs
1.0.1:
- optimized imports
1.0.0: 1.0.0:
- ported analysis.NaiveBayes() here - ported analysis.NaiveBayes() here
- removed classness - removed classness
@@ -18,46 +22,45 @@ __author__ = (
__all__ = [ __all__ = [
'gaussian', 'gaussian',
'multinomial' 'multinomial',
'bernoulli', 'bernoulli',
'complement' 'complement',
] ]
import sklearn import sklearn
from sklearn import model_selection, naive_bayes from . import ClassificationMetric
from . import ClassificationMetric, RegressionMetric
def gaussian(data, labels, test_size = 0.3, priors = None, var_smoothing = 1e-09): def gaussian(data, labels, test_size = 0.3, **kwargs):
data_train, data_test, labels_train, labels_test = sklearn.model_selection.train_test_split(data, labels, test_size=test_size, random_state=1) data_train, data_test, labels_train, labels_test = sklearn.model_selection.train_test_split(data, labels, test_size=test_size, random_state=1)
model = sklearn.naive_bayes.GaussianNB(priors = priors, var_smoothing = var_smoothing) model = sklearn.naive_bayes.GaussianNB(**kwargs)
model.fit(data_train, labels_train) model.fit(data_train, labels_train)
predictions = model.predict(data_test) predictions = model.predict(data_test)
return model, ClassificationMetric(predictions, labels_test) return model, ClassificationMetric(predictions, labels_test)
def multinomial(data, labels, test_size = 0.3, alpha=1.0, fit_prior=True, class_prior=None): def multinomial(data, labels, test_size = 0.3, **kwargs):
data_train, data_test, labels_train, labels_test = sklearn.model_selection.train_test_split(data, labels, test_size=test_size, random_state=1) data_train, data_test, labels_train, labels_test = sklearn.model_selection.train_test_split(data, labels, test_size=test_size, random_state=1)
model = sklearn.naive_bayes.MultinomialNB(alpha = alpha, fit_prior = fit_prior, class_prior = class_prior) model = sklearn.naive_bayes.MultinomialNB(**kwargs)
model.fit(data_train, labels_train) model.fit(data_train, labels_train)
predictions = model.predict(data_test) predictions = model.predict(data_test)
return model, ClassificationMetric(predictions, labels_test) return model, ClassificationMetric(predictions, labels_test)
def bernoulli(data, labels, test_size = 0.3, alpha=1.0, binarize=0.0, fit_prior=True, class_prior=None): def bernoulli(data, labels, test_size = 0.3, **kwargs):
data_train, data_test, labels_train, labels_test = sklearn.model_selection.train_test_split(data, labels, test_size=test_size, random_state=1) data_train, data_test, labels_train, labels_test = sklearn.model_selection.train_test_split(data, labels, test_size=test_size, random_state=1)
model = sklearn.naive_bayes.BernoulliNB(alpha = alpha, binarize = binarize, fit_prior = fit_prior, class_prior = class_prior) model = sklearn.naive_bayes.BernoulliNB(**kwargs)
model.fit(data_train, labels_train) model.fit(data_train, labels_train)
predictions = model.predict(data_test) predictions = model.predict(data_test)
return model, ClassificationMetric(predictions, labels_test) return model, ClassificationMetric(predictions, labels_test)
def complement(data, labels, test_size = 0.3, alpha=1.0, fit_prior=True, class_prior=None, norm=False): def complement(data, labels, test_size = 0.3, **kwargs):
data_train, data_test, labels_train, labels_test = sklearn.model_selection.train_test_split(data, labels, test_size=test_size, random_state=1) data_train, data_test, labels_train, labels_test = sklearn.model_selection.train_test_split(data, labels, test_size=test_size, random_state=1)
model = sklearn.naive_bayes.ComplementNB(alpha = alpha, fit_prior = fit_prior, class_prior = class_prior, norm = norm) model = sklearn.naive_bayes.ComplementNB(**kwargs)
model.fit(data_train, labels_train) model.fit(data_train, labels_train)
predictions = model.predict(data_test) predictions = model.predict(data_test)

View File

@@ -4,9 +4,14 @@
# this should be imported as a python module using 'from tra_analysis import RandomForest' # this should be imported as a python module using 'from tra_analysis import RandomForest'
# setup: # setup:
__version__ = "1.0.1" __version__ = "1.0.3"
__changelog__ = """changelog: __changelog__ = """changelog:
1.0.3:
- updated RandomForestClassifier and RandomForestRegressor parameters to match sklearn v 1.0.2
- changed default values for kwargs to rely on sklearn
1.0.2:
- optimized imports
1.0.1: 1.0.1:
- fixed __all__ - fixed __all__
1.0.0: 1.0.0:
@@ -23,23 +28,22 @@ __all__ = [
"random_forest_regressor", "random_forest_regressor",
] ]
import sklearn import sklearn, sklearn.ensemble, sklearn.naive_bayes
from sklearn import ensemble, model_selection
from . import ClassificationMetric, RegressionMetric from . import ClassificationMetric, RegressionMetric
def random_forest_classifier(data, labels, test_size, n_estimators, criterion="gini", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features="auto", max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None): def random_forest_classifier(data, labels, test_size, n_estimators, **kwargs):
data_train, data_test, labels_train, labels_test = sklearn.model_selection.train_test_split(data, labels, test_size=test_size, random_state=1) data_train, data_test, labels_train, labels_test = sklearn.model_selection.train_test_split(data, labels, test_size=test_size, random_state=1)
kernel = sklearn.ensemble.RandomForestClassifier(n_estimators = n_estimators, criterion = criterion, max_depth = max_depth, min_samples_split = min_samples_split, min_samples_leaf = min_samples_leaf, min_weight_fraction_leaf = min_weight_fraction_leaf, max_leaf_nodes = max_leaf_nodes, min_impurity_decrease = min_impurity_decrease, bootstrap = bootstrap, oob_score = oob_score, n_jobs = n_jobs, random_state = random_state, verbose = verbose, warm_start = warm_start, class_weight = class_weight) kernel = sklearn.ensemble.RandomForestClassifier(n_estimators = n_estimators, **kwargs)
kernel.fit(data_train, labels_train) kernel.fit(data_train, labels_train)
predictions = kernel.predict(data_test) predictions = kernel.predict(data_test)
return kernel, ClassificationMetric(predictions, labels_test) return kernel, ClassificationMetric(predictions, labels_test)
def random_forest_regressor(data, outputs, test_size, n_estimators, criterion="mse", max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features="auto", max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False): def random_forest_regressor(data, outputs, test_size, n_estimators, **kwargs):
data_train, data_test, outputs_train, outputs_test = sklearn.model_selection.train_test_split(data, outputs, test_size=test_size, random_state=1) data_train, data_test, outputs_train, outputs_test = sklearn.model_selection.train_test_split(data, outputs, test_size=test_size, random_state=1)
kernel = sklearn.ensemble.RandomForestRegressor(n_estimators = n_estimators, criterion = criterion, max_depth = max_depth, min_samples_split = min_samples_split, min_weight_fraction_leaf = min_weight_fraction_leaf, max_features = max_features, max_leaf_nodes = max_leaf_nodes, min_impurity_decrease = min_impurity_decrease, min_impurity_split = min_impurity_split, bootstrap = bootstrap, oob_score = oob_score, n_jobs = n_jobs, random_state = random_state, verbose = verbose, warm_start = warm_start) kernel = sklearn.ensemble.RandomForestRegressor(n_estimators = n_estimators, **kwargs)
kernel.fit(data_train, outputs_train) kernel.fit(data_train, outputs_train)
predictions = kernel.predict(data_test) predictions = kernel.predict(data_test)

View File

@@ -4,9 +4,11 @@
# this should be imported as a python module using 'from tra_analysis import RegressionMetric' # this should be imported as a python module using 'from tra_analysis import RegressionMetric'
# setup: # setup:
__version__ = "1.0.0" __version__ = "1.0.1"
__changelog__ = """changelog: __changelog__ = """changelog:
1.0.1:
- optimized imports
1.0.0: 1.0.0:
- ported analysis.RegressionMetric() here - ported analysis.RegressionMetric() here
""" """
@@ -21,7 +23,6 @@ __all__ = [
import numpy as np import numpy as np
import sklearn import sklearn
from sklearn import metrics
class RegressionMetric(): class RegressionMetric():

View File

@@ -4,9 +4,11 @@
# this should be imported as a python module using 'from tra_analysis import SVM' # this should be imported as a python module using 'from tra_analysis import SVM'
# setup: # setup:
__version__ = "1.0.2" __version__ = "1.0.3"
__changelog__ = """changelog: __changelog__ = """changelog:
1.0.3:
- optimized imports
1.0.2: 1.0.2:
- fixed __all__ - fixed __all__
1.0.1: 1.0.1:
@@ -30,7 +32,6 @@ __all__ = [
] ]
import sklearn import sklearn
from sklearn import svm
from . import ClassificationMetric, RegressionMetric from . import ClassificationMetric, RegressionMetric
class CustomKernel: class CustomKernel:

View File

@@ -16,7 +16,7 @@ __changelog__ = """changelog:
__author__ = ( __author__ = (
"Arthur Lu <learthurgo@gmail.com>", "Arthur Lu <learthurgo@gmail.com>",
"James Pan <zpan@imsa.edu>" "James Pan <zpan@imsa.edu>",
) )
__all__ = [ __all__ = [

View File

@@ -4,9 +4,11 @@
# this should be imported as a python module using 'from tra_analysis import StatisticalTest' # this should be imported as a python module using 'from tra_analysis import StatisticalTest'
# setup: # setup:
__version__ = "1.0.2" __version__ = "1.0.3"
__changelog__ = """changelog: __changelog__ = """changelog:
1.0.3:
- optimized imports
1.0.2: 1.0.2:
- added tukey_multicomparison - added tukey_multicomparison
- fixed styling - fixed styling
@@ -61,7 +63,6 @@ __all__ = [
import numpy as np import numpy as np
import scipy import scipy
from scipy import stats, interpolate
def ttest_onesample(a, popmean, axis = 0, nan_policy = 'propagate'): def ttest_onesample(a, popmean, axis = 0, nan_policy = 'propagate'):
@@ -279,9 +280,9 @@ def get_tukeyQcrit(k, df, alpha=0.05):
cv001 = c[:, 2::2] cv001 = c[:, 2::2]
if alpha == 0.05: if alpha == 0.05:
intp = interpolate.interp1d(crows, cv005[:,k-2]) intp = scipy.interpolate.interp1d(crows, cv005[:,k-2])
elif alpha == 0.01: elif alpha == 0.01:
intp = interpolate.interp1d(crows, cv001[:,k-2]) intp = scipy.interpolate.interp1d(crows, cv001[:,k-2])
else: else:
raise ValueError('only implemented for alpha equal to 0.01 and 0.05') raise ValueError('only implemented for alpha equal to 0.01 and 0.05')
return intp(df) return intp(df)

View File

@@ -15,6 +15,9 @@ __changelog__ = """changelog:
- deprecated all *_obj.py compatibility modules - deprecated all *_obj.py compatibility modules
- deprecated titanlearn.py - deprecated titanlearn.py
- deprecated visualization.py - deprecated visualization.py
- removed matplotlib from requirements
- removed extra submodule imports in Analysis
- added typehinting, docstrings for each function
3.0.0: 3.0.0:
- incremented version to release 3.0.0 - incremented version to release 3.0.0
3.0.0-rc2: 3.0.0-rc2:
@@ -44,6 +47,7 @@ __all__ = [
"Analysis", "Analysis",
"Array", "Array",
"ClassificationMetric", "ClassificationMetric",
"Clustering",
"CorrelationTest", "CorrelationTest",
"Expression", "Expression",
"Fit", "Fit",

View File

@@ -0,0 +1,24 @@
# Titan Robotics Team 2022: Metrics submodule
# Written by Arthur Lu
# Notes:
# this should be imported as a python module using 'from tra_analysis import metrics'
# setup:
__version__ = "1.0.0"
__changelog__ = """changelog:
1.0.0:
- implemented elo, glicko2, trueskill
"""
__author__ = (
"Arthur Lu <learthurgo@gmail.com>",
)
__all__ = {
"Expression"
}
from . import elo
from . import glicko2
from . import trueskill