added normalization preprocessing to Clustering

added unit tests for normalized clustering
This commit is contained in:
Arthur Lu 2021-07-26 18:17:42 +00:00
parent 3e99869d5d
commit 3606a072c4
2 changed files with 20 additions and 5 deletions

View File

@ -235,10 +235,14 @@ def test_equation():
def test_clustering(): def test_clustering():
normalizer = sklearn.preprocessing.Normalizer()
data = X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]]) data = X = np.array([[1, 2], [2, 2], [2, 3], [8, 7], [8, 8], [25, 80]])
assert Clustering.dbscan(data, eps=3, min_samples=2).tolist() == [0, 0, 0, 1, 1, -1] assert Clustering.dbscan(data, eps=3, min_samples=2).tolist() == [0, 0, 0, 1, 1, -1]
assert Clustering.dbscan(data, normalizer=normalizer, eps=3, min_samples=2).tolist() == [0, 0, 0, 0, 0, 0]
data = np.array([[1, 1], [2, 1], [1, 0], [4, 7], [3, 5], [3, 6]]) data = np.array([[1, 1], [2, 1], [1, 0], [4, 7], [3, 5], [3, 6]])
assert Clustering.spectral(data, n_clusters=2, assign_labels='discretize', random_state=0).tolist() == [1, 1, 1, 0, 0, 0] assert Clustering.spectral(data, n_clusters=2, assign_labels='discretize', random_state=0).tolist() == [1, 1, 1, 0, 0, 0]
assert Clustering.spectral(data, normalizer=normalizer, n_clusters=2, assign_labels='discretize', random_state=0).tolist() == [0, 1, 1, 0, 0, 0]

View File

@ -4,10 +4,12 @@
# this should be imported as a python module using 'from tra_analysis import Clustering' # this should be imported as a python module using 'from tra_analysis import Clustering'
# setup: # setup:
__version__ = "2.0.0" __version__ = "2.0.1"
# changelog should be viewed using print(analysis.__changelog__) # changelog should be viewed using print(analysis.__changelog__)
__changelog__ = """changelog: __changelog__ = """changelog:
2.0.1:
- added normalization preprocessing to clustering, expects instance of sklearn.preprocessing.Normalizer()
2.0.0: 2.0.0:
- added dbscan clustering algo - added dbscan clustering algo
- added spectral clustering algo - added spectral clustering algo
@ -28,7 +30,10 @@ __all__ = [
import sklearn import sklearn
def kmeans(data, n_clusters=8, init="k-means++", n_init=10, max_iter=300, tol=0.0001, precompute_distances="auto", verbose=0, random_state=None, copy_x=True, n_jobs=None, algorithm="auto"): def kmeans(data, normalizer = None, n_clusters=8, init="k-means++", n_init=10, max_iter=300, tol=0.0001, precompute_distances="auto", verbose=0, random_state=None, copy_x=True, n_jobs=None, algorithm="auto"):
if normalizer != None:
data = normalizer.transform(data)
kernel = sklearn.cluster.KMeans(n_clusters = n_clusters, init = init, n_init = n_init, max_iter = max_iter, tol = tol, precompute_distances = precompute_distances, verbose = verbose, random_state = random_state, copy_x = copy_x, n_jobs = n_jobs, algorithm = algorithm) kernel = sklearn.cluster.KMeans(n_clusters = n_clusters, init = init, n_init = n_init, max_iter = max_iter, tol = tol, precompute_distances = precompute_distances, verbose = verbose, random_state = random_state, copy_x = copy_x, n_jobs = n_jobs, algorithm = algorithm)
kernel.fit(data) kernel.fit(data)
@ -37,13 +42,19 @@ def kmeans(data, n_clusters=8, init="k-means++", n_init=10, max_iter=300, tol=0.
return centers, predictions return centers, predictions
def dbscan(data, eps=0.5, min_samples=5, metric='euclidean', metric_params=None, algorithm='auto', leaf_size=30, p=None, n_jobs=None): def dbscan(data, normalizer=None, eps=0.5, min_samples=5, metric='euclidean', metric_params=None, algorithm='auto', leaf_size=30, p=None, n_jobs=None):
if normalizer != None:
data = normalizer.transform(data)
model = sklearn.cluster.DBSCAN(eps = eps, min_samples = min_samples, metric = metric, metric_params = metric_params, algorithm = algorithm, leaf_size = leaf_size, p = p, n_jobs = n_jobs).fit(data) model = sklearn.cluster.DBSCAN(eps = eps, min_samples = min_samples, metric = metric, metric_params = metric_params, algorithm = algorithm, leaf_size = leaf_size, p = p, n_jobs = n_jobs).fit(data)
return model.labels_ return model.labels_
def spectral(data, n_clusters=8, eigen_solver=None, n_components=None, random_state=None, n_init=10, gamma=1.0, affinity='rbf', n_neighbors=10, eigen_tol=0.0, assign_labels='kmeans', degree=3, coef0=1, kernel_params=None, n_jobs=None, verbose=False): def spectral(data, normalizer=None, n_clusters=8, eigen_solver=None, n_components=None, random_state=None, n_init=10, gamma=1.0, affinity='rbf', n_neighbors=10, eigen_tol=0.0, assign_labels='kmeans', degree=3, coef0=1, kernel_params=None, n_jobs=None, verbose=False):
if normalizer != None:
data = normalizer.transform(data)
model = sklearn.cluster.SpectralClustering(n_clusters = n_clusters, eigen_solver = eigen_solver, n_components = n_components, random_state = random_state, n_init = n_init, gamma = gamma, affinity = affinity, n_neighbors = n_neighbors, eigen_tol = eigen_tol, assign_labels = assign_labels, degree = degree, coef0 = coef0, kernel_params = kernel_params, n_jobs = n_jobs).fit(data) model = sklearn.cluster.SpectralClustering(n_clusters = n_clusters, eigen_solver = eigen_solver, n_components = n_components, random_state = random_state, n_init = n_init, gamma = gamma, affinity = affinity, n_neighbors = n_neighbors, eigen_tol = eigen_tol, assign_labels = assign_labels, degree = degree, coef0 = coef0, kernel_params = kernel_params, n_jobs = n_jobs).fit(data)