From 2140ea8f7787737fa72482834579faef178a862a Mon Sep 17 00:00:00 2001 From: jlevine18 Date: Thu, 20 Dec 2018 21:45:05 -0600 Subject: [PATCH] started visualization module --- data analysis/visualization.py | 130 +++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 data analysis/visualization.py diff --git a/data analysis/visualization.py b/data analysis/visualization.py new file mode 100644 index 00000000..b888ef3b --- /dev/null +++ b/data analysis/visualization.py @@ -0,0 +1,130 @@ +#Titan Robotics Team 2022: Visualization Module +#Written by Arthur Lu & Jacob Levine +#Notes: +# this should be imported as a python module using 'import visualization' +# this should be included in the local directory or environment variable +# this module has not been optimized for multhreaded computing +#Number of easter eggs: Jake is Jewish and does not observe easter. +#setup: + +__version__ = "1.0.0.001" + +#changelog should be viewed using print(analysis.__changelog__) +__changelog__ = """changelog: +1.0.0.xxx: + -added basic plotting, clustering, and regression comparisons""" +__author__ = ( + "Arthur Lu , " + "Jacob Levine ," + ) +__all__ = [ + 'affinity_prop', + 'bar_graph', + 'dbscan', + 'kmeans', + 'line_plot', + 'pca_comp', + 'regression_comp', + 'scatter_plot', + 'spectral', + 'vis_2d' + ] +#imports +import matplotlib.pyplot as plt +from sklearn.decomposition import PCA, KernelPCA, IncrementalPCA +from sklearn.preprocessing import StandardScaler +from sklearn.cluster import AffinityPropagation, DBSCAN, KMeans, SpectralClustering +import statistics + +#bar of x,y +def bar_graph(x,y): + x=np.asarray(x) + y=np.asarray(y) + plt.bar(x,y) + plt.show() + +#scatter of x,y +def scatter_plot(x,y): + x=np.asarray(x) + y=np.asarray(y) + plt.scatter(x,y) + plt.show() + +#line of x,y +def line_plot(x,y): + x=np.asarray(x) + y=np.asarray(y) + plt.scatter(x,y) + plt.show() + +#plot data + regression fit +def regression_comp(x,y,reg): + x=np.asarray(x) + y=np.asarray(y) + regx=np.arange(x.min(),x.max(),(x.max()-x.min())/1000)) + regy=[] + for i in regx: + regy.append(eval(reg[0].replace("z",str(i)))) + regy=np.asarray(regy) + plt.scatter(x,y) + plt.plot(regx,regy,color="orange",linewidth=3) + plt.text(.85*max([x.max(),regx.max()]),.95*max([y.max(),regy.max()]), + u"R\u00b2="+str(round(reg[2],5)), + horizontalalignment='center', verticalalignment='center') + plt.text(.85*max([x.max(),regx.max()]),.85*max([y.max(),regy.max()]), + "MSE="+str(round(reg[1],5)), + horizontalalignment='center', verticalalignment='center') + plt.show() + +#PCA to compress down to 2d +def pca_comp(big_multidim): + pca=PCA(n_components=2) + td_norm=StandardScaler().fit_transform(big_multidim) + td_pca=pca.fit_transform(td_norm) + return td_pca + +#one-stop visualization of multidim datasets +def vis_2d(big_multidim): + td_pca=pca_comp(big_multidim) + plt.scatter(td_pca[:,0], td_pca[:,1]) + +def cluster_vis(data, cluster_assign): + pca=PCA(n_components=2) + td_norm=StandardScaler().fit_transform(data) + td_pca=pca.fit_transform(td_norm) + colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a', + '#f781bf', '#a65628', '#984ea3', + '#999999', '#e41a1c', '#dede00']), + int(max(clu) + 1)))) + colors = np.append(colors, ["#000000"]) + plt.figure(figsize=(8, 8)) + plt.scatter(td_norm[:, 0], td_norm[:, 1], s=10, color=colors[cluster_assign]) + plt.show() + +#affinity prop- slow, but ok if you don't have any idea how many you want +def affinity_prop(data, damping=.77, preference=-70): + td_norm=StandardScaler().fit_transform(data) + db = AffinityPropagation(damping=damping,preference=preference).fit(td) + y=db.predict(td_norm) + return y + +#DBSCAN- slightly faster but can label your dataset as all outliers +def dbscan(data, eps=.3): + td_norm=StandardScaler().fit_transform(data) + db = DBSCAN(eps=eps).fit(td) + y=db.labels_.astype(np.int) + return y + +#K-means clustering- the classic +def kmeans(data, num_clusters): + td_norm=StandardScaler().fit_transform(data) + db = KMeans(n_clusters=num_clusters).fit(td) + y=db.labels_.astype(np.int) + return y + +#Spectral Clustering- Seems to work really well +def spectral(data, num_clusters): + td_norm=StandardScaler().fit_transform(data) + db = SpectralClustering(n_clusters=num_clusters).fit(td) + y=db.labels_.astype(np.int) + return y