add PCA for dimensions reduction on clustering
continuous-integration/drone/push Build is passing Details

pull/23/head
remrem 10 months ago
parent 69aa8c58b2
commit 816bf3a237

@ -3,6 +3,7 @@ import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, DBSCAN from sklearn.cluster import KMeans, DBSCAN
from sklearn.datasets import make_blobs, make_moons from sklearn.datasets import make_blobs, make_moons
from mpl_toolkits.mplot3d import Axes3D from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
def visualize_clusters_2d(X, labels, centers=None, title="Clusters"): def visualize_clusters_2d(X, labels, centers=None, title="Clusters"):
plt.figure(figsize=(10, 7)) plt.figure(figsize=(10, 7))
@ -56,8 +57,11 @@ def calculate_cluster_statistics_dbscan(X, labels):
}) })
return stats return stats
def launch_cluster_knn(df, array_columns, n=3): def launch_cluster_knn(df, array_columns, n=3, dimensions=2):
X = df[array_columns].values X = df[array_columns].values
if len(array_columns) > 3:
pca = PCA(dimensions)
X = pca.fit_transform(df)
kmeans = KMeans(n_clusters=n, random_state=42) kmeans = KMeans(n_clusters=n, random_state=42)
labels_kmeans = kmeans.fit_predict(X) labels_kmeans = kmeans.fit_predict(X)
@ -66,19 +70,23 @@ def launch_cluster_knn(df, array_columns, n=3):
# print(f"Cluster {stat['cluster']}: {stat['num_points']} points, Center: {stat['center']}") # print(f"Cluster {stat['cluster']}: {stat['num_points']} points, Center: {stat['center']}")
stats_kmeans = calculate_cluster_statistics_kmeans(X, labels_kmeans, centers_kmeans) stats_kmeans = calculate_cluster_statistics_kmeans(X, labels_kmeans, centers_kmeans)
if len(array_columns) == 3: if dimensions == 3:
return visualize_clusters_3d(X, labels_kmeans, centers_kmeans, title="K-Means Clustering 3D") return visualize_clusters_3d(X, labels_kmeans, centers_kmeans, title="K-Means Clustering 3D")
else: else:
return visualize_clusters_2d(X, labels_kmeans, centers_kmeans, title="K-Means Clustering") return visualize_clusters_2d(X, labels_kmeans, centers_kmeans, title="K-Means Clustering")
def launch_cluster_dbscan(df, array_columns): def launch_cluster_dbscan(df, array_columns, dimensions=2):
X = df[array_columns].values X = df[array_columns].values
if len(array_columns) > 3:
pca = PCA(dimensions)
X = pca.fit_transform(df)
dbscan = DBSCAN(eps=0.2, min_samples=5) dbscan = DBSCAN(eps=0.2, min_samples=5)
labels_dbscan = dbscan.fit_predict(X) labels_dbscan = dbscan.fit_predict(X)
stats_dbscan = calculate_cluster_statistics_dbscan(X, labels_dbscan) stats_dbscan = calculate_cluster_statistics_dbscan(X, labels_dbscan)
# for stat in stats_dbscan: # for stat in stats_dbscan:
# print(f"Cluster {stat['cluster']}: {stat['num_points']} points, Density: {stat['density']}") # print(f"Cluster {stat['cluster']}: {stat['num_points']} points, Density: {stat['density']}")
if len(array_columns) == 3: if dimensions == 3:
return visualize_clusters_3d(X, labels_dbscan, title="DBSCAN Clustering 3D") return visualize_clusters_3d(X, labels_dbscan, title="DBSCAN Clustering 3D")
else: else:
return visualize_clusters_2d(X, labels_dbscan, title="DBSCAN Clustering") return visualize_clusters_2d(X, labels_dbscan, title="DBSCAN Clustering")

@ -30,18 +30,22 @@ if 'df' in st.session_state:
with tab1: with tab1:
st.header("Clustering") st.header("Clustering")
selected_columns = handle_column_multiselect(df, "clustering") selected_columns = handle_column_multiselect(df, "clustering")
if len(selected_columns) >= 3:
dimensions = st.radio("Reduce to dimensions X with PCA:",[2,3],index=0)
else:
dimensions = 2
tab_names = ["K-means", "DBSCAN"] tab_names = ["K-means", "DBSCAN"]
tab11, tab12 = st.tabs(tab_names) tab11, tab12 = st.tabs(tab_names)
with tab11: with tab11:
if st.button(f"Start {tab_names[0]}"): if st.button(f"Start {tab_names[0]}"):
st.pyplot(cc.launch_cluster_knn(df, selected_columns)) st.pyplot(cc.launch_cluster_knn(df, selected_columns, dimensions=dimensions))
with tab12: with tab12:
if st.button(f"Start {tab_names[1]}"): if st.button(f"Start {tab_names[1]}"):
st.pyplot(cc.launch_cluster_dbscan(df, selected_columns)) st.pyplot(cc.launch_cluster_dbscan(df, selected_columns, dimensions))
with tab2: with tab2:
st.header("Predictions") st.header("Predictions")

Loading…
Cancel
Save