From 13d26c716b42c63d5453bc68ade87929c9815ab5 Mon Sep 17 00:00:00 2001 From: "aurian.jault" Date: Wed, 19 Jun 2024 08:40:14 +0200 Subject: [PATCH] clustering 2D + 3D --- main.py | 10 +++-- src/back/clustering_csv.py | 82 ++++++++++++++++++++++++++++++++++++++ src/back/load_csv.py | 2 +- 3 files changed, 90 insertions(+), 4 deletions(-) create mode 100644 src/back/clustering_csv.py diff --git a/main.py b/main.py index 31aa73b..a80ee2d 100755 --- a/main.py +++ b/main.py @@ -1,17 +1,21 @@ #!/usr/bin/env python3 + import sys sys.path.append('./src/back/') import load_csv as l import show_csv as s +import clustering_csv as c df = l.return_csv("./data.csv") l.csv_value(df) l.csv_value(df) -# l.csv_stadardisation_Z(df,"Vehicle Year") +# l.csv_standardisation_Z(df,"Vehicle Year") + +# s.histo_col(df,"Speed Limit") -s.histo_col(df,"Speed Limit") +# s.plotBoxWhisker(df) -s.plotBoxWhisker(df) +c.launch_cluster(df,['Speed Limit','Vehicle Year']) diff --git a/src/back/clustering_csv.py b/src/back/clustering_csv.py new file mode 100644 index 0000000..94ca7d9 --- /dev/null +++ b/src/back/clustering_csv.py @@ -0,0 +1,82 @@ +import numpy as np +import matplotlib.pyplot as plt +from sklearn.cluster import KMeans, DBSCAN +from sklearn.datasets import make_blobs, make_moons +from mpl_toolkits.mplot3d import Axes3D + +def visualize_clusters_2d(X, labels, centers=None, title="Clusters"): + plt.figure(figsize=(10, 7)) + plt.scatter(X[:, 0], X[:, 1], c=labels, s=50, cmap='viridis') + if centers is not None: + plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, alpha=0.75) + plt.title(title) + plt.xlabel("Feature 1") + plt.ylabel("Feature 2") + plt.show() + +def visualize_clusters_3d(X, labels, centers=None, title="Clusters"): + fig = plt.figure(figsize=(10, 7)) + ax = fig.add_subplot(111, projection='3d') + ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=labels, s=50, cmap='viridis') + if centers is not None: + ax.scatter(centers[:, 0], centers[:, 1], centers[:, 2], c='red', s=200, alpha=0.75) + ax.set_title(title) + ax.set_xlabel("Feature 1") + ax.set_ylabel("Feature 2") + ax.set_zlabel("Feature 3") + plt.show() + +def calculate_cluster_statistics_kmeans(X, labels, centers): + unique_labels = np.unique(labels) + stats = [] + for label in unique_labels: + cluster_points = X[labels == label] + num_points = len(cluster_points) + center = centers[label] + stats.append({ + 'cluster': label, + 'num_points': num_points, + 'center': center + }) + return stats + +def calculate_cluster_statistics_dbscan(X, labels): + unique_labels = np.unique(labels) + stats = [] + for label in unique_labels: + if label == -1: + continue # Ignore noise + cluster_points = X[labels == label] + num_points = len(cluster_points) + density = num_points / (np.max(cluster_points, axis=0) - np.min(cluster_points, axis=0)).prod() + stats.append({ + 'cluster': label, + 'num_points': num_points, + 'density': density + }) + return stats + +def launch_cluster(df,array_columns): + X = df[array_columns].values + + kmeans = KMeans(n_clusters=4, random_state=42) + labels_kmeans = kmeans.fit_predict(X) + centers_kmeans = kmeans.cluster_centers_ + + stats_kmeans = calculate_cluster_statistics_kmeans(X, labels_kmeans, centers_kmeans) + # for stat in stats_kmeans: + # print(f"Cluster {stat['cluster']}: {stat['num_points']} points, Center: {stat['center']}") + + # Appliquer DBSCAN + dbscan = DBSCAN(eps=0.2, min_samples=5) + labels_dbscan = dbscan.fit_predict(X) + # stats_dbscan = calculate_cluster_statistics_dbscan(X, labels_dbscan) + # for stat in stats_dbscan: + # print(f"Cluster {stat['cluster']}: {stat['num_points']} points, Density: {stat['density']}") + if len(array_columns) == 3: + visualize_clusters_3d(X, labels_kmeans, centers_kmeans, title="K-Means Clustering 3D") + visualize_clusters_3d(X, labels_dbscan, title="DBSCAN Clustering 3D") + else: + visualize_clusters_2d(X, labels_kmeans, centers_kmeans, title="K-Means Clustering") + visualize_clusters_2d(X, labels_dbscan, title="DBSCAN Clustering") + diff --git a/src/back/load_csv.py b/src/back/load_csv.py index 870fe72..7b8eeeb 100644 --- a/src/back/load_csv.py +++ b/src/back/load_csv.py @@ -28,7 +28,7 @@ def csv_norm_min_max(df,col): df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min()) return df -def csv_stadardisation_Z(df,col): +def csv_standardisation_Z(df,col): mean_col1 = df[col].mean() std_col1 = df[col].std() df[col] = (df[col] - mean_col1) / std_col1