From 1d67cad31a9d196fcc71664f64ea8bc634250191 Mon Sep 17 00:00:00 2001 From: dohodin Date: Wed, 5 Jun 2024 09:50:36 +0200 Subject: [PATCH 1/7] moving assets folder to root --- .../assets => assets}/PickaxePowBackground.png | Bin 1 file changed, 0 insertions(+), 0 deletions(-) rename {Documentation/assets => assets}/PickaxePowBackground.png (100%) diff --git a/Documentation/assets/PickaxePowBackground.png b/assets/PickaxePowBackground.png similarity index 100% rename from Documentation/assets/PickaxePowBackground.png rename to assets/PickaxePowBackground.png From 89d7d4dd9437b05b46945dc4c6db7d347c9bc0fd Mon Sep 17 00:00:00 2001 From: Dorian HODIN Date: Fri, 7 Jun 2024 10:13:13 +0200 Subject: [PATCH 2/7] =?UTF-8?q?Mise=20=C3=A0=20jour=20de=20'README.md'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 76eb91c..9666161 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@
- + # **Pow**
From f83f35c498fcc7f8e58368d050b349b914625197 Mon Sep 17 00:00:00 2001 From: "aurian.jault" Date: Fri, 7 Jun 2024 10:56:19 +0200 Subject: [PATCH 3/7] added normalizing datas --- main.py | 7 ++++++- src/back/load_csv.py | 41 ++++++++++++++++++++++++++++------------- 2 files changed, 34 insertions(+), 14 deletions(-) diff --git a/main.py b/main.py index 1f987b5..8a8212c 100755 --- a/main.py +++ b/main.py @@ -4,4 +4,9 @@ sys.path.append('./src/back/') import load_csv as l -l.csv_value() +df = l.return_csv("./data.csv") +l.csv_value(df) + +l.csv_value(df) + +l.csv_stadadisation_Z(df,"Vehicle Year") diff --git a/src/back/load_csv.py b/src/back/load_csv.py index 03db63a..4438e2b 100644 --- a/src/back/load_csv.py +++ b/src/back/load_csv.py @@ -1,20 +1,35 @@ import pandas as pd +import numpy as np +def return_csv(path): + df = pd.read_csv(path) + return df -def csv_value(): - df = pd.read_csv('./data.csv') - # print(df.head()) - +def csv_value(df): #print all detail - # df.info() - + df.info() # Print number of missing value for each column - # print(df.isna().sum()) - + print(df.isna().sum()) # Useless values - # Off-Road Description -> 156170 - # Municipality -> 152979 - # Related Non-Motorist -> 166642 - # Non-Motorist Substance Abuse -> 167788 - # Circumstance -> 140746 + + +def csv_check(df): + for col in df: + print("-"*12) + print(col) + print("-"*12) + print(df[col].unique()) + + +def csv_norm_min_max(df,col): + maValue = df[col].max + miValue = df[col].min + df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min()) return df + +def csv_stadadisation_Z(df,col): + mean_col1 = df[col].mean() + std_col1 = df[col].std() + df[col] = (df[col] - mean_col1) / std_col1 + return df[col] + From c1dfaa7b6836222abbbfd7fd64a152d4de9a4ebe Mon Sep 17 00:00:00 2001 From: "aurian.jault" Date: Wed, 19 Jun 2024 08:08:00 +0200 Subject: [PATCH 4/7] adding stuff --- main.py | 7 ++++++- src/back/load_csv.py | 5 +++-- src/back/show_csv.py | 16 ++++++++++++++++ 3 files changed, 25 insertions(+), 3 deletions(-) create mode 100644 src/back/show_csv.py diff --git a/main.py b/main.py index 8a8212c..31aa73b 100755 --- a/main.py +++ b/main.py @@ -3,10 +3,15 @@ import sys sys.path.append('./src/back/') import load_csv as l +import show_csv as s df = l.return_csv("./data.csv") l.csv_value(df) l.csv_value(df) -l.csv_stadadisation_Z(df,"Vehicle Year") +# l.csv_stadardisation_Z(df,"Vehicle Year") + +s.histo_col(df,"Speed Limit") + +s.plotBoxWhisker(df) diff --git a/src/back/load_csv.py b/src/back/load_csv.py index 4438e2b..870fe72 100644 --- a/src/back/load_csv.py +++ b/src/back/load_csv.py @@ -1,5 +1,6 @@ import pandas as pd import numpy as np +import matplotlib.pyplot as plt def return_csv(path): df = pd.read_csv(path) @@ -27,9 +28,9 @@ def csv_norm_min_max(df,col): df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min()) return df -def csv_stadadisation_Z(df,col): +def csv_stadardisation_Z(df,col): mean_col1 = df[col].mean() std_col1 = df[col].std() df[col] = (df[col] - mean_col1) / std_col1 return df[col] - + diff --git a/src/back/show_csv.py b/src/back/show_csv.py new file mode 100644 index 0000000..93d9973 --- /dev/null +++ b/src/back/show_csv.py @@ -0,0 +1,16 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt + +def histo_col(df,colonne): + plt.figure() + plt.hist(df[colonne], bins=int(df[colonne].nunique()/4), alpha=0.7, color='blue', edgecolor='black') + plt.title(f"Histogramme de la colonne '{colonne}'") + plt.xlabel(colonne) + plt.ylabel("Fréquence") + plt.grid(True) + plt.show() + +def plotBoxWhisker(df): + df.plot(kind='box', subplots=True, sharex=False, sharey=False) + plt.show() From 13d26c716b42c63d5453bc68ade87929c9815ab5 Mon Sep 17 00:00:00 2001 From: "aurian.jault" Date: Wed, 19 Jun 2024 08:40:14 +0200 Subject: [PATCH 5/7] clustering 2D + 3D --- main.py | 10 +++-- src/back/clustering_csv.py | 82 ++++++++++++++++++++++++++++++++++++++ src/back/load_csv.py | 2 +- 3 files changed, 90 insertions(+), 4 deletions(-) create mode 100644 src/back/clustering_csv.py diff --git a/main.py b/main.py index 31aa73b..a80ee2d 100755 --- a/main.py +++ b/main.py @@ -1,17 +1,21 @@ #!/usr/bin/env python3 + import sys sys.path.append('./src/back/') import load_csv as l import show_csv as s +import clustering_csv as c df = l.return_csv("./data.csv") l.csv_value(df) l.csv_value(df) -# l.csv_stadardisation_Z(df,"Vehicle Year") +# l.csv_standardisation_Z(df,"Vehicle Year") + +# s.histo_col(df,"Speed Limit") -s.histo_col(df,"Speed Limit") +# s.plotBoxWhisker(df) -s.plotBoxWhisker(df) +c.launch_cluster(df,['Speed Limit','Vehicle Year']) diff --git a/src/back/clustering_csv.py b/src/back/clustering_csv.py new file mode 100644 index 0000000..94ca7d9 --- /dev/null +++ b/src/back/clustering_csv.py @@ -0,0 +1,82 @@ +import numpy as np +import matplotlib.pyplot as plt +from sklearn.cluster import KMeans, DBSCAN +from sklearn.datasets import make_blobs, make_moons +from mpl_toolkits.mplot3d import Axes3D + +def visualize_clusters_2d(X, labels, centers=None, title="Clusters"): + plt.figure(figsize=(10, 7)) + plt.scatter(X[:, 0], X[:, 1], c=labels, s=50, cmap='viridis') + if centers is not None: + plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, alpha=0.75) + plt.title(title) + plt.xlabel("Feature 1") + plt.ylabel("Feature 2") + plt.show() + +def visualize_clusters_3d(X, labels, centers=None, title="Clusters"): + fig = plt.figure(figsize=(10, 7)) + ax = fig.add_subplot(111, projection='3d') + ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=labels, s=50, cmap='viridis') + if centers is not None: + ax.scatter(centers[:, 0], centers[:, 1], centers[:, 2], c='red', s=200, alpha=0.75) + ax.set_title(title) + ax.set_xlabel("Feature 1") + ax.set_ylabel("Feature 2") + ax.set_zlabel("Feature 3") + plt.show() + +def calculate_cluster_statistics_kmeans(X, labels, centers): + unique_labels = np.unique(labels) + stats = [] + for label in unique_labels: + cluster_points = X[labels == label] + num_points = len(cluster_points) + center = centers[label] + stats.append({ + 'cluster': label, + 'num_points': num_points, + 'center': center + }) + return stats + +def calculate_cluster_statistics_dbscan(X, labels): + unique_labels = np.unique(labels) + stats = [] + for label in unique_labels: + if label == -1: + continue # Ignore noise + cluster_points = X[labels == label] + num_points = len(cluster_points) + density = num_points / (np.max(cluster_points, axis=0) - np.min(cluster_points, axis=0)).prod() + stats.append({ + 'cluster': label, + 'num_points': num_points, + 'density': density + }) + return stats + +def launch_cluster(df,array_columns): + X = df[array_columns].values + + kmeans = KMeans(n_clusters=4, random_state=42) + labels_kmeans = kmeans.fit_predict(X) + centers_kmeans = kmeans.cluster_centers_ + + stats_kmeans = calculate_cluster_statistics_kmeans(X, labels_kmeans, centers_kmeans) + # for stat in stats_kmeans: + # print(f"Cluster {stat['cluster']}: {stat['num_points']} points, Center: {stat['center']}") + + # Appliquer DBSCAN + dbscan = DBSCAN(eps=0.2, min_samples=5) + labels_dbscan = dbscan.fit_predict(X) + # stats_dbscan = calculate_cluster_statistics_dbscan(X, labels_dbscan) + # for stat in stats_dbscan: + # print(f"Cluster {stat['cluster']}: {stat['num_points']} points, Density: {stat['density']}") + if len(array_columns) == 3: + visualize_clusters_3d(X, labels_kmeans, centers_kmeans, title="K-Means Clustering 3D") + visualize_clusters_3d(X, labels_dbscan, title="DBSCAN Clustering 3D") + else: + visualize_clusters_2d(X, labels_kmeans, centers_kmeans, title="K-Means Clustering") + visualize_clusters_2d(X, labels_dbscan, title="DBSCAN Clustering") + diff --git a/src/back/load_csv.py b/src/back/load_csv.py index 870fe72..7b8eeeb 100644 --- a/src/back/load_csv.py +++ b/src/back/load_csv.py @@ -28,7 +28,7 @@ def csv_norm_min_max(df,col): df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min()) return df -def csv_stadardisation_Z(df,col): +def csv_standardisation_Z(df,col): mean_col1 = df[col].mean() std_col1 = df[col].std() df[col] = (df[col] - mean_col1) / std_col1 From 629e9c7023ddccd1aee2276c658fb92212f91317 Mon Sep 17 00:00:00 2001 From: "aurian.jault" Date: Wed, 19 Jun 2024 09:17:36 +0200 Subject: [PATCH 6/7] norm_robust --- main.py | 4 +++- src/back/load_csv.py | 12 ++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index a80ee2d..b00f3e0 100755 --- a/main.py +++ b/main.py @@ -14,8 +14,10 @@ l.csv_value(df) # l.csv_standardisation_Z(df,"Vehicle Year") +# l.csv_robust_normalize(df,"Speed Limit") + # s.histo_col(df,"Speed Limit") # s.plotBoxWhisker(df) -c.launch_cluster(df,['Speed Limit','Vehicle Year']) +# c.launch_cluster(df,['Speed Limit','Vehicle Year']) diff --git a/src/back/load_csv.py b/src/back/load_csv.py index 7b8eeeb..5aa10f2 100644 --- a/src/back/load_csv.py +++ b/src/back/load_csv.py @@ -34,3 +34,15 @@ def csv_standardisation_Z(df,col): df[col] = (df[col] - mean_col1) / std_col1 return df[col] +def csv_robust_normalize(df, column): + # Calcul de la médiane et de l'IQR + median = df[column].median() + q1 = df[column].quantile(0.25) + q3 = df[column].quantile(0.75) + iqr = q3 - q1 + + # Application de la normalisation robuste + normalized_column = (df[column] - median) / iqr + df[column] = normalized_column + print (normalized_column) + return normalized_column From 49786aedb52dd7114faab4f4e21574d5943489be Mon Sep 17 00:00:00 2001 From: "aurian.jault" Date: Wed, 19 Jun 2024 09:31:51 +0200 Subject: [PATCH 7/7] stat clusters --- main.py | 2 +- src/back/clustering_csv.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/main.py b/main.py index b00f3e0..51e80d2 100755 --- a/main.py +++ b/main.py @@ -20,4 +20,4 @@ l.csv_value(df) # s.plotBoxWhisker(df) -# c.launch_cluster(df,['Speed Limit','Vehicle Year']) +c.launch_cluster(df,['Speed Limit','Vehicle Year']) diff --git a/src/back/clustering_csv.py b/src/back/clustering_csv.py index 94ca7d9..497d38f 100644 --- a/src/back/clustering_csv.py +++ b/src/back/clustering_csv.py @@ -70,7 +70,7 @@ def launch_cluster(df,array_columns): # Appliquer DBSCAN dbscan = DBSCAN(eps=0.2, min_samples=5) labels_dbscan = dbscan.fit_predict(X) - # stats_dbscan = calculate_cluster_statistics_dbscan(X, labels_dbscan) + stats_dbscan = calculate_cluster_statistics_dbscan(X, labels_dbscan) # for stat in stats_dbscan: # print(f"Cluster {stat['cluster']}: {stat['num_points']} points, Density: {stat['density']}") if len(array_columns) == 3: @@ -79,4 +79,5 @@ def launch_cluster(df,array_columns): else: visualize_clusters_2d(X, labels_kmeans, centers_kmeans, title="K-Means Clustering") visualize_clusters_2d(X, labels_dbscan, title="DBSCAN Clustering") + return stats_kmeans,stats_dbscan