diff --git a/README.md b/README.md index 76eb91c..9666161 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@
- + # **Pow**
diff --git a/Documentation/assets/PickaxePowBackground.png b/assets/PickaxePowBackground.png similarity index 100% rename from Documentation/assets/PickaxePowBackground.png rename to assets/PickaxePowBackground.png diff --git a/main.py b/main.py index 1f987b5..51e80d2 100755 --- a/main.py +++ b/main.py @@ -1,7 +1,23 @@ #!/usr/bin/env python3 + import sys sys.path.append('./src/back/') import load_csv as l +import show_csv as s +import clustering_csv as c + +df = l.return_csv("./data.csv") +l.csv_value(df) + +l.csv_value(df) + +# l.csv_standardisation_Z(df,"Vehicle Year") + +# l.csv_robust_normalize(df,"Speed Limit") + +# s.histo_col(df,"Speed Limit") + +# s.plotBoxWhisker(df) -l.csv_value() +c.launch_cluster(df,['Speed Limit','Vehicle Year']) diff --git a/src/back/clustering_csv.py b/src/back/clustering_csv.py new file mode 100644 index 0000000..497d38f --- /dev/null +++ b/src/back/clustering_csv.py @@ -0,0 +1,83 @@ +import numpy as np +import matplotlib.pyplot as plt +from sklearn.cluster import KMeans, DBSCAN +from sklearn.datasets import make_blobs, make_moons +from mpl_toolkits.mplot3d import Axes3D + +def visualize_clusters_2d(X, labels, centers=None, title="Clusters"): + plt.figure(figsize=(10, 7)) + plt.scatter(X[:, 0], X[:, 1], c=labels, s=50, cmap='viridis') + if centers is not None: + plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, alpha=0.75) + plt.title(title) + plt.xlabel("Feature 1") + plt.ylabel("Feature 2") + plt.show() + +def visualize_clusters_3d(X, labels, centers=None, title="Clusters"): + fig = plt.figure(figsize=(10, 7)) + ax = fig.add_subplot(111, projection='3d') + ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=labels, s=50, cmap='viridis') + if centers is not None: + ax.scatter(centers[:, 0], centers[:, 1], centers[:, 2], c='red', s=200, alpha=0.75) + ax.set_title(title) + ax.set_xlabel("Feature 1") + ax.set_ylabel("Feature 2") + ax.set_zlabel("Feature 3") + plt.show() + +def calculate_cluster_statistics_kmeans(X, labels, centers): + unique_labels = np.unique(labels) + stats = [] + for label in unique_labels: + cluster_points = X[labels == label] + num_points = len(cluster_points) + center = centers[label] + stats.append({ + 'cluster': label, + 'num_points': num_points, + 'center': center + }) + return stats + +def calculate_cluster_statistics_dbscan(X, labels): + unique_labels = np.unique(labels) + stats = [] + for label in unique_labels: + if label == -1: + continue # Ignore noise + cluster_points = X[labels == label] + num_points = len(cluster_points) + density = num_points / (np.max(cluster_points, axis=0) - np.min(cluster_points, axis=0)).prod() + stats.append({ + 'cluster': label, + 'num_points': num_points, + 'density': density + }) + return stats + +def launch_cluster(df,array_columns): + X = df[array_columns].values + + kmeans = KMeans(n_clusters=4, random_state=42) + labels_kmeans = kmeans.fit_predict(X) + centers_kmeans = kmeans.cluster_centers_ + + stats_kmeans = calculate_cluster_statistics_kmeans(X, labels_kmeans, centers_kmeans) + # for stat in stats_kmeans: + # print(f"Cluster {stat['cluster']}: {stat['num_points']} points, Center: {stat['center']}") + + # Appliquer DBSCAN + dbscan = DBSCAN(eps=0.2, min_samples=5) + labels_dbscan = dbscan.fit_predict(X) + stats_dbscan = calculate_cluster_statistics_dbscan(X, labels_dbscan) + # for stat in stats_dbscan: + # print(f"Cluster {stat['cluster']}: {stat['num_points']} points, Density: {stat['density']}") + if len(array_columns) == 3: + visualize_clusters_3d(X, labels_kmeans, centers_kmeans, title="K-Means Clustering 3D") + visualize_clusters_3d(X, labels_dbscan, title="DBSCAN Clustering 3D") + else: + visualize_clusters_2d(X, labels_kmeans, centers_kmeans, title="K-Means Clustering") + visualize_clusters_2d(X, labels_dbscan, title="DBSCAN Clustering") + return stats_kmeans,stats_dbscan + diff --git a/src/back/load_csv.py b/src/back/load_csv.py index 03db63a..5aa10f2 100644 --- a/src/back/load_csv.py +++ b/src/back/load_csv.py @@ -1,20 +1,48 @@ import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +def return_csv(path): + df = pd.read_csv(path) + return df -def csv_value(): - df = pd.read_csv('./data.csv') - # print(df.head()) - +def csv_value(df): #print all detail - # df.info() - + df.info() # Print number of missing value for each column - # print(df.isna().sum()) - + print(df.isna().sum()) # Useless values - # Off-Road Description -> 156170 - # Municipality -> 152979 - # Related Non-Motorist -> 166642 - # Non-Motorist Substance Abuse -> 167788 - # Circumstance -> 140746 + + +def csv_check(df): + for col in df: + print("-"*12) + print(col) + print("-"*12) + print(df[col].unique()) + + +def csv_norm_min_max(df,col): + maValue = df[col].max + miValue = df[col].min + df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min()) return df + +def csv_standardisation_Z(df,col): + mean_col1 = df[col].mean() + std_col1 = df[col].std() + df[col] = (df[col] - mean_col1) / std_col1 + return df[col] + +def csv_robust_normalize(df, column): + # Calcul de la médiane et de l'IQR + median = df[column].median() + q1 = df[column].quantile(0.25) + q3 = df[column].quantile(0.75) + iqr = q3 - q1 + + # Application de la normalisation robuste + normalized_column = (df[column] - median) / iqr + df[column] = normalized_column + print (normalized_column) + return normalized_column diff --git a/src/back/show_csv.py b/src/back/show_csv.py new file mode 100644 index 0000000..93d9973 --- /dev/null +++ b/src/back/show_csv.py @@ -0,0 +1,16 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt + +def histo_col(df,colonne): + plt.figure() + plt.hist(df[colonne], bins=int(df[colonne].nunique()/4), alpha=0.7, color='blue', edgecolor='black') + plt.title(f"Histogramme de la colonne '{colonne}'") + plt.xlabel(colonne) + plt.ylabel("Fréquence") + plt.grid(True) + plt.show() + +def plotBoxWhisker(df): + df.plot(kind='box', subplots=True, sharex=False, sharey=False) + plt.show()