commit
384f815b7e
Before Width: | Height: | Size: 140 KiB After Width: | Height: | Size: 140 KiB |
@ -1,7 +1,23 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
sys.path.append('./src/back/')
|
sys.path.append('./src/back/')
|
||||||
|
|
||||||
import load_csv as l
|
import load_csv as l
|
||||||
|
import show_csv as s
|
||||||
|
import clustering_csv as c
|
||||||
|
|
||||||
|
df = l.return_csv("./data.csv")
|
||||||
|
l.csv_value(df)
|
||||||
|
|
||||||
|
l.csv_value(df)
|
||||||
|
|
||||||
|
# l.csv_standardisation_Z(df,"Vehicle Year")
|
||||||
|
|
||||||
|
# l.csv_robust_normalize(df,"Speed Limit")
|
||||||
|
|
||||||
|
# s.histo_col(df,"Speed Limit")
|
||||||
|
|
||||||
|
# s.plotBoxWhisker(df)
|
||||||
|
|
||||||
l.csv_value()
|
c.launch_cluster(df,['Speed Limit','Vehicle Year'])
|
||||||
|
@ -0,0 +1,83 @@
|
|||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from sklearn.cluster import KMeans, DBSCAN
|
||||||
|
from sklearn.datasets import make_blobs, make_moons
|
||||||
|
from mpl_toolkits.mplot3d import Axes3D
|
||||||
|
|
||||||
|
def visualize_clusters_2d(X, labels, centers=None, title="Clusters"):
|
||||||
|
plt.figure(figsize=(10, 7))
|
||||||
|
plt.scatter(X[:, 0], X[:, 1], c=labels, s=50, cmap='viridis')
|
||||||
|
if centers is not None:
|
||||||
|
plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, alpha=0.75)
|
||||||
|
plt.title(title)
|
||||||
|
plt.xlabel("Feature 1")
|
||||||
|
plt.ylabel("Feature 2")
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
def visualize_clusters_3d(X, labels, centers=None, title="Clusters"):
|
||||||
|
fig = plt.figure(figsize=(10, 7))
|
||||||
|
ax = fig.add_subplot(111, projection='3d')
|
||||||
|
ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=labels, s=50, cmap='viridis')
|
||||||
|
if centers is not None:
|
||||||
|
ax.scatter(centers[:, 0], centers[:, 1], centers[:, 2], c='red', s=200, alpha=0.75)
|
||||||
|
ax.set_title(title)
|
||||||
|
ax.set_xlabel("Feature 1")
|
||||||
|
ax.set_ylabel("Feature 2")
|
||||||
|
ax.set_zlabel("Feature 3")
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
def calculate_cluster_statistics_kmeans(X, labels, centers):
|
||||||
|
unique_labels = np.unique(labels)
|
||||||
|
stats = []
|
||||||
|
for label in unique_labels:
|
||||||
|
cluster_points = X[labels == label]
|
||||||
|
num_points = len(cluster_points)
|
||||||
|
center = centers[label]
|
||||||
|
stats.append({
|
||||||
|
'cluster': label,
|
||||||
|
'num_points': num_points,
|
||||||
|
'center': center
|
||||||
|
})
|
||||||
|
return stats
|
||||||
|
|
||||||
|
def calculate_cluster_statistics_dbscan(X, labels):
|
||||||
|
unique_labels = np.unique(labels)
|
||||||
|
stats = []
|
||||||
|
for label in unique_labels:
|
||||||
|
if label == -1:
|
||||||
|
continue # Ignore noise
|
||||||
|
cluster_points = X[labels == label]
|
||||||
|
num_points = len(cluster_points)
|
||||||
|
density = num_points / (np.max(cluster_points, axis=0) - np.min(cluster_points, axis=0)).prod()
|
||||||
|
stats.append({
|
||||||
|
'cluster': label,
|
||||||
|
'num_points': num_points,
|
||||||
|
'density': density
|
||||||
|
})
|
||||||
|
return stats
|
||||||
|
|
||||||
|
def launch_cluster(df,array_columns):
|
||||||
|
X = df[array_columns].values
|
||||||
|
|
||||||
|
kmeans = KMeans(n_clusters=4, random_state=42)
|
||||||
|
labels_kmeans = kmeans.fit_predict(X)
|
||||||
|
centers_kmeans = kmeans.cluster_centers_
|
||||||
|
|
||||||
|
stats_kmeans = calculate_cluster_statistics_kmeans(X, labels_kmeans, centers_kmeans)
|
||||||
|
# for stat in stats_kmeans:
|
||||||
|
# print(f"Cluster {stat['cluster']}: {stat['num_points']} points, Center: {stat['center']}")
|
||||||
|
|
||||||
|
# Appliquer DBSCAN
|
||||||
|
dbscan = DBSCAN(eps=0.2, min_samples=5)
|
||||||
|
labels_dbscan = dbscan.fit_predict(X)
|
||||||
|
stats_dbscan = calculate_cluster_statistics_dbscan(X, labels_dbscan)
|
||||||
|
# for stat in stats_dbscan:
|
||||||
|
# print(f"Cluster {stat['cluster']}: {stat['num_points']} points, Density: {stat['density']}")
|
||||||
|
if len(array_columns) == 3:
|
||||||
|
visualize_clusters_3d(X, labels_kmeans, centers_kmeans, title="K-Means Clustering 3D")
|
||||||
|
visualize_clusters_3d(X, labels_dbscan, title="DBSCAN Clustering 3D")
|
||||||
|
else:
|
||||||
|
visualize_clusters_2d(X, labels_kmeans, centers_kmeans, title="K-Means Clustering")
|
||||||
|
visualize_clusters_2d(X, labels_dbscan, title="DBSCAN Clustering")
|
||||||
|
return stats_kmeans,stats_dbscan
|
||||||
|
|
@ -1,20 +1,48 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
def return_csv(path):
|
||||||
|
df = pd.read_csv(path)
|
||||||
|
return df
|
||||||
|
|
||||||
def csv_value():
|
def csv_value(df):
|
||||||
df = pd.read_csv('./data.csv')
|
|
||||||
# print(df.head())
|
|
||||||
|
|
||||||
#print all detail
|
#print all detail
|
||||||
# df.info()
|
df.info()
|
||||||
|
|
||||||
# Print number of missing value for each column
|
# Print number of missing value for each column
|
||||||
# print(df.isna().sum())
|
print(df.isna().sum())
|
||||||
|
|
||||||
# Useless values
|
# Useless values
|
||||||
# Off-Road Description -> 156170
|
|
||||||
# Municipality -> 152979
|
|
||||||
# Related Non-Motorist -> 166642
|
def csv_check(df):
|
||||||
# Non-Motorist Substance Abuse -> 167788
|
for col in df:
|
||||||
# Circumstance -> 140746
|
print("-"*12)
|
||||||
|
print(col)
|
||||||
|
print("-"*12)
|
||||||
|
print(df[col].unique())
|
||||||
|
|
||||||
|
|
||||||
|
def csv_norm_min_max(df,col):
|
||||||
|
maValue = df[col].max
|
||||||
|
miValue = df[col].min
|
||||||
|
df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
def csv_standardisation_Z(df,col):
|
||||||
|
mean_col1 = df[col].mean()
|
||||||
|
std_col1 = df[col].std()
|
||||||
|
df[col] = (df[col] - mean_col1) / std_col1
|
||||||
|
return df[col]
|
||||||
|
|
||||||
|
def csv_robust_normalize(df, column):
|
||||||
|
# Calcul de la médiane et de l'IQR
|
||||||
|
median = df[column].median()
|
||||||
|
q1 = df[column].quantile(0.25)
|
||||||
|
q3 = df[column].quantile(0.75)
|
||||||
|
iqr = q3 - q1
|
||||||
|
|
||||||
|
# Application de la normalisation robuste
|
||||||
|
normalized_column = (df[column] - median) / iqr
|
||||||
|
df[column] = normalized_column
|
||||||
|
print (normalized_column)
|
||||||
|
return normalized_column
|
||||||
|
@ -0,0 +1,16 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
|
def histo_col(df,colonne):
|
||||||
|
plt.figure()
|
||||||
|
plt.hist(df[colonne], bins=int(df[colonne].nunique()/4), alpha=0.7, color='blue', edgecolor='black')
|
||||||
|
plt.title(f"Histogramme de la colonne '{colonne}'")
|
||||||
|
plt.xlabel(colonne)
|
||||||
|
plt.ylabel("Fréquence")
|
||||||
|
plt.grid(True)
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
def plotBoxWhisker(df):
|
||||||
|
df.plot(kind='box', subplots=True, sharex=False, sharey=False)
|
||||||
|
plt.show()
|
Loading…
Reference in new issue