From 267a2b8013439d45978a81db5e296610c9b329d3 Mon Sep 17 00:00:00 2001 From: dorian Date: Wed, 19 Jun 2024 09:17:43 +0200 Subject: [PATCH 1/4] Adding managing missing values --- src/back/managing_missing_values.py | 83 +++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 src/back/managing_missing_values.py diff --git a/src/back/managing_missing_values.py b/src/back/managing_missing_values.py new file mode 100644 index 0000000..745641d --- /dev/null +++ b/src/back/managing_missing_values.py @@ -0,0 +1,83 @@ +import pandas as pd +from sklearn.impute import KNNImputer +from sklearn.linear_model import LinearRegression +import numpy as np +import load_csv as l + +def convert_categorical_to_numeric(data): + for column in data.columns: + if data[column].nunique() <= 15: + data[column] = data[column].astype('category') + data[column] = data[column].cat.codes.replace(-1, np.nan) + 1 + else: + data = data.drop(column, axis=1) + return data + +def drop_high_null_percentage(data, threshold=0.5): + missing_percentage = data.isnull().mean() + data = data.loc[:, missing_percentage <= threshold] + return data + + + +def replace_with_mean(data, column): + data[column] = data[column].fillna(data[column].mean()) + return data + +def replace_with_median(data, column): + data[column] = data[column].fillna(data[column].median()) + return data + +def replace_with_mode(data, column): + mode_value = data[column].mode() + if not mode_value.empty: + data[column] = data[column].fillna(mode_value[0]) + return data + +def impute_with_knn(data, column, n_neighbors=5): + imputer = KNNImputer(n_neighbors=n_neighbors) + data[[column]] = imputer.fit_transform(data[[column]]) + return data + +def impute_with_regression(data, column): + if data[column].isnull().sum() > 0: + train_data = data[data[column].notna()] + test_data = data[data[column].isna()] + if not train_data.empty and not test_data.empty: + regressor = LinearRegression() + regressor.fit(train_data.drop(column, axis=1), train_data[column]) + data.loc[data[column].isna(), column] = regressor.predict(test_data.drop(column, axis=1)) + return data + + +""" + Parameters: + - data: Pandas DataFrame with the data + - method: Method to handle missing values ('drop', 'mean', 'median', 'mode', 'knn', 'regression') + - n_neighbors: Number of neighbors to use for KNN imputation (only used if method='knn') +""" +def handle_missing_values(data, method, column, n_neighbors=5): + + data = drop_high_null_percentage(data) + data = convert_categorical_to_numeric(data) + + if method == 'mean': + return replace_with_mean(data, column) + elif method == 'median': + return replace_with_median(data, column) + elif method == 'mode': + return replace_with_mode(data, column) + elif method == 'knn': + return impute_with_knn(data, column, n_neighbors=n_neighbors) + elif method == 'regression': + return impute_with_regression(data, column) + elif method == 'drop_high_null': + return drop_high_null_percentage(data) + else: + raise ValueError("Unknown method") + + + +data = l.return_csv('./data.csv') +cleaned_data = handle_missing_values(data, method='mode', column='Route Type') +print(cleaned_data) From 022437dad86dabf2371569dec55284c34150e506 Mon Sep 17 00:00:00 2001 From: dorian Date: Wed, 19 Jun 2024 09:22:48 +0200 Subject: [PATCH 2/4] Mean median and other function effect all data and now only column, removing useless columns --- src/back/managing_missing_values.py | 62 +++++++++++------------------ 1 file changed, 24 insertions(+), 38 deletions(-) diff --git a/src/back/managing_missing_values.py b/src/back/managing_missing_values.py index 745641d..8360cf1 100644 --- a/src/back/managing_missing_values.py +++ b/src/back/managing_missing_values.py @@ -19,34 +19,28 @@ def drop_high_null_percentage(data, threshold=0.5): return data +def replace_with_mean(data): + return data.apply(lambda col: col.fillna(col.mean()) if col.dtype.kind in 'biufc' else col) -def replace_with_mean(data, column): - data[column] = data[column].fillna(data[column].mean()) - return data - -def replace_with_median(data, column): - data[column] = data[column].fillna(data[column].median()) - return data +def replace_with_median(data): + return data.apply(lambda col: col.fillna(col.median()) if col.dtype.kind in 'biufc' else col) -def replace_with_mode(data, column): - mode_value = data[column].mode() - if not mode_value.empty: - data[column] = data[column].fillna(mode_value[0]) - return data +def replace_with_mode(data): + return data.apply(lambda col: col.fillna(col.mode()[0]) if col.mode().size > 0 else col) -def impute_with_knn(data, column, n_neighbors=5): +def impute_with_knn(data, n_neighbors=5): imputer = KNNImputer(n_neighbors=n_neighbors) - data[[column]] = imputer.fit_transform(data[[column]]) - return data + return pd.DataFrame(imputer.fit_transform(data), columns=data.columns) -def impute_with_regression(data, column): - if data[column].isnull().sum() > 0: - train_data = data[data[column].notna()] - test_data = data[data[column].isna()] - if not train_data.empty and not test_data.empty: - regressor = LinearRegression() - regressor.fit(train_data.drop(column, axis=1), train_data[column]) - data.loc[data[column].isna(), column] = regressor.predict(test_data.drop(column, axis=1)) +def impute_with_regression(data): + for column in data.columns: + if data[column].isnull().sum() > 0: + train_data = data[data[column].notna()] + test_data = data[data[column].isna()] + if not train_data.empty and not test_data.empty: + regressor = LinearRegression() + regressor.fit(train_data.drop(column, axis=1), train_data[column]) + data.loc[data[column].isna(), column] = regressor.predict(test_data.drop(column, axis=1)) return data @@ -56,28 +50,20 @@ def impute_with_regression(data, column): - method: Method to handle missing values ('drop', 'mean', 'median', 'mode', 'knn', 'regression') - n_neighbors: Number of neighbors to use for KNN imputation (only used if method='knn') """ -def handle_missing_values(data, method, column, n_neighbors=5): +def handle_missing_values(data, method, n_neighbors=5): data = drop_high_null_percentage(data) - data = convert_categorical_to_numeric(data) - + data = convert_categorical_to_numeric(data) if method == 'mean': - return replace_with_mean(data, column) + return replace_with_mean(data) elif method == 'median': - return replace_with_median(data, column) + return replace_with_median(data) elif method == 'mode': - return replace_with_mode(data, column) + return replace_with_mode(data) elif method == 'knn': - return impute_with_knn(data, column, n_neighbors=n_neighbors) + return impute_with_knn(data, n_neighbors=n_neighbors) elif method == 'regression': - return impute_with_regression(data, column) - elif method == 'drop_high_null': - return drop_high_null_percentage(data) + return impute_with_regression(data) else: raise ValueError("Unknown method") - - -data = l.return_csv('./data.csv') -cleaned_data = handle_missing_values(data, method='mode', column='Route Type') -print(cleaned_data) From 2cbcbeaf7ecb10808a3ee97f42ad4bdf019b047e Mon Sep 17 00:00:00 2001 From: dorian Date: Fri, 21 Jun 2024 14:31:19 +0200 Subject: [PATCH 3/4] Adding prediction file --- src/back/prediction.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 src/back/prediction.py diff --git a/src/back/prediction.py b/src/back/prediction.py new file mode 100644 index 0000000..09c7556 --- /dev/null +++ b/src/back/prediction.py @@ -0,0 +1,19 @@ +from sklearn.model_selection import train_test_split +from sklearn.linear_model import LinearRegression +from sklearn.ensemble import RandomForestRegressor + + +def getColumnsForPredictionAndPredict(df,columns, columnGoal, algoOfPrediction): + predictors = df[columns] + target = df[columnGoal] + X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size=0.2, random_state=42) + + if algoOfPrediction == "Régression Linéaire": + model = LinearRegression() + elif algoOfPrediction == "Forêt Aléatoire": + model = RandomForestRegressor(n_estimators=100) + else: + raise NameError("No method name : \"" + algoOfPrediction + "\"") + + model.fit(X_train, y_train) + return model.predict(X_test) \ No newline at end of file From efab9853027da0435f9388dc50138fd7191763bc Mon Sep 17 00:00:00 2001 From: "aurian.jault" Date: Fri, 21 Jun 2024 15:56:37 +0200 Subject: [PATCH 4/4] split function --- main.py | 2 +- src/back/clustering_csv.py | 31 ++++++++++++++++++++++++++++++- src/back/load_csv.py | 3 +++ 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/main.py b/main.py index 51e80d2..d39f93a 100755 --- a/main.py +++ b/main.py @@ -20,4 +20,4 @@ l.csv_value(df) # s.plotBoxWhisker(df) -c.launch_cluster(df,['Speed Limit','Vehicle Year']) +c.launch_cluster(df,['Speed Limit','Vehicle Year','Longitude']) diff --git a/src/back/clustering_csv.py b/src/back/clustering_csv.py index 497d38f..dcb170d 100644 --- a/src/back/clustering_csv.py +++ b/src/back/clustering_csv.py @@ -56,6 +56,35 @@ def calculate_cluster_statistics_dbscan(X, labels): }) return stats +def launch_cluster_knn(df,array_columns,n): + X = df[array_columns].values + + kmeans = KMeans(n_clusters=n, random_state=42) + labels_kmeans = kmeans.fit_predict(X) + centers_kmeans = kmeans.cluster_centers_ + # for stat in stats_kmeans: + # print(f"Cluster {stat['cluster']}: {stat['num_points']} points, Center: {stat['center']}") + + stats_kmeans = calculate_cluster_statistics_kmeans(X, labels_kmeans, centers_kmeans) + if len(array_columns) == 3: + visualize_clusters_3d(X, labels_kmeans, centers_kmeans, title="K-Means Clustering 3D") + else: + visualize_clusters_2d(X, labels_kmeans, centers_kmeans, title="K-Means Clustering") + return stats_kmeans + +def launch_cluster_DBSCAN(df, array_columns): + X = df[array_columns].values + dbscan = DBSCAN(eps=0.2, min_samples=5) + labels_dbscan = dbscan.fit_predict(X) + stats_dbscan = calculate_cluster_statistics_dbscan(X, labels_dbscan) + # for stat in stats_dbscan: + # print(f"Cluster {stat['cluster']}: {stat['num_points']} points, Density: {stat['density']}") + if len(array_columns) == 3: + visualize_clusters_3d(X, labels_dbscan, title="DBSCAN Clustering 3D") + else: + visualize_clusters_2d(X, labels_dbscan, title="DBSCAN Clustering") + return stats_dbscan + def launch_cluster(df,array_columns): X = df[array_columns].values @@ -76,7 +105,7 @@ def launch_cluster(df,array_columns): if len(array_columns) == 3: visualize_clusters_3d(X, labels_kmeans, centers_kmeans, title="K-Means Clustering 3D") visualize_clusters_3d(X, labels_dbscan, title="DBSCAN Clustering 3D") - else: + else: visualize_clusters_2d(X, labels_kmeans, centers_kmeans, title="K-Means Clustering") visualize_clusters_2d(X, labels_dbscan, title="DBSCAN Clustering") return stats_kmeans,stats_dbscan diff --git a/src/back/load_csv.py b/src/back/load_csv.py index 5aa10f2..25d5ad9 100644 --- a/src/back/load_csv.py +++ b/src/back/load_csv.py @@ -46,3 +46,6 @@ def csv_robust_normalize(df, column): df[column] = normalized_column print (normalized_column) return normalized_column + + +