From 036f46eb91304db50b6c20c19086b584900c13b1 Mon Sep 17 00:00:00 2001 From: rem Date: Mon, 24 Jun 2024 22:14:45 +0200 Subject: [PATCH 1/7] function for prediction result display --- src/home.py | 6 +++--- src/pages/prediction.py | 27 +++++++++++++-------------- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/src/home.py b/src/home.py index 754dc65..5d45647 100644 --- a/src/home.py +++ b/src/home.py @@ -1,6 +1,6 @@ import streamlit as st from io import StringIO -from ydata_profiling import ProfileReport +# from ydata_profiling import ProfileReport import pandas as pd def statistics(df): @@ -44,8 +44,8 @@ def main(): st.write("## Statistics") statistics(df) - profile = ProfileReport(df, title='Pandas Profiling Report', explorative=True) - profile.to_widgets() + # profile = ProfileReport(df, title='Pandas Profiling Report', explorative=True) + # profile.to_widgets() if st.button("Next"): st.switch_page("pages/clean.py") diff --git a/src/pages/prediction.py b/src/pages/prediction.py index 6892c69..6a87c01 100644 --- a/src/pages/prediction.py +++ b/src/pages/prediction.py @@ -6,8 +6,18 @@ sys.path.append('./back/') import clustering_csv as cc import prediction as p -if 'df' in st.session_state: +def display_prediction_results(df, targetCol): + df_cols.remove(col) + original_col = df[col] + predicted_col = p.getColumnsForPredictionAndPredict(df, df_cols, "Route Type", "Linear Regression") + + new_df = pd.DataFrame() + new_df['Original'] = original_col + new_df['Predicted'] = predicted_col + + st.dataframe(new_df) +if 'df' in st.session_state: df = st.session_state.df df_cols = df.columns.tolist() @@ -21,21 +31,10 @@ if 'df' in st.session_state: if st.button("Linear Regression"): col = "Route Type" - df_cols.remove(col) - original_col = df[col] - predicted_col = p.getColumnsForPredictionAndPredict(df, df_cols, "Route Type", "Linear Regression") + display_prediction_results(df, col) if st.button("Random Forest"): col = "Route Type" - df_cols.remove(col) - original_col = df[col] - predicted_col = p.getColumnsForPredictionAndPredict(df, df_cols, "Route Type", "Random Forest") - - ndf = pd.DataFrame() - ndf['Original'] = original_col - ndf['Predicted'] = predicted_col - - st.dataframe(ndf) - + display_prediction_results(df, col) else: st.write("Please clean your dataset.") From a6fb8d2b35ec019df347ee2e55959ab6e303d0ce Mon Sep 17 00:00:00 2001 From: rem Date: Mon, 24 Jun 2024 23:16:58 +0200 Subject: [PATCH 2/7] prediction/clustering page with user columns choice --- src/pages/evaluate.py | 0 src/pages/prediction.py | 61 +++++++++++++++++++++++++++++++---------- 2 files changed, 46 insertions(+), 15 deletions(-) delete mode 100644 src/pages/evaluate.py diff --git a/src/pages/evaluate.py b/src/pages/evaluate.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/pages/prediction.py b/src/pages/prediction.py index 6a87c01..e2a95ab 100644 --- a/src/pages/prediction.py +++ b/src/pages/prediction.py @@ -6,10 +6,13 @@ sys.path.append('./back/') import clustering_csv as cc import prediction as p -def display_prediction_results(df, targetCol): - df_cols.remove(col) - original_col = df[col] - predicted_col = p.getColumnsForPredictionAndPredict(df, df_cols, "Route Type", "Linear Regression") +def handle_column_multiselect(df, method_name): + selected_columns = st.multiselect(f"Select the columns you want for {method_name}:", df.columns.tolist(), placeholder="Select dataset columns") + return selected_columns + +def display_prediction_results(df, targetCol, sourceColumns, method): + original_col = df[targetCol] + predicted_col = p.getColumnsForPredictionAndPredict(df, sourceColumns, targetCol, method) new_df = pd.DataFrame() new_df['Original'] = original_col @@ -19,22 +22,50 @@ def display_prediction_results(df, targetCol): if 'df' in st.session_state: df = st.session_state.df - df_cols = df.columns.tolist() st.write("# 🔮 Prediction") - if st.button("K-means"): - st.pyplot(cc.launch_cluster_knn(df, ["Route Type", "Traffic Control"])) + tab1, tab2 = st.tabs(["Clustering", "Predictions"]) + + with tab1: + st.header("Clustering") + selected_columns = handle_column_multiselect(df, "clustering") + + + tab_names = ["K-means", "DBSCAN"] + tab11, tab12 = st.tabs(tab_names) + + with tab11: + if st.button(f"Start {tab_names[0]}"): + st.pyplot(cc.launch_cluster_knn(df, selected_columns)) + + with tab12: + if st.button(f"Start {tab_names[1]}"): + st.pyplot(cc.launch_cluster_dbscan(df, selected_columns)) + + with tab2: + st.header("Predictions") + target_column = st.selectbox( + "Target column:", + df.columns.tolist(), + index=None, + placeholder="Select target column" + ) - if st.button("DBSCAN"): - st.pyplot(cc.launch_cluster_dbscan(df, ["Route Type", "Traffic Control"])) + if target_column != None: + selected_columns_p = handle_column_multiselect(df, "predictions") + + tab_names = ["Linear Regression", "Random Forest"] + tab21, tab22 = st.tabs(tab_names) - if st.button("Linear Regression"): - col = "Route Type" - display_prediction_results(df, col) + with tab21: + if st.button(f"Start {tab_names[0]}"): + st.write(target_column) + st.write(selected_columns_p) + display_prediction_results(df, target_column, selected_columns_p, tab_names[0]) - if st.button("Random Forest"): - col = "Route Type" - display_prediction_results(df, col) + with tab22: + if st.button(f"Start {tab_names[1]}"): + display_prediction_results(df, target_column, selected_columns_p, tab_names[1]) else: st.write("Please clean your dataset.") From 1de7f3c5128b9e8c06507e60681cd26767355703 Mon Sep 17 00:00:00 2001 From: rem Date: Mon, 24 Jun 2024 23:19:25 +0200 Subject: [PATCH 3/7] remove unused page_link --- src/home.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/home.py b/src/home.py index 8c46aa8..d989c21 100644 --- a/src/home.py +++ b/src/home.py @@ -23,7 +23,6 @@ def nav_bar(): st.page_link("pages/clean.py", label="Clean", icon="🧼", help=None) st.page_link("pages/visualize.py", label="Visualize", icon="👁️", help=None) st.page_link("pages/prediction.py", label="Predict", icon="🔮", help=None) - st.page_link("pages/evaluate.py", label="Evaluate", icon=None, help=None) def clean_dataframe(line): # Call to function to clean data From b5feac89cb065a81f1a360f377dd955ae9987e54 Mon Sep 17 00:00:00 2001 From: rem Date: Mon, 24 Jun 2024 23:28:11 +0200 Subject: [PATCH 4/7] fix typos --- src/back/managing_missing_values.py | 2 +- src/pages/clean.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/back/managing_missing_values.py b/src/back/managing_missing_values.py index 24ac773..b12c463 100644 --- a/src/back/managing_missing_values.py +++ b/src/back/managing_missing_values.py @@ -48,7 +48,7 @@ def impute_with_regression(data): model = LinearRegression() model.fit(X_complete, y_complete) y_pred = model.predict(X_missing) - data.loc[df[col].isnull(), col] = y_pred + data.loc[data[col].isnull(), col] = y_pred return data diff --git a/src/pages/clean.py b/src/pages/clean.py index f64bd49..8fd0b97 100644 --- a/src/pages/clean.py +++ b/src/pages/clean.py @@ -13,7 +13,6 @@ if 'original_df' in st.session_state: st.write("## Missing data") rm_empty_rows_or_cols = st.checkbox("Remove empty rows or columns", True) - st.write("#### Replace missing values") replace_methods = ["mean","median","mode","knn","regression"] replace_method = st.radio('Choose an option:', replace_methods) From 0910dfae21a420da7ee8d6989a4888047822e7f3 Mon Sep 17 00:00:00 2001 From: rem Date: Mon, 24 Jun 2024 23:36:26 +0200 Subject: [PATCH 5/7] rewrite function handle_normalization --- src/back/load_csv.py | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/src/back/load_csv.py b/src/back/load_csv.py index 5a6170f..b1ce91a 100644 --- a/src/back/load_csv.py +++ b/src/back/load_csv.py @@ -20,11 +20,6 @@ def csv_check(df): print("-"*12) print(df[col].unique()) -def do_for_columns(df): - for col_name in df: - df[col_name] = function(df[col_name]) - - def csv_norm_min_max(df, col): max = df[col].max() min = df[col].min() @@ -50,17 +45,13 @@ def csv_robust_normalize(df, col): return normalized_column def handle_normalization(df, norm_method): - if norm_method == "min-max": - for col_name in df: + for col_name in df: + if norm_method == "min-max": df[col_name] = csv_norm_min_max(df, col_name) - return df - elif norm_method == "z-score": - for col_name in df: + elif norm_method == "z-score": df[col_name] = csv_standardisation_Z(df, col_name) - return df - elif norm_method == "robust": - for col_name in df: + elif norm_method == "robust": df[col_name] = csv_robust_normalize(df, col_name) - return df - else: - raise ValueError("Unknown method") + else: + raise ValueError("Unknown method") + return df From 1a862c1ed66e48e622c88e5d3cca9dd5a7876ea5 Mon Sep 17 00:00:00 2001 From: rem Date: Mon, 24 Jun 2024 23:45:52 +0200 Subject: [PATCH 6/7] fix robust normalize --- src/back/load_csv.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/back/load_csv.py b/src/back/load_csv.py index b1ce91a..8b05ea2 100644 --- a/src/back/load_csv.py +++ b/src/back/load_csv.py @@ -1,6 +1,7 @@ import pandas as pd import numpy as np import matplotlib.pyplot as plt +from sklearn.preprocessing import RobustScaler def return_csv(path): df = pd.read_csv(path) @@ -32,17 +33,16 @@ def csv_standardisation_Z(df, col): df[col] = (df[col] - mean_col1) / std_col1 return df[col] -def csv_robust_normalize(df, col): - # Calcul de la médiane et de l'IQR - median = df[col].median() - q1 = df[col].quantile(0.25) - q3 = df[col].quantile(0.75) - iqr = q3 - q1 - - # Application de la normalisation robuste - normalized_column = (df[col] - median) / iqr - df[col] = normalized_column - return normalized_column +def robust_normalize_column(df, column_name): + # Extract the column datas + column_data = df[column_name].values.reshape(-1, 1) + + # Fit and transform the column datas + scaler = RobustScaler() + normalized_data = scaler.fit_transform(column_data) + df[column_name] = normalized_data + + return normalized_data def handle_normalization(df, norm_method): for col_name in df: @@ -51,7 +51,7 @@ def handle_normalization(df, norm_method): elif norm_method == "z-score": df[col_name] = csv_standardisation_Z(df, col_name) elif norm_method == "robust": - df[col_name] = csv_robust_normalize(df, col_name) + df[col_name] = robust_normalize_column(df, col_name) else: raise ValueError("Unknown method") return df From 9ef632dcc8d2fca5003d314c418c9b5880695b41 Mon Sep 17 00:00:00 2001 From: rem Date: Mon, 24 Jun 2024 23:50:59 +0200 Subject: [PATCH 7/7] remove empty lines + fix 3d cluster visualisation --- src/back/clustering_csv.py | 2 +- src/back/managing_missing_values.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/back/clustering_csv.py b/src/back/clustering_csv.py index fb93b4e..4b3b6fb 100644 --- a/src/back/clustering_csv.py +++ b/src/back/clustering_csv.py @@ -24,7 +24,7 @@ def visualize_clusters_3d(X, labels, centers=None, title="Clusters"): ax.set_xlabel("Feature 1") ax.set_ylabel("Feature 2") ax.set_zlabel("Feature 3") - plt.show() + return plt.gcf() def calculate_cluster_statistics_kmeans(X, labels, centers): unique_labels = np.unique(labels) diff --git a/src/back/managing_missing_values.py b/src/back/managing_missing_values.py index b12c463..2ecdb28 100644 --- a/src/back/managing_missing_values.py +++ b/src/back/managing_missing_values.py @@ -18,7 +18,6 @@ def drop_high_null_percentage(data, threshold=0.5): data = data.loc[:, missing_percentage <= threshold] return data - def replace_with_mean(data): return data.apply(lambda col: col.fillna(col.mean()) if col.dtype.kind in 'biufc' else col) @@ -52,7 +51,6 @@ def impute_with_regression(data): return data - """ Parameters: - data: Pandas DataFrame with the data