diff --git a/src/back/clustering_csv.py b/src/back/clustering_csv.py index fb93b4e..4b3b6fb 100644 --- a/src/back/clustering_csv.py +++ b/src/back/clustering_csv.py @@ -24,7 +24,7 @@ def visualize_clusters_3d(X, labels, centers=None, title="Clusters"): ax.set_xlabel("Feature 1") ax.set_ylabel("Feature 2") ax.set_zlabel("Feature 3") - plt.show() + return plt.gcf() def calculate_cluster_statistics_kmeans(X, labels, centers): unique_labels = np.unique(labels) diff --git a/src/back/load_csv.py b/src/back/load_csv.py index 5a6170f..8b05ea2 100644 --- a/src/back/load_csv.py +++ b/src/back/load_csv.py @@ -1,6 +1,7 @@ import pandas as pd import numpy as np import matplotlib.pyplot as plt +from sklearn.preprocessing import RobustScaler def return_csv(path): df = pd.read_csv(path) @@ -20,11 +21,6 @@ def csv_check(df): print("-"*12) print(df[col].unique()) -def do_for_columns(df): - for col_name in df: - df[col_name] = function(df[col_name]) - - def csv_norm_min_max(df, col): max = df[col].max() min = df[col].min() @@ -37,30 +33,25 @@ def csv_standardisation_Z(df, col): df[col] = (df[col] - mean_col1) / std_col1 return df[col] -def csv_robust_normalize(df, col): - # Calcul de la médiane et de l'IQR - median = df[col].median() - q1 = df[col].quantile(0.25) - q3 = df[col].quantile(0.75) - iqr = q3 - q1 - - # Application de la normalisation robuste - normalized_column = (df[col] - median) / iqr - df[col] = normalized_column - return normalized_column +def robust_normalize_column(df, column_name): + # Extract the column datas + column_data = df[column_name].values.reshape(-1, 1) + + # Fit and transform the column datas + scaler = RobustScaler() + normalized_data = scaler.fit_transform(column_data) + df[column_name] = normalized_data + + return normalized_data def handle_normalization(df, norm_method): - if norm_method == "min-max": - for col_name in df: + for col_name in df: + if norm_method == "min-max": df[col_name] = csv_norm_min_max(df, col_name) - return df - elif norm_method == "z-score": - for col_name in df: + elif norm_method == "z-score": df[col_name] = csv_standardisation_Z(df, col_name) - return df - elif norm_method == "robust": - for col_name in df: - df[col_name] = csv_robust_normalize(df, col_name) - return df - else: - raise ValueError("Unknown method") + elif norm_method == "robust": + df[col_name] = robust_normalize_column(df, col_name) + else: + raise ValueError("Unknown method") + return df diff --git a/src/back/managing_missing_values.py b/src/back/managing_missing_values.py index 24ac773..2ecdb28 100644 --- a/src/back/managing_missing_values.py +++ b/src/back/managing_missing_values.py @@ -18,7 +18,6 @@ def drop_high_null_percentage(data, threshold=0.5): data = data.loc[:, missing_percentage <= threshold] return data - def replace_with_mean(data): return data.apply(lambda col: col.fillna(col.mean()) if col.dtype.kind in 'biufc' else col) @@ -48,11 +47,10 @@ def impute_with_regression(data): model = LinearRegression() model.fit(X_complete, y_complete) y_pred = model.predict(X_missing) - data.loc[df[col].isnull(), col] = y_pred + data.loc[data[col].isnull(), col] = y_pred return data - """ Parameters: - data: Pandas DataFrame with the data diff --git a/src/home.py b/src/home.py index 67bbe40..d989c21 100644 --- a/src/home.py +++ b/src/home.py @@ -1,6 +1,6 @@ import streamlit as st from io import StringIO -from ydata_profiling import ProfileReport +# from ydata_profiling import ProfileReport import pandas as pd def statistics(df): @@ -23,7 +23,6 @@ def nav_bar(): st.page_link("pages/clean.py", label="Clean", icon="🧼", help=None) st.page_link("pages/visualize.py", label="Visualize", icon="👁️", help=None) st.page_link("pages/prediction.py", label="Predict", icon="🔮", help=None) - st.page_link("pages/evaluate.py", label="Evaluate", icon=None, help=None) def clean_dataframe(line): # Call to function to clean data diff --git a/src/pages/clean.py b/src/pages/clean.py index f64bd49..8fd0b97 100644 --- a/src/pages/clean.py +++ b/src/pages/clean.py @@ -13,7 +13,6 @@ if 'original_df' in st.session_state: st.write("## Missing data") rm_empty_rows_or_cols = st.checkbox("Remove empty rows or columns", True) - st.write("#### Replace missing values") replace_methods = ["mean","median","mode","knn","regression"] replace_method = st.radio('Choose an option:', replace_methods) diff --git a/src/pages/evaluate.py b/src/pages/evaluate.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/pages/prediction.py b/src/pages/prediction.py index 6892c69..e2a95ab 100644 --- a/src/pages/prediction.py +++ b/src/pages/prediction.py @@ -6,36 +6,66 @@ sys.path.append('./back/') import clustering_csv as cc import prediction as p -if 'df' in st.session_state: +def handle_column_multiselect(df, method_name): + selected_columns = st.multiselect(f"Select the columns you want for {method_name}:", df.columns.tolist(), placeholder="Select dataset columns") + return selected_columns + +def display_prediction_results(df, targetCol, sourceColumns, method): + original_col = df[targetCol] + predicted_col = p.getColumnsForPredictionAndPredict(df, sourceColumns, targetCol, method) + + new_df = pd.DataFrame() + new_df['Original'] = original_col + new_df['Predicted'] = predicted_col + + st.dataframe(new_df) +if 'df' in st.session_state: df = st.session_state.df - df_cols = df.columns.tolist() st.write("# 🔮 Prediction") - if st.button("K-means"): - st.pyplot(cc.launch_cluster_knn(df, ["Route Type", "Traffic Control"])) + tab1, tab2 = st.tabs(["Clustering", "Predictions"]) + + with tab1: + st.header("Clustering") + selected_columns = handle_column_multiselect(df, "clustering") + + + tab_names = ["K-means", "DBSCAN"] + tab11, tab12 = st.tabs(tab_names) - if st.button("DBSCAN"): - st.pyplot(cc.launch_cluster_dbscan(df, ["Route Type", "Traffic Control"])) + with tab11: + if st.button(f"Start {tab_names[0]}"): + st.pyplot(cc.launch_cluster_knn(df, selected_columns)) - if st.button("Linear Regression"): - col = "Route Type" - df_cols.remove(col) - original_col = df[col] - predicted_col = p.getColumnsForPredictionAndPredict(df, df_cols, "Route Type", "Linear Regression") + with tab12: + if st.button(f"Start {tab_names[1]}"): + st.pyplot(cc.launch_cluster_dbscan(df, selected_columns)) - if st.button("Random Forest"): - col = "Route Type" - df_cols.remove(col) - original_col = df[col] - predicted_col = p.getColumnsForPredictionAndPredict(df, df_cols, "Route Type", "Random Forest") + with tab2: + st.header("Predictions") + target_column = st.selectbox( + "Target column:", + df.columns.tolist(), + index=None, + placeholder="Select target column" + ) - ndf = pd.DataFrame() - ndf['Original'] = original_col - ndf['Predicted'] = predicted_col + if target_column != None: + selected_columns_p = handle_column_multiselect(df, "predictions") + + tab_names = ["Linear Regression", "Random Forest"] + tab21, tab22 = st.tabs(tab_names) - st.dataframe(ndf) + with tab21: + if st.button(f"Start {tab_names[0]}"): + st.write(target_column) + st.write(selected_columns_p) + display_prediction_results(df, target_column, selected_columns_p, tab_names[0]) + with tab22: + if st.button(f"Start {tab_names[1]}"): + display_prediction_results(df, target_column, selected_columns_p, tab_names[1]) else: st.write("Please clean your dataset.")