Merge pull request 'streamlit' (#22) from streamlit into master

Reviewed-on: #22
10 months ago · 69aa8c58b2
parent 9172409508 9ef632dcc8
commit 69aa8c58b2
7 changed files with 72 additions and 55 deletions
--- a/src/back/clustering_csv.py
+++ b/src/back/clustering_csv.py
@ -24,7 +24,7 @@ def visualize_clusters_3d(X, labels, centers=None, title="Clusters"):
    ax.set_xlabel("Feature 1")
    ax.set_ylabel("Feature 2")
    ax.set_zlabel("Feature 3")
-    plt.show()
+    return plt.gcf()

 def calculate_cluster_statistics_kmeans(X, labels, centers):
    unique_labels = np.unique(labels)
--- a/src/back/load_csv.py
+++ b/src/back/load_csv.py
@ -1,6 +1,7 @@
 import pandas as pd
 import numpy  as np
 import matplotlib.pyplot as plt
+from sklearn.preprocessing import RobustScaler

 def return_csv(path):
    df = pd.read_csv(path)
@ -20,11 +21,6 @@ def csv_check(df):
        print("-"*12)
        print(df[col].unique())

-def do_for_columns(df):
-    for col_name in df:
-        df[col_name] = function(df[col_name])
-
-
 def csv_norm_min_max(df, col):
    max = df[col].max()
    min = df[col].min()
@ -37,30 +33,25 @@ def csv_standardisation_Z(df, col):
    df[col] = (df[col] - mean_col1) / std_col1
    return df[col]

-def csv_robust_normalize(df, col):
-    # Calcul de la médiane et de l'IQR
-    median = df[col].median()
-    q1 = df[col].quantile(0.25)
-    q3 = df[col].quantile(0.75)
-    iqr = q3 - q1
+def robust_normalize_column(df, column_name):
+    # Extract the column datas
+    column_data = df[column_name].values.reshape(-1, 1)
    
-    # Application de la normalisation robuste
-    normalized_column = (df[col] - median) / iqr
-    df[col] = normalized_column
-    return normalized_column
+    # Fit and transform the column datas
+    scaler = RobustScaler()
+    normalized_data = scaler.fit_transform(column_data)
+    df[column_name] = normalized_data
+    
+    return normalized_data

 def handle_normalization(df, norm_method):
-    if norm_method == "min-max":
    for col_name in df:
+        if norm_method == "min-max":
            df[col_name] = csv_norm_min_max(df, col_name)
-        return df
        elif norm_method == "z-score":
-        for col_name in df:
            df[col_name] = csv_standardisation_Z(df, col_name)
-        return df
        elif norm_method == "robust":
-        for col_name in df:
-            df[col_name] = csv_robust_normalize(df, col_name)
-        return df
+            df[col_name] = robust_normalize_column(df, col_name)
        else:
            raise ValueError("Unknown method")
+    return df
--- a/src/back/managing_missing_values.py
+++ b/src/back/managing_missing_values.py
@ -18,7 +18,6 @@ def drop_high_null_percentage(data, threshold=0.5):
    data = data.loc[:, missing_percentage <= threshold]
    return data

-
 def replace_with_mean(data):
    return data.apply(lambda col: col.fillna(col.mean()) if col.dtype.kind in 'biufc' else col)

@ -48,11 +47,10 @@ def impute_with_regression(data):
        model = LinearRegression()
        model.fit(X_complete, y_complete)
        y_pred = model.predict(X_missing)
-        data.loc[df[col].isnull(), col] = y_pred
+        data.loc[data[col].isnull(), col] = y_pred
        
    return data

-
 """    
    Parameters:
    - data: Pandas DataFrame with the data
--- a/src/home.py
+++ b/src/home.py
@ -1,6 +1,6 @@
 import streamlit as st
 from io import StringIO
-from ydata_profiling import ProfileReport
+# from ydata_profiling import ProfileReport
 import pandas as pd

 def  statistics(df):
@ -23,7 +23,6 @@ def nav_bar():
    st.page_link("pages/clean.py", label="Clean", icon="🧼", help=None)
    st.page_link("pages/visualize.py", label="Visualize", icon="👁️", help=None)
    st.page_link("pages/prediction.py", label="Predict", icon="🔮", help=None)
-    st.page_link("pages/evaluate.py", label="Evaluate", icon=None, help=None)

 def clean_dataframe(line):
    # Call to function to clean data
--- a/src/pages/clean.py
+++ b/src/pages/clean.py
@ -13,7 +13,6 @@ if 'original_df' in st.session_state:
    st.write("## Missing data")
    rm_empty_rows_or_cols = st.checkbox("Remove empty rows or columns", True)

-
    st.write("#### Replace missing values")
    replace_methods = ["mean","median","mode","knn","regression"]
    replace_method = st.radio('Choose an option:', replace_methods)
--- a/src/pages/evaluate.py
+++ b/src/pages/evaluate.py
--- a/src/pages/prediction.py
+++ b/src/pages/prediction.py
@ -6,36 +6,66 @@ sys.path.append('./back/')
 import clustering_csv as cc
 import prediction as p

-if 'df' in st.session_state:
+def handle_column_multiselect(df, method_name):
+    selected_columns = st.multiselect(f"Select the columns you want for {method_name}:", df.columns.tolist(), placeholder="Select dataset columns")
+    return selected_columns
+    
+def display_prediction_results(df, targetCol, sourceColumns, method):
+    original_col = df[targetCol]
+    predicted_col = p.getColumnsForPredictionAndPredict(df, sourceColumns, targetCol, method)    
+    
+    new_df = pd.DataFrame()
+    new_df['Original'] = original_col
+    new_df['Predicted'] = predicted_col

+    st.dataframe(new_df)    
+
+if 'df' in st.session_state:
    df = st.session_state.df
-    df_cols = df.columns.tolist()

    st.write("# 🔮 Prediction")

-    if st.button("K-means"):
-        st.pyplot(cc.launch_cluster_knn(df, ["Route Type", "Traffic Control"]))
+    tab1, tab2 = st.tabs(["Clustering", "Predictions"])
+
+    with tab1:
+        st.header("Clustering")
+        selected_columns = handle_column_multiselect(df, "clustering")
+            
+        
+        tab_names = ["K-means", "DBSCAN"] 
+        tab11, tab12 = st.tabs(tab_names)
+
+        with tab11:
+            if st.button(f"Start {tab_names[0]}"):
+                st.pyplot(cc.launch_cluster_knn(df, selected_columns))

-    if st.button("DBSCAN"):
-        st.pyplot(cc.launch_cluster_dbscan(df, ["Route Type", "Traffic Control"]))
+        with tab12:
+            if st.button(f"Start {tab_names[1]}"):
+                st.pyplot(cc.launch_cluster_dbscan(df, selected_columns))

-    if st.button("Linear Regression"):
-        col = "Route Type"
-        df_cols.remove(col)
-        original_col = df[col]
-        predicted_col = p.getColumnsForPredictionAndPredict(df, df_cols, "Route Type", "Linear Regression")
+    with tab2:
+        st.header("Predictions")
+        target_column = st.selectbox(
+                            "Target column:",
+                            df.columns.tolist(),
+                            index=None,
+                            placeholder="Select target column"
+                        )

-    if st.button("Random Forest"):
-        col = "Route Type"
-        df_cols.remove(col)
-        original_col = df[col]
-        predicted_col = p.getColumnsForPredictionAndPredict(df, df_cols, "Route Type", "Random Forest")
+        if target_column != None:
+            selected_columns_p = handle_column_multiselect(df, "predictions")
        
-    ndf = pd.DataFrame()
-    ndf['Original'] = original_col
-    ndf['Predicted'] = predicted_col
+        tab_names = ["Linear Regression", "Random Forest"] 
+        tab21, tab22 = st.tabs(tab_names)

-    st.dataframe(ndf)
+        with tab21:
+            if st.button(f"Start {tab_names[0]}"):
+                st.write(target_column)
+                st.write(selected_columns_p)
+                display_prediction_results(df, target_column, selected_columns_p, tab_names[0])

+        with tab22:
+            if st.button(f"Start {tab_names[1]}"):
+                display_prediction_results(df, target_column, selected_columns_p, tab_names[1])
 else:
    st.write("Please clean your dataset.")