link logic to streamlit ui

1 year ago · 84adfb5a96
parent b7bcb629db
commit 84adfb5a96
9 changed files with 159 additions and 68 deletions
--- a/src/back/clustering_csv.py
+++ b/src/back/clustering_csv.py
@ -12,7 +12,7 @@ def visualize_clusters_2d(X, labels, centers=None, title="Clusters"):
    plt.title(title)
    plt.xlabel("Feature 1")
    plt.ylabel("Feature 2")
-    plt.show()
+    return plt.gcf()
 def visualize_clusters_3d(X, labels, centers=None, title="Clusters"):
    fig = plt.figure(figsize=(10, 7))
@ -56,7 +56,7 @@ def calculate_cluster_statistics_dbscan(X, labels):
        })
    return stats
-def launch_cluster_knn(df,array_columns,n):
+def launch_cluster_knn(df, array_columns, n=3):
    X = df[array_columns].values
    kmeans = KMeans(n_clusters=n, random_state=42)
@ -67,12 +67,11 @@ def launch_cluster_knn(df,array_columns,n):
    stats_kmeans = calculate_cluster_statistics_kmeans(X, labels_kmeans, centers_kmeans)
    if len(array_columns) == 3:
-        visualize_clusters_3d(X, labels_kmeans, centers_kmeans, title="K-Means Clustering 3D")
+        return visualize_clusters_3d(X, labels_kmeans, centers_kmeans, title="K-Means Clustering 3D")
    else:
-        visualize_clusters_2d(X, labels_kmeans, centers_kmeans, title="K-Means Clustering")
+        return visualize_clusters_2d(X, labels_kmeans, centers_kmeans, title="K-Means Clustering")
    return stats_kmeans
-def launch_cluster_DBSCAN(df, array_columns):
+def launch_cluster_dbscan(df, array_columns):
    X = df[array_columns].values
    dbscan = DBSCAN(eps=0.2, min_samples=5)
    labels_dbscan = dbscan.fit_predict(X)
@ -80,9 +79,9 @@ def launch_cluster_DBSCAN(df, array_columns):
    # for stat in stats_dbscan:
    #     print(f"Cluster {stat['cluster']}: {stat['num_points']} points, Density: {stat['density']}")
    if len(array_columns) == 3:
-        visualize_clusters_3d(X, labels_dbscan, title="DBSCAN Clustering 3D")
+        return visualize_clusters_3d(X, labels_dbscan, title="DBSCAN Clustering 3D")
    else:
-        visualize_clusters_2d(X, labels_dbscan, title="DBSCAN Clustering")
+        return visualize_clusters_2d(X, labels_dbscan, title="DBSCAN Clustering")
    return stats_dbscan
 def launch_cluster(df, array_columns):
--- a/src/back/load_csv.py
+++ b/src/back/load_csv.py
@ -21,12 +21,16 @@ def csv_check(df):
        print("-"*12)
        print(df[col].unique())
 def do_for_columns(df):
    for col_name in df:
        df[col_name] = function(df[col_name])
 def csv_norm_min_max(df, col):
-    maValue = df[col].max
+    max = df[col].max()
-    miValue = df[col].min
+    min = df[col].min()
-    df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
+    df[col] = (df[col] - min)/ (max - min)
-    return df
+    return df[col]
 def csv_standardisation_Z(df, col):
    mean_col1 = df[col].mean()
@ -34,18 +38,30 @@ def csv_standardisation_Z(df,col):
    df[col] = (df[col] - mean_col1) / std_col1
    return df[col]
-def csv_robust_normalize(df, column):
+def csv_robust_normalize(df, col):
    # Calcul de la médiane et de l'IQR
-    median = df[column].median()
+    median = df[col].median()
-    q1 = df[column].quantile(0.25)
+    q1 = df[col].quantile(0.25)
-    q3 = df[column].quantile(0.75)
+    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    # Application de la normalisation robuste
-    normalized_column = (df[column] - median) / iqr
+    normalized_column = (df[col] - median) / iqr
-    df[column] = normalized_column
+    df[col] = normalized_column
    print (normalized_column)
    return normalized_column
-
+def handle_normalization(df, norm_method):
-
+    if norm_method == "min-max":
        for col_name in df:
            df[col_name] = csv_norm_min_max(df, col_name)
        return df
    elif norm_method == "z-score":
        for col_name in df:
            df[col_name] = csv_standardisation_Z(df, col_name)
        return df
    elif norm_method == "robust":
        for col_name in df:
            df[col_name] = csv_robust_normalize(df, col_name)
        return df
    else:
        raise ValueError("Unknown method")
--- a/src/back/managing_missing_values.py
+++ b/src/back/managing_missing_values.py
@ -51,8 +51,6 @@ def impute_with_regression(data):
    - n_neighbors: Number of neighbors to use for KNN imputation (only used if method='knn')
 """
 def handle_missing_values(data, method, n_neighbors=5):
    data = drop_high_null_percentage(data)
    data = convert_categorical_to_numeric(data)    
    if method == 'mean':
        return replace_with_mean(data)
--- a/src/back/prediction.py
+++ b/src/back/prediction.py
@ -2,18 +2,16 @@ from sklearn.model_selection import train_test_split
 from sklearn.linear_model import LinearRegression
 from sklearn.ensemble import RandomForestRegressor
 def getColumnsForPredictionAndPredict(df,columns, columnGoal, algoOfPrediction):
    predictors = df[columns]
    target = df[columnGoal]
    X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size=0.2, random_state=42)
-    if algoOfPrediction == "Régression Linéaire":
+    if algoOfPrediction == "Linear Regression":
        model = LinearRegression()
-    elif algoOfPrediction == "Forêt Aléatoire":
+    elif algoOfPrediction == "Random Forest":
        model = RandomForestRegressor(n_estimators=100)
    else:
        raise NameError("No method name : \"" + algoOfPrediction + "\"")
-    model.fit(X_train, y_train)
+    model.fit(predictors, target)
-    return model.predict(X_test)
+    return model.predict(predictors)
--- a/src/back/show_csv.py
+++ b/src/back/show_csv.py
@ -2,15 +2,15 @@ import pandas as pd
 import numpy  as np
 import matplotlib.pyplot as plt
-def histo_col(df,colonne):
+def histo_col(df, col):
    plt.figure()
-    plt.hist(df[colonne], bins=int(df[colonne].nunique()/4), alpha=0.7, color='blue', edgecolor='black')
+    plt.hist(df[col], bins=4, alpha=0.7, color='blue', edgecolor='black')
-    plt.title(f"Histogramme de la colonne '{colonne}'")
+    plt.title(f"Histogramme de la colonne '{col}'")
-    plt.xlabel(colonne)
+    plt.xlabel(col)
    plt.ylabel("Fréquence")
    plt.grid(True)
-    plt.show()
+    return plt.gcf()
-def plotBoxWhisker(df):
+def plotBoxWhisker(df, col):
-    df.plot(kind='box', subplots=True, sharex=False, sharey=False)
+    df[col].plot(kind='box', subplots=True, sharex=False, sharey=False)
-    plt.show()
+    return plt.gcf()
--- a/src/home.py
+++ b/src/home.py
@ -1,5 +1,6 @@
 import streamlit as st
 from io import StringIO
 from ydata_profiling import ProfileReport
 import pandas as pd
 def  statistics(df):
@ -43,6 +44,9 @@ def main():
        st.write("## Statistics")
        statistics(df)
        profile = ProfileReport(df, title='Pandas Profiling Report', explorative=True)
        profile.to_widgets()
        if st.button("Next"):
            st.switch_page("pages/clean.py")
--- a/src/pages/clean.py
+++ b/src/pages/clean.py
@ -1,18 +1,44 @@
 import streamlit as st
 import sys
 sys.path.append('./back/')
 import managing_missing_values as mmv
 import load_csv as lc
 if 'original_df' in st.session_state:
    df = st.session_state.original_df
    st.write("# 🧼 Data cleaning")
    st.write("## Missing data")
    rm_empty_rows_or_cols = st.checkbox("Remove empty rows or columns", True)
    st.write("#### Replace missing values")
-replace_methods = ["Mean","Median","Mode","KNN","Regression"]
+    replace_methods = ["mean","median","mode","knn","regression"]
    replace_method = st.radio('Choose an option:', replace_methods)
    st.write("## Normalize data")
-normalize_methods = ["Min-Max","Z-Score","Another One"]
+    normalize_methods = ["min-max","z-score","robust"]
    normalize_method = st.radio('Choose an option:', normalize_methods)
-if st.button("Clean dataset"):
+    is_cleaned = st.button("Clean dataset")
-    # TODO: Actual processing
+    if is_cleaned:
-    st.write("TODO")
+        if rm_empty_rows_or_cols:
            st.write("-  Removing hight null percentage values")
            df = mmv.drop_high_null_percentage(df)
            st.dataframe(df)
        st.write("- Handle missing values with method:", replace_method)
        df = mmv.handle_missing_values(df, replace_method)
        st.session_state.df = df
        st.dataframe(df)
        st.write("- Normalize with method:", normalize_method)
        df = lc.handle_normalization(df, normalize_method)
        st.session_state.df = df
        st.dataframe(df)
        st.switch_page("pages/visualize.py")
 else:
    st.write("Please upload you dataset.")
--- a/src/pages/prediction.py
+++ b/src/pages/prediction.py
@ -0,0 +1,41 @@
 import streamlit as st
 import pandas as pd
 import sys
 sys.path.append('./back/')
 import clustering_csv as cc
 import prediction as p
 if 'df' in st.session_state:
    df = st.session_state.df
    df_cols = df.columns.tolist()
    st.write("# 🔮 Prediction")
    if st.button("K-means"):
        st.pyplot(cc.launch_cluster_knn(df, ["Route Type", "Traffic Control"]))
    if st.button("DBSCAN"):
        st.pyplot(cc.launch_cluster_dbscan(df, ["Route Type", "Traffic Control"]))
    if st.button("Linear Regression"):
        col = "Route Type"
        df_cols.remove(col)
        original_col = df[col]
        predicted_col = p.getColumnsForPredictionAndPredict(df, df_cols, "Route Type", "Linear Regression")
    if st.button("Random Forest"):
        col = "Route Type"
        df_cols.remove(col)
        original_col = df[col]
        predicted_col = p.getColumnsForPredictionAndPredict(df, df_cols, "Route Type", "Random Forest")
    ndf = pd.DataFrame()
    ndf['Original'] = original_col
    ndf['Predicted'] = predicted_col
    st.dataframe(ndf)
 else:
    st.write("Please clean your dataset.")
--- a/src/pages/visualize.py
+++ b/src/pages/visualize.py
@ -1,7 +1,14 @@
 import streamlit as st
 import matplotlib.pyplot as plt
-df = st.session_state.orig_df
+import sys
 sys.path.append('./back/')
 import show_csv as sc
 if 'df' in st.session_state:
    df = st.session_state.df
    df_columns = df.columns.tolist()
    st.write("# 📊 Visualization")
@ -11,7 +18,7 @@ hist_tabs = st.tabs(df_columns)
    for idx, tab in enumerate(hist_tabs):
        tab.write("##### "+df_columns[idx])
-    tab.bar_chart(df[df_columns[idx]])
+        tab.pyplot(sc.histo_col(df, df_columns[idx]))
    st.write("## Box & Whisker")
    baw_tabs = st.tabs(df_columns)
@ -20,4 +27,6 @@ for idx, tab in enumerate(baw_tabs):
        tab.write("##### "+df_columns[idx])
        fig, ax = plt.subplots()
        df[df_columns[idx]].plot(kind='box')
-    st.pyplot(fig)
+        tab.pyplot(fig)
 else:
    st.write('Please clean your dataset.')