From 84adfb5a96a541bfa1973ba6a007101165e7cd59 Mon Sep 17 00:00:00 2001 From: rem Date: Mon, 24 Jun 2024 18:30:49 +0200 Subject: [PATCH] link logic to streamlit ui --- src/back/clustering_csv.py | 17 +++++----- src/back/load_csv.py | 46 +++++++++++++++++--------- src/back/managing_missing_values.py | 2 -- src/back/prediction.py | 12 +++---- src/back/show_csv.py | 16 ++++----- src/home.py | 4 +++ src/pages/clean.py | 50 ++++++++++++++++++++++------- src/pages/prediction.py | 41 +++++++++++++++++++++++ src/pages/visualize.py | 39 +++++++++++++--------- 9 files changed, 159 insertions(+), 68 deletions(-) diff --git a/src/back/clustering_csv.py b/src/back/clustering_csv.py index dcb170d..fb93b4e 100644 --- a/src/back/clustering_csv.py +++ b/src/back/clustering_csv.py @@ -12,7 +12,7 @@ def visualize_clusters_2d(X, labels, centers=None, title="Clusters"): plt.title(title) plt.xlabel("Feature 1") plt.ylabel("Feature 2") - plt.show() + return plt.gcf() def visualize_clusters_3d(X, labels, centers=None, title="Clusters"): fig = plt.figure(figsize=(10, 7)) @@ -56,7 +56,7 @@ def calculate_cluster_statistics_dbscan(X, labels): }) return stats -def launch_cluster_knn(df,array_columns,n): +def launch_cluster_knn(df, array_columns, n=3): X = df[array_columns].values kmeans = KMeans(n_clusters=n, random_state=42) @@ -67,12 +67,11 @@ def launch_cluster_knn(df,array_columns,n): stats_kmeans = calculate_cluster_statistics_kmeans(X, labels_kmeans, centers_kmeans) if len(array_columns) == 3: - visualize_clusters_3d(X, labels_kmeans, centers_kmeans, title="K-Means Clustering 3D") + return visualize_clusters_3d(X, labels_kmeans, centers_kmeans, title="K-Means Clustering 3D") else: - visualize_clusters_2d(X, labels_kmeans, centers_kmeans, title="K-Means Clustering") - return stats_kmeans + return visualize_clusters_2d(X, labels_kmeans, centers_kmeans, title="K-Means Clustering") -def launch_cluster_DBSCAN(df, array_columns): +def launch_cluster_dbscan(df, array_columns): X = df[array_columns].values dbscan = DBSCAN(eps=0.2, min_samples=5) labels_dbscan = dbscan.fit_predict(X) @@ -80,12 +79,12 @@ def launch_cluster_DBSCAN(df, array_columns): # for stat in stats_dbscan: # print(f"Cluster {stat['cluster']}: {stat['num_points']} points, Density: {stat['density']}") if len(array_columns) == 3: - visualize_clusters_3d(X, labels_dbscan, title="DBSCAN Clustering 3D") + return visualize_clusters_3d(X, labels_dbscan, title="DBSCAN Clustering 3D") else: - visualize_clusters_2d(X, labels_dbscan, title="DBSCAN Clustering") + return visualize_clusters_2d(X, labels_dbscan, title="DBSCAN Clustering") return stats_dbscan -def launch_cluster(df,array_columns): +def launch_cluster(df, array_columns): X = df[array_columns].values kmeans = KMeans(n_clusters=4, random_state=42) diff --git a/src/back/load_csv.py b/src/back/load_csv.py index 25d5ad9..5c9d1de 100644 --- a/src/back/load_csv.py +++ b/src/back/load_csv.py @@ -21,31 +21,47 @@ def csv_check(df): print("-"*12) print(df[col].unique()) +def do_for_columns(df): + for col_name in df: + df[col_name] = function(df[col_name]) -def csv_norm_min_max(df,col): - maValue = df[col].max - miValue = df[col].min - df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min()) - return df -def csv_standardisation_Z(df,col): +def csv_norm_min_max(df, col): + max = df[col].max() + min = df[col].min() + df[col] = (df[col] - min)/ (max - min) + return df[col] + +def csv_standardisation_Z(df, col): mean_col1 = df[col].mean() std_col1 = df[col].std() df[col] = (df[col] - mean_col1) / std_col1 return df[col] -def csv_robust_normalize(df, column): +def csv_robust_normalize(df, col): # Calcul de la médiane et de l'IQR - median = df[column].median() - q1 = df[column].quantile(0.25) - q3 = df[column].quantile(0.75) + median = df[col].median() + q1 = df[col].quantile(0.25) + q3 = df[col].quantile(0.75) iqr = q3 - q1 # Application de la normalisation robuste - normalized_column = (df[column] - median) / iqr - df[column] = normalized_column - print (normalized_column) + normalized_column = (df[col] - median) / iqr + df[col] = normalized_column return normalized_column - - +def handle_normalization(df, norm_method): + if norm_method == "min-max": + for col_name in df: + df[col_name] = csv_norm_min_max(df, col_name) + return df + elif norm_method == "z-score": + for col_name in df: + df[col_name] = csv_standardisation_Z(df, col_name) + return df + elif norm_method == "robust": + for col_name in df: + df[col_name] = csv_robust_normalize(df, col_name) + return df + else: + raise ValueError("Unknown method") diff --git a/src/back/managing_missing_values.py b/src/back/managing_missing_values.py index 8360cf1..9fc9e38 100644 --- a/src/back/managing_missing_values.py +++ b/src/back/managing_missing_values.py @@ -51,8 +51,6 @@ def impute_with_regression(data): - n_neighbors: Number of neighbors to use for KNN imputation (only used if method='knn') """ def handle_missing_values(data, method, n_neighbors=5): - - data = drop_high_null_percentage(data) data = convert_categorical_to_numeric(data) if method == 'mean': return replace_with_mean(data) diff --git a/src/back/prediction.py b/src/back/prediction.py index 09c7556..1700d72 100644 --- a/src/back/prediction.py +++ b/src/back/prediction.py @@ -2,18 +2,16 @@ from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.ensemble import RandomForestRegressor - def getColumnsForPredictionAndPredict(df,columns, columnGoal, algoOfPrediction): predictors = df[columns] target = df[columnGoal] - X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size=0.2, random_state=42) - if algoOfPrediction == "Régression Linéaire": + if algoOfPrediction == "Linear Regression": model = LinearRegression() - elif algoOfPrediction == "Forêt Aléatoire": - model = RandomForestRegressor(n_estimators=100) + elif algoOfPrediction == "Random Forest": + model = RandomForestRegressor(n_estimators=100) else: raise NameError("No method name : \"" + algoOfPrediction + "\"") - model.fit(X_train, y_train) - return model.predict(X_test) \ No newline at end of file + model.fit(predictors, target) + return model.predict(predictors) diff --git a/src/back/show_csv.py b/src/back/show_csv.py index 93d9973..cd8a293 100644 --- a/src/back/show_csv.py +++ b/src/back/show_csv.py @@ -2,15 +2,15 @@ import pandas as pd import numpy as np import matplotlib.pyplot as plt -def histo_col(df,colonne): +def histo_col(df, col): plt.figure() - plt.hist(df[colonne], bins=int(df[colonne].nunique()/4), alpha=0.7, color='blue', edgecolor='black') - plt.title(f"Histogramme de la colonne '{colonne}'") - plt.xlabel(colonne) + plt.hist(df[col], bins=4, alpha=0.7, color='blue', edgecolor='black') + plt.title(f"Histogramme de la colonne '{col}'") + plt.xlabel(col) plt.ylabel("Fréquence") plt.grid(True) - plt.show() + return plt.gcf() -def plotBoxWhisker(df): - df.plot(kind='box', subplots=True, sharex=False, sharey=False) - plt.show() +def plotBoxWhisker(df, col): + df[col].plot(kind='box', subplots=True, sharex=False, sharey=False) + return plt.gcf() diff --git a/src/home.py b/src/home.py index ad70562..754dc65 100644 --- a/src/home.py +++ b/src/home.py @@ -1,5 +1,6 @@ import streamlit as st from io import StringIO +from ydata_profiling import ProfileReport import pandas as pd def statistics(df): @@ -43,6 +44,9 @@ def main(): st.write("## Statistics") statistics(df) + profile = ProfileReport(df, title='Pandas Profiling Report', explorative=True) + profile.to_widgets() + if st.button("Next"): st.switch_page("pages/clean.py") diff --git a/src/pages/clean.py b/src/pages/clean.py index b0bec84..f64bd49 100644 --- a/src/pages/clean.py +++ b/src/pages/clean.py @@ -1,18 +1,44 @@ import streamlit as st +import sys +sys.path.append('./back/') -st.write("# 🧼 Data cleaning") +import managing_missing_values as mmv +import load_csv as lc -st.write("## Missing data") -rm_empty_rows_or_cols = st.checkbox("Remove empty rows or columns", True) +if 'original_df' in st.session_state: + df = st.session_state.original_df -st.write("#### Replace missing values") -replace_methods = ["Mean","Median","Mode","KNN","Regression"] -replace_method = st.radio('Choose an option:', replace_methods) + st.write("# 🧼 Data cleaning") -st.write("## Normalize data") -normalize_methods = ["Min-Max","Z-Score","Another One"] -normalize_method = st.radio('Choose an option:', normalize_methods) + st.write("## Missing data") + rm_empty_rows_or_cols = st.checkbox("Remove empty rows or columns", True) -if st.button("Clean dataset"): - # TODO: Actual processing - st.write("TODO") + + st.write("#### Replace missing values") + replace_methods = ["mean","median","mode","knn","regression"] + replace_method = st.radio('Choose an option:', replace_methods) + + st.write("## Normalize data") + normalize_methods = ["min-max","z-score","robust"] + normalize_method = st.radio('Choose an option:', normalize_methods) + + is_cleaned = st.button("Clean dataset") + if is_cleaned: + if rm_empty_rows_or_cols: + st.write("- Removing hight null percentage values") + df = mmv.drop_high_null_percentage(df) + st.dataframe(df) + + st.write("- Handle missing values with method:", replace_method) + df = mmv.handle_missing_values(df, replace_method) + st.session_state.df = df + st.dataframe(df) + + st.write("- Normalize with method:", normalize_method) + df = lc.handle_normalization(df, normalize_method) + st.session_state.df = df + st.dataframe(df) + + st.switch_page("pages/visualize.py") +else: + st.write("Please upload you dataset.") diff --git a/src/pages/prediction.py b/src/pages/prediction.py index e69de29..6892c69 100644 --- a/src/pages/prediction.py +++ b/src/pages/prediction.py @@ -0,0 +1,41 @@ +import streamlit as st +import pandas as pd +import sys +sys.path.append('./back/') + +import clustering_csv as cc +import prediction as p + +if 'df' in st.session_state: + + df = st.session_state.df + df_cols = df.columns.tolist() + + st.write("# 🔮 Prediction") + + if st.button("K-means"): + st.pyplot(cc.launch_cluster_knn(df, ["Route Type", "Traffic Control"])) + + if st.button("DBSCAN"): + st.pyplot(cc.launch_cluster_dbscan(df, ["Route Type", "Traffic Control"])) + + if st.button("Linear Regression"): + col = "Route Type" + df_cols.remove(col) + original_col = df[col] + predicted_col = p.getColumnsForPredictionAndPredict(df, df_cols, "Route Type", "Linear Regression") + + if st.button("Random Forest"): + col = "Route Type" + df_cols.remove(col) + original_col = df[col] + predicted_col = p.getColumnsForPredictionAndPredict(df, df_cols, "Route Type", "Random Forest") + + ndf = pd.DataFrame() + ndf['Original'] = original_col + ndf['Predicted'] = predicted_col + + st.dataframe(ndf) + +else: + st.write("Please clean your dataset.") diff --git a/src/pages/visualize.py b/src/pages/visualize.py index 1c6536f..d15ff23 100644 --- a/src/pages/visualize.py +++ b/src/pages/visualize.py @@ -1,23 +1,32 @@ import streamlit as st import matplotlib.pyplot as plt -df = st.session_state.orig_df -df_columns = df.columns.tolist() +import sys +sys.path.append('./back/') -st.write("# 📊 Visualization") +import show_csv as sc -st.write("## Histograms") -hist_tabs = st.tabs(df_columns) +if 'df' in st.session_state: -for idx, tab in enumerate(hist_tabs): - tab.write("##### "+df_columns[idx]) - tab.bar_chart(df[df_columns[idx]]) + df = st.session_state.df + df_columns = df.columns.tolist() -st.write("## Box & Whisker") -baw_tabs = st.tabs(df_columns) + st.write("# 📊 Visualization") -for idx, tab in enumerate(baw_tabs): - tab.write("##### "+df_columns[idx]) - fig, ax = plt.subplots() - df[df_columns[idx]].plot(kind='box') - st.pyplot(fig) + st.write("## Histograms") + hist_tabs = st.tabs(df_columns) + + for idx, tab in enumerate(hist_tabs): + tab.write("##### "+df_columns[idx]) + tab.pyplot(sc.histo_col(df, df_columns[idx])) + + st.write("## Box & Whisker") + baw_tabs = st.tabs(df_columns) + + for idx, tab in enumerate(baw_tabs): + tab.write("##### "+df_columns[idx]) + fig, ax = plt.subplots() + df[df_columns[idx]].plot(kind='box') + tab.pyplot(fig) +else: + st.write('Please clean your dataset.')