From d317fd3ad9822cd989b072b10f3e9035b5fb0f8a Mon Sep 17 00:00:00 2001 From: rem Date: Tue, 25 Jun 2024 02:26:54 +0200 Subject: [PATCH] correlation_matrix vue + code reorganisation --- src/back/prediction.py | 22 +++++++++++++++++++++- src/pages/prediction.py | 36 ++++++++++++++++-------------------- 2 files changed, 37 insertions(+), 21 deletions(-) diff --git a/src/back/prediction.py b/src/back/prediction.py index 1700d72..4a679f1 100644 --- a/src/back/prediction.py +++ b/src/back/prediction.py @@ -1,6 +1,10 @@ from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.ensemble import RandomForestRegressor +from sklearn.metrics import f1_score +from sklearn.metrics import accuracy_score +import numpy as np +import matplotlib.pyplot as plt def getColumnsForPredictionAndPredict(df,columns, columnGoal, algoOfPrediction): predictors = df[columns] @@ -14,4 +18,20 @@ def getColumnsForPredictionAndPredict(df,columns, columnGoal, algoOfPrediction): raise NameError("No method name : \"" + algoOfPrediction + "\"") model.fit(predictors, target) - return model.predict(predictors) + prediction = model.predict(predictors) + return prediction + +def correlation_matrix(df, columns): + new_df = df[columns] + correlations = new_df.corr() + print(correlations) + fig = plt.figure() + ax = fig.add_subplot(111) + cax = ax.matshow(correlations, vmin=-1, vmax=1) + fig.colorbar(cax) + ticks = np.arange(0,new_df.shape[1],1) + ax.set_xticks(ticks) + ax.set_yticks(ticks) + ax.set_xticklabels(list(new_df)) + ax.set_yticklabels(list(new_df)) + return fig diff --git a/src/pages/prediction.py b/src/pages/prediction.py index d2b8805..f25008c 100644 --- a/src/pages/prediction.py +++ b/src/pages/prediction.py @@ -1,6 +1,8 @@ import streamlit as st import pandas as pd import sys +import matplotlib.pyplot as plt +import numpy as np sys.path.append('./back/') import clustering_csv as cc @@ -10,7 +12,7 @@ def handle_column_multiselect(df, method_name): selected_columns = st.multiselect(f"Select the columns you want for {method_name}:", df.columns.tolist(), placeholder="Select dataset columns") return selected_columns -def display_prediction_results(df, targetCol, sourceColumns, method): +def df_prediction_results(df, targetCol, sourceColumns, method): original_col = df[targetCol] predicted_col = p.getColumnsForPredictionAndPredict(df, sourceColumns, targetCol, method) @@ -18,7 +20,7 @@ def display_prediction_results(df, targetCol, sourceColumns, method): new_df['Original'] = original_col new_df['Predicted'] = predicted_col - st.dataframe(new_df) + return new_df if 'df' in st.session_state: df = st.session_state.df @@ -37,15 +39,14 @@ if 'df' in st.session_state: dimensions = 2 tab_names = ["K-means", "DBSCAN"] - tab11, tab12 = st.tabs(tab_names) + cluster_tabs = st.tabs(tab_names) - with tab11: - if st.button(f"Start {tab_names[0]}"): - st.pyplot(cc.launch_cluster_knn(df, selected_columns, dimensions=dimensions)) - - with tab12: - if st.button(f"Start {tab_names[1]}"): - st.pyplot(cc.launch_cluster_dbscan(df, selected_columns, dimensions)) + for idx, tab in enumerate(cluster_tabs): + if tab.button(f"Start {tab_names[idx]}"): + if tab_names[idx] == "K-means": + tab.pyplot(cc.launch_cluster_knn(df, selected_columns, dimensions=dimensions)) + else: + tab.pyplot(cc.launch_cluster_dbscan(df, selected_columns, dimensions)) with tab2: st.header("Predictions") @@ -60,16 +61,11 @@ if 'df' in st.session_state: selected_columns_p = handle_column_multiselect(df, "predictions") tab_names = ["Linear Regression", "Random Forest"] - tab21, tab22 = st.tabs(tab_names) - - with tab21: - if st.button(f"Start {tab_names[0]}"): - st.write(target_column) - st.write(selected_columns_p) - display_prediction_results(df, target_column, selected_columns_p, tab_names[0]) + prediction_tabs = st.tabs(tab_names) - with tab22: - if st.button(f"Start {tab_names[1]}"): - display_prediction_results(df, target_column, selected_columns_p, tab_names[1]) + for idx, tab in enumerate(prediction_tabs): + if tab.button(f"Start {tab_names[idx]}"): + tab.pyplot(p.correlation_matrix(df, selected_columns_p+[target_column])) + tab.dataframe(df_prediction_results(df, target_column, selected_columns_p, tab_names[idx])) else: st.write("Please clean your dataset.")