Merge pull request 'streamlit' (#24) from streamlit into master

Reviewed-on: #24
1 year ago · 40be24a556
parent 59289df22f f180558394
commit 40be24a556
3 changed files with 40 additions and 22 deletions
--- a/src/back/clustering_csv.py
+++ b/src/back/clustering_csv.py
@ -90,7 +90,6 @@ def launch_cluster_dbscan(df, array_columns, dimensions=2):
        return visualize_clusters_3d(X, labels_dbscan, title="DBSCAN Clustering 3D")
    else:
        return visualize_clusters_2d(X, labels_dbscan, title="DBSCAN Clustering")
-    return stats_dbscan

 def launch_cluster(df, array_columns):
    X = df[array_columns].values
--- a/src/back/prediction.py
+++ b/src/back/prediction.py
@ -1,6 +1,10 @@
 from sklearn.model_selection import train_test_split
 from sklearn.linear_model import LinearRegression
 from sklearn.ensemble import RandomForestRegressor
+from sklearn.metrics import f1_score
+from sklearn.metrics import accuracy_score
+import numpy as np
+import matplotlib.pyplot as plt

 def getColumnsForPredictionAndPredict(df,columns, columnGoal, algoOfPrediction):
    predictors = df[columns]
@ -14,4 +18,20 @@ def getColumnsForPredictionAndPredict(df,columns, columnGoal, algoOfPrediction):
        raise NameError("No method name : \"" + algoOfPrediction + "\"")

    model.fit(predictors, target)
-    return model.predict(predictors)
+    prediction = model.predict(predictors)
+    return prediction
+
+def correlation_matrix(df, columns):
+    new_df = df[columns]
+    correlations = new_df.corr()
+    print(correlations)
+    fig = plt.figure()
+    ax = fig.add_subplot(111)
+    cax = ax.matshow(correlations, vmin=-1, vmax=1)
+    fig.colorbar(cax)
+    ticks = np.arange(0,new_df.shape[1],1)
+    ax.set_xticks(ticks)
+    ax.set_yticks(ticks)
+    ax.set_xticklabels(list(new_df))
+    ax.set_yticklabels(list(new_df))
+    return fig
--- a/src/pages/prediction.py
+++ b/src/pages/prediction.py
@ -1,6 +1,8 @@
 import streamlit as st
 import pandas as pd
 import sys
+import matplotlib.pyplot as plt
+import numpy as np
 sys.path.append('./back/')

 import clustering_csv as cc
@ -10,15 +12,15 @@ def handle_column_multiselect(df, method_name):
    selected_columns = st.multiselect(f"Select the columns you want for {method_name}:", df.columns.tolist(), placeholder="Select dataset columns")
    return selected_columns
    
-def display_prediction_results(df, targetCol, sourceColumns, method):
+def df_prediction_results(df, targetCol, sourceColumns, method):
    original_col = df[targetCol]
-    predicted_col = p.getColumnsForPredictionAndPredict(df, sourceColumns, targetCol, method)    
+    predicted_col = p.getColumnsForPredictionAndPredict(df, sourceColumns, targetCol, method)
    
    new_df = pd.DataFrame()
    new_df['Original'] = original_col
    new_df['Predicted'] = predicted_col

-    st.dataframe(new_df)    
+    return new_df

 if 'df' in st.session_state:
    df = st.session_state.df
@ -37,15 +39,16 @@ if 'df' in st.session_state:
            dimensions = 2
        
        tab_names = ["K-means", "DBSCAN"] 
-        tab11, tab12 = st.tabs(tab_names)
+        cluster_tabs = st.tabs(tab_names)

-        with tab11:
-            if st.button(f"Start {tab_names[0]}"):
-                st.pyplot(cc.launch_cluster_knn(df, selected_columns, dimensions=dimensions))
+        for idx, tab in enumerate(cluster_tabs):
+            if tab.button(f"Start {tab_names[idx]}"):
+                if tab_names[idx] == "K-means":
+                    fig = cc.launch_cluster_knn(df, selected_columns, dimensions=dimensions)
+                else:
+                    fig = cc.launch_cluster_dbscan(df, selected_columns, dimensions)

-        with tab12:
-            if st.button(f"Start {tab_names[1]}"):
-                st.pyplot(cc.launch_cluster_dbscan(df, selected_columns, dimensions))
+                tab.pyplot(fig)

    with tab2:
        st.header("Predictions")
@ -60,16 +63,12 @@ if 'df' in st.session_state:
            selected_columns_p = handle_column_multiselect(df, "predictions")
        
        tab_names = ["Linear Regression", "Random Forest"] 
-        tab21, tab22 = st.tabs(tab_names)
+        prediction_tabs = st.tabs(tab_names)

-        with tab21:
-            if st.button(f"Start {tab_names[0]}"):
-                st.write(target_column)
-                st.write(selected_columns_p)
-                display_prediction_results(df, target_column, selected_columns_p, tab_names[0])
-
-        with tab22:
-            if st.button(f"Start {tab_names[1]}"):
-                display_prediction_results(df, target_column, selected_columns_p, tab_names[1])
+        for idx, tab in enumerate(prediction_tabs):
+            if tab.button(f"Start {tab_names[idx]}"):
+                tab.pyplot(p.correlation_matrix(df, selected_columns_p+[target_column]))
+                tmp_df = df_prediction_results(df, target_column, selected_columns_p, tab_names[idx])
+                tab.dataframe(tmp_df)
 else:
    st.write("Please clean your dataset.")