From 7dafa78bc4e838f3f08f868cf6e9d4a277cc65d3 Mon Sep 17 00:00:00 2001 From: "hugo.pradier2" Date: Sun, 23 Jun 2024 17:44:26 +0200 Subject: [PATCH] fin separation front/back --- .gitignore | 2 +- backend/classification_strategy.py | 45 +++++++++++ backend/dbscan_strategy.py | 17 ++++ backend/kmeans_strategy.py | 21 +++++ backend/{normstrategy.py => norm_strategy.py} | 0 backend/regression_strategy.py | 18 +++++ backend/visualization_strategy.py | 16 ++++ frontend/pages/clustering_dbscan.py | 33 +++----- frontend/pages/clustering_kmeans.py | 50 ++++-------- frontend/pages/normalization.py | 2 +- frontend/pages/prediction_classification.py | 78 ++++++++----------- frontend/pages/prediction_regression.py | 33 ++++---- frontend/pages/visualization.py | 21 ++--- 13 files changed, 203 insertions(+), 133 deletions(-) create mode 100644 backend/classification_strategy.py create mode 100644 backend/dbscan_strategy.py create mode 100644 backend/kmeans_strategy.py rename backend/{normstrategy.py => norm_strategy.py} (100%) create mode 100644 backend/regression_strategy.py create mode 100644 backend/visualization_strategy.py diff --git a/.gitignore b/.gitignore index 9f7550b..99d7f3f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,2 @@ __pycache__ -.venv +*/myenv diff --git a/backend/classification_strategy.py b/backend/classification_strategy.py new file mode 100644 index 0000000..633e298 --- /dev/null +++ b/backend/classification_strategy.py @@ -0,0 +1,45 @@ +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score +from sklearn.preprocessing import LabelEncoder + +def perform_classification(data, data_name, target_name, test_size): + X = data[data_name] + y = data[target_name] + + label_encoders = {} + for column in X.select_dtypes(include=['object']).columns: + le = LabelEncoder() + X[column] = le.fit_transform(X[column]) + label_encoders[column] = le + + if y.dtype == 'object': + le = LabelEncoder() + y = le.fit_transform(y) + label_encoders[target_name] = le + else: + if y.nunique() > 10: + raise ValueError("The target variable seems to be continuous. Please select a categorical target for classification.") + + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42) + + model = LogisticRegression() + model.fit(X_train, y_train) + y_pred = model.predict(X_test) + accuracy = accuracy_score(y_test, y_pred) + + return model, label_encoders, accuracy + +def make_prediction(model, label_encoders, data_name, target_name, input_values): + X_new = [] + for feature, value in zip(data_name, input_values): + if feature in label_encoders: + value = label_encoders[feature].transform([value])[0] + X_new.append(value) + + prediction = model.predict([X_new]) + + if target_name in label_encoders: + prediction = label_encoders[target_name].inverse_transform(prediction) + + return prediction[0] diff --git a/backend/dbscan_strategy.py b/backend/dbscan_strategy.py new file mode 100644 index 0000000..9cc89a5 --- /dev/null +++ b/backend/dbscan_strategy.py @@ -0,0 +1,17 @@ +import numpy as np +import matplotlib.pyplot as plt +from sklearn.cluster import DBSCAN + +def perform_dbscan_clustering(data, data_name, eps, min_samples): + x = data[data_name].to_numpy() + dbscan = DBSCAN(eps=eps, min_samples=min_samples) + y_dbscan = dbscan.fit_predict(x) + + fig = plt.figure() + if len(data_name) == 2: + ax = fig.add_subplot(projection='rectilinear') + plt.scatter(x[:, 0], x[:, 1], c=y_dbscan, s=50, cmap="viridis") + else: + ax = fig.add_subplot(projection='3d') + ax.scatter(x[:, 0], x[:, 1], x[:, 2], c=y_dbscan, s=50, cmap="viridis") + return fig diff --git a/backend/kmeans_strategy.py b/backend/kmeans_strategy.py new file mode 100644 index 0000000..f77e677 --- /dev/null +++ b/backend/kmeans_strategy.py @@ -0,0 +1,21 @@ +import numpy as np +import matplotlib.pyplot as plt +from sklearn.cluster import KMeans + +def perform_kmeans_clustering(data, data_name, n_clusters, n_init, max_iter): + x = data[data_name].to_numpy() + kmeans = KMeans(n_clusters=n_clusters, init="random", n_init=n_init, max_iter=max_iter, random_state=111) + y_kmeans = kmeans.fit_predict(x) + + fig = plt.figure() + if len(data_name) == 2: + ax = fig.add_subplot(projection='rectilinear') + plt.scatter(x[:, 0], x[:, 1], c=y_kmeans, s=50, cmap="viridis") + centers = kmeans.cluster_centers_ + plt.scatter(centers[:, 0], centers[:, 1], c="black", s=200, marker="X") + else: + ax = fig.add_subplot(projection='3d') + ax.scatter(x[:, 0], x[:, 1], x[:, 2], c=y_kmeans, s=50, cmap="viridis") + centers = kmeans.cluster_centers_ + ax.scatter(centers[:, 0], centers[:, 1], centers[:, 2], c="black", s=200, marker="X") + return fig diff --git a/backend/normstrategy.py b/backend/norm_strategy.py similarity index 100% rename from backend/normstrategy.py rename to backend/norm_strategy.py diff --git a/backend/regression_strategy.py b/backend/regression_strategy.py new file mode 100644 index 0000000..52286cc --- /dev/null +++ b/backend/regression_strategy.py @@ -0,0 +1,18 @@ +from sklearn.linear_model import LinearRegression + +def perform_regression(data, data_name, target_name): + X = data[data_name] + y = data[target_name] + + if not isinstance(y.iloc[0], (int, float)): + raise ValueError("The target variable should be numeric (continuous) for regression.") + + model = LinearRegression() + model.fit(X, y) + + return model + +def make_prediction(model, feature_names, input_values): + prediction = model.predict([input_values]) + + return prediction[0] diff --git a/backend/visualization_strategy.py b/backend/visualization_strategy.py new file mode 100644 index 0000000..8fd5119 --- /dev/null +++ b/backend/visualization_strategy.py @@ -0,0 +1,16 @@ +import matplotlib.pyplot as plt +import seaborn as sns + +def plot_histogram(data, column): + fig, ax = plt.subplots() + ax.hist(data[column].dropna(), bins=20, edgecolor='k') + ax.set_title(f"Histogram of {column}") + ax.set_xlabel(column) + ax.set_ylabel("Frequency") + return fig + +def plot_boxplot(data, column): + fig, ax = plt.subplots() + sns.boxplot(data=data, x=column, ax=ax) + ax.set_title(f"Boxplot of {column}") + return fig diff --git a/frontend/pages/clustering_dbscan.py b/frontend/pages/clustering_dbscan.py index d06b10a..a8bafeb 100644 --- a/frontend/pages/clustering_dbscan.py +++ b/frontend/pages/clustering_dbscan.py @@ -1,35 +1,22 @@ import streamlit as st -import matplotlib.pyplot as plt -from sklearn.cluster import DBSCAN - -st.header("Clustering: dbscan") +import sys +import os +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../backend'))) +from dbscan_strategy import perform_dbscan_clustering +st.header("Clustering: DBSCAN") if "data" in st.session_state: data = st.session_state.data - with st.form("my_form"): + with st.form("dbscan_form"): data_name = st.multiselect("Data Name", data.select_dtypes(include="number").columns, max_selections=3) eps = st.slider("eps", min_value=0.0, max_value=1.0, value=0.5, step=0.01) min_samples = st.number_input("min_samples", step=1, min_value=1, value=5) - st.form_submit_button("launch") - - if len(data_name) >= 2 and len(data_name) <=3: - x = data[data_name].to_numpy() - - dbscan = DBSCAN(eps=eps, min_samples=min_samples) - y_dbscan = dbscan.fit_predict(x) + submitted = st.form_submit_button("Launch") - fig = plt.figure() - if len(data_name) == 2: - ax = fig.add_subplot(projection='rectilinear') - plt.scatter(x[:, 0], x[:, 1], c=y_dbscan, s=50, cmap="viridis") - else: - ax = fig.add_subplot(projection='3d') - ax.scatter(x[:, 0], x[:, 1],x[:, 2], c=y_dbscan, s=50, cmap="viridis") + if submitted and 2 <= len(data_name) <= 3: + fig = perform_dbscan_clustering(data, data_name, eps, min_samples) st.pyplot(fig) - - - else: - st.error("file not loaded") \ No newline at end of file + st.error("File not loaded") diff --git a/frontend/pages/clustering_kmeans.py b/frontend/pages/clustering_kmeans.py index c61bf40..cacf84a 100644 --- a/frontend/pages/clustering_kmeans.py +++ b/frontend/pages/clustering_kmeans.py @@ -1,44 +1,26 @@ import streamlit as st -from sklearn.cluster import KMeans -import matplotlib.pyplot as plt - -st.header("Clustering: kmeans") +import sys +import os +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../backend'))) +from kmeans_strategy import perform_kmeans_clustering +st.header("Clustering: KMeans") if "data" in st.session_state: data = st.session_state.data - with st.form("my_form"): - row1 = st.columns([1,1,1]) - n_clusters = row1[0].selectbox("Number of clusters", range(1,data.shape[0])) - data_name = row1[1].multiselect("Data Name",data.select_dtypes(include="number").columns, max_selections=3) - n_init = row1[2].number_input("n_init",step=1,min_value=1) - - row2 = st.columns([1,1]) - max_iter = row1[0].number_input("max_iter",step=1,min_value=1) - - - st.form_submit_button("launch") + with st.form("kmeans_form"): + row1 = st.columns([1, 1, 1]) + n_clusters = row1[0].selectbox("Number of clusters", range(1, data.shape[0])) + data_name = row1[1].multiselect("Data Name", data.select_dtypes(include="number").columns, max_selections=3) + n_init = row1[2].number_input("n_init", step=1, min_value=1) - if len(data_name) >= 2 and len(data_name) <=3: - x = data[data_name].to_numpy() + row2 = st.columns([1, 1]) + max_iter = row2[0].number_input("max_iter", step=1, min_value=1) + submitted = st.form_submit_button("Launch") - kmeans = KMeans(n_clusters=n_clusters, init="random", n_init=n_init, max_iter=max_iter, random_state=111) - y_kmeans = kmeans.fit_predict(x) - - fig = plt.figure() - if len(data_name) == 2: - ax = fig.add_subplot(projection='rectilinear') - plt.scatter(x[:, 0], x[:, 1], c=y_kmeans, s=50, cmap="viridis") - centers = kmeans.cluster_centers_ - plt.scatter(centers[:, 0], centers[:, 1], c="black", s=200, marker="X") - else: - ax = fig.add_subplot(projection='3d') - - ax.scatter(x[:, 0], x[:, 1],x[:, 2], c=y_kmeans, s=50, cmap="viridis") - centers = kmeans.cluster_centers_ - ax.scatter(centers[:, 0], centers[:, 1],centers[:, 2], c="black", s=200, marker="X") + if submitted and 2 <= len(data_name) <= 3: + fig = perform_kmeans_clustering(data, data_name, n_clusters, n_init, max_iter) st.pyplot(fig) - else: - st.error("file not loaded") + st.error("File not loaded") diff --git a/frontend/pages/normalization.py b/frontend/pages/normalization.py index f15ac18..28de333 100644 --- a/frontend/pages/normalization.py +++ b/frontend/pages/normalization.py @@ -2,7 +2,7 @@ import streamlit as st import sys import os sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../backend'))) -from normstrategy import MVStrategy, ScalingStrategy, KNNStrategy +from norm_strategy import MVStrategy, ScalingStrategy, KNNStrategy if "data" in st.session_state: data = st.session_state.original_data diff --git a/frontend/pages/prediction_classification.py b/frontend/pages/prediction_classification.py index 5aaf52f..2e1fffc 100644 --- a/frontend/pages/prediction_classification.py +++ b/frontend/pages/prediction_classification.py @@ -1,9 +1,8 @@ import streamlit as st -from sklearn.linear_model import LogisticRegression -from sklearn.model_selection import train_test_split -from sklearn.metrics import accuracy_score -from sklearn.preprocessing import LabelEncoder -import pandas as pd +import sys +import os +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../backend'))) +from classification_strategy import perform_classification, make_prediction st.header("Prediction: Classification") @@ -12,53 +11,38 @@ if "data" in st.session_state: with st.form("classification_form"): st.subheader("Classification Parameters") - data_name = st.multiselect("Features", data.columns) - target_name = st.selectbox("Target", data.columns) - test_size = st.slider("Test Size", min_value=0.1, max_value=0.5, value=0.2, step=0.1) - st.form_submit_button('Train and Predict') - - if data_name and target_name: - X = data[data_name] - y = data[target_name] - - label_encoders = {} - for column in X.select_dtypes(include=['object']).columns: - le = LabelEncoder() - X[column] = le.fit_transform(X[column]) - label_encoders[column] = le - - if y.dtype == 'object': - le = LabelEncoder() - y = le.fit_transform(y) - label_encoders[target_name] = le - - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42) - - model = LogisticRegression() - model.fit(X_train, y_train) - y_pred = model.predict(X_test) - accuracy = accuracy_score(y_test, y_pred) - + data_name = st.multiselect("Features", data.columns, key="classification_features") + target_name = st.selectbox("Target", data.columns, key="classification_target") + test_size = st.slider("Test Size", min_value=0.1, max_value=0.5, value=0.2, step=0.1, key="classification_test_size") + submitted = st.form_submit_button('Train and Predict') + + if submitted and data_name and target_name: + try: + model, label_encoders, accuracy = perform_classification(data, data_name, target_name, test_size) + st.session_state.classification_model = model + st.session_state.classification_label_encoders = label_encoders + st.session_state.classification_accuracy = accuracy + st.session_state.classification_features_selected = data_name + st.session_state.classification_target_selected = target_name + except ValueError as e: + st.error(e) + + if "classification_model" in st.session_state: st.subheader("Model Accuracy") - st.write(f"Accuracy on test data: {accuracy:.2f}") + st.write(f"Accuracy on test data: {st.session_state.classification_accuracy:.2f}") st.subheader("Enter values for prediction") - pred_values = [] - for feature in data_name: - if feature in label_encoders: - values = list(label_encoders[feature].classes_) - value = st.selectbox(f"Value for {feature}", values) - value_encoded = label_encoders[feature].transform([value])[0] - pred_values.append(value_encoded) + input_values = [] + for feature in st.session_state.classification_features_selected: + if feature in st.session_state.classification_label_encoders: + values = list(st.session_state.classification_label_encoders[feature].classes_) + value = st.selectbox(f"Value for {feature}", values, key=f"classification_input_{feature}") else: - value = st.number_input(f"Value for {feature}", value=0.0) - pred_values.append(value) - - prediction = model.predict(pd.DataFrame([pred_values], columns=data_name)) + value = st.number_input(f"Value for {feature}", value=0.0, key=f"classification_input_{feature}") + input_values.append(value) - if target_name in label_encoders: - prediction = label_encoders[target_name].inverse_transform(prediction) + prediction = make_prediction(st.session_state.classification_model, st.session_state.classification_label_encoders, st.session_state.classification_features_selected, st.session_state.classification_target_selected, input_values) - st.write("Prediction:", prediction[0]) + st.write("Prediction:", prediction) else: st.error("File not loaded") diff --git a/frontend/pages/prediction_regression.py b/frontend/pages/prediction_regression.py index 377274e..d07d9cc 100644 --- a/frontend/pages/prediction_regression.py +++ b/frontend/pages/prediction_regression.py @@ -1,6 +1,8 @@ import streamlit as st -from sklearn.linear_model import LinearRegression -import pandas as pd +import sys +import os +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../backend'))) +from regression_strategy import perform_regression, make_prediction st.header("Prediction: Regression") @@ -9,21 +11,24 @@ if "data" in st.session_state: with st.form("regression_form"): st.subheader("Linear Regression Parameters") - data_name = st.multiselect("Features", data.select_dtypes(include="number").columns) - target_name = st.selectbox("Target", data.select_dtypes(include="number").columns) - st.form_submit_button('Train and Predict') + data_name = st.multiselect("Features", data.select_dtypes(include="number").columns, key="regression_features") + target_name = st.selectbox("Target", data.select_dtypes(include="number").columns, key="regression_target") + submitted = st.form_submit_button('Train and Predict') - if data_name and target_name: - X = data[data_name] - y = data[target_name] + if submitted and data_name and target_name: + try: + model = perform_regression(data, data_name, target_name) + st.session_state.regression_model = model + st.session_state.regression_features_selected = data_name + st.session_state.regression_target_selected = target_name + except ValueError as e: + st.error(e) - model = LinearRegression() - model.fit(X, y) - + if "regression_model" in st.session_state: st.subheader("Enter values for prediction") - pred_values = [st.number_input(f"Value for {feature}", value=0.0) for feature in data_name] - prediction = model.predict(pd.DataFrame([pred_values], columns=data_name)) + input_values = [st.number_input(f"Value for {feature}", value=0.0, key=f"regression_input_{feature}") for feature in st.session_state.regression_features_selected] + prediction = make_prediction(st.session_state.regression_model, st.session_state.regression_features_selected, input_values) - st.write("Prediction:", prediction[0]) + st.write("Prediction:", prediction) else: st.error("File not loaded") diff --git a/frontend/pages/visualization.py b/frontend/pages/visualization.py index 057b0c9..4215a67 100644 --- a/frontend/pages/visualization.py +++ b/frontend/pages/visualization.py @@ -1,30 +1,25 @@ import streamlit as st -import matplotlib.pyplot as plt -import seaborn as sns +import sys +import os +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../backend'))) +from visualization_strategy import plot_histogram, plot_boxplot st.header("Data Visualization") - if "data" in st.session_state: data = st.session_state.data st.subheader("Histogram") column_to_plot = st.selectbox("Select Column for Histogram", data.columns) if column_to_plot: - fig, ax = plt.subplots() - ax.hist(data[column_to_plot].dropna(), bins=20, edgecolor='k') - ax.set_title(f"Histogram of {column_to_plot}") - ax.set_xlabel(column_to_plot) - ax.set_ylabel("Frequency") + fig = plot_histogram(data, column_to_plot) st.pyplot(fig) - + st.subheader("Boxplot") dataNumeric = data.select_dtypes(include="number") column_to_plot = st.selectbox("Select Column for Boxplot", dataNumeric.columns) if column_to_plot: - fig, ax = plt.subplots() - sns.boxplot(data=data, x=column_to_plot, ax=ax) - ax.set_title(f"Boxplot of {column_to_plot}") + fig = plot_boxplot(data, column_to_plot) st.pyplot(fig) else: - st.error("file not loaded") \ No newline at end of file + st.error("file not loaded")