ajout prediction classification

prediction de regression terminee
debut prediction
7 changed files with 252 additions and 36 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,2 @@
-.venv
+__pycache__
+.venv
--- a/frontend/exploration.py
+++ b/frontend/exploration.py
@ -13,6 +13,7 @@ uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"])

 if uploaded_file is not None:
    st.session_state.data = pd.read_csv(uploaded_file)
+    st.session_state.original_data = st.session_state.data
    st.success("File loaded successfully!")


--- a/frontend/normstrategy.py
+++ b/frontend/normstrategy.py
@ -0,0 +1,138 @@
+from abc import ABC, abstractmethod
+from pandas import DataFrame, Series
+from pandas.api.types import is_numeric_dtype
+from typing import Any, Union
+
+class DataFrameFunction(ABC):
+    """A command that may be applied in-place to a dataframe."""
+
+    @abstractmethod
+    def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame:
+        """Apply the current function to the given dataframe, in-place.
+
+        The series is described by its label and dataframe."""
+        return df
+
+
+class MVStrategy(DataFrameFunction):
+    """A way to handle missing values in a dataframe."""
+
+    @staticmethod
+    def list_available(df: DataFrame, series: Series) -> list['MVStrategy']:
+        """Get all the strategies that can be used."""
+        choices = [DropStrategy(), ModeStrategy()]
+        if is_numeric_dtype(series):
+            choices.extend((MeanStrategy(), MedianStrategy(), LinearRegressionStrategy()))
+        return choices
+
+
+class ScalingStrategy(DataFrameFunction):
+    """A way to handle missing values in a dataframe."""
+
+    @staticmethod
+    def list_available(df: DataFrame, series: Series) -> list['MVStrategy']:
+        """Get all the strategies that can be used."""
+        choices = [KeepStrategy()]
+        if is_numeric_dtype(series):
+            choices.extend((MinMaxStrategy(), ZScoreStrategy()))
+            if series.sum() != 0:
+                choices.append(UnitLengthStrategy())
+        return choices
+
+
+class DropStrategy(MVStrategy):
+    #@typing.override
+    def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame:
+        df.dropna(subset=label, inplace=True)
+        return df
+
+    def __str__(self) -> str:
+        return "Drop"
+
+
+class PositionStrategy(MVStrategy):
+    #@typing.override
+    def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame:
+        series.fillna(self.get_value(series), inplace=True)
+        return df
+
+    @abstractmethod
+    def get_value(self, series: Series) -> Any:
+        pass
+
+
+class MeanStrategy(PositionStrategy):
+    #@typing.override
+    def get_value(self, series: Series) -> Union[int, float]:
+        return series.mean()
+
+    def __str__(self) -> str:
+        return "Use mean"
+
+
+class MedianStrategy(PositionStrategy):
+    #@typing.override
+    def get_value(self, series: Series) -> Union[int, float]:
+        return series.median()
+
+    def __str__(self) -> str:
+        return "Use median"
+
+
+class ModeStrategy(PositionStrategy):
+    #@typing.override
+    def get_value(self, series: Series) -> Any:
+        return series.mode()[0]
+
+    def __str__(self) -> str:
+        return "Use mode"
+
+
+class LinearRegressionStrategy(MVStrategy):
+    def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame:
+        series.interpolate(inplace=True)
+        return df
+
+    def __str__(self) -> str:
+        return "Use linear regression"
+
+
+class KeepStrategy(ScalingStrategy):
+    #@typing.override
+    def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame:
+        return df
+
+    def __str__(self) -> str:
+        return "No-op"
+
+
+class MinMaxStrategy(ScalingStrategy):
+    #@typing.override
+    def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame:
+        minimum = series.min()
+        maximum = series.max()
+        df[label] = (series - minimum) / (maximum - minimum)
+        return df
+
+    def __str__(self) -> str:
+        return "Min-max"
+
+
+class ZScoreStrategy(ScalingStrategy):
+    #@typing.override
+    def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame:
+        df[label] = (series - series.mean()) / series.std()
+        return df
+
+    def __str__(self) -> str:
+        return "Z-Score"
+
+
+class UnitLengthStrategy(ScalingStrategy):
+    #@typing.override
+    def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame:
+        df[label] = series / series.sum()
+        return df
+
+    def __str__(self) -> str:
+        return "Unit length"
--- a/frontend/pages/clustering.py
+++ b/frontend/pages/clustering.py
@ -1,35 +0,0 @@
-import streamlit as st
-from sklearn.cluster import KMeans
-import matplotlib.pyplot as plt
-
-st.header("Clustering")
-
-
-if "data" in st.session_state:
-    data = st.session_state.data
-
-    with st.form("my_form"):
-        row1 = st.columns([1,1,1])
-        n_clusters = row1[0].selectbox("Number of clusters", range(1, 10))
-        data_name = row1[1].multiselect("Data Name",data.select_dtypes(include="number").columns, max_selections=2)
-        n_init = row1[2].number_input("n_init",step=1,min_value=1)
-
-        row2 = st.columns([1,1])
-        max_iter = row1[0].number_input("max_iter",step=1,min_value=1)
-
-
-        st.form_submit_button('launch')
-
-    if len(data_name) == 2:
-        x = data[data_name].to_numpy()
-
-        kmeans = KMeans(n_clusters=n_clusters, init='random', n_init=n_init, max_iter=max_iter, random_state=111)
-        y_kmeans = kmeans.fit_predict(x)
-
-        fig, ax = plt.subplots(figsize=(12,8))
-        plt.scatter(x[:, 0], x[:, 1], s=100, c=kmeans.labels_, cmap='Set1')
-        plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=400, marker='*', color='k')
-        st.pyplot(fig)
-
-else:
-    st.error("file not loaded")
--- a/frontend/pages/clustering_dbscan.py
+++ b/frontend/pages/clustering_dbscan.py
@ -0,0 +1,35 @@
+import streamlit as st
+import matplotlib.pyplot as plt
+from sklearn.cluster import DBSCAN
+
+st.header("Clustering: dbscan")
+
+
+if "data" in st.session_state:
+    data = st.session_state.data
+
+    with st.form("my_form"):
+        data_name = st.multiselect("Data Name", data.select_dtypes(include="number").columns, max_selections=3)
+        eps = st.slider("eps", min_value=0.0, max_value=1.0, value=0.5, step=0.01)
+        min_samples = st.number_input("min_samples", step=1, min_value=1, value=5)
+        st.form_submit_button("launch")
+
+    if len(data_name) >= 2 and len(data_name) <=3:
+        x = data[data_name].to_numpy()
+
+        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
+        y_dbscan = dbscan.fit_predict(x)
+
+        fig = plt.figure()
+        if len(data_name) == 2:
+            ax = fig.add_subplot(projection='rectilinear')
+            plt.scatter(x[:, 0], x[:, 1], c=y_dbscan, s=50, cmap="viridis")
+        else:
+            ax = fig.add_subplot(projection='3d')
+            ax.scatter(x[:, 0], x[:, 1],x[:, 2], c=y_dbscan, s=50, cmap="viridis")
+        st.pyplot(fig)
+
+    
+
+else:
+    st.error("file not loaded")
--- a/frontend/pages/clustering_kmeans.py
+++ b/frontend/pages/clustering_kmeans.py
@ -0,0 +1,44 @@
+import streamlit as st
+from sklearn.cluster import KMeans
+import matplotlib.pyplot as plt
+
+st.header("Clustering: kmeans")
+
+
+if "data" in st.session_state:
+    data = st.session_state.data
+
+    with st.form("my_form"):
+        row1 = st.columns([1,1,1])
+        n_clusters = row1[0].selectbox("Number of clusters", range(1,data.shape[0]))
+        data_name = row1[1].multiselect("Data Name",data.select_dtypes(include="number").columns, max_selections=3)
+        n_init = row1[2].number_input("n_init",step=1,min_value=1)
+
+        row2 = st.columns([1,1])
+        max_iter = row1[0].number_input("max_iter",step=1,min_value=1)
+
+
+        st.form_submit_button("launch")
+
+    if len(data_name) >= 2 and len(data_name) <=3:
+        x = data[data_name].to_numpy()
+
+        kmeans = KMeans(n_clusters=n_clusters, init="random", n_init=n_init, max_iter=max_iter, random_state=111)
+        y_kmeans = kmeans.fit_predict(x)
+
+        fig = plt.figure()
+        if len(data_name) == 2:
+            ax = fig.add_subplot(projection='rectilinear')
+            plt.scatter(x[:, 0], x[:, 1], c=y_kmeans, s=50, cmap="viridis")
+            centers = kmeans.cluster_centers_
+            plt.scatter(centers[:, 0], centers[:, 1], c="black", s=200, marker="X")
+        else:
+            ax = fig.add_subplot(projection='3d')
+
+            ax.scatter(x[:, 0], x[:, 1],x[:, 2], c=y_kmeans, s=50, cmap="viridis")
+            centers = kmeans.cluster_centers_
+            ax.scatter(centers[:, 0], centers[:, 1],centers[:, 2], c="black", s=200, marker="X")
+        st.pyplot(fig)
+
+else:
+    st.error("file not loaded")
--- a/frontend/pages/normalization.py
+++ b/frontend/pages/normalization.py
@ -0,0 +1,32 @@
+import streamlit as st
+from normstrategy import MVStrategy, ScalingStrategy
+
+if "data" in st.session_state:
+    data = st.session_state.original_data
+    st.session_state.original_data = data.copy()
+
+    for column, series in data.items():
+        col1, col2 = st.columns(2)
+        missing_count = series.isna().sum()
+        choices = MVStrategy.list_available(data, series)
+        option = col1.selectbox(
+            f"Missing values of {column} ({missing_count})",
+            choices,
+            index=1,
+            key=f"mv-{column}",
+        )
+        # Always re-get the series to avoid reusing an invalidated series pointer
+        data = option.apply(data, column, data[column])
+
+        choices = ScalingStrategy.list_available(data, series)
+        option = col2.selectbox(
+            "Scaling",
+            choices,
+            key=f"scaling-{column}",
+        )
+        data = option.apply(data, column, data[column])
+
+    st.write(data)
+    st.session_state.data = data
+else:
+    st.error("file not loaded")
Author	SHA1	Message	Date
Hugo PRADIER	2d1c867bed	ajout prediction classification	10 months ago
Hugo PRADIER	a914c3f8f9	prediction de regression terminee	10 months ago
Hugo PRADIER	70641ebca4	debut prediction	10 months ago
Bastien OLLIER	e5f05a2c8a	Mise à jour de 'frontend/pages/clustering_kmeans.py'	10 months ago
Bastien OLLIER	972fde561f	Mise à jour de 'frontend/pages/clustering_dbscan.py'	10 months ago
Bastien OLLIER	694ecd0eef	Merge pull request 'Visualize clusters in 3d' (#6 ) from cluster3d into main Reviewed-on: #6 Reviewed-by: Clément FRÉVILLE <clement.freville2@etu.uca.fr>	10 months ago
Bastien OLLIER	e255c67972	Merge pull request 'Implement base missing values strategies' (#3 ) from feature/missing-values into main Reviewed-on: #3 Reviewed-by: Bastien OLLIER <bastien.ollier@noreply.codefirst.iut.uca.fr>	10 months ago
Bastien OLLIER	e48c3bfa50	add 3d plot to bdscan	10 months ago
Bastien OLLIER	52cb140746	add 3d to kmeans	10 months ago
Clément FRÉVILLE	6dcca29cbd	Rename to original_data	10 months ago
Bastien OLLIER	c1f5e55a0b	Merge pull request 'clustering' (#5 ) from clustering into main Reviewed-on: #5 Reviewed-by: Clément FRÉVILLE <clement.freville2@etu.uca.fr>	10 months ago
Bastien OLLIER	34f70b4d79	delete np	10 months ago
Bastien OLLIER	64cf65a417	max nb cluster to nb line	10 months ago
Bastien OLLIER	d4e33e7367	dbscan	10 months ago
Bastien OLLIER	72dcc8ff1c	add dbscan	10 months ago
Bastien OLLIER	9fc6d7d2d1	add dbscan	10 months ago
Clément FRÉVILLE	a325603fd9	Add scaling strategies	10 months ago
Bastien OLLIER	197939555c	debut dbscan	10 months ago
Clément FRÉVILLE	5f960df838	Support Pandas linear regression	11 months ago
Clément FRÉVILLE	63bce82b3b	Implement base MissingValues strategies	11 months ago