From 987e255dad01e15899c54a4e10d9d933b8a72787 Mon Sep 17 00:00:00 2001
From: bastien ollier <bastien.ollier@etu.uca.fr>
Date: Tue, 25 Jun 2024 08:23:37 +0200
Subject: [PATCH] delete :

---
 .../.streamlit => .streamlit}/config.toml     |   0
 backend/__init__.py                           |   1 +
 backend/normstrategy.py                       | 179 ++++++++++++++++++
 ...tering:_dbscan.py => clustering_dbscan.py} |   0
 ...tering:_kmeans.py => clustering_kmeans.py} |   0
 5 files changed, 180 insertions(+)
 rename {frontend/.streamlit => .streamlit}/config.toml (100%)
 create mode 100644 backend/__init__.py
 create mode 100644 backend/normstrategy.py
 rename frontend/pages/{clustering:_dbscan.py => clustering_dbscan.py} (100%)
 rename frontend/pages/{clustering:_kmeans.py => clustering_kmeans.py} (100%)

diff --git a/frontend/.streamlit/config.toml b/.streamlit/config.toml
similarity index 100%
rename from frontend/.streamlit/config.toml
rename to .streamlit/config.toml
diff --git a/backend/__init__.py b/backend/__init__.py
new file mode 100644
index 0000000..f68492a
--- /dev/null
+++ b/backend/__init__.py
@@ -0,0 +1 @@
+from . import normstrategy
\ No newline at end of file
diff --git a/backend/normstrategy.py b/backend/normstrategy.py
new file mode 100644
index 0000000..5a50bab
--- /dev/null
+++ b/backend/normstrategy.py
@@ -0,0 +1,179 @@
+from abc import ABC, abstractmethod
+from pandas import DataFrame, Series
+from pandas.api.types import is_numeric_dtype
+from sklearn.neighbors import KNeighborsClassifier
+from typing import Any, Union
+
+class DataFrameFunction(ABC):
+    """A command that may be applied in-place to a dataframe."""
+
+    @abstractmethod
+    def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame:
+        """Apply the current function to the given dataframe, in-place.
+
+        The series is described by its label and dataframe."""
+        return df
+
+
+class MVStrategy(DataFrameFunction):
+    """A way to handle missing values in a dataframe."""
+
+    @staticmethod
+    def list_available(df: DataFrame, label: str, series: Series) -> list['MVStrategy']:
+        """Get all the strategies that can be used."""
+        choices = [DropStrategy(), ModeStrategy()]
+        if is_numeric_dtype(series):
+            choices.extend((MeanStrategy(), MedianStrategy(), LinearRegressionStrategy()))
+            other_columns = df.select_dtypes(include="number").drop(label, axis=1).columns.to_list()
+            if len(other_columns):
+                choices.append(KNNStrategy(other_columns))
+        return choices
+
+
+class ScalingStrategy(DataFrameFunction):
+    """A way to handle missing values in a dataframe."""
+
+    @staticmethod
+    def list_available(df: DataFrame, series: Series) -> list['MVStrategy']:
+        """Get all the strategies that can be used."""
+        choices = [KeepStrategy()]
+        if is_numeric_dtype(series):
+            choices.extend((MinMaxStrategy(), ZScoreStrategy()))
+            if series.sum() != 0:
+                choices.append(UnitLengthStrategy())
+        return choices
+
+
+class DropStrategy(MVStrategy):
+    #@typing.override
+    def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame:
+        df.dropna(subset=label, inplace=True)
+        return df
+
+    def __str__(self) -> str:
+        return "Drop"
+
+
+class PositionStrategy(MVStrategy):
+    #@typing.override
+    def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame:
+        series.fillna(self.get_value(series), inplace=True)
+        return df
+
+    @abstractmethod
+    def get_value(self, series: Series) -> Any:
+        pass
+
+
+class MeanStrategy(PositionStrategy):
+    #@typing.override
+    def get_value(self, series: Series) -> Union[int, float]:
+        return series.mean()
+
+    def __str__(self) -> str:
+        return "Use mean"
+
+
+class MedianStrategy(PositionStrategy):
+    #@typing.override
+    def get_value(self, series: Series) -> Union[int, float]:
+        return series.median()
+
+    def __str__(self) -> str:
+        return "Use median"
+
+
+class ModeStrategy(PositionStrategy):
+    #@typing.override
+    def get_value(self, series: Series) -> Any:
+        return series.mode()[0]
+
+    def __str__(self) -> str:
+        return "Use mode"
+
+
+class LinearRegressionStrategy(MVStrategy):
+    def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame:
+        series.interpolate(inplace=True)
+        return df
+
+    def __str__(self) -> str:
+        return "Use linear regression"
+
+
+class KNNStrategy(MVStrategy):
+    def __init__(self, training_features: list[str]):
+        self.available_features = training_features
+        self.training_features = training_features
+        self.n_neighbors = 3
+
+    def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame:
+        # Remove any training column that have any missing values
+        usable_data = df.dropna(subset=self.training_features)
+        # Select columns to impute from
+        train_data = usable_data.dropna(subset=label)
+        # Create train dataframe
+        x_train = train_data.drop(label, axis=1)
+        y_train = train_data[label]
+
+        reg = KNeighborsClassifier(self.n_neighbors).fit(x_train, y_train)
+
+        # Create test dataframe
+        test_data = usable_data[usable_data[label].isnull()]
+        if test_data.empty:
+            return df
+        x_test = test_data.drop(label, axis=1)
+        predicted = reg.predict(x_test)
+
+        # Fill with predicated values and patch the original data
+        usable_data[label].fillna(Series(predicted), inplace=True)
+        df.fillna(usable_data, inplace=True)
+        return df
+
+    def count_max(self, df: DataFrame, label: str) -> int:
+        usable_data = df.dropna(subset=self.training_features)
+        return usable_data[label].count()
+
+    def __str__(self) -> str:
+        return "kNN"
+
+
+class KeepStrategy(ScalingStrategy):
+    #@typing.override
+    def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame:
+        return df
+
+    def __str__(self) -> str:
+        return "No-op"
+
+
+class MinMaxStrategy(ScalingStrategy):
+    #@typing.override
+    def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame:
+        minimum = series.min()
+        maximum = series.max()
+        df[label] = (series - minimum) / (maximum - minimum)
+        return df
+
+    def __str__(self) -> str:
+        return "Min-max"
+
+
+class ZScoreStrategy(ScalingStrategy):
+    #@typing.override
+    def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame:
+        df[label] = (series - series.mean()) / series.std()
+        return df
+
+    def __str__(self) -> str:
+        return "Z-Score"
+
+
+class UnitLengthStrategy(ScalingStrategy):
+    #@typing.override
+    def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame:
+        df[label] = series / series.sum()
+        return df
+
+    def __str__(self) -> str:
+        return "Unit length"
diff --git a/frontend/pages/clustering:_dbscan.py b/frontend/pages/clustering_dbscan.py
similarity index 100%
rename from frontend/pages/clustering:_dbscan.py
rename to frontend/pages/clustering_dbscan.py
diff --git a/frontend/pages/clustering:_kmeans.py b/frontend/pages/clustering_kmeans.py
similarity index 100%
rename from frontend/pages/clustering:_kmeans.py
rename to frontend/pages/clustering_kmeans.py