diff --git a/frontend/mvstrategy.py b/frontend/normstrategy.py similarity index 54% rename from frontend/mvstrategy.py rename to frontend/normstrategy.py index fb7cc6c..2896c49 100644 --- a/frontend/mvstrategy.py +++ b/frontend/normstrategy.py @@ -3,16 +3,20 @@ from pandas import DataFrame, Series from pandas.api.types import is_numeric_dtype from typing import Any, Union -class MVStrategy(ABC): - """A way to handle missing values in a dataframe.""" +class DataFrameFunction(ABC): + """A command that may be applied in-place to a dataframe.""" @abstractmethod def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: - """Apply the current strategy to the given series. + """Apply the current function to the given dataframe, in-place. The series is described by its label and dataframe.""" return df + +class MVStrategy(DataFrameFunction): + """A way to handle missing values in a dataframe.""" + @staticmethod def list_available(df: DataFrame, series: Series) -> list['MVStrategy']: """Get all the strategies that can be used.""" @@ -22,6 +26,20 @@ class MVStrategy(ABC): return choices +class ScalingStrategy(DataFrameFunction): + """A way to handle missing values in a dataframe.""" + + @staticmethod + def list_available(df: DataFrame, series: Series) -> list['MVStrategy']: + """Get all the strategies that can be used.""" + choices = [KeepStrategy()] + if is_numeric_dtype(series): + choices.extend((MinMaxStrategy(), ZScoreStrategy())) + if series.sum() != 0: + choices.append(UnitLengthStrategy()) + return choices + + class DropStrategy(MVStrategy): #@typing.override def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: @@ -77,3 +95,44 @@ class LinearRegressionStrategy(MVStrategy): def __str__(self) -> str: return "Use linear regression" + + +class KeepStrategy(ScalingStrategy): + #@typing.override + def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: + return df + + def __str__(self) -> str: + return "No-op" + + +class MinMaxStrategy(ScalingStrategy): + #@typing.override + def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: + minimum = series.min() + maximum = series.max() + df[label] = (series - minimum) / (maximum - minimum) + return df + + def __str__(self) -> str: + return "Min-max" + + +class ZScoreStrategy(ScalingStrategy): + #@typing.override + def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: + df[label] = (series - series.mean()) / series.std() + return df + + def __str__(self) -> str: + return "Z-Score" + + +class UnitLengthStrategy(ScalingStrategy): + #@typing.override + def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: + df[label] = series / series.sum() + return df + + def __str__(self) -> str: + return "Unit length" diff --git a/frontend/pages/normalization.py b/frontend/pages/normalization.py index 7dd5b84..ca40f91 100644 --- a/frontend/pages/normalization.py +++ b/frontend/pages/normalization.py @@ -1,14 +1,15 @@ import streamlit as st -from mvstrategy import MVStrategy +from normstrategy import MVStrategy, ScalingStrategy if "data" in st.session_state: data = st.session_state.data st.session_state.data = data.copy() for column, series in data.items(): + col1, col2 = st.columns(2) missing_count = series.isna().sum() choices = MVStrategy.list_available(data, series) - option = st.selectbox( + option = col1.selectbox( f"Missing values of {column} ({missing_count})", choices, index=1, @@ -17,6 +18,14 @@ if "data" in st.session_state: # Always re-get the series to avoid reusing an invalidated series pointer data = option.apply(data, column, data[column]) + choices = ScalingStrategy.list_available(data, series) + option = col2.selectbox( + "Scaling", + choices, + key=f"scaling-{column}", + ) + data = option.apply(data, column, data[column]) + st.write(data) st.session_state.working_data = data else: