diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bee8a64 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +__pycache__ diff --git a/frontend/exploration.py b/frontend/exploration.py index 123a22b..4cac622 100644 --- a/frontend/exploration.py +++ b/frontend/exploration.py @@ -13,6 +13,7 @@ uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"]) if uploaded_file is not None: st.session_state.data = pd.read_csv(uploaded_file) + st.session_state.original_data = st.session_state.data st.success("File loaded successfully!") diff --git a/frontend/normstrategy.py b/frontend/normstrategy.py new file mode 100644 index 0000000..2896c49 --- /dev/null +++ b/frontend/normstrategy.py @@ -0,0 +1,138 @@ +from abc import ABC, abstractmethod +from pandas import DataFrame, Series +from pandas.api.types import is_numeric_dtype +from typing import Any, Union + +class DataFrameFunction(ABC): + """A command that may be applied in-place to a dataframe.""" + + @abstractmethod + def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: + """Apply the current function to the given dataframe, in-place. + + The series is described by its label and dataframe.""" + return df + + +class MVStrategy(DataFrameFunction): + """A way to handle missing values in a dataframe.""" + + @staticmethod + def list_available(df: DataFrame, series: Series) -> list['MVStrategy']: + """Get all the strategies that can be used.""" + choices = [DropStrategy(), ModeStrategy()] + if is_numeric_dtype(series): + choices.extend((MeanStrategy(), MedianStrategy(), LinearRegressionStrategy())) + return choices + + +class ScalingStrategy(DataFrameFunction): + """A way to handle missing values in a dataframe.""" + + @staticmethod + def list_available(df: DataFrame, series: Series) -> list['MVStrategy']: + """Get all the strategies that can be used.""" + choices = [KeepStrategy()] + if is_numeric_dtype(series): + choices.extend((MinMaxStrategy(), ZScoreStrategy())) + if series.sum() != 0: + choices.append(UnitLengthStrategy()) + return choices + + +class DropStrategy(MVStrategy): + #@typing.override + def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: + df.dropna(subset=label, inplace=True) + return df + + def __str__(self) -> str: + return "Drop" + + +class PositionStrategy(MVStrategy): + #@typing.override + def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: + series.fillna(self.get_value(series), inplace=True) + return df + + @abstractmethod + def get_value(self, series: Series) -> Any: + pass + + +class MeanStrategy(PositionStrategy): + #@typing.override + def get_value(self, series: Series) -> Union[int, float]: + return series.mean() + + def __str__(self) -> str: + return "Use mean" + + +class MedianStrategy(PositionStrategy): + #@typing.override + def get_value(self, series: Series) -> Union[int, float]: + return series.median() + + def __str__(self) -> str: + return "Use median" + + +class ModeStrategy(PositionStrategy): + #@typing.override + def get_value(self, series: Series) -> Any: + return series.mode()[0] + + def __str__(self) -> str: + return "Use mode" + + +class LinearRegressionStrategy(MVStrategy): + def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: + series.interpolate(inplace=True) + return df + + def __str__(self) -> str: + return "Use linear regression" + + +class KeepStrategy(ScalingStrategy): + #@typing.override + def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: + return df + + def __str__(self) -> str: + return "No-op" + + +class MinMaxStrategy(ScalingStrategy): + #@typing.override + def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: + minimum = series.min() + maximum = series.max() + df[label] = (series - minimum) / (maximum - minimum) + return df + + def __str__(self) -> str: + return "Min-max" + + +class ZScoreStrategy(ScalingStrategy): + #@typing.override + def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: + df[label] = (series - series.mean()) / series.std() + return df + + def __str__(self) -> str: + return "Z-Score" + + +class UnitLengthStrategy(ScalingStrategy): + #@typing.override + def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: + df[label] = series / series.sum() + return df + + def __str__(self) -> str: + return "Unit length" diff --git a/frontend/pages/normalization.py b/frontend/pages/normalization.py new file mode 100644 index 0000000..3500988 --- /dev/null +++ b/frontend/pages/normalization.py @@ -0,0 +1,32 @@ +import streamlit as st +from normstrategy import MVStrategy, ScalingStrategy + +if "data" in st.session_state: + data = st.session_state.original_data + st.session_state.original_data = data.copy() + + for column, series in data.items(): + col1, col2 = st.columns(2) + missing_count = series.isna().sum() + choices = MVStrategy.list_available(data, series) + option = col1.selectbox( + f"Missing values of {column} ({missing_count})", + choices, + index=1, + key=f"mv-{column}", + ) + # Always re-get the series to avoid reusing an invalidated series pointer + data = option.apply(data, column, data[column]) + + choices = ScalingStrategy.list_available(data, series) + option = col2.selectbox( + "Scaling", + choices, + key=f"scaling-{column}", + ) + data = option.apply(data, column, data[column]) + + st.write(data) + st.session_state.data = data +else: + st.error("file not loaded")