diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bee8a64 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +__pycache__ diff --git a/frontend/exploration.py b/frontend/exploration.py index 123a22b..43d8c72 100644 --- a/frontend/exploration.py +++ b/frontend/exploration.py @@ -13,6 +13,7 @@ uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"]) if uploaded_file is not None: st.session_state.data = pd.read_csv(uploaded_file) + st.session_state.working_data = st.session_state.data st.success("File loaded successfully!") diff --git a/frontend/mvstrategy.py b/frontend/mvstrategy.py new file mode 100644 index 0000000..81db2f8 --- /dev/null +++ b/frontend/mvstrategy.py @@ -0,0 +1,70 @@ +from abc import ABC, abstractmethod +from pandas import DataFrame, Series +from pandas.api.types import is_numeric_dtype +from typing import Any, Union + +class MVStrategy(ABC): + """A way to handle missing values in a dataframe.""" + + @abstractmethod + def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: + """Apply the current strategy to the given series. + + The series is described by its label and dataframe.""" + return df + + @staticmethod + def list_available(series: Series) -> list['MVStrategy']: + """Get all the strategies that can be used.""" + choices = [DropStrategy(), ModeStrategy()] + if is_numeric_dtype(series): + choices.extend((MeanStrategy(), MedianStrategy())) + return choices + + +class DropStrategy(MVStrategy): + #@typing.override + def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: + df.dropna(subset=label, inplace=True) + return df + + def __str__(self) -> str: + return "Drop" + + +class PositionStrategy(MVStrategy): + #@typing.override + def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: + series.fillna(self.get_value(series), inplace=True) + return df + + @abstractmethod + def get_value(self, series: Series) -> Any: + pass + + +class MeanStrategy(PositionStrategy): + #@typing.override + def get_value(self, series: Series) -> Union[int, float]: + return series.mean() + + def __str__(self) -> str: + return "Use mean" + + +class MedianStrategy(PositionStrategy): + #@typing.override + def get_value(self, series: Series) -> Union[int, float]: + return series.median() + + def __str__(self) -> str: + return "Use median" + + +class ModeStrategy(PositionStrategy): + #@typing.override + def get_value(self, series: Series) -> Any: + return series.mode()[0] + + def __str__(self) -> str: + return "Use mode" diff --git a/frontend/pages/normalization.py b/frontend/pages/normalization.py new file mode 100644 index 0000000..4f20c7a --- /dev/null +++ b/frontend/pages/normalization.py @@ -0,0 +1,23 @@ +import streamlit as st +from mvstrategy import MVStrategy + +if "data" in st.session_state: + data = st.session_state.data + st.session_state.data = data.copy() + + for column, series in data.items(): + missing_count = series.isna().sum() + choices = MVStrategy.list_available(series) + option = st.selectbox( + f"Missing values of {column} ({missing_count})", + choices, + index=1, + key=f"mv-{column}", + ) + # Always re-get the series to avoid reusing an invalidated series pointer + data = option.apply(data, column, data[column]) + + st.write(data) + st.session_state.working_data = data +else: + st.error("file not loaded") diff --git a/frontend/pages/visualization.py b/frontend/pages/visualization.py index 057b0c9..6ca8270 100644 --- a/frontend/pages/visualization.py +++ b/frontend/pages/visualization.py @@ -5,8 +5,8 @@ import seaborn as sns st.header("Data Visualization") -if "data" in st.session_state: - data = st.session_state.data +if "working_data" in st.session_state: + data = st.session_state.working_data st.subheader("Histogram") column_to_plot = st.selectbox("Select Column for Histogram", data.columns)