From 63bce82b3b4a222fa6ff18d2ed942455ed3caa6e Mon Sep 17 00:00:00 2001 From: clfreville2 Date: Fri, 7 Jun 2024 10:50:44 +0200 Subject: [PATCH 1/4] Implement base MissingValues strategies --- .gitignore | 1 + frontend/exploration.py | 1 + frontend/mvstrategy.py | 70 +++++++++++++++++++++++++++++++++ frontend/pages/normalization.py | 23 +++++++++++ frontend/pages/visualization.py | 4 +- 5 files changed, 97 insertions(+), 2 deletions(-) create mode 100644 .gitignore create mode 100644 frontend/mvstrategy.py create mode 100644 frontend/pages/normalization.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bee8a64 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +__pycache__ diff --git a/frontend/exploration.py b/frontend/exploration.py index 123a22b..43d8c72 100644 --- a/frontend/exploration.py +++ b/frontend/exploration.py @@ -13,6 +13,7 @@ uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"]) if uploaded_file is not None: st.session_state.data = pd.read_csv(uploaded_file) + st.session_state.working_data = st.session_state.data st.success("File loaded successfully!") diff --git a/frontend/mvstrategy.py b/frontend/mvstrategy.py new file mode 100644 index 0000000..81db2f8 --- /dev/null +++ b/frontend/mvstrategy.py @@ -0,0 +1,70 @@ +from abc import ABC, abstractmethod +from pandas import DataFrame, Series +from pandas.api.types import is_numeric_dtype +from typing import Any, Union + +class MVStrategy(ABC): + """A way to handle missing values in a dataframe.""" + + @abstractmethod + def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: + """Apply the current strategy to the given series. + + The series is described by its label and dataframe.""" + return df + + @staticmethod + def list_available(series: Series) -> list['MVStrategy']: + """Get all the strategies that can be used.""" + choices = [DropStrategy(), ModeStrategy()] + if is_numeric_dtype(series): + choices.extend((MeanStrategy(), MedianStrategy())) + return choices + + +class DropStrategy(MVStrategy): + #@typing.override + def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: + df.dropna(subset=label, inplace=True) + return df + + def __str__(self) -> str: + return "Drop" + + +class PositionStrategy(MVStrategy): + #@typing.override + def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: + series.fillna(self.get_value(series), inplace=True) + return df + + @abstractmethod + def get_value(self, series: Series) -> Any: + pass + + +class MeanStrategy(PositionStrategy): + #@typing.override + def get_value(self, series: Series) -> Union[int, float]: + return series.mean() + + def __str__(self) -> str: + return "Use mean" + + +class MedianStrategy(PositionStrategy): + #@typing.override + def get_value(self, series: Series) -> Union[int, float]: + return series.median() + + def __str__(self) -> str: + return "Use median" + + +class ModeStrategy(PositionStrategy): + #@typing.override + def get_value(self, series: Series) -> Any: + return series.mode()[0] + + def __str__(self) -> str: + return "Use mode" diff --git a/frontend/pages/normalization.py b/frontend/pages/normalization.py new file mode 100644 index 0000000..4f20c7a --- /dev/null +++ b/frontend/pages/normalization.py @@ -0,0 +1,23 @@ +import streamlit as st +from mvstrategy import MVStrategy + +if "data" in st.session_state: + data = st.session_state.data + st.session_state.data = data.copy() + + for column, series in data.items(): + missing_count = series.isna().sum() + choices = MVStrategy.list_available(series) + option = st.selectbox( + f"Missing values of {column} ({missing_count})", + choices, + index=1, + key=f"mv-{column}", + ) + # Always re-get the series to avoid reusing an invalidated series pointer + data = option.apply(data, column, data[column]) + + st.write(data) + st.session_state.working_data = data +else: + st.error("file not loaded") diff --git a/frontend/pages/visualization.py b/frontend/pages/visualization.py index 057b0c9..6ca8270 100644 --- a/frontend/pages/visualization.py +++ b/frontend/pages/visualization.py @@ -5,8 +5,8 @@ import seaborn as sns st.header("Data Visualization") -if "data" in st.session_state: - data = st.session_state.data +if "working_data" in st.session_state: + data = st.session_state.working_data st.subheader("Histogram") column_to_plot = st.selectbox("Select Column for Histogram", data.columns) -- 2.36.3 From 5f960df83842e866833d863b63d08c0a0348acfd Mon Sep 17 00:00:00 2001 From: clfreville2 Date: Fri, 7 Jun 2024 11:58:52 +0200 Subject: [PATCH 2/4] Support Pandas linear regression --- frontend/mvstrategy.py | 13 +++++++++++-- frontend/pages/normalization.py | 2 +- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/frontend/mvstrategy.py b/frontend/mvstrategy.py index 81db2f8..fb7cc6c 100644 --- a/frontend/mvstrategy.py +++ b/frontend/mvstrategy.py @@ -14,11 +14,11 @@ class MVStrategy(ABC): return df @staticmethod - def list_available(series: Series) -> list['MVStrategy']: + def list_available(df: DataFrame, series: Series) -> list['MVStrategy']: """Get all the strategies that can be used.""" choices = [DropStrategy(), ModeStrategy()] if is_numeric_dtype(series): - choices.extend((MeanStrategy(), MedianStrategy())) + choices.extend((MeanStrategy(), MedianStrategy(), LinearRegressionStrategy())) return choices @@ -68,3 +68,12 @@ class ModeStrategy(PositionStrategy): def __str__(self) -> str: return "Use mode" + + +class LinearRegressionStrategy(MVStrategy): + def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: + series.interpolate(inplace=True) + return df + + def __str__(self) -> str: + return "Use linear regression" diff --git a/frontend/pages/normalization.py b/frontend/pages/normalization.py index 4f20c7a..7dd5b84 100644 --- a/frontend/pages/normalization.py +++ b/frontend/pages/normalization.py @@ -7,7 +7,7 @@ if "data" in st.session_state: for column, series in data.items(): missing_count = series.isna().sum() - choices = MVStrategy.list_available(series) + choices = MVStrategy.list_available(data, series) option = st.selectbox( f"Missing values of {column} ({missing_count})", choices, -- 2.36.3 From a325603fd96fa3eeebde3f1d9b661019a1cfff82 Mon Sep 17 00:00:00 2001 From: clfreville2 Date: Wed, 19 Jun 2024 09:04:39 +0200 Subject: [PATCH 3/4] Add scaling strategies --- frontend/{mvstrategy.py => normstrategy.py} | 65 ++++++++++++++++++++- frontend/pages/normalization.py | 13 ++++- 2 files changed, 73 insertions(+), 5 deletions(-) rename frontend/{mvstrategy.py => normstrategy.py} (54%) diff --git a/frontend/mvstrategy.py b/frontend/normstrategy.py similarity index 54% rename from frontend/mvstrategy.py rename to frontend/normstrategy.py index fb7cc6c..2896c49 100644 --- a/frontend/mvstrategy.py +++ b/frontend/normstrategy.py @@ -3,16 +3,20 @@ from pandas import DataFrame, Series from pandas.api.types import is_numeric_dtype from typing import Any, Union -class MVStrategy(ABC): - """A way to handle missing values in a dataframe.""" +class DataFrameFunction(ABC): + """A command that may be applied in-place to a dataframe.""" @abstractmethod def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: - """Apply the current strategy to the given series. + """Apply the current function to the given dataframe, in-place. The series is described by its label and dataframe.""" return df + +class MVStrategy(DataFrameFunction): + """A way to handle missing values in a dataframe.""" + @staticmethod def list_available(df: DataFrame, series: Series) -> list['MVStrategy']: """Get all the strategies that can be used.""" @@ -22,6 +26,20 @@ class MVStrategy(ABC): return choices +class ScalingStrategy(DataFrameFunction): + """A way to handle missing values in a dataframe.""" + + @staticmethod + def list_available(df: DataFrame, series: Series) -> list['MVStrategy']: + """Get all the strategies that can be used.""" + choices = [KeepStrategy()] + if is_numeric_dtype(series): + choices.extend((MinMaxStrategy(), ZScoreStrategy())) + if series.sum() != 0: + choices.append(UnitLengthStrategy()) + return choices + + class DropStrategy(MVStrategy): #@typing.override def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: @@ -77,3 +95,44 @@ class LinearRegressionStrategy(MVStrategy): def __str__(self) -> str: return "Use linear regression" + + +class KeepStrategy(ScalingStrategy): + #@typing.override + def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: + return df + + def __str__(self) -> str: + return "No-op" + + +class MinMaxStrategy(ScalingStrategy): + #@typing.override + def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: + minimum = series.min() + maximum = series.max() + df[label] = (series - minimum) / (maximum - minimum) + return df + + def __str__(self) -> str: + return "Min-max" + + +class ZScoreStrategy(ScalingStrategy): + #@typing.override + def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: + df[label] = (series - series.mean()) / series.std() + return df + + def __str__(self) -> str: + return "Z-Score" + + +class UnitLengthStrategy(ScalingStrategy): + #@typing.override + def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: + df[label] = series / series.sum() + return df + + def __str__(self) -> str: + return "Unit length" diff --git a/frontend/pages/normalization.py b/frontend/pages/normalization.py index 7dd5b84..ca40f91 100644 --- a/frontend/pages/normalization.py +++ b/frontend/pages/normalization.py @@ -1,14 +1,15 @@ import streamlit as st -from mvstrategy import MVStrategy +from normstrategy import MVStrategy, ScalingStrategy if "data" in st.session_state: data = st.session_state.data st.session_state.data = data.copy() for column, series in data.items(): + col1, col2 = st.columns(2) missing_count = series.isna().sum() choices = MVStrategy.list_available(data, series) - option = st.selectbox( + option = col1.selectbox( f"Missing values of {column} ({missing_count})", choices, index=1, @@ -17,6 +18,14 @@ if "data" in st.session_state: # Always re-get the series to avoid reusing an invalidated series pointer data = option.apply(data, column, data[column]) + choices = ScalingStrategy.list_available(data, series) + option = col2.selectbox( + "Scaling", + choices, + key=f"scaling-{column}", + ) + data = option.apply(data, column, data[column]) + st.write(data) st.session_state.working_data = data else: -- 2.36.3 From 6dcca29cbd36f6aa1d51cf2ef4539d6577046384 Mon Sep 17 00:00:00 2001 From: clfreville2 Date: Wed, 19 Jun 2024 09:49:16 +0200 Subject: [PATCH 4/4] Rename to original_data --- frontend/exploration.py | 2 +- frontend/pages/normalization.py | 6 +++--- frontend/pages/visualization.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/frontend/exploration.py b/frontend/exploration.py index 43d8c72..4cac622 100644 --- a/frontend/exploration.py +++ b/frontend/exploration.py @@ -13,7 +13,7 @@ uploaded_file = st.file_uploader("Upload your CSV file", type=["csv"]) if uploaded_file is not None: st.session_state.data = pd.read_csv(uploaded_file) - st.session_state.working_data = st.session_state.data + st.session_state.original_data = st.session_state.data st.success("File loaded successfully!") diff --git a/frontend/pages/normalization.py b/frontend/pages/normalization.py index ca40f91..3500988 100644 --- a/frontend/pages/normalization.py +++ b/frontend/pages/normalization.py @@ -2,8 +2,8 @@ import streamlit as st from normstrategy import MVStrategy, ScalingStrategy if "data" in st.session_state: - data = st.session_state.data - st.session_state.data = data.copy() + data = st.session_state.original_data + st.session_state.original_data = data.copy() for column, series in data.items(): col1, col2 = st.columns(2) @@ -27,6 +27,6 @@ if "data" in st.session_state: data = option.apply(data, column, data[column]) st.write(data) - st.session_state.working_data = data + st.session_state.data = data else: st.error("file not loaded") diff --git a/frontend/pages/visualization.py b/frontend/pages/visualization.py index 6ca8270..057b0c9 100644 --- a/frontend/pages/visualization.py +++ b/frontend/pages/visualization.py @@ -5,8 +5,8 @@ import seaborn as sns st.header("Data Visualization") -if "working_data" in st.session_state: - data = st.session_state.working_data +if "data" in st.session_state: + data = st.session_state.data st.subheader("Histogram") column_to_plot = st.selectbox("Select Column for Histogram", data.columns) -- 2.36.3