from abc import ABC, abstractmethod from pandas import DataFrame, Series from pandas.api.types import is_numeric_dtype from typing import Any, Union class DataFrameFunction(ABC): """A command that may be applied in-place to a dataframe.""" @abstractmethod def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: """Apply the current function to the given dataframe, in-place. The series is described by its label and dataframe.""" return df class MVStrategy(DataFrameFunction): """A way to handle missing values in a dataframe.""" @staticmethod def list_available(df: DataFrame, series: Series) -> list['MVStrategy']: """Get all the strategies that can be used.""" choices = [DropStrategy(), ModeStrategy()] if is_numeric_dtype(series): choices.extend((MeanStrategy(), MedianStrategy(), LinearRegressionStrategy())) return choices class ScalingStrategy(DataFrameFunction): """A way to handle missing values in a dataframe.""" @staticmethod def list_available(df: DataFrame, series: Series) -> list['MVStrategy']: """Get all the strategies that can be used.""" choices = [KeepStrategy()] if is_numeric_dtype(series): choices.extend((MinMaxStrategy(), ZScoreStrategy())) if series.sum() != 0: choices.append(UnitLengthStrategy()) return choices class DropStrategy(MVStrategy): #@typing.override def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: df.dropna(subset=label, inplace=True) return df def __str__(self) -> str: return "Drop" class PositionStrategy(MVStrategy): #@typing.override def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: series.fillna(self.get_value(series), inplace=True) return df @abstractmethod def get_value(self, series: Series) -> Any: pass class MeanStrategy(PositionStrategy): #@typing.override def get_value(self, series: Series) -> Union[int, float]: return series.mean() def __str__(self) -> str: return "Use mean" class MedianStrategy(PositionStrategy): #@typing.override def get_value(self, series: Series) -> Union[int, float]: return series.median() def __str__(self) -> str: return "Use median" class ModeStrategy(PositionStrategy): #@typing.override def get_value(self, series: Series) -> Any: return series.mode()[0] def __str__(self) -> str: return "Use mode" class LinearRegressionStrategy(MVStrategy): def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: series.interpolate(inplace=True) return df def __str__(self) -> str: return "Use linear regression" class KeepStrategy(ScalingStrategy): #@typing.override def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: return df def __str__(self) -> str: return "No-op" class MinMaxStrategy(ScalingStrategy): #@typing.override def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: minimum = series.min() maximum = series.max() df[label] = (series - minimum) / (maximum - minimum) return df def __str__(self) -> str: return "Min-max" class ZScoreStrategy(ScalingStrategy): #@typing.override def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: df[label] = (series - series.mean()) / series.std() return df def __str__(self) -> str: return "Z-Score" class UnitLengthStrategy(ScalingStrategy): #@typing.override def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: df[label] = series / series.sum() return df def __str__(self) -> str: return "Unit length"