diff --git a/frontend/normstrategy.py b/frontend/normstrategy.py index 2896c49..af4dde3 100644 --- a/frontend/normstrategy.py +++ b/frontend/normstrategy.py @@ -1,6 +1,7 @@ from abc import ABC, abstractmethod from pandas import DataFrame, Series from pandas.api.types import is_numeric_dtype +from sklearn.neighbors import KNeighborsClassifier from typing import Any, Union class DataFrameFunction(ABC): @@ -18,11 +19,14 @@ class MVStrategy(DataFrameFunction): """A way to handle missing values in a dataframe.""" @staticmethod - def list_available(df: DataFrame, series: Series) -> list['MVStrategy']: + def list_available(df: DataFrame, label: str, series: Series) -> list['MVStrategy']: """Get all the strategies that can be used.""" choices = [DropStrategy(), ModeStrategy()] if is_numeric_dtype(series): choices.extend((MeanStrategy(), MedianStrategy(), LinearRegressionStrategy())) + other_columns = df.select_dtypes(include="number").drop(label, axis=1).columns.to_list() + if len(other_columns): + choices.append(KNNStrategy(other_columns)) return choices @@ -97,6 +101,39 @@ class LinearRegressionStrategy(MVStrategy): return "Use linear regression" +class KNNStrategy(MVStrategy): + def __init__(self, training_features: list[str]): + self.available_features = training_features + self.training_features = training_features + self.n_neighbors = 3 + + def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: + # Remove any training column that have any missing values + usable_data = df.dropna(subset=self.training_features) + # Select columns to impute from + train_data = usable_data.dropna(subset=label) + # Create train dataframe + x_train = train_data.drop(label, axis=1) + y_train = train_data[label] + + reg = KNeighborsClassifier(self.n_neighbors).fit(x_train, y_train) + + # Create test dataframe + test_data = usable_data[usable_data[label].isnull()] + if test_data.empty: + return df + x_test = test_data.drop(label, axis=1) + predicted = reg.predict(x_test) + + # Fill with predicated values and patch the original data + usable_data[label].fillna(Series(predicted), inplace=True) + df.fillna(usable_data, inplace=True) + return df + + def __str__(self) -> str: + return "kNN" + + class KeepStrategy(ScalingStrategy): #@typing.override def apply(self, df: DataFrame, label: str, series: Series) -> DataFrame: diff --git a/frontend/pages/normalization.py b/frontend/pages/normalization.py index 3500988..b543f87 100644 --- a/frontend/pages/normalization.py +++ b/frontend/pages/normalization.py @@ -1,5 +1,5 @@ import streamlit as st -from normstrategy import MVStrategy, ScalingStrategy +from normstrategy import MVStrategy, ScalingStrategy, KNNStrategy if "data" in st.session_state: data = st.session_state.original_data @@ -8,13 +8,17 @@ if "data" in st.session_state: for column, series in data.items(): col1, col2 = st.columns(2) missing_count = series.isna().sum() - choices = MVStrategy.list_available(data, series) + choices = MVStrategy.list_available(data, column, series) option = col1.selectbox( f"Missing values of {column} ({missing_count})", choices, index=1, key=f"mv-{column}", ) + if isinstance(option, KNNStrategy): + print(option.available_features) + option.training_features = st.multiselect("Training columns", option.training_features, default=option.available_features, key=f"cols-{column}") + option.n_neighbors = st.number_input("Number of neighbors", min_value=1, value=option.n_neighbors, key=f"neighbors-{column}") # Always re-get the series to avoid reusing an invalidated series pointer data = option.apply(data, column, data[column])