From da3876fa7d5aee6318944199516776c12863edd6 Mon Sep 17 00:00:00 2001 From: dorian Date: Mon, 24 Jun 2024 12:19:13 +0200 Subject: [PATCH] Correcting impute regression --- src/back/managing_missing_values.py | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/src/back/managing_missing_values.py b/src/back/managing_missing_values.py index 8360cf1..7ee920f 100644 --- a/src/back/managing_missing_values.py +++ b/src/back/managing_missing_values.py @@ -33,14 +33,23 @@ def impute_with_knn(data, n_neighbors=5): return pd.DataFrame(imputer.fit_transform(data), columns=data.columns) def impute_with_regression(data): - for column in data.columns: - if data[column].isnull().sum() > 0: - train_data = data[data[column].notna()] - test_data = data[data[column].isna()] - if not train_data.empty and not test_data.empty: - regressor = LinearRegression() - regressor.fit(train_data.drop(column, axis=1), train_data[column]) - data.loc[data[column].isna(), column] = regressor.predict(test_data.drop(column, axis=1)) + missing_columns = data.columns[data.isnull().any()].tolist() + + for col in missing_columns: + missing_data = data[data[col].isnull()] + complete_data = data[~data[col].isnull()] + if missing_data.empty or complete_data.empty: + continue + X_complete = complete_data.drop(columns=missing_columns) + y_complete = complete_data[col] + X_missing = missing_data.drop(columns=missing_columns) + if X_missing.shape[0] > 0.5 * data.shape[0]: + continue + model = LinearRegression() + model.fit(X_complete, y_complete) + y_pred = model.predict(X_missing) + data.loc[df[col].isnull(), col] = y_pred + return data @@ -65,5 +74,4 @@ def handle_missing_values(data, method, n_neighbors=5): elif method == 'regression': return impute_with_regression(data) else: - raise ValueError("Unknown method") - + raise ValueError("Unknown method") \ No newline at end of file