Mean median and other function effect all data and now only column, removing useless columns

managing_missing_values
Dorian HODIN 1 year ago
parent 267a2b8013
commit 022437dad8

@ -19,27 +19,21 @@ def drop_high_null_percentage(data, threshold=0.5):
return data return data
def replace_with_mean(data):
return data.apply(lambda col: col.fillna(col.mean()) if col.dtype.kind in 'biufc' else col)
def replace_with_mean(data, column): def replace_with_median(data):
data[column] = data[column].fillna(data[column].mean()) return data.apply(lambda col: col.fillna(col.median()) if col.dtype.kind in 'biufc' else col)
return data
def replace_with_median(data, column): def replace_with_mode(data):
data[column] = data[column].fillna(data[column].median()) return data.apply(lambda col: col.fillna(col.mode()[0]) if col.mode().size > 0 else col)
return data
def replace_with_mode(data, column): def impute_with_knn(data, n_neighbors=5):
mode_value = data[column].mode()
if not mode_value.empty:
data[column] = data[column].fillna(mode_value[0])
return data
def impute_with_knn(data, column, n_neighbors=5):
imputer = KNNImputer(n_neighbors=n_neighbors) imputer = KNNImputer(n_neighbors=n_neighbors)
data[[column]] = imputer.fit_transform(data[[column]]) return pd.DataFrame(imputer.fit_transform(data), columns=data.columns)
return data
def impute_with_regression(data, column): def impute_with_regression(data):
for column in data.columns:
if data[column].isnull().sum() > 0: if data[column].isnull().sum() > 0:
train_data = data[data[column].notna()] train_data = data[data[column].notna()]
test_data = data[data[column].isna()] test_data = data[data[column].isna()]
@ -56,28 +50,20 @@ def impute_with_regression(data, column):
- method: Method to handle missing values ('drop', 'mean', 'median', 'mode', 'knn', 'regression') - method: Method to handle missing values ('drop', 'mean', 'median', 'mode', 'knn', 'regression')
- n_neighbors: Number of neighbors to use for KNN imputation (only used if method='knn') - n_neighbors: Number of neighbors to use for KNN imputation (only used if method='knn')
""" """
def handle_missing_values(data, method, column, n_neighbors=5): def handle_missing_values(data, method, n_neighbors=5):
data = drop_high_null_percentage(data) data = drop_high_null_percentage(data)
data = convert_categorical_to_numeric(data) data = convert_categorical_to_numeric(data)
if method == 'mean': if method == 'mean':
return replace_with_mean(data, column) return replace_with_mean(data)
elif method == 'median': elif method == 'median':
return replace_with_median(data, column) return replace_with_median(data)
elif method == 'mode': elif method == 'mode':
return replace_with_mode(data, column) return replace_with_mode(data)
elif method == 'knn': elif method == 'knn':
return impute_with_knn(data, column, n_neighbors=n_neighbors) return impute_with_knn(data, n_neighbors=n_neighbors)
elif method == 'regression': elif method == 'regression':
return impute_with_regression(data, column) return impute_with_regression(data)
elif method == 'drop_high_null':
return drop_high_null_percentage(data)
else: else:
raise ValueError("Unknown method") raise ValueError("Unknown method")
data = l.return_csv('./data.csv')
cleaned_data = handle_missing_values(data, method='mode', column='Route Type')
print(cleaned_data)

Loading…
Cancel
Save