You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
68 lines
1.7 KiB
68 lines
1.7 KiB
import pandas as pd
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
|
|
<<<<<<< HEAD
|
|
def return_csv(path):
|
|
df = pd.read_csv(path)
|
|
return df
|
|
|
|
def csv_value(df):
|
|
#print all detail
|
|
df.info()
|
|
# Print number of missing value for each column
|
|
print(df.isna().sum())
|
|
# Useless values
|
|
|
|
def csv_check(df):
|
|
for col in df:
|
|
print("-"*12)
|
|
print(col)
|
|
print("-"*12)
|
|
print(df[col].unique())
|
|
|
|
def do_for_columns(df):
|
|
for col_name in df:
|
|
df[col_name] = function(df[col_name])
|
|
|
|
|
|
def csv_norm_min_max(df, col):
|
|
max = df[col].max()
|
|
min = df[col].min()
|
|
df[col] = (df[col] - min)/ (max - min)
|
|
return df[col]
|
|
|
|
def csv_standardisation_Z(df, col):
|
|
mean_col1 = df[col].mean()
|
|
std_col1 = df[col].std()
|
|
df[col] = (df[col] - mean_col1) / std_col1
|
|
return df[col]
|
|
|
|
def csv_robust_normalize(df, col):
|
|
# Calcul de la médiane et de l'IQR
|
|
median = df[col].median()
|
|
q1 = df[col].quantile(0.25)
|
|
q3 = df[col].quantile(0.75)
|
|
iqr = q3 - q1
|
|
|
|
# Application de la normalisation robuste
|
|
normalized_column = (df[col] - median) / iqr
|
|
df[col] = normalized_column
|
|
return normalized_column
|
|
|
|
def handle_normalization(df, norm_method):
|
|
if norm_method == "min-max":
|
|
for col_name in df:
|
|
df[col_name] = csv_norm_min_max(df, col_name)
|
|
return df
|
|
elif norm_method == "z-score":
|
|
for col_name in df:
|
|
df[col_name] = csv_standardisation_Z(df, col_name)
|
|
return df
|
|
elif norm_method == "robust":
|
|
for col_name in df:
|
|
df[col_name] = csv_robust_normalize(df, col_name)
|
|
return df
|
|
else:
|
|
raise ValueError("Unknown method")
|