From 4195f98de9a6b2bb3ace31f1f82728c48d620414 Mon Sep 17 00:00:00 2001 From: luevard <99143550+saucepommefrite@users.noreply.github.com> Date: Fri, 9 Feb 2024 09:49:49 +0100 Subject: [PATCH] :sparkles: comments and clean code --- test.py | 181 +++++++++++++++++++++++++++++++++----------------------- 1 file changed, 106 insertions(+), 75 deletions(-) diff --git a/test.py b/test.py index f0f2503..8602b5d 100644 --- a/test.py +++ b/test.py @@ -5,7 +5,7 @@ import seaborn as sns import matplotlib.pyplot as plt from sklearn.tree import export_graphviz from io import StringIO -from IPython.display import Image +from IPython.display import Image from sklearn.tree import plot_tree import pydotplus from IPython.display import Image @@ -21,72 +21,94 @@ from sklearn.impute import SimpleImputer pd.options.display.max_columns = None pd.options.display.max_rows = None import sklearn -print('The scikit-learn version is {}.'.format(sklearn.__version__)) + +def displayNumberOfNaNValues(df): + # Create an empty list to store tuples of column index and number of NaN values + na = [] + # Loop through each column in the DataFrame + for index, col in enumerate(df): + # Count the number of NaN values in each column and append the index and count to 'na' + na.append((index, df[col].isna().sum())) + # Make a copy of 'na' and sort it based on the count of NaN values in descending order + na.sort(key=lambda x: x[1], reverse=True) + # Iterate through the sorted list of columns + for i in range(len(df.columns)): + # Check if the count of NaN values for the current column is not zero + if na[i][1] != 0: + # Print the column name, count of NaN values, and "NaN" + print(df.columns[na[i][0]], ":", na[i][1], "NaN") + # Calculate and print the total number of features with NaN values + print('Number of features with NaN values:', len([x[1] for x in na if x[1] > 0])) + print("Total NaN in dataframe :" , df.isna().sum().sum()) + df = pd.read_csv('archive/data.csv') -b_age = df['B_age'] # we replace B_age to put it among B features -df.drop(['B_age'], axis = 1, inplace = True) -df.insert(76, "B_age", b_age) +# Before April 2001, there were almost no rules in UFC (no judges, no time limits, no rounds, etc.). +#It's up to this precise date that UFC started to implement a set of rules known as +#"Unified Rules of Mixed Martial Arts". +#Therefore, we delete all fights before this major update in UFC's rules history. -df_fe = df.copy() # We make a copy of the dataframe for the feature engineering part later -#print(df.head(5)) +# Using this old data would not be representative of current fights, especially since this +#sport has become one of the most regulated due to its mixity and complexity. limit_date = '2001-04-01' df = df[(df['date'] > limit_date)] -# print("Total NaN in dataframe :" , df.isna().sum().sum()) -# print("Total NaN in each column of the dataframe") -na = [] -for index, col in enumerate(df): - na.append((index, df[col].isna().sum())) -na_sorted = na.copy() -na_sorted.sort(key = lambda x: x[1], reverse = True) - -# for i in range(len(df.columns)): -# print(df.columns[na_sorted[i][0]],":", na_sorted[i][1], "NaN") +# Display NaN values +displayNumberOfNaNValues(df) +# Define the list of important features to impute imp_features = ['R_Weight_lbs', 'R_Height_cms', 'B_Height_cms', 'R_age', 'B_age', 'R_Reach_cms', 'B_Reach_cms'] + +# Initialize a SimpleImputer to impute missing values with median imp_median = SimpleImputer(missing_values=np.nan, strategy='median') +# Iterate over each feature to impute missing values for feature in imp_features: + # Fit and transform the feature using median imputation imp_feature = imp_median.fit_transform(df[feature].values.reshape(-1,1)) + # Assign the imputed values back to the DataFrame df[feature] = imp_feature +# Impute missing values for 'R_Stance' using most frequent strategy imp_stance_R = SimpleImputer(missing_values=np.nan, strategy='most_frequent') imp_R_stance = imp_stance_R.fit_transform(df['R_Stance'].values.reshape(-1,1)) +# Impute missing values for 'B_Stance' using most frequent strategy imp_stance_B = SimpleImputer(missing_values=np.nan, strategy='most_frequent') imp_B_stance = imp_stance_B.fit_transform(df['B_Stance'].values.reshape(-1,1)) -df_R_stance_imputed = pd.DataFrame(imp_R_stance, columns=['R_Stance']) -df_B_stance_imputed = pd.DataFrame(imp_B_stance, columns=['B_Stance']) - -# Assign the imputed values to the original DataFrame -df['R_Stance'] = df_R_stance_imputed['R_Stance'] -df['B_Stance'] = df_B_stance_imputed['B_Stance'] +# Create DataFrames for imputed stances +df['R_Stance'] = pd.DataFrame(imp_R_stance, columns=['R_Stance']) +df['B_Stance'] = pd.DataFrame(imp_B_stance, columns=['B_Stance']) -print('Number of features with NaN values :', len([x[1] for x in na if x[1] > 0])) +# drop B_avg_BODY_att values in the dataframe + # List of features with NaN values to drop + #na_features = ['B_avg_BODY_att', 'R_avg_BODY_att'] -na_features = ['B_avg_BODY_att', 'R_avg_BODY_att'] -df.dropna(subset = na_features, inplace = True) + # Drop rows with NaN values in specified features + #df.dropna(subset=na_features, inplace=True) -df.drop(['Referee', 'location'], axis = 1, inplace = True) - -# print(df.shape) -# print("Total NaN in dataframe :" , df.isna().sum().sum()) +# Drop columns 'Referee' and 'location' from the DataFrame +# The value of references and location has a low impact in battles, which makes it irrelevant to keep +df.drop(['Referee', 'location'], axis=1, inplace=True) +# Drop column 'B_draw' and 'R_draw' and 'Draw' fight and 'Catch Weight' fight df.drop(['B_draw', 'R_draw'], axis=1, inplace=True) df = df[df['Winner'] != 'Draw'] df = df[df['weight_class'] != 'Catch Weight'] -# Supprimez les colonnes non numériques -df_numeric = df.select_dtypes(include=['float64', 'int64']) +# Remove column when data type is not float or int +dfWithoutString = df.select_dtypes(include=['float64', 'int64']) -# Tracez la matrice de corrélation plt.figure(figsize=(50, 40)) -corr_matrix = df_numeric.corr(method='pearson').abs() +corr_matrix = dfWithoutString.corr(method='pearson').abs() sns.heatmap(corr_matrix, annot=True) + +# Show the correlation matrix of the dataframe +# Very laggy feature + # plt.show() # i = index of the fighter's fight, 0 means the last fight, -1 means first fight @@ -99,9 +121,8 @@ def select_fight_row(df, name, i): arr = df_temp.iloc[i,:].values return arr - -# print(select_fight_row(df, 'Amanda Nunes', 0)) -# we get the last fight of Amanda Nunes +# we get the last fight of Khabib :'( +print(select_fight_row(df, 'Khabib Nurmagomedov', 0)) # get all active UFC fighters (according to the limit_date parameter) @@ -110,10 +131,11 @@ def list_fighters(df, limit_date): set_R = set(df_temp['R_fighter']) set_B = set(df_temp['B_fighter']) fighters = list(set_R.union(set_B)) + print("Number of fighter: "+str(len(fighters))) return fighters -fighters = list_fighters(df, '2017-01-01') -print(len(fighters)) +# Last year when data fight was not full and correct +fighters = list_fighters(df,'2016-01-01') def build_df(df, fighters, i): arr = [select_fight_row(df, fighters[f], i) for f in range(len(fighters)) if select_fight_row(df, fighters[f], i) is not None] @@ -197,43 +219,52 @@ tree_estimator = model['random_forest'].estimators_[10] # plt.show() def predict(df, pipeline, blue_fighter, red_fighter, weightclass, rounds, title_bout=False): - - #We build two dataframes, one for each figther - f1 = df[(df['R_fighter'] == blue_fighter) | (df['B_fighter'] == blue_fighter)].copy() - f1.reset_index(drop=True, inplace=True) - f1 = f1[:1] - f2 = df[(df['R_fighter'] == red_fighter) | (df['B_fighter'] == red_fighter)].copy() - f2.reset_index(drop=True, inplace=True) - f2 = f2[:1] - - # if the fighter was red/blue corner on his last fight, we filter columns to only keep his statistics (and not the other fighter) - # then we rename columns according to the color of the corner in the parameters using re.sub() - if (f1.loc[0, ['R_fighter']].values[0]) == blue_fighter: - result1 = f1.filter(regex='^R', axis=1).copy() #here we keep the red corner stats - result1.rename(columns = lambda x: re.sub('^R','B', x), inplace=True) #we rename it with "B_" prefix because he's in the blue_corner - else: - result1 = f1.filter(regex='^B', axis=1).copy() - if (f2.loc[0, ['R_fighter']].values[0]) == red_fighter: - result2 = f2.filter(regex='^R', axis=1).copy() - else: - result2 = f2.filter(regex='^B', axis=1).copy() - result2.rename(columns = lambda x: re.sub('^B','R', x), inplace=True) + try: + #We build two dataframes, one for each figther + f1 = df[(df['R_fighter'] == blue_fighter) | (df['B_fighter'] == blue_fighter)].copy() + f1.reset_index(drop=True, inplace=True) + f1 = f1[:1] + f2 = df[(df['R_fighter'] == red_fighter) | (df['B_fighter'] == red_fighter)].copy() + f2.reset_index(drop=True, inplace=True) + f2 = f2[:1] + + # if the fighter was red/blue corner on his last fight, we filter columns to only keep his statistics (and not the other fighter) + # then we rename columns according to the color of the corner in the parameters using re.sub() + if (f1.loc[0, ['R_fighter']].values[0]) == blue_fighter: + result1 = f1.filter(regex='^R', axis=1).copy() #here we keep the red corner stats + result1.rename(columns = lambda x: re.sub('^R','B', x), inplace=True) #we rename it with "B_" prefix because he's in the blue_corner + else: + result1 = f1.filter(regex='^B', axis=1).copy() + if (f2.loc[0, ['R_fighter']].values[0]) == red_fighter: + result2 = f2.filter(regex='^R', axis=1).copy() + else: + result2 = f2.filter(regex='^B', axis=1).copy() + result2.rename(columns = lambda x: re.sub('^B','R', x), inplace=True) + + fight = pd.concat([result1, result2], axis = 1) # we concatenate the red and blue fighter dataframes (in columns) + fight.drop(['R_fighter','B_fighter'], axis = 1, inplace = True) # we remove fighter names + fight.insert(0, 'title_bout', title_bout) # we add tittle_bout, weight class and number of rounds data to the dataframe + fight.insert(1, 'weight_class', weightclass) + fight.insert(2, 'no_of_rounds', rounds) + fight['title_bout'] = fight['title_bout'].map({True: 1, False: 0}) - fight = pd.concat([result1, result2], axis = 1) # we concatenate the red and blue fighter dataframes (in columns) - fight.drop(['R_fighter','B_fighter'], axis = 1, inplace = True) # we remove fighter names - fight.insert(0, 'title_bout', title_bout) # we add tittle_bout, weight class and number of rounds data to the dataframe - fight.insert(1, 'weight_class', weightclass) - fight.insert(2, 'no_of_rounds', rounds) - fight['title_bout'] = fight['title_bout'].map({True: 1, False: 0}) - - pred = pipeline.predict(fight) - proba = pipeline.predict_proba(fight) - if (pred == 1.0): - print("The predicted winner is", red_fighter, 'with a probability of', round(proba[0][1] * 100, 2), "%") - else: - print("The predicted winner is", blue_fighter, 'with a probability of ', round(proba[0][0] * 100, 2), "%") - return proba + pred = pipeline.predict(fight) + proba = pipeline.predict_proba(fight) + if (pred == 1.0): + print("The predicted winner is", red_fighter, 'with a probability of', round(proba[0][1] * 100, 2), "%") + else: + print("The predicted winner is", blue_fighter, 'with a probability of ', round(proba[0][0] * 100, 2), "%") + return proba + except: + print("One of fighter doesn't exist in the dataframe") + return + predict(df, model, 'Kamaru Usman', 'Colby Covington', 'Welterweight', 3, True) predict(df, model, 'Leon Edwards', 'Belal Muhammad', 'Welterweight', 3, True) -predict(df, model, 'Conor McGregor', 'Khabib Nurmagomedov', 'Lightweight', 5, True) \ No newline at end of file +predict(df, model, 'Conor McGregor', 'Khabib Nurmagomedov', 'Lightweight', 5, True) +predict(df, model, 'Conor McGregor', 'Tai Tuivasa', 'Heavyweight', 5, True) + +predict(df,model,'Charles Oliveira','Conor McGregor','Lightweight',5,True) +predict(df,model,'Charles Oliveira','Khabib Nurmagomedov','Lightweight',5,True) +predict(df, model, 'Leon Edwards', 'Kamaru Usman', 'Welterweight', 5, True)