From 4195f98de9a6b2bb3ace31f1f82728c48d620414 Mon Sep 17 00:00:00 2001
From: luevard <99143550+saucepommefrite@users.noreply.github.com>
Date: Fri, 9 Feb 2024 09:49:49 +0100
Subject: [PATCH] :sparkles: comments and clean code

---
 test.py | 181 +++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 106 insertions(+), 75 deletions(-)

diff --git a/test.py b/test.py
index f0f2503..8602b5d 100644
--- a/test.py
+++ b/test.py
@@ -5,7 +5,7 @@ import seaborn as sns
 import matplotlib.pyplot as plt
 from sklearn.tree import export_graphviz
 from io import StringIO 
-from IPython.display import Image  
+from IPython.display import Image
 from sklearn.tree import plot_tree
 import pydotplus
 from IPython.display import Image
@@ -21,72 +21,94 @@ from sklearn.impute import SimpleImputer
 pd.options.display.max_columns = None
 pd.options.display.max_rows = None
 import sklearn
-print('The scikit-learn version is {}.'.format(sklearn.__version__))
+
+def displayNumberOfNaNValues(df):
+    # Create an empty list to store tuples of column index and number of NaN values
+    na = []
+    # Loop through each column in the DataFrame
+    for index, col in enumerate(df):
+        # Count the number of NaN values in each column and append the index and count to 'na'
+        na.append((index, df[col].isna().sum())) 
+    # Make a copy of 'na' and sort it based on the count of NaN values in descending order
+    na.sort(key=lambda x: x[1], reverse=True) 
+    # Iterate through the sorted list of columns
+    for i in range(len(df.columns)):
+        # Check if the count of NaN values for the current column is not zero
+        if na[i][1] != 0:
+            # Print the column name, count of NaN values, and "NaN"
+            print(df.columns[na[i][0]], ":", na[i][1], "NaN")
+    # Calculate and print the total number of features with NaN values
+    print('Number of features with NaN values:', len([x[1] for x in na if x[1] > 0]))
+    print("Total NaN in dataframe :" , df.isna().sum().sum())
+
 
 df = pd.read_csv('archive/data.csv')
 
-b_age = df['B_age']  #  we replace B_age to put it among B features 
-df.drop(['B_age'], axis = 1, inplace = True)
-df.insert(76, "B_age", b_age)
+# Before April 2001, there were almost no rules in UFC (no judges, no time limits, no rounds, etc.). 
+#It's up to this precise date that UFC started to implement a set of rules known as 
+#"Unified Rules of Mixed Martial Arts".
+#Therefore, we delete all fights before this major update in UFC's rules history.
 
-df_fe = df.copy() #  We make a copy of the dataframe for the feature engineering part later
-#print(df.head(5))
+# Using this old data would not be representative of current fights, especially since this 
+#sport has become one of the most regulated due to its mixity and complexity.
 
 limit_date = '2001-04-01'
 df = df[(df['date'] > limit_date)]
 
-# print("Total NaN in dataframe :" , df.isna().sum().sum())
-# print("Total NaN in each column of the dataframe")
-na = []
-for index, col in enumerate(df):
-    na.append((index, df[col].isna().sum())) 
-na_sorted = na.copy()
-na_sorted.sort(key = lambda x: x[1], reverse = True) 
-
-# for i in range(len(df.columns)):
-#     print(df.columns[na_sorted[i][0]],":", na_sorted[i][1], "NaN")
+# Display NaN values
+displayNumberOfNaNValues(df)
 
+# Define the list of important features to impute
 imp_features = ['R_Weight_lbs', 'R_Height_cms', 'B_Height_cms', 'R_age', 'B_age', 'R_Reach_cms', 'B_Reach_cms']
+
+# Initialize a SimpleImputer to impute missing values with median
 imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
 
+# Iterate over each feature to impute missing values
 for feature in imp_features:
+    # Fit and transform the feature using median imputation
     imp_feature = imp_median.fit_transform(df[feature].values.reshape(-1,1))
+    # Assign the imputed values back to the DataFrame
     df[feature] = imp_feature
 
+# Impute missing values for 'R_Stance' using most frequent strategy
 imp_stance_R = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
 imp_R_stance = imp_stance_R.fit_transform(df['R_Stance'].values.reshape(-1,1))
 
+# Impute missing values for 'B_Stance' using most frequent strategy
 imp_stance_B = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
 imp_B_stance = imp_stance_B.fit_transform(df['B_Stance'].values.reshape(-1,1))
 
-df_R_stance_imputed = pd.DataFrame(imp_R_stance, columns=['R_Stance'])
-df_B_stance_imputed = pd.DataFrame(imp_B_stance, columns=['B_Stance'])
-
-# Assign the imputed values to the original DataFrame
-df['R_Stance'] = df_R_stance_imputed['R_Stance']
-df['B_Stance'] = df_B_stance_imputed['B_Stance']
+# Create DataFrames for imputed stances
+df['R_Stance'] = pd.DataFrame(imp_R_stance, columns=['R_Stance'])
+df['B_Stance'] = pd.DataFrame(imp_B_stance, columns=['B_Stance'])
 
-print('Number of features with NaN values :', len([x[1] for x in na if x[1] > 0]))
+# drop B_avg_BODY_att values in the dataframe
+    # List of features with NaN values to drop
+    #na_features = ['B_avg_BODY_att', 'R_avg_BODY_att']
 
-na_features = ['B_avg_BODY_att', 'R_avg_BODY_att']
-df.dropna(subset = na_features, inplace = True)
+    # Drop rows with NaN values in specified features
+    #df.dropna(subset=na_features, inplace=True)
 
-df.drop(['Referee', 'location'], axis = 1, inplace = True)
-
-# print(df.shape)
-# print("Total NaN in dataframe :" , df.isna().sum().sum())
+# Drop columns 'Referee' and 'location' from the DataFrame
+# The value of references and location has a low impact in battles, which makes it irrelevant to keep
+df.drop(['Referee', 'location'], axis=1, inplace=True)
 
+# Drop column 'B_draw' and 'R_draw' and 'Draw' fight and 'Catch Weight' fight
 df.drop(['B_draw', 'R_draw'], axis=1, inplace=True)
 df = df[df['Winner'] != 'Draw']
 df = df[df['weight_class'] != 'Catch Weight']
 
-# Supprimez les colonnes non numériques
-df_numeric = df.select_dtypes(include=['float64', 'int64'])
+# Remove column when data type is not float or int
+dfWithoutString = df.select_dtypes(include=['float64', 'int64'])
 
-# Tracez la matrice de corrélation
 plt.figure(figsize=(50, 40))
-corr_matrix = df_numeric.corr(method='pearson').abs()
+corr_matrix = dfWithoutString.corr(method='pearson').abs()
 sns.heatmap(corr_matrix, annot=True)
+
+# Show the correlation matrix of the dataframe
+# Very laggy feature
+
 # plt.show()
 
 #  i = index of the fighter's fight, 0 means the last fight, -1 means first fight
@@ -99,9 +121,8 @@ def select_fight_row(df, name, i):
     arr = df_temp.iloc[i,:].values
     return arr
     
-
-# print(select_fight_row(df, 'Amanda Nunes', 0))
-#  we get the last fight of Amanda Nunes
+#  we get the last fight of Khabib :'(
+print(select_fight_row(df, 'Khabib Nurmagomedov', 0))
 
 
 # get all active UFC fighters (according to the limit_date parameter)
@@ -110,10 +131,11 @@ def list_fighters(df, limit_date):
     set_R = set(df_temp['R_fighter'])
     set_B = set(df_temp['B_fighter'])
     fighters = list(set_R.union(set_B))
+    print("Number of fighter: "+str(len(fighters)))
     return fighters
 
-fighters = list_fighters(df, '2017-01-01')
-print(len(fighters))
+# Last year when data fight was not full and correct
+fighters = list_fighters(df,'2016-01-01')
 
 def build_df(df, fighters, i):      
     arr = [select_fight_row(df, fighters[f], i) for f in range(len(fighters)) if select_fight_row(df, fighters[f], i) is not None]
@@ -197,43 +219,52 @@ tree_estimator = model['random_forest'].estimators_[10]
 # plt.show()
 
 def predict(df, pipeline, blue_fighter, red_fighter, weightclass, rounds, title_bout=False): 
-    
-    #We build two dataframes, one for each figther 
-    f1 = df[(df['R_fighter'] == blue_fighter) | (df['B_fighter'] == blue_fighter)].copy()
-    f1.reset_index(drop=True, inplace=True)
-    f1 = f1[:1]
-    f2 = df[(df['R_fighter'] == red_fighter) | (df['B_fighter'] == red_fighter)].copy()
-    f2.reset_index(drop=True, inplace=True)
-    f2 = f2[:1]
-    
-    # if the fighter was red/blue corner on his last fight, we filter columns to only keep his statistics (and not the other fighter)
-    # then we rename columns according to the color of  the corner in the parameters using re.sub()
-    if (f1.loc[0, ['R_fighter']].values[0]) == blue_fighter:
-        result1 = f1.filter(regex='^R', axis=1).copy() #here we keep the red corner stats
-        result1.rename(columns = lambda x: re.sub('^R','B', x), inplace=True)  #we rename it with "B_" prefix because he's in the blue_corner
-    else: 
-        result1 = f1.filter(regex='^B', axis=1).copy()
-    if (f2.loc[0, ['R_fighter']].values[0]) == red_fighter:
-        result2 = f2.filter(regex='^R', axis=1).copy()
-    else:
-        result2 = f2.filter(regex='^B', axis=1).copy()
-        result2.rename(columns = lambda x: re.sub('^B','R', x), inplace=True)
+    try:
+        #We build two dataframes, one for each figther 
+        f1 = df[(df['R_fighter'] == blue_fighter) | (df['B_fighter'] == blue_fighter)].copy()
+        f1.reset_index(drop=True, inplace=True)
+        f1 = f1[:1]
+        f2 = df[(df['R_fighter'] == red_fighter) | (df['B_fighter'] == red_fighter)].copy()
+        f2.reset_index(drop=True, inplace=True)
+        f2 = f2[:1]
+
+        # if the fighter was red/blue corner on his last fight, we filter columns to only keep his statistics (and not the other fighter)
+        # then we rename columns according to the color of  the corner in the parameters using re.sub()
+        if (f1.loc[0, ['R_fighter']].values[0]) == blue_fighter:
+            result1 = f1.filter(regex='^R', axis=1).copy() #here we keep the red corner stats
+            result1.rename(columns = lambda x: re.sub('^R','B', x), inplace=True)  #we rename it with "B_" prefix because he's in the blue_corner
+        else: 
+            result1 = f1.filter(regex='^B', axis=1).copy()
+        if (f2.loc[0, ['R_fighter']].values[0]) == red_fighter:
+            result2 = f2.filter(regex='^R', axis=1).copy()
+        else:
+            result2 = f2.filter(regex='^B', axis=1).copy()
+            result2.rename(columns = lambda x: re.sub('^B','R', x), inplace=True)
+            
+        fight = pd.concat([result1, result2], axis = 1) # we concatenate the red and blue fighter dataframes (in columns)
+        fight.drop(['R_fighter','B_fighter'], axis = 1, inplace = True) # we remove fighter names
+        fight.insert(0, 'title_bout', title_bout) # we add tittle_bout, weight class and number of rounds data to the dataframe
+        fight.insert(1, 'weight_class', weightclass)
+        fight.insert(2, 'no_of_rounds', rounds)
+        fight['title_bout'] = fight['title_bout'].map({True: 1, False: 0})
         
-    fight = pd.concat([result1, result2], axis = 1) # we concatenate the red and blue fighter dataframes (in columns)
-    fight.drop(['R_fighter','B_fighter'], axis = 1, inplace = True) # we remove fighter names
-    fight.insert(0, 'title_bout', title_bout) # we add tittle_bout, weight class and number of rounds data to the dataframe
-    fight.insert(1, 'weight_class', weightclass)
-    fight.insert(2, 'no_of_rounds', rounds)
-    fight['title_bout'] = fight['title_bout'].map({True: 1, False: 0})
-    
-    pred = pipeline.predict(fight)
-    proba = pipeline.predict_proba(fight)
-    if (pred == 1.0): 
-        print("The predicted winner is", red_fighter, 'with a probability of', round(proba[0][1] * 100, 2), "%")
-    else:
-        print("The predicted winner is", blue_fighter, 'with a probability of ', round(proba[0][0] * 100, 2), "%")
-    return proba
+        pred = pipeline.predict(fight)
+        proba = pipeline.predict_proba(fight)
+        if (pred == 1.0): 
+            print("The predicted winner is", red_fighter, 'with a probability of', round(proba[0][1] * 100, 2), "%")
+        else:
+            print("The predicted winner is", blue_fighter, 'with a probability of ', round(proba[0][0] * 100, 2), "%")
+        return proba
+    except:
+        print("One of fighter doesn't exist in the dataframe")
+        return
+
 
 predict(df, model, 'Kamaru Usman', 'Colby Covington', 'Welterweight', 3, True) 
 predict(df, model, 'Leon Edwards', 'Belal Muhammad', 'Welterweight', 3, True)
-predict(df, model, 'Conor McGregor', 'Khabib Nurmagomedov', 'Lightweight', 5, True)
\ No newline at end of file
+predict(df, model, 'Conor McGregor', 'Khabib Nurmagomedov', 'Lightweight', 5, True)
+predict(df, model, 'Conor McGregor', 'Tai Tuivasa', 'Heavyweight', 5, True)
+
+predict(df,model,'Charles Oliveira','Conor McGregor','Lightweight',5,True)
+predict(df,model,'Charles Oliveira','Khabib Nurmagomedov','Lightweight',5,True)
+predict(df, model, 'Leon Edwards', 'Kamaru Usman', 'Welterweight', 5, True)