✨ comments and clean code

1 year ago · 4195f98de9
parent ed2d749ed6
commit 4195f98de9
1 changed files with 106 additions and 75 deletions
--- a/test.py
+++ b/test.py
@ -21,72 +21,94 @@ from sklearn.impute import SimpleImputer
 pd.options.display.max_columns = None
 pd.options.display.max_rows = None
 import sklearn
-print('The scikit-learn version is {}.'.format(sklearn.__version__))
+
+def displayNumberOfNaNValues(df):
+    # Create an empty list to store tuples of column index and number of NaN values
+    na = []
+    # Loop through each column in the DataFrame
+    for index, col in enumerate(df):
+        # Count the number of NaN values in each column and append the index and count to 'na'
+        na.append((index, df[col].isna().sum())) 
+    # Make a copy of 'na' and sort it based on the count of NaN values in descending order
+    na.sort(key=lambda x: x[1], reverse=True) 
+    # Iterate through the sorted list of columns
+    for i in range(len(df.columns)):
+        # Check if the count of NaN values for the current column is not zero
+        if na[i][1] != 0:
+            # Print the column name, count of NaN values, and "NaN"
+            print(df.columns[na[i][0]], ":", na[i][1], "NaN")
+    # Calculate and print the total number of features with NaN values
+    print('Number of features with NaN values:', len([x[1] for x in na if x[1] > 0]))
+    print("Total NaN in dataframe :" , df.isna().sum().sum())
+

 df = pd.read_csv('archive/data.csv')

-b_age = df['B_age']  #  we replace B_age to put it among B features 
-df.drop(['B_age'], axis = 1, inplace = True)
-df.insert(76, "B_age", b_age)
+# Before April 2001, there were almost no rules in UFC (no judges, no time limits, no rounds, etc.). 
+#It's up to this precise date that UFC started to implement a set of rules known as 
+#"Unified Rules of Mixed Martial Arts".
+#Therefore, we delete all fights before this major update in UFC's rules history.

-df_fe = df.copy() #  We make a copy of the dataframe for the feature engineering part later
-#print(df.head(5))
+# Using this old data would not be representative of current fights, especially since this 
+#sport has become one of the most regulated due to its mixity and complexity.

 limit_date = '2001-04-01'
 df = df[(df['date'] > limit_date)]

-# print("Total NaN in dataframe :" , df.isna().sum().sum())
-# print("Total NaN in each column of the dataframe")
-na = []
-for index, col in enumerate(df):
-    na.append((index, df[col].isna().sum())) 
-na_sorted = na.copy()
-na_sorted.sort(key = lambda x: x[1], reverse = True) 
-
-# for i in range(len(df.columns)):
-#     print(df.columns[na_sorted[i][0]],":", na_sorted[i][1], "NaN")
+# Display NaN values
+displayNumberOfNaNValues(df)

+# Define the list of important features to impute
 imp_features = ['R_Weight_lbs', 'R_Height_cms', 'B_Height_cms', 'R_age', 'B_age', 'R_Reach_cms', 'B_Reach_cms']
+
+# Initialize a SimpleImputer to impute missing values with median
 imp_median = SimpleImputer(missing_values=np.nan, strategy='median')

+# Iterate over each feature to impute missing values
 for feature in imp_features:
+    # Fit and transform the feature using median imputation
    imp_feature = imp_median.fit_transform(df[feature].values.reshape(-1,1))
+    # Assign the imputed values back to the DataFrame
    df[feature] = imp_feature

+# Impute missing values for 'R_Stance' using most frequent strategy
 imp_stance_R = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
 imp_R_stance = imp_stance_R.fit_transform(df['R_Stance'].values.reshape(-1,1))

+# Impute missing values for 'B_Stance' using most frequent strategy
 imp_stance_B = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
 imp_B_stance = imp_stance_B.fit_transform(df['B_Stance'].values.reshape(-1,1))

-df_R_stance_imputed = pd.DataFrame(imp_R_stance, columns=['R_Stance'])
-df_B_stance_imputed = pd.DataFrame(imp_B_stance, columns=['B_Stance'])
+# Create DataFrames for imputed stances
+df['R_Stance'] = pd.DataFrame(imp_R_stance, columns=['R_Stance'])
+df['B_Stance'] = pd.DataFrame(imp_B_stance, columns=['B_Stance'])

-# Assign the imputed values to the original DataFrame
-df['R_Stance'] = df_R_stance_imputed['R_Stance']
-df['B_Stance'] = df_B_stance_imputed['B_Stance']
+# drop B_avg_BODY_att values in the dataframe
+    # List of features with NaN values to drop
+    #na_features = ['B_avg_BODY_att', 'R_avg_BODY_att']

-print('Number of features with NaN values :', len([x[1] for x in na if x[1] > 0]))
-
-na_features = ['B_avg_BODY_att', 'R_avg_BODY_att']
-df.dropna(subset = na_features, inplace = True)
+    # Drop rows with NaN values in specified features
+    #df.dropna(subset=na_features, inplace=True)

+# Drop columns 'Referee' and 'location' from the DataFrame
+# The value of references and location has a low impact in battles, which makes it irrelevant to keep
 df.drop(['Referee', 'location'], axis=1, inplace=True)

-# print(df.shape)
-# print("Total NaN in dataframe :" , df.isna().sum().sum())
-
+# Drop column 'B_draw' and 'R_draw' and 'Draw' fight and 'Catch Weight' fight
 df.drop(['B_draw', 'R_draw'], axis=1, inplace=True)
 df = df[df['Winner'] != 'Draw']
 df = df[df['weight_class'] != 'Catch Weight']

-# Supprimez les colonnes non numériques
-df_numeric = df.select_dtypes(include=['float64', 'int64'])
+# Remove column when data type is not float or int
+dfWithoutString = df.select_dtypes(include=['float64', 'int64'])

-# Tracez la matrice de corrélation
 plt.figure(figsize=(50, 40))
-corr_matrix = df_numeric.corr(method='pearson').abs()
+corr_matrix = dfWithoutString.corr(method='pearson').abs()
 sns.heatmap(corr_matrix, annot=True)
+
+# Show the correlation matrix of the dataframe
+# Very laggy feature
+
 # plt.show()

 #  i = index of the fighter's fight, 0 means the last fight, -1 means first fight
@ -99,9 +121,8 @@ def select_fight_row(df, name, i):
    arr = df_temp.iloc[i,:].values
    return arr
    
-
-# print(select_fight_row(df, 'Amanda Nunes', 0))
-#  we get the last fight of Amanda Nunes
+#  we get the last fight of Khabib :'(
+print(select_fight_row(df, 'Khabib Nurmagomedov', 0))


 # get all active UFC fighters (according to the limit_date parameter)
@ -110,10 +131,11 @@ def list_fighters(df, limit_date):
    set_R = set(df_temp['R_fighter'])
    set_B = set(df_temp['B_fighter'])
    fighters = list(set_R.union(set_B))
+    print("Number of fighter: "+str(len(fighters)))
    return fighters

-fighters = list_fighters(df, '2017-01-01')
-print(len(fighters))
+# Last year when data fight was not full and correct
+fighters = list_fighters(df,'2016-01-01')

 def build_df(df, fighters, i):      
    arr = [select_fight_row(df, fighters[f], i) for f in range(len(fighters)) if select_fight_row(df, fighters[f], i) is not None]
@ -197,7 +219,7 @@ tree_estimator = model['random_forest'].estimators_[10]
 # plt.show()

 def predict(df, pipeline, blue_fighter, red_fighter, weightclass, rounds, title_bout=False): 
-    
+    try:
        #We build two dataframes, one for each figther 
        f1 = df[(df['R_fighter'] == blue_fighter) | (df['B_fighter'] == blue_fighter)].copy()
        f1.reset_index(drop=True, inplace=True)
@ -233,7 +255,16 @@ def predict(df, pipeline, blue_fighter, red_fighter, weightclass, rounds, title_
        else:
            print("The predicted winner is", blue_fighter, 'with a probability of ', round(proba[0][0] * 100, 2), "%")
        return proba
+    except:
+        print("One of fighter doesn't exist in the dataframe")
+        return
+

 predict(df, model, 'Kamaru Usman', 'Colby Covington', 'Welterweight', 3, True) 
 predict(df, model, 'Leon Edwards', 'Belal Muhammad', 'Welterweight', 3, True)
 predict(df, model, 'Conor McGregor', 'Khabib Nurmagomedov', 'Lightweight', 5, True)
+predict(df, model, 'Conor McGregor', 'Tai Tuivasa', 'Heavyweight', 5, True)
+
+predict(df,model,'Charles Oliveira','Conor McGregor','Lightweight',5,True)
+predict(df,model,'Charles Oliveira','Khabib Nurmagomedov','Lightweight',5,True)
+predict(df, model, 'Leon Edwards', 'Kamaru Usman', 'Welterweight', 5, True)