comments and clean code

master
luevard 1 year ago
parent ed2d749ed6
commit 4195f98de9

@ -5,7 +5,7 @@ import seaborn as sns
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
from sklearn.tree import export_graphviz from sklearn.tree import export_graphviz
from io import StringIO from io import StringIO
from IPython.display import Image from IPython.display import Image
from sklearn.tree import plot_tree from sklearn.tree import plot_tree
import pydotplus import pydotplus
from IPython.display import Image from IPython.display import Image
@ -21,72 +21,94 @@ from sklearn.impute import SimpleImputer
pd.options.display.max_columns = None pd.options.display.max_columns = None
pd.options.display.max_rows = None pd.options.display.max_rows = None
import sklearn import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))
def displayNumberOfNaNValues(df):
# Create an empty list to store tuples of column index and number of NaN values
na = []
# Loop through each column in the DataFrame
for index, col in enumerate(df):
# Count the number of NaN values in each column and append the index and count to 'na'
na.append((index, df[col].isna().sum()))
# Make a copy of 'na' and sort it based on the count of NaN values in descending order
na.sort(key=lambda x: x[1], reverse=True)
# Iterate through the sorted list of columns
for i in range(len(df.columns)):
# Check if the count of NaN values for the current column is not zero
if na[i][1] != 0:
# Print the column name, count of NaN values, and "NaN"
print(df.columns[na[i][0]], ":", na[i][1], "NaN")
# Calculate and print the total number of features with NaN values
print('Number of features with NaN values:', len([x[1] for x in na if x[1] > 0]))
print("Total NaN in dataframe :" , df.isna().sum().sum())
df = pd.read_csv('archive/data.csv') df = pd.read_csv('archive/data.csv')
b_age = df['B_age'] # we replace B_age to put it among B features # Before April 2001, there were almost no rules in UFC (no judges, no time limits, no rounds, etc.).
df.drop(['B_age'], axis = 1, inplace = True) #It's up to this precise date that UFC started to implement a set of rules known as
df.insert(76, "B_age", b_age) #"Unified Rules of Mixed Martial Arts".
#Therefore, we delete all fights before this major update in UFC's rules history.
df_fe = df.copy() # We make a copy of the dataframe for the feature engineering part later # Using this old data would not be representative of current fights, especially since this
#print(df.head(5)) #sport has become one of the most regulated due to its mixity and complexity.
limit_date = '2001-04-01' limit_date = '2001-04-01'
df = df[(df['date'] > limit_date)] df = df[(df['date'] > limit_date)]
# print("Total NaN in dataframe :" , df.isna().sum().sum()) # Display NaN values
# print("Total NaN in each column of the dataframe") displayNumberOfNaNValues(df)
na = []
for index, col in enumerate(df):
na.append((index, df[col].isna().sum()))
na_sorted = na.copy()
na_sorted.sort(key = lambda x: x[1], reverse = True)
# for i in range(len(df.columns)):
# print(df.columns[na_sorted[i][0]],":", na_sorted[i][1], "NaN")
# Define the list of important features to impute
imp_features = ['R_Weight_lbs', 'R_Height_cms', 'B_Height_cms', 'R_age', 'B_age', 'R_Reach_cms', 'B_Reach_cms'] imp_features = ['R_Weight_lbs', 'R_Height_cms', 'B_Height_cms', 'R_age', 'B_age', 'R_Reach_cms', 'B_Reach_cms']
# Initialize a SimpleImputer to impute missing values with median
imp_median = SimpleImputer(missing_values=np.nan, strategy='median') imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
# Iterate over each feature to impute missing values
for feature in imp_features: for feature in imp_features:
# Fit and transform the feature using median imputation
imp_feature = imp_median.fit_transform(df[feature].values.reshape(-1,1)) imp_feature = imp_median.fit_transform(df[feature].values.reshape(-1,1))
# Assign the imputed values back to the DataFrame
df[feature] = imp_feature df[feature] = imp_feature
# Impute missing values for 'R_Stance' using most frequent strategy
imp_stance_R = SimpleImputer(missing_values=np.nan, strategy='most_frequent') imp_stance_R = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_R_stance = imp_stance_R.fit_transform(df['R_Stance'].values.reshape(-1,1)) imp_R_stance = imp_stance_R.fit_transform(df['R_Stance'].values.reshape(-1,1))
# Impute missing values for 'B_Stance' using most frequent strategy
imp_stance_B = SimpleImputer(missing_values=np.nan, strategy='most_frequent') imp_stance_B = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_B_stance = imp_stance_B.fit_transform(df['B_Stance'].values.reshape(-1,1)) imp_B_stance = imp_stance_B.fit_transform(df['B_Stance'].values.reshape(-1,1))
df_R_stance_imputed = pd.DataFrame(imp_R_stance, columns=['R_Stance']) # Create DataFrames for imputed stances
df_B_stance_imputed = pd.DataFrame(imp_B_stance, columns=['B_Stance']) df['R_Stance'] = pd.DataFrame(imp_R_stance, columns=['R_Stance'])
df['B_Stance'] = pd.DataFrame(imp_B_stance, columns=['B_Stance'])
# Assign the imputed values to the original DataFrame
df['R_Stance'] = df_R_stance_imputed['R_Stance']
df['B_Stance'] = df_B_stance_imputed['B_Stance']
print('Number of features with NaN values :', len([x[1] for x in na if x[1] > 0])) # drop B_avg_BODY_att values in the dataframe
# List of features with NaN values to drop
#na_features = ['B_avg_BODY_att', 'R_avg_BODY_att']
na_features = ['B_avg_BODY_att', 'R_avg_BODY_att'] # Drop rows with NaN values in specified features
df.dropna(subset = na_features, inplace = True) #df.dropna(subset=na_features, inplace=True)
df.drop(['Referee', 'location'], axis = 1, inplace = True) # Drop columns 'Referee' and 'location' from the DataFrame
# The value of references and location has a low impact in battles, which makes it irrelevant to keep
# print(df.shape) df.drop(['Referee', 'location'], axis=1, inplace=True)
# print("Total NaN in dataframe :" , df.isna().sum().sum())
# Drop column 'B_draw' and 'R_draw' and 'Draw' fight and 'Catch Weight' fight
df.drop(['B_draw', 'R_draw'], axis=1, inplace=True) df.drop(['B_draw', 'R_draw'], axis=1, inplace=True)
df = df[df['Winner'] != 'Draw'] df = df[df['Winner'] != 'Draw']
df = df[df['weight_class'] != 'Catch Weight'] df = df[df['weight_class'] != 'Catch Weight']
# Supprimez les colonnes non numériques # Remove column when data type is not float or int
df_numeric = df.select_dtypes(include=['float64', 'int64']) dfWithoutString = df.select_dtypes(include=['float64', 'int64'])
# Tracez la matrice de corrélation
plt.figure(figsize=(50, 40)) plt.figure(figsize=(50, 40))
corr_matrix = df_numeric.corr(method='pearson').abs() corr_matrix = dfWithoutString.corr(method='pearson').abs()
sns.heatmap(corr_matrix, annot=True) sns.heatmap(corr_matrix, annot=True)
# Show the correlation matrix of the dataframe
# Very laggy feature
# plt.show() # plt.show()
# i = index of the fighter's fight, 0 means the last fight, -1 means first fight # i = index of the fighter's fight, 0 means the last fight, -1 means first fight
@ -99,9 +121,8 @@ def select_fight_row(df, name, i):
arr = df_temp.iloc[i,:].values arr = df_temp.iloc[i,:].values
return arr return arr
# we get the last fight of Khabib :'(
# print(select_fight_row(df, 'Amanda Nunes', 0)) print(select_fight_row(df, 'Khabib Nurmagomedov', 0))
# we get the last fight of Amanda Nunes
# get all active UFC fighters (according to the limit_date parameter) # get all active UFC fighters (according to the limit_date parameter)
@ -110,10 +131,11 @@ def list_fighters(df, limit_date):
set_R = set(df_temp['R_fighter']) set_R = set(df_temp['R_fighter'])
set_B = set(df_temp['B_fighter']) set_B = set(df_temp['B_fighter'])
fighters = list(set_R.union(set_B)) fighters = list(set_R.union(set_B))
print("Number of fighter: "+str(len(fighters)))
return fighters return fighters
fighters = list_fighters(df, '2017-01-01') # Last year when data fight was not full and correct
print(len(fighters)) fighters = list_fighters(df,'2016-01-01')
def build_df(df, fighters, i): def build_df(df, fighters, i):
arr = [select_fight_row(df, fighters[f], i) for f in range(len(fighters)) if select_fight_row(df, fighters[f], i) is not None] arr = [select_fight_row(df, fighters[f], i) for f in range(len(fighters)) if select_fight_row(df, fighters[f], i) is not None]
@ -197,43 +219,52 @@ tree_estimator = model['random_forest'].estimators_[10]
# plt.show() # plt.show()
def predict(df, pipeline, blue_fighter, red_fighter, weightclass, rounds, title_bout=False): def predict(df, pipeline, blue_fighter, red_fighter, weightclass, rounds, title_bout=False):
try:
#We build two dataframes, one for each figther #We build two dataframes, one for each figther
f1 = df[(df['R_fighter'] == blue_fighter) | (df['B_fighter'] == blue_fighter)].copy() f1 = df[(df['R_fighter'] == blue_fighter) | (df['B_fighter'] == blue_fighter)].copy()
f1.reset_index(drop=True, inplace=True) f1.reset_index(drop=True, inplace=True)
f1 = f1[:1] f1 = f1[:1]
f2 = df[(df['R_fighter'] == red_fighter) | (df['B_fighter'] == red_fighter)].copy() f2 = df[(df['R_fighter'] == red_fighter) | (df['B_fighter'] == red_fighter)].copy()
f2.reset_index(drop=True, inplace=True) f2.reset_index(drop=True, inplace=True)
f2 = f2[:1] f2 = f2[:1]
# if the fighter was red/blue corner on his last fight, we filter columns to only keep his statistics (and not the other fighter) # if the fighter was red/blue corner on his last fight, we filter columns to only keep his statistics (and not the other fighter)
# then we rename columns according to the color of the corner in the parameters using re.sub() # then we rename columns according to the color of the corner in the parameters using re.sub()
if (f1.loc[0, ['R_fighter']].values[0]) == blue_fighter: if (f1.loc[0, ['R_fighter']].values[0]) == blue_fighter:
result1 = f1.filter(regex='^R', axis=1).copy() #here we keep the red corner stats result1 = f1.filter(regex='^R', axis=1).copy() #here we keep the red corner stats
result1.rename(columns = lambda x: re.sub('^R','B', x), inplace=True) #we rename it with "B_" prefix because he's in the blue_corner result1.rename(columns = lambda x: re.sub('^R','B', x), inplace=True) #we rename it with "B_" prefix because he's in the blue_corner
else: else:
result1 = f1.filter(regex='^B', axis=1).copy() result1 = f1.filter(regex='^B', axis=1).copy()
if (f2.loc[0, ['R_fighter']].values[0]) == red_fighter: if (f2.loc[0, ['R_fighter']].values[0]) == red_fighter:
result2 = f2.filter(regex='^R', axis=1).copy() result2 = f2.filter(regex='^R', axis=1).copy()
else: else:
result2 = f2.filter(regex='^B', axis=1).copy() result2 = f2.filter(regex='^B', axis=1).copy()
result2.rename(columns = lambda x: re.sub('^B','R', x), inplace=True) result2.rename(columns = lambda x: re.sub('^B','R', x), inplace=True)
fight = pd.concat([result1, result2], axis = 1) # we concatenate the red and blue fighter dataframes (in columns)
fight.drop(['R_fighter','B_fighter'], axis = 1, inplace = True) # we remove fighter names
fight.insert(0, 'title_bout', title_bout) # we add tittle_bout, weight class and number of rounds data to the dataframe
fight.insert(1, 'weight_class', weightclass)
fight.insert(2, 'no_of_rounds', rounds)
fight['title_bout'] = fight['title_bout'].map({True: 1, False: 0})
fight = pd.concat([result1, result2], axis = 1) # we concatenate the red and blue fighter dataframes (in columns) pred = pipeline.predict(fight)
fight.drop(['R_fighter','B_fighter'], axis = 1, inplace = True) # we remove fighter names proba = pipeline.predict_proba(fight)
fight.insert(0, 'title_bout', title_bout) # we add tittle_bout, weight class and number of rounds data to the dataframe if (pred == 1.0):
fight.insert(1, 'weight_class', weightclass) print("The predicted winner is", red_fighter, 'with a probability of', round(proba[0][1] * 100, 2), "%")
fight.insert(2, 'no_of_rounds', rounds) else:
fight['title_bout'] = fight['title_bout'].map({True: 1, False: 0}) print("The predicted winner is", blue_fighter, 'with a probability of ', round(proba[0][0] * 100, 2), "%")
return proba
pred = pipeline.predict(fight) except:
proba = pipeline.predict_proba(fight) print("One of fighter doesn't exist in the dataframe")
if (pred == 1.0): return
print("The predicted winner is", red_fighter, 'with a probability of', round(proba[0][1] * 100, 2), "%")
else:
print("The predicted winner is", blue_fighter, 'with a probability of ', round(proba[0][0] * 100, 2), "%")
return proba
predict(df, model, 'Kamaru Usman', 'Colby Covington', 'Welterweight', 3, True) predict(df, model, 'Kamaru Usman', 'Colby Covington', 'Welterweight', 3, True)
predict(df, model, 'Leon Edwards', 'Belal Muhammad', 'Welterweight', 3, True) predict(df, model, 'Leon Edwards', 'Belal Muhammad', 'Welterweight', 3, True)
predict(df, model, 'Conor McGregor', 'Khabib Nurmagomedov', 'Lightweight', 5, True) predict(df, model, 'Conor McGregor', 'Khabib Nurmagomedov', 'Lightweight', 5, True)
predict(df, model, 'Conor McGregor', 'Tai Tuivasa', 'Heavyweight', 5, True)
predict(df,model,'Charles Oliveira','Conor McGregor','Lightweight',5,True)
predict(df,model,'Charles Oliveira','Khabib Nurmagomedov','Lightweight',5,True)
predict(df, model, 'Leon Edwards', 'Kamaru Usman', 'Welterweight', 5, True)

Loading…
Cancel
Save