import re import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt from sklearn.tree import export_graphviz from sklearn.tree import plot_tree from sklearn.pipeline import Pipeline from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix from sklearn.model_selection import cross_val_score from sklearn.preprocessing import OrdinalEncoder, LabelEncoder from sklearn.compose import make_column_transformer from sklearn.impute import SimpleImputer pd.options.display.max_columns = None pd.options.display.max_rows = None import sklearn def displayNumberOfNaNValues(df): # Create an empty list to store tuples of column index and number of NaN values na = [] # Loop through each column in the DataFrame for index, col in enumerate(df): # Count the number of NaN values in each column and append the index and count to 'na' na.append((index, df[col].isna().sum())) # Make a copy of 'na' and sort it based on the count of NaN values in descending order na.sort(key=lambda x: x[1], reverse=True) # Iterate through the sorted list of columns for i in range(len(df.columns)): # Check if the count of NaN values for the current column is not zero if na[i][1] != 0: # Print the column name, count of NaN values, and "NaN" print(df.columns[na[i][0]], ":", na[i][1], "NaN") # Calculate and print the total number of features with NaN values print('Number of features with NaN values:', len([x[1] for x in na if x[1] > 0])) print("Total NaN in dataframe :" , df.isna().sum().sum()) # Before April 2001, there were almost no rules in UFC (no judges, no time limits, no rounds, etc.). #It's up to this precise date that UFC started to implement a set of rules known as #"Unified Rules of Mixed Martial Arts". #Therefore, we delete all fights before this major update in UFC's rules history. # Using this old data would not be representative of current fights, especially since this #sport has become one of the most regulated due to its mixity and complexity. #limit_date = '2001-04-01' #df = df[(df['date'] > limit_date)] # Display NaN values #displayNumberOfNaNValues(df) # Define the list of important features to impute #imp_features = ['R_Weight_lbs', 'R_Height_cms', 'B_Height_cms', 'R_age', 'B_age', 'R_Reach_cms', 'B_Reach_cms'] # Initialize a SimpleImputer to impute missing values with median #imp_median = SimpleImputer(missing_values=np.nan, strategy='median') # Iterate over each feature to impute missing values #for feature in imp_features: # Fit and transform the feature using median imputation #imp_feature = imp_median.fit_transform(df[feature].values.reshape(-1,1)) # Assign the imputed values back to the DataFrame #df[feature] = imp_feature # Impute missing values for 'R_Stance' using most frequent strategy #imp_stance_R = SimpleImputer(missing_values=np.nan, strategy='most_frequent') #imp_R_stance = imp_stance_R.fit_transform(df['R_Stance'].values.reshape(-1,1)) # ## Impute missing values for 'B_Stance' using most frequent strategy #imp_stance_B = SimpleImputer(missing_values=np.nan, strategy='most_frequent') #imp_B_stance = imp_stance_B.fit_transform(df['B_Stance'].values.reshape(-1,1)) # ## Create DataFrames for imputed stances #df['R_Stance'] = pd.DataFrame(imp_R_stance, columns=['R_Stance']) #df['B_Stance'] = pd.DataFrame(imp_B_stance, columns=['B_Stance']) # ## drop B_avg_BODY_att values in the dataframe # # List of features with NaN values to drop # #na_features = ['B_avg_BODY_att', 'R_avg_BODY_att'] # # # Drop rows with NaN values in specified features # #df.dropna(subset=na_features, inplace=True) # ## Drop columns 'Referee' and 'location' from the DataFrame ## The value of references and location has a low impact in battles, which makes it irrelevant to keep #df.drop(['Referee', 'location'], axis=1, inplace=True) # ## Drop column 'B_draw' and 'R_draw' and 'Draw' fight and 'Catch Weight' fight #df.drop(['B_draw', 'R_draw'], axis=1, inplace=True) #df = df[df['Winner'] != 'Draw'] #df = df[df['weight_class'] != 'Catch Weight'] # ## Remove column when data type is not float or int #dfWithoutString = df.select_dtypes(include=['float64', 'int64']) # #plt.figure(figsize=(50, 40)) #corr_matrix = dfWithoutString.corr(method='pearson').abs() #sns.heatmap(corr_matrix, annot=True) # ## Show the correlation matrix of the dataframe ## Very laggy feature # # plt.show() # i = index of the fighter's fight, 0 means the last fight, -1 means first fight def select_fight_row(df, name, i): df_temp = df[(df['R_fighter'] == name) | (df['B_fighter'] == name)] # filter df on fighter's name df_temp.reset_index(drop=True, inplace=True) # as we created a new temporary dataframe, we have to reset indexes idx = max(df_temp.index) # get the index of the oldest fight if i > idx: # if we are looking for a fight that didn't exist, we return nothing return arr = df_temp.iloc[i,:].values return arr # we get the last fight of Khabib :'( #print(select_fight_row(df, 'Khabib Nurmagomedov', 0)) # get all active UFC fighters (according to the limit_date parameter) def list_fighters(df, limit_date): # Filter the DataFrame to include only fights occurring after the specified limit date df_temp = df[df['date'] > limit_date] # Create a set of all fighters from the red corner ('R_fighter') in the filtered DataFrame set_R = set(df_temp['R_fighter']) # Create a set of all fighters from the blue corner ('B_fighter') in the filtered DataFrame set_B = set(df_temp['B_fighter']) # Combine the sets of fighters from the red and blue corners to get all unique fighters fighters = list(set_R.union(set_B)) # Print the number of unique fighters included in the list # print("Number of fighters: " + str(len(fighters))) # Return the list of unique fighters return fighters # Last year when data fight was not full and correct #fighters = list_fighters(df,'2015-01-01') def build_df(df, fighters, i): arr = [select_fight_row(df, fighters[f], i) for f in range(len(fighters)) if select_fight_row(df, fighters[f], i) is not None] cols = [col for col in df] df_fights = pd.DataFrame(data=arr, columns=cols) df_fights.drop_duplicates(inplace=True) df_fights['title_bout'] = df_fights['title_bout'].map({True: 1, False: 0}) df_fights.drop(['R_fighter', 'B_fighter', 'date'], axis=1, inplace=True) return df_fights def build_df_all_but_last(df, fighters): cols = [col for col in df] df_fights=pd.DataFrame(columns=cols) for f in range(len(fighters)): i=0 while True: fight_row = select_fight_row(df, fighters[f], i) if fight_row is None: if not df_fights.empty: df_fights = df_fights.iloc[:-1] break fight_row = list(fight_row) dfTemp = pd.DataFrame(data=[fight_row], columns=cols) df_fights = df_fights.dropna(axis=1, how='all') df_fights = pd.concat([df_fights, dfTemp], ignore_index=True) i=i+1 df_fights.drop_duplicates(inplace=True) df_fights = df_fights[~df_fights.apply(lambda row: 'Open Stance' in row.values, axis=1)].reset_index(drop=True) df_fights['title_bout'] = df_fights['title_bout'].map({True: 1, False: 0}) df_fights.drop(['R_fighter', 'B_fighter', 'date'], axis=1, inplace=True) return df_fights # #df_train = build_df_all_but_last(df, fighters) #df_test = build_df(df, fighters,0) # #preprocessor = make_column_transformer((OrdinalEncoder(), ['weight_class', 'B_Stance', 'R_Stance']), remainder='passthrough') # ## If the winner is from the Red corner, Winner label will be encoded as 1, otherwise it will be 0 (Blue corner) #label_encoder = LabelEncoder() #y_train = label_encoder.fit_transform(df_train['Winner']) #y_test = label_encoder.transform(df_test['Winner']) # #X_train, X_test = df_train.drop(['Winner'], axis=1), df_test.drop(['Winner'], axis=1) # ## Random Forest composed of 100 decision trees. We optimized parameters using cross-validation and GridSearch tool paired together #random_forest = RandomForestClassifier(n_estimators=100, # criterion='entropy', # max_depth=10, # min_samples_split=2, # min_samples_leaf=1, # random_state=0) # #model = Pipeline([('encoding', preprocessor), ('random_forest', random_forest)]) #model.fit(X_train, y_train) # ## We use cross-validation with 5-folds to have a more precise accuracy (reduce variation) #accuracies = cross_val_score(estimator=model, X=X_train, y=y_train, cv=5) #print('Accuracy mean : ', accuracies.mean()) #print('Accuracy standard deviation : ', accuracies.std()) # #y_pred = model.predict(X_test) #print('Testing accuracy : ', accuracy_score(y_test, y_pred), '\n') # #target_names = ["Blue","Red"] #print(classification_report(y_test, y_pred, labels=[0,1], target_names=target_names)) # ## cm = confusion_matrix(y_test, y_pred) ## ax = plt.subplot() # sns.heatmap(cm, annot = True, ax = ax, fmt = "d") # ax.set_xlabel('Actual') # ax.set_ylabel('Predicted') # ax.set_title("Confusion Matrix") # ax.xaxis.set_ticklabels(['Blue', 'Red']) # ax.yaxis.set_ticklabels(['Blue', 'Red']) # plt.show() #feature_names = [col for col in X_train] #feature_importances = model['random_forest'].feature_importances_ #indices = np.argsort(feature_importances)[::-1] #n = 30 # maximum feature importances displayed #idx = indices[0:n] #std = np.std([tree.feature_importances_ for tree in model['random_forest'].estimators_], axis=0) #for f in range(n): # print("%d. feature %s (%f)" % (f + 1, feature_names[idx[f]], feature_importances[idx[f]])) # plt.figure(figsize=(30, 8)) # plt.title("Feature importances") # plt.bar(range(n), feature_importances[idx], color="r", yerr=std[idx], align="center") # plt.xticks(range(n), [feature_names[id] for id in idx], rotation = 45) # plt.xlim([-1, n]) # plt.show() # Sélectionnez un arbre de votre modèle #tree_estimator = model['random_forest'].estimators_[10] # Tracez l'arbre # plt.figure(figsize=(1, 1)) # plot_tree(tree_estimator, feature_names=df_train.columns, filled=True, rounded=True, fontsize=10) # plt.savefig('tree.png', dpi=600) # Enregistrez l'image au format PNG # plt.show() def predict(df, pipeline, blue_fighter, red_fighter, weightclass, rounds, title_bout=False): try: #We build two dataframes, one for each figther f1 = df[(df['R_fighter'] == blue_fighter) | (df['B_fighter'] == blue_fighter)].copy() f1.reset_index(drop=True, inplace=True) f1 = f1[:1] f2 = df[(df['R_fighter'] == red_fighter) | (df['B_fighter'] == red_fighter)].copy() f2.reset_index(drop=True, inplace=True) f2 = f2[:1] # if the fighter was red/blue corner on his last fight, we filter columns to only keep his statistics (and not the other fighter) # then we rename columns according to the color of the corner in the parameters using re.sub() if (f1.loc[0, ['R_fighter']].values[0]) == blue_fighter: result1 = f1.filter(regex='^R', axis=1).copy() #here we keep the red corner stats result1.rename(columns = lambda x: re.sub('^R','B', x), inplace=True) #we rename it with "B_" prefix because he's in the blue_corner else: result1 = f1.filter(regex='^B', axis=1).copy() if (f2.loc[0, ['R_fighter']].values[0]) == red_fighter: result2 = f2.filter(regex='^R', axis=1).copy() else: result2 = f2.filter(regex='^B', axis=1).copy() result2.rename(columns = lambda x: re.sub('^B','R', x), inplace=True) fight = pd.concat([result1, result2], axis = 1) # we concatenate the red and blue fighter dataframes (in columns) fight.drop(['R_fighter','B_fighter'], axis = 1, inplace = True) # we remove fighter names fight.insert(0, 'title_bout', title_bout) # we add tittle_bout, weight class and number of rounds data to the dataframe fight.insert(1, 'weight_class', weightclass) fight.insert(2, 'no_of_rounds', rounds) fight['title_bout'] = fight['title_bout'].map({True: 1, False: 0}) pred = pipeline.predict(fight) proba = pipeline.predict_proba(fight) if (pred == 1.0): print("The predicted winner is", red_fighter, 'with a probability of', round(proba[0][1] * 100, 2), "%") else: print("The predicted winner is", blue_fighter, 'with a probability of ', round(proba[0][0] * 100, 2), "%") return proba except: print("One of fighter doesn't exist in the dataframe") return #predict(df, model, 'Kamaru Usman', 'Colby Covington', 'Welterweight', 3, True) #predict(df, model, 'Leon Edwards', 'Belal Muhammad', 'Welterweight', 3, True) #predict(df, model, 'Conor McGregor', 'Khabib Nurmagomedov', 'Lightweight', 5, True) #predict(df, model, 'Conor McGregor', 'Tai Tuivasa', 'Heavyweight', 5, True) # #predict(df,model,'Charles Oliveira','Conor McGregor','Lightweight',5,True) #predict(df,model,'Charles Oliveira','Khabib Nurmagomedov','Lightweight',5,True) #predict(df, model, 'Leon Edwards', 'Kamaru Usman', 'Welterweight', 5, True)