import re import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt from sklearn.tree import export_graphviz from io import StringIO from IPython.display import Image from sklearn.tree import plot_tree import pydotplus from IPython.display import Image from sklearn.pipeline import Pipeline from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix from sklearn.model_selection import cross_val_score from sklearn.preprocessing import OrdinalEncoder, LabelEncoder from sklearn.compose import make_column_transformer from sklearn.impute import SimpleImputer pd.options.display.max_columns = None pd.options.display.max_rows = None import sklearn def displayNumberOfNaNValues(df): # Create an empty list to store tuples of column index and number of NaN values na = [] # Loop through each column in the DataFrame for index, col in enumerate(df): # Count the number of NaN values in each column and append the index and count to 'na' na.append((index, df[col].isna().sum())) # Make a copy of 'na' and sort it based on the count of NaN values in descending order na.sort(key=lambda x: x[1], reverse=True) # Iterate through the sorted list of columns for i in range(len(df.columns)): # Check if the count of NaN values for the current column is not zero if na[i][1] != 0: # Print the column name, count of NaN values, and "NaN" print(df.columns[na[i][0]], ":", na[i][1], "NaN") # Calculate and print the total number of features with NaN values print('Number of features with NaN values:', len([x[1] for x in na if x[1] > 0])) print("Total NaN in dataframe :" , df.isna().sum().sum()) df = pd.read_csv('archive/data.csv') # Before April 2001, there were almost no rules in UFC (no judges, no time limits, no rounds, etc.). #It's up to this precise date that UFC started to implement a set of rules known as #"Unified Rules of Mixed Martial Arts". #Therefore, we delete all fights before this major update in UFC's rules history. # Using this old data would not be representative of current fights, especially since this #sport has become one of the most regulated due to its mixity and complexity. limit_date = '2001-04-01' df = df[(df['date'] > limit_date)] # Display NaN values displayNumberOfNaNValues(df) # Define the list of important features to impute imp_features = ['R_Weight_lbs', 'R_Height_cms', 'B_Height_cms', 'R_age', 'B_age', 'R_Reach_cms', 'B_Reach_cms'] # Initialize a SimpleImputer to impute missing values with median imp_median = SimpleImputer(missing_values=np.nan, strategy='median') # Iterate over each feature to impute missing values for feature in imp_features: # Fit and transform the feature using median imputation imp_feature = imp_median.fit_transform(df[feature].values.reshape(-1,1)) # Assign the imputed values back to the DataFrame df[feature] = imp_feature # Impute missing values for 'R_Stance' using most frequent strategy imp_stance_R = SimpleImputer(missing_values=np.nan, strategy='most_frequent') imp_R_stance = imp_stance_R.fit_transform(df['R_Stance'].values.reshape(-1,1)) # Impute missing values for 'B_Stance' using most frequent strategy imp_stance_B = SimpleImputer(missing_values=np.nan, strategy='most_frequent') imp_B_stance = imp_stance_B.fit_transform(df['B_Stance'].values.reshape(-1,1)) # Create DataFrames for imputed stances df['R_Stance'] = pd.DataFrame(imp_R_stance, columns=['R_Stance']) df['B_Stance'] = pd.DataFrame(imp_B_stance, columns=['B_Stance']) # drop B_avg_BODY_att values in the dataframe # List of features with NaN values to drop #na_features = ['B_avg_BODY_att', 'R_avg_BODY_att'] # Drop rows with NaN values in specified features #df.dropna(subset=na_features, inplace=True) # Drop columns 'Referee' and 'location' from the DataFrame # The value of references and location has a low impact in battles, which makes it irrelevant to keep df.drop(['Referee', 'location'], axis=1, inplace=True) # Drop column 'B_draw' and 'R_draw' and 'Draw' fight and 'Catch Weight' fight df.drop(['B_draw', 'R_draw'], axis=1, inplace=True) df = df[df['Winner'] != 'Draw'] df = df[df['weight_class'] != 'Catch Weight'] # Remove column when data type is not float or int dfWithoutString = df.select_dtypes(include=['float64', 'int64']) plt.figure(figsize=(50, 40)) corr_matrix = dfWithoutString.corr(method='pearson').abs() sns.heatmap(corr_matrix, annot=True) # Show the correlation matrix of the dataframe # Very laggy feature # plt.show() # i = index of the fighter's fight, 0 means the last fight, -1 means first fight def select_fight_row(df, name, i): df_temp = df[(df['R_fighter'] == name) | (df['B_fighter'] == name)] # filter df on fighter's name df_temp.reset_index(drop=True, inplace=True) # as we created a new temporary dataframe, we have to reset indexes idx = max(df_temp.index) # get the index of the oldest fight if i > idx: # if we are looking for a fight that didn't exist, we return nothing return arr = df_temp.iloc[i,:].values return arr # we get the last fight of Khabib :'( print(select_fight_row(df, 'Khabib Nurmagomedov', 0)) # get all active UFC fighters (according to the limit_date parameter) def list_fighters(df, limit_date): df_temp = df[df['date'] > limit_date] set_R = set(df_temp['R_fighter']) set_B = set(df_temp['B_fighter']) fighters = list(set_R.union(set_B)) print("Number of fighter: "+str(len(fighters))) return fighters # Last year when data fight was not full and correct fighters = list_fighters(df,'2016-01-01') def build_df(df, fighters, i): arr = [select_fight_row(df, fighters[f], i) for f in range(len(fighters)) if select_fight_row(df, fighters[f], i) is not None] cols = [col for col in df] df_fights = pd.DataFrame(data=arr, columns=cols) df_fights.drop_duplicates(inplace=True) df_fights['title_bout'] = df_fights['title_bout'].map({True: 1, False: 0}) df_fights.drop(['R_fighter', 'B_fighter', 'date'], axis=1, inplace=True) return df_fights df_train = build_df(df, fighters, 0) df_test = build_df(df, fighters, 1) # print(df_train.head(5)) preprocessor = make_column_transformer((OrdinalEncoder(), ['weight_class', 'B_Stance', 'R_Stance']), remainder='passthrough') # If the winner is from the Red corner, Winner label will be encoded as 1, otherwise it will be 0 (Blue corner) label_encoder = LabelEncoder() y_train = label_encoder.fit_transform(df_train['Winner']) y_test = label_encoder.transform(df_test['Winner']) X_train, X_test = df_train.drop(['Winner'], axis=1), df_test.drop(['Winner'], axis=1) # Random Forest composed of 100 decision trees. We optimized parameters using cross-validation and GridSearch tool paired together random_forest = RandomForestClassifier(n_estimators=100, criterion='entropy', max_depth=10, min_samples_split=2, min_samples_leaf=1, random_state=0) model = Pipeline([('encoding', preprocessor), ('random_forest', random_forest)]) model.fit(X_train, y_train) # We use cross-validation with 5-folds to have a more precise accuracy (reduce variation) accuracies = cross_val_score(estimator=model, X=X_train, y=y_train, cv=5) print('Accuracy mean : ', accuracies.mean()) print('Accuracy standard deviation : ', accuracies.std()) y_pred = model.predict(X_test) print('Testing accuracy : ', accuracy_score(y_test, y_pred), '\n') target_names = ["Blue","Red"] print(classification_report(y_test, y_pred, labels=[0,1], target_names=target_names)) # cm = confusion_matrix(y_test, y_pred) # ax = plt.subplot() # sns.heatmap(cm, annot = True, ax = ax, fmt = "d") # ax.set_xlabel('Actual') # ax.set_ylabel('Predicted') # ax.set_title("Confusion Matrix") # ax.xaxis.set_ticklabels(['Blue', 'Red']) # ax.yaxis.set_ticklabels(['Blue', 'Red']) # plt.show() feature_names = [col for col in X_train] feature_importances = model['random_forest'].feature_importances_ indices = np.argsort(feature_importances)[::-1] n = 30 # maximum feature importances displayed idx = indices[0:n] std = np.std([tree.feature_importances_ for tree in model['random_forest'].estimators_], axis=0) #for f in range(n): # print("%d. feature %s (%f)" % (f + 1, feature_names[idx[f]], feature_importances[idx[f]])) # plt.figure(figsize=(30, 8)) # plt.title("Feature importances") # plt.bar(range(n), feature_importances[idx], color="r", yerr=std[idx], align="center") # plt.xticks(range(n), [feature_names[id] for id in idx], rotation = 45) # plt.xlim([-1, n]) # plt.show() # Sélectionnez un arbre de votre modèle tree_estimator = model['random_forest'].estimators_[10] # Tracez l'arbre # plt.figure(figsize=(1, 1)) # plot_tree(tree_estimator, feature_names=df_train.columns, filled=True, rounded=True, fontsize=10) # plt.savefig('tree.png', dpi=600) # Enregistrez l'image au format PNG # plt.show() def predict(df, pipeline, blue_fighter, red_fighter, weightclass, rounds, title_bout=False): try: #We build two dataframes, one for each figther f1 = df[(df['R_fighter'] == blue_fighter) | (df['B_fighter'] == blue_fighter)].copy() f1.reset_index(drop=True, inplace=True) f1 = f1[:1] f2 = df[(df['R_fighter'] == red_fighter) | (df['B_fighter'] == red_fighter)].copy() f2.reset_index(drop=True, inplace=True) f2 = f2[:1] # if the fighter was red/blue corner on his last fight, we filter columns to only keep his statistics (and not the other fighter) # then we rename columns according to the color of the corner in the parameters using re.sub() if (f1.loc[0, ['R_fighter']].values[0]) == blue_fighter: result1 = f1.filter(regex='^R', axis=1).copy() #here we keep the red corner stats result1.rename(columns = lambda x: re.sub('^R','B', x), inplace=True) #we rename it with "B_" prefix because he's in the blue_corner else: result1 = f1.filter(regex='^B', axis=1).copy() if (f2.loc[0, ['R_fighter']].values[0]) == red_fighter: result2 = f2.filter(regex='^R', axis=1).copy() else: result2 = f2.filter(regex='^B', axis=1).copy() result2.rename(columns = lambda x: re.sub('^B','R', x), inplace=True) fight = pd.concat([result1, result2], axis = 1) # we concatenate the red and blue fighter dataframes (in columns) fight.drop(['R_fighter','B_fighter'], axis = 1, inplace = True) # we remove fighter names fight.insert(0, 'title_bout', title_bout) # we add tittle_bout, weight class and number of rounds data to the dataframe fight.insert(1, 'weight_class', weightclass) fight.insert(2, 'no_of_rounds', rounds) fight['title_bout'] = fight['title_bout'].map({True: 1, False: 0}) pred = pipeline.predict(fight) proba = pipeline.predict_proba(fight) if (pred == 1.0): print("The predicted winner is", red_fighter, 'with a probability of', round(proba[0][1] * 100, 2), "%") else: print("The predicted winner is", blue_fighter, 'with a probability of ', round(proba[0][0] * 100, 2), "%") return proba except: print("One of fighter doesn't exist in the dataframe") return predict(df, model, 'Kamaru Usman', 'Colby Covington', 'Welterweight', 3, True) predict(df, model, 'Leon Edwards', 'Belal Muhammad', 'Welterweight', 3, True) predict(df, model, 'Conor McGregor', 'Khabib Nurmagomedov', 'Lightweight', 5, True) predict(df, model, 'Conor McGregor', 'Tai Tuivasa', 'Heavyweight', 5, True) predict(df,model,'Charles Oliveira','Conor McGregor','Lightweight',5,True) predict(df,model,'Charles Oliveira','Khabib Nurmagomedov','Lightweight',5,True) predict(df, model, 'Leon Edwards', 'Kamaru Usman', 'Welterweight', 5, True)