You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

153 lines
7.2 KiB

import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import export_graphviz
from sklearn.tree import plot_tree
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
pd.options.display.max_columns = None
pd.options.display.max_rows = None
import sklearn
def displayNumberOfNaNValues(df):
# Create an empty list to store tuples of column index and number of NaN values
na = []
# Loop through each column in the DataFrame
for index, col in enumerate(df):
# Count the number of NaN values in each column and append the index and count to 'na'
na.append((index, df[col].isna().sum()))
# Make a copy of 'na' and sort it based on the count of NaN values in descending order
na.sort(key=lambda x: x[1], reverse=True)
# Iterate through the sorted list of columns
for i in range(len(df.columns)):
# Check if the count of NaN values for the current column is not zero
if na[i][1] != 0:
# Print the column name, count of NaN values, and "NaN"
print(df.columns[na[i][0]], ":", na[i][1], "NaN")
# Calculate and print the total number of features with NaN values
print('Number of features with NaN values:', len([x[1] for x in na if x[1] > 0]))
print("Total NaN in dataframe :" , df.isna().sum().sum())
# i = index of the fighter's fight, 0 means the last fight, -1 means first fight
def select_fight_row(df, name, i):
df_temp = df[(df['R_fighter'] == name) | (df['B_fighter'] == name)] # filter df on fighter's name
df_temp.reset_index(drop=True, inplace=True) # as we created a new temporary dataframe, we have to reset indexes
idx = max(df_temp.index) # get the index of the oldest fight
if i > idx: # if we are looking for a fight that didn't exist, we return nothing
return
arr = df_temp.iloc[i,:].values
return arr
# get all active UFC fighters (according to the limit_date parameter)
def list_fighters(df, limit_date):
# Filter the DataFrame to include only fights occurring after the specified limit date
df_temp = df[df['date'] > limit_date]
# Create a set of all fighters from the red corner ('R_fighter') in the filtered DataFrame
set_R = set(df_temp['R_fighter'])
# Create a set of all fighters from the blue corner ('B_fighter') in the filtered DataFrame
set_B = set(df_temp['B_fighter'])
# Combine the sets of fighters from the red and blue corners to get all unique fighters
fighters = list(set_R.union(set_B))
# Print the number of unique fighters included in the list
# print("Number of fighters: " + str(len(fighters)))
# Return the list of unique fighters
return fighters
def build_df(df, fighters, i):
arr = [select_fight_row(df, fighters[f], i) for f in range(len(fighters)) if select_fight_row(df, fighters[f], i) is not None]
cols = [col for col in df]
df_fights = pd.DataFrame(data=arr, columns=cols)
df_fights.drop_duplicates(inplace=True)
df_fights['title_bout'] = df_fights['title_bout'].map({True: 1, False: 0})
df_fights.drop(['R_fighter', 'B_fighter', 'date'], axis=1, inplace=True)
return df_fights
def build_df_all_but_last(df, fighters):
cols = [col for col in df]
print(len(cols))
df_fights=pd.DataFrame(columns=cols)
for f in range(len(fighters)):
i=0
while True:
fight_row = select_fight_row(df, fighters[f], i)
if fight_row is None:
if not df_fights.empty:
df_fights = df_fights.iloc[:-1]
break
fight_row = list(fight_row)
dfTemp = pd.DataFrame(data=[fight_row], columns=cols)
df_fights = df_fights.dropna(axis=1, how='all')
df_fights = pd.concat([df_fights, dfTemp], ignore_index=True)
i=i+1
df_fights.drop_duplicates(inplace=True)
df_fights = df_fights[~df_fights.apply(lambda row: 'Open Stance' in row.values, axis=1)].reset_index(drop=True)
df_fights['title_bout'] = df_fights['title_bout'].map({True: 1, False: 0})
df_fights.drop(['R_fighter', 'B_fighter', 'date'], axis=1, inplace=True)
return df_fights
def predict(df, pipeline, blue_fighter, red_fighter, weightclass, rounds, title_bout=False):
#We build two dataframes, one for each figther
f1 = df[(df['R_fighter'] == blue_fighter) | (df['B_fighter'] == blue_fighter)].copy()
f1.reset_index(drop=True, inplace=True)
f1 = f1[:1]
f2 = df[(df['R_fighter'] == red_fighter) | (df['B_fighter'] == red_fighter)].copy()
f2.reset_index(drop=True, inplace=True)
f2 = f2[:1]
print("OK 1")
# if the fighter was red/blue corner on his last fight, we filter columns to only keep his statistics (and not the other fighter)
# then we rename columns according to the color of the corner in the parameters using re.sub()
if (f1.loc[0, ['R_fighter']].values[0]) == blue_fighter:
result1 = f1.filter(regex='^R', axis=1).copy() #here we keep the red corner stats
result1.rename(columns = lambda x: re.sub('^R','B', x), inplace=True) #we rename it with "B_" prefix because he's in the blue_corner
else:
result1 = f1.filter(regex='^B', axis=1).copy()
if (f2.loc[0, ['R_fighter']].values[0]) == red_fighter:
result2 = f2.filter(regex='^R', axis=1).copy()
else:
result2 = f2.filter(regex='^B', axis=1).copy()
result2.rename(columns = lambda x: re.sub('^B','R', x), inplace=True)
print("OK 2")
fight = pd.concat([result1, result2], axis = 1) # we concatenate the red and blue fighter dataframes (in columns)
fight.drop(['R_fighter','B_fighter'], axis = 1, inplace = True) # we remove fighter names
fight.insert(0, 'title_bout', title_bout) # we add tittle_bout, weight class and number of rounds data to the dataframe
fight.insert(1, 'weight_class', weightclass)
fight.insert(2, 'no_of_rounds', rounds)
fight['title_bout'] = fight['title_bout'].map({True: 1, False: 0})
print("OK 3")
pred = pipeline.predict(fight)
proba = pipeline.predict_proba(fight)
print("OK 4")
if (pred == 1.0):
print("The predicted winner is", red_fighter, 'with a probability of', round(proba[0][1] * 100, 2), "%")
else:
print("The predicted winner is", blue_fighter, 'with a probability of ', round(proba[0][0] * 100, 2), "%")
return proba
#predict(df, model, 'Kamaru Usman', 'Colby Covington', 'Welterweight', 3, True)
#predict(df, model, 'Leon Edwards', 'Belal Muhammad', 'Welterweight', 3, True)
#predict(df, model, 'Conor McGregor', 'Khabib Nurmagomedov', 'Lightweight', 5, True)
#predict(df, model, 'Conor McGregor', 'Tai Tuivasa', 'Heavyweight', 5, True)
#predict(df,model,'Charles Oliveira','Conor McGregor','Lightweight',5,True)
#predict(df,model,'Charles Oliveira','Khabib Nurmagomedov','Lightweight',5,True)
#predict(df, model, 'Leon Edwards', 'Kamaru Usman', 'Welterweight', 5, True)