MMIX/test.py

import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import export_graphviz
from sklearn.tree import plot_tree
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
pd.options.display.max_columns = None
pd.options.display.max_rows = None
import sklearn

def displayNumberOfNaNValues(df):
    # Create an empty list to store tuples of column index and number of NaN values
    na = []
    # Loop through each column in the DataFrame
    for index, col in enumerate(df):
        # Count the number of NaN values in each column and append the index and count to 'na'
        na.append((index, df[col].isna().sum()))
    # Make a copy of 'na' and sort it based on the count of NaN values in descending order
    na.sort(key=lambda x: x[1], reverse=True)
    # Iterate through the sorted list of columns
    for i in range(len(df.columns)):
        # Check if the count of NaN values for the current column is not zero
        if na[i][1] != 0:
            # Print the column name, count of NaN values, and "NaN"
            print(df.columns[na[i][0]], ":", na[i][1], "NaN")
    # Calculate and print the total number of features with NaN values
    print('Number of features with NaN values:', len([x[1] for x in na if x[1] > 0]))
    print("Total NaN in dataframe :" , df.isna().sum().sum())


#  i = index of the fighter's fight, 0 means the last fight, -1 means first fight
def select_fight_row(df, name, i):
    df_temp = df[(df['R_fighter'] == name) | (df['B_fighter'] == name)]  # filter df on fighter's name
    df_temp.reset_index(drop=True, inplace=True) #  as we created a new temporary dataframe, we have to reset indexes
    idx = max(df_temp.index)  #  get the index of the oldest fight
    if i > idx:  #  if we are looking for a fight that didn't exist, we return nothing
        return
    arr = df_temp.iloc[i,:].values
    return arr

# get all active UFC fighters (according to the limit_date parameter)
def list_fighters(df, limit_date):
    # Filter the DataFrame to include only fights occurring after the specified limit date
    df_temp = df[df['date'] > limit_date]
    # Create a set of all fighters from the red corner ('R_fighter') in the filtered DataFrame
    set_R = set(df_temp['R_fighter'])
    # Create a set of all fighters from the blue corner ('B_fighter') in the filtered DataFrame
    set_B = set(df_temp['B_fighter'])
    # Combine the sets of fighters from the red and blue corners to get all unique fighters
    fighters = list(set_R.union(set_B))
    # Print the number of unique fighters included in the list
    # print("Number of fighters: " + str(len(fighters)))
    # Return the list of unique fighters
    return fighters

def build_df(df, fighters, i):
    arr = [select_fight_row(df, fighters[f], i) for f in range(len(fighters)) if select_fight_row(df, fighters[f], i) is not None]
    cols = [col for col in df]
    df_fights = pd.DataFrame(data=arr, columns=cols)
    df_fights.drop_duplicates(inplace=True)
    df_fights['title_bout'] = df_fights['title_bout'].map({True: 1, False: 0})
    df_fights.drop(['R_fighter', 'B_fighter', 'date'], axis=1, inplace=True)
    return df_fights

def build_df_all_but_last(df, fighters):
    cols = [col for col in df]
    print(len(cols))
    df_fights=pd.DataFrame(columns=cols)
    for f in range(len(fighters)):
        i=0
        while True:
            fight_row = select_fight_row(df, fighters[f], i)
            if fight_row is None:
                if not df_fights.empty:
                    df_fights = df_fights.iloc[:-1]
                break
            fight_row = list(fight_row)
            dfTemp = pd.DataFrame(data=[fight_row], columns=cols)
            df_fights = df_fights.dropna(axis=1, how='all')
            df_fights = pd.concat([df_fights, dfTemp], ignore_index=True)
            i=i+1
    df_fights.drop_duplicates(inplace=True)
    df_fights = df_fights[~df_fights.apply(lambda row: 'Open Stance' in row.values, axis=1)].reset_index(drop=True)
    df_fights['title_bout'] = df_fights['title_bout'].map({True: 1, False: 0})
    df_fights.drop(['R_fighter', 'B_fighter', 'date'], axis=1, inplace=True)

    return df_fights

def predict(df, pipeline, blue_fighter, red_fighter, weightclass, rounds, title_bout=False):
    #We build two dataframes, one for each figther
    f1 = df[(df['R_fighter'] == blue_fighter) | (df['B_fighter'] == blue_fighter)].copy()
    f1.reset_index(drop=True, inplace=True)
    f1 = f1[:1]
    f2 = df[(df['R_fighter'] == red_fighter) | (df['B_fighter'] == red_fighter)].copy()
    f2.reset_index(drop=True, inplace=True)
    f2 = f2[:1]

    print("OK 1")

    # if the fighter was red/blue corner on his last fight, we filter columns to only keep his statistics (and not the other fighter)
    # then we rename columns according to the color of  the corner in the parameters using re.sub()
    if (f1.loc[0, ['R_fighter']].values[0]) == blue_fighter:
        result1 = f1.filter(regex='^R', axis=1).copy() #here we keep the red corner stats
        result1.rename(columns = lambda x: re.sub('^R','B', x), inplace=True)  #we rename it with "B_" prefix because he's in the blue_corner
    else:
        result1 = f1.filter(regex='^B', axis=1).copy()
    if (f2.loc[0, ['R_fighter']].values[0]) == red_fighter:
        result2 = f2.filter(regex='^R', axis=1).copy()
    else:
        result2 = f2.filter(regex='^B', axis=1).copy()
        result2.rename(columns = lambda x: re.sub('^B','R', x), inplace=True)

    print("OK 2")

    fight = pd.concat([result1, result2], axis = 1) # we concatenate the red and blue fighter dataframes (in columns)
    fight.drop(['R_fighter','B_fighter'], axis = 1, inplace = True) # we remove fighter names
    fight.insert(0, 'title_bout', title_bout) # we add tittle_bout, weight class and number of rounds data to the dataframe
    fight.insert(1, 'weight_class', weightclass)
    fight.insert(2, 'no_of_rounds', rounds)
    fight['title_bout'] = fight['title_bout'].map({True: 1, False: 0})

    print("OK 3")

    pred = pipeline.predict(fight)
    proba = pipeline.predict_proba(fight)

    print("OK 4")

    if (pred == 1.0):
        print("The predicted winner is", red_fighter, 'with a probability of', round(proba[0][1] * 100, 2), "%")
    else:
        print("The predicted winner is", blue_fighter, 'with a probability of ', round(proba[0][0] * 100, 2), "%")
    return proba


#predict(df, model, 'Kamaru Usman', 'Colby Covington', 'Welterweight', 3, True)
#predict(df, model, 'Leon Edwards', 'Belal Muhammad', 'Welterweight', 3, True)
#predict(df, model, 'Conor McGregor', 'Khabib Nurmagomedov', 'Lightweight', 5, True)
#predict(df, model, 'Conor McGregor', 'Tai Tuivasa', 'Heavyweight', 5, True)
#predict(df,model,'Charles Oliveira','Conor McGregor','Lightweight',5,True)
#predict(df,model,'Charles Oliveira','Khabib Nurmagomedov','Lightweight',5,True)
#predict(df, model, 'Leon Edwards', 'Kamaru Usman', 'Welterweight', 5, True)