parent
2f3b9f8d98
commit
4cc1e720f4
Binary file not shown.
Binary file not shown.
Binary file not shown.
Before Width: | Height: | Size: 9.5 MiB |
@ -1,174 +0,0 @@
|
|||||||
from flask import Flask, render_template, request
|
|
||||||
import pandas as pd
|
|
||||||
from test import * # Assurez-vous d'avoir un fichier predict.py avec votre fonction predict
|
|
||||||
|
|
||||||
from sklearn.linear_model import LogisticRegression
|
|
||||||
|
|
||||||
app = Flask(__name__)
|
|
||||||
|
|
||||||
colonnes = ['B_fighter','R_fighter','title_bout',
|
|
||||||
'B_avg_BODY_landed', 'B_avg_HEAD_landed', 'B_avg_TD_att', 'B_avg_TOTAL_STR_landed',
|
|
||||||
'B_avg_opp_BODY_att', 'B_avg_opp_HEAD_landed', 'B_avg_opp_LEG_landed',
|
|
||||||
'B_avg_opp_SIG_STR_att', 'B_avg_opp_TOTAL_STR_att',
|
|
||||||
|
|
||||||
'R_avg_BODY_landed', 'R_avg_HEAD_landed', 'R_avg_TD_att', 'R_avg_TOTAL_STR_landed',
|
|
||||||
'R_avg_opp_BODY_att', 'R_avg_opp_HEAD_landed', 'R_avg_opp_LEG_landed',
|
|
||||||
'R_avg_opp_SIG_STR_att', 'R_avg_opp_TOTAL_STR_att',
|
|
||||||
|
|
||||||
'B_age', 'R_age','date','Winner','weight_class','B_Stance','R_Stance']
|
|
||||||
|
|
||||||
# Charger le DataFrame une seule fois pour économiser des ressources
|
|
||||||
df = pd.read_csv('archive/data.csv') # Assurez-vous de spécifier le bon chemin vers votre fichier de données
|
|
||||||
|
|
||||||
# Before April 2001, there were almost no rules in UFC (no judges, no time limits, no rounds, etc.).
|
|
||||||
#It's up to this precise date that UFC started to implement a set of rules known as
|
|
||||||
#"Unified Rules of Mixed Martial Arts".
|
|
||||||
#Therefore, we delete all fights before this major update in UFC's rules history.
|
|
||||||
|
|
||||||
# Using this old data would not be representative of current fights, especially since this
|
|
||||||
#sport has become one of the most regulated due to its mixity and complexity.
|
|
||||||
limit_date = '2001-04-01'
|
|
||||||
df = df[(df['date'] > limit_date)]
|
|
||||||
|
|
||||||
# Display NaN values
|
|
||||||
displayNumberOfNaNValues(df)
|
|
||||||
|
|
||||||
# Define the list of important features to impute
|
|
||||||
imp_features = ['R_Weight_lbs', 'R_Height_cms', 'B_Height_cms', 'R_age', 'B_age', 'R_Reach_cms', 'B_Reach_cms']
|
|
||||||
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
|
|
||||||
|
|
||||||
# Iterate over each feature to impute missing values
|
|
||||||
for feature in imp_features:
|
|
||||||
# Fit and transform the feature using median imputation
|
|
||||||
imp_feature = imp_median.fit_transform(df[feature].values.reshape(-1,1))
|
|
||||||
# Assign the imputed values back to the DataFrame
|
|
||||||
df[feature] = imp_feature
|
|
||||||
|
|
||||||
# Impute missing values for 'R_Stance' using most frequent strategy
|
|
||||||
imp_stance_R = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
|
|
||||||
imp_R_stance = imp_stance_R.fit_transform(df['R_Stance'].values.reshape(-1,1))
|
|
||||||
|
|
||||||
# Impute missing values for 'B_Stance' using most frequent strategy
|
|
||||||
imp_stance_B = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
|
|
||||||
imp_B_stance = imp_stance_B.fit_transform(df['B_Stance'].values.reshape(-1,1))
|
|
||||||
|
|
||||||
# Create DataFrames for imputed stances
|
|
||||||
df['R_Stance'] = pd.DataFrame(imp_R_stance, columns=['R_Stance'])
|
|
||||||
df['B_Stance'] = pd.DataFrame(imp_B_stance, columns=['B_Stance'])
|
|
||||||
|
|
||||||
|
|
||||||
df.drop(['Referee', 'location'], axis=1, inplace=True)
|
|
||||||
|
|
||||||
# Drop column 'B_draw' and 'R_draw' and 'Draw' fight and 'Catch Weight' fight
|
|
||||||
df.drop(['B_draw', 'R_draw'], axis=1, inplace=True)
|
|
||||||
df = df[df['Winner'] != 'Draw']
|
|
||||||
df = df[df['weight_class'] != 'Catch Weight']
|
|
||||||
|
|
||||||
# Remove column when data type is not float or int
|
|
||||||
dfWithoutString = df.select_dtypes(include=['float64', 'int64'])
|
|
||||||
|
|
||||||
plt.figure(figsize=(50, 40))
|
|
||||||
corr_matrix = dfWithoutString.corr(method='pearson').abs()
|
|
||||||
sns.heatmap(corr_matrix, annot=True)
|
|
||||||
## Show the correlation matrix of the dataframe
|
|
||||||
## Very laggy feature
|
|
||||||
# plt.show()
|
|
||||||
|
|
||||||
|
|
||||||
# Last year when data fight was not full and correct
|
|
||||||
fighters = list_fighters(df,'2015-01-01')
|
|
||||||
|
|
||||||
df = df[colonnes]
|
|
||||||
|
|
||||||
# Get all fight of every fighters
|
|
||||||
df_train = build_df_all_but_last(df, fighters)
|
|
||||||
|
|
||||||
# Get the last fight of every fighters for test the model
|
|
||||||
df_test = build_df(df, fighters,0)
|
|
||||||
|
|
||||||
#Creates a column transformer that encodes specified categorical columns ordinally
|
|
||||||
#while leaving other columns unchanged
|
|
||||||
preprocessor = make_column_transformer((OrdinalEncoder(), ['weight_class', 'B_Stance', 'R_Stance']), remainder='passthrough')
|
|
||||||
|
|
||||||
#These lines of code utilize LabelEncoder to encode the 'Winner' column into numerical labels for
|
|
||||||
#both training and testing datasets, followed by the separation of features and target variable for
|
|
||||||
#further processing.
|
|
||||||
label_encoder = LabelEncoder()
|
|
||||||
y_train = label_encoder.fit_transform(df_train['Winner'])
|
|
||||||
y_test = label_encoder.transform(df_test['Winner'])
|
|
||||||
X_train, X_test = df_train.drop(['Winner'], axis=1), df_test.drop(['Winner'], axis=1)
|
|
||||||
|
|
||||||
# Random Forest composed of 100 decision trees. We optimized parameters using cross-validation
|
|
||||||
#and GridSearch tool paired together
|
|
||||||
random_forest = RandomForestClassifier(n_estimators=100,
|
|
||||||
criterion='entropy',
|
|
||||||
max_depth=10,
|
|
||||||
min_samples_split=2,
|
|
||||||
min_samples_leaf=1,
|
|
||||||
random_state=0)
|
|
||||||
|
|
||||||
|
|
||||||
from sklearn.tree import DecisionTreeClassifier
|
|
||||||
Arbre_decision = DecisionTreeClassifier(random_state=0, max_depth=20)
|
|
||||||
clf = Arbre_decision.fit(X_train, y_train)
|
|
||||||
from sklearn.metrics import accuracy_score
|
|
||||||
ypredit = clf.predict(Xtest)
|
|
||||||
accuracy_score(ytest, ypredit)
|
|
||||||
|
|
||||||
# Train data
|
|
||||||
model = Pipeline([('encoding', preprocessor), ('random_forest', random_forest)])
|
|
||||||
model.fit(X_train, y_train)
|
|
||||||
|
|
||||||
print('Testing accuracy : ', accuracy_score(ytest, ypredit), '\n')
|
|
||||||
|
|
||||||
|
|
||||||
# We use cross-validation with 5-folds to have a more precise accuracy (reduce variation)
|
|
||||||
accuracies = cross_val_score(estimator=model, X=X_train, y=y_train, cv=5)
|
|
||||||
print('Accuracy mean : ', accuracies.mean())
|
|
||||||
print('Accuracy standard deviation : ', accuracies.std())
|
|
||||||
|
|
||||||
# Test
|
|
||||||
y_pred = model.predict(X_test)
|
|
||||||
print('Testing accuracy : ', accuracy_score(y_test, y_pred), '\n')
|
|
||||||
|
|
||||||
# Class definition
|
|
||||||
target_names = ["Blue","Red"]
|
|
||||||
print(classification_report(y_test, y_pred, labels=[0,1], target_names=target_names))
|
|
||||||
|
|
||||||
# Declare feature
|
|
||||||
feature_names = [col for col in X_train]
|
|
||||||
# Set importances for every feature
|
|
||||||
feature_importances = model['random_forest'].feature_importances_
|
|
||||||
# Sort importances
|
|
||||||
indices = np.argsort(feature_importances)[::-1]
|
|
||||||
n = 30 # maximum feature importances displayed
|
|
||||||
idx = indices[0:n]
|
|
||||||
# Standard deviation
|
|
||||||
std = np.std([tree.feature_importances_ for tree in model['random_forest'].estimators_], axis=0)
|
|
||||||
# Select tree from model
|
|
||||||
tree_estimator = model['random_forest'].estimators_[10]
|
|
||||||
|
|
||||||
|
|
||||||
@app.route('/')
|
|
||||||
def index():
|
|
||||||
return render_template('index.html')
|
|
||||||
|
|
||||||
@app.route('/predict', methods=['POST'])
|
|
||||||
def make_prediction():
|
|
||||||
blue_fighter = request.form['blue_fighter']
|
|
||||||
red_fighter = request.form['red_fighter']
|
|
||||||
weightclass = request.form['weightclass']
|
|
||||||
rounds = int(request.form['rounds'])
|
|
||||||
title_bout = True if request.form['title_bout'] == 'True' else False
|
|
||||||
|
|
||||||
prediction_proba = predict(df, model, blue_fighter, red_fighter, weightclass, rounds, title_bout)
|
|
||||||
|
|
||||||
# Formatage du résultat pour l'afficher dans le navigateur
|
|
||||||
result = ""
|
|
||||||
if prediction_proba is not None:
|
|
||||||
result = f"The predicted probability of {blue_fighter} winning is {round(prediction_proba[0][0] * 100, 2)}% and the predicted probability of {red_fighter} winning is {round(prediction_proba[0][1] * 100, 2)}%"
|
|
||||||
|
|
||||||
return render_template('result.html', result=result)
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
app.run(debug=True)
|
|
@ -1,32 +0,0 @@
|
|||||||
<!DOCTYPE html>
|
|
||||||
<html lang="en">
|
|
||||||
<head>
|
|
||||||
<meta charset="UTF-8">
|
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
||||||
<title>UFC Fight Prediction</title>
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
<h1>UFC Fight Prediction</h1>
|
|
||||||
<form action="/predict" method="post">
|
|
||||||
<label for="blue_fighter">Blue Fighter:</label>
|
|
||||||
<input type="text" id="blue_fighter" name="blue_fighter"><br><br>
|
|
||||||
|
|
||||||
<label for="red_fighter">Red Fighter:</label>
|
|
||||||
<input type="text" id="red_fighter" name="red_fighter"><br><br>
|
|
||||||
|
|
||||||
<label for="weightclass">Weight Class:</label>
|
|
||||||
<input type="text" id="weightclass" name="weightclass"><br><br>
|
|
||||||
|
|
||||||
<label for="rounds">Number of Rounds:</label>
|
|
||||||
<input type="number" id="rounds" name="rounds" min="1" max="5" value="3"><br><br>
|
|
||||||
|
|
||||||
<label for="title_bout">Title Bout:</label>
|
|
||||||
<select id="title_bout" name="title_bout">
|
|
||||||
<option value="True">Yes</option>
|
|
||||||
<option value="False" selected>No</option>
|
|
||||||
</select><br><br>
|
|
||||||
|
|
||||||
<input type="submit" value="Predict">
|
|
||||||
</form>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
@ -1,13 +0,0 @@
|
|||||||
<!DOCTYPE html>
|
|
||||||
<html lang="en">
|
|
||||||
<head>
|
|
||||||
<meta charset="UTF-8">
|
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
||||||
<title>Prediction Result</title>
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
<h2>Prediction Result</h2>
|
|
||||||
<p>{{ result }}</p>
|
|
||||||
<p><a href="/">Make Another Prediction</a></p>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
@ -1,152 +0,0 @@
|
|||||||
import re
|
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
import seaborn as sns
|
|
||||||
import matplotlib.pyplot as plt
|
|
||||||
from sklearn.tree import export_graphviz
|
|
||||||
from sklearn.tree import plot_tree
|
|
||||||
from sklearn.pipeline import Pipeline
|
|
||||||
from sklearn.ensemble import RandomForestClassifier
|
|
||||||
from sklearn.metrics import accuracy_score
|
|
||||||
from sklearn.metrics import classification_report
|
|
||||||
from sklearn.metrics import confusion_matrix
|
|
||||||
from sklearn.model_selection import cross_val_score
|
|
||||||
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
|
|
||||||
from sklearn.compose import make_column_transformer
|
|
||||||
from sklearn.impute import SimpleImputer
|
|
||||||
pd.options.display.max_columns = None
|
|
||||||
pd.options.display.max_rows = None
|
|
||||||
import sklearn
|
|
||||||
|
|
||||||
def displayNumberOfNaNValues(df):
|
|
||||||
# Create an empty list to store tuples of column index and number of NaN values
|
|
||||||
na = []
|
|
||||||
# Loop through each column in the DataFrame
|
|
||||||
for index, col in enumerate(df):
|
|
||||||
# Count the number of NaN values in each column and append the index and count to 'na'
|
|
||||||
na.append((index, df[col].isna().sum()))
|
|
||||||
# Make a copy of 'na' and sort it based on the count of NaN values in descending order
|
|
||||||
na.sort(key=lambda x: x[1], reverse=True)
|
|
||||||
# Iterate through the sorted list of columns
|
|
||||||
for i in range(len(df.columns)):
|
|
||||||
# Check if the count of NaN values for the current column is not zero
|
|
||||||
if na[i][1] != 0:
|
|
||||||
# Print the column name, count of NaN values, and "NaN"
|
|
||||||
print(df.columns[na[i][0]], ":", na[i][1], "NaN")
|
|
||||||
# Calculate and print the total number of features with NaN values
|
|
||||||
print('Number of features with NaN values:', len([x[1] for x in na if x[1] > 0]))
|
|
||||||
print("Total NaN in dataframe :" , df.isna().sum().sum())
|
|
||||||
|
|
||||||
|
|
||||||
# i = index of the fighter's fight, 0 means the last fight, -1 means first fight
|
|
||||||
def select_fight_row(df, name, i):
|
|
||||||
df_temp = df[(df['R_fighter'] == name) | (df['B_fighter'] == name)] # filter df on fighter's name
|
|
||||||
df_temp.reset_index(drop=True, inplace=True) # as we created a new temporary dataframe, we have to reset indexes
|
|
||||||
idx = max(df_temp.index) # get the index of the oldest fight
|
|
||||||
if i > idx: # if we are looking for a fight that didn't exist, we return nothing
|
|
||||||
return
|
|
||||||
arr = df_temp.iloc[i,:].values
|
|
||||||
return arr
|
|
||||||
|
|
||||||
# get all active UFC fighters (according to the limit_date parameter)
|
|
||||||
def list_fighters(df, limit_date):
|
|
||||||
# Filter the DataFrame to include only fights occurring after the specified limit date
|
|
||||||
df_temp = df[df['date'] > limit_date]
|
|
||||||
# Create a set of all fighters from the red corner ('R_fighter') in the filtered DataFrame
|
|
||||||
set_R = set(df_temp['R_fighter'])
|
|
||||||
# Create a set of all fighters from the blue corner ('B_fighter') in the filtered DataFrame
|
|
||||||
set_B = set(df_temp['B_fighter'])
|
|
||||||
# Combine the sets of fighters from the red and blue corners to get all unique fighters
|
|
||||||
fighters = list(set_R.union(set_B))
|
|
||||||
# Print the number of unique fighters included in the list
|
|
||||||
# print("Number of fighters: " + str(len(fighters)))
|
|
||||||
# Return the list of unique fighters
|
|
||||||
return fighters
|
|
||||||
|
|
||||||
def build_df(df, fighters, i):
|
|
||||||
arr = [select_fight_row(df, fighters[f], i) for f in range(len(fighters)) if select_fight_row(df, fighters[f], i) is not None]
|
|
||||||
cols = [col for col in df]
|
|
||||||
df_fights = pd.DataFrame(data=arr, columns=cols)
|
|
||||||
df_fights.drop_duplicates(inplace=True)
|
|
||||||
df_fights['title_bout'] = df_fights['title_bout'].map({True: 1, False: 0})
|
|
||||||
df_fights.drop(['R_fighter', 'B_fighter', 'date'], axis=1, inplace=True)
|
|
||||||
return df_fights
|
|
||||||
|
|
||||||
def build_df_all_but_last(df, fighters):
|
|
||||||
cols = [col for col in df]
|
|
||||||
print(len(cols))
|
|
||||||
df_fights=pd.DataFrame(columns=cols)
|
|
||||||
for f in range(len(fighters)):
|
|
||||||
i=0
|
|
||||||
while True:
|
|
||||||
fight_row = select_fight_row(df, fighters[f], i)
|
|
||||||
if fight_row is None:
|
|
||||||
if not df_fights.empty:
|
|
||||||
df_fights = df_fights.iloc[:-1]
|
|
||||||
break
|
|
||||||
fight_row = list(fight_row)
|
|
||||||
dfTemp = pd.DataFrame(data=[fight_row], columns=cols)
|
|
||||||
df_fights = df_fights.dropna(axis=1, how='all')
|
|
||||||
df_fights = pd.concat([df_fights, dfTemp], ignore_index=True)
|
|
||||||
i=i+1
|
|
||||||
df_fights.drop_duplicates(inplace=True)
|
|
||||||
df_fights = df_fights[~df_fights.apply(lambda row: 'Open Stance' in row.values, axis=1)].reset_index(drop=True)
|
|
||||||
df_fights['title_bout'] = df_fights['title_bout'].map({True: 1, False: 0})
|
|
||||||
df_fights.drop(['R_fighter', 'B_fighter', 'date'], axis=1, inplace=True)
|
|
||||||
|
|
||||||
return df_fights
|
|
||||||
|
|
||||||
def predict(df, pipeline, blue_fighter, red_fighter, weightclass, rounds, title_bout=False):
|
|
||||||
#We build two dataframes, one for each figther
|
|
||||||
f1 = df[(df['R_fighter'] == blue_fighter) | (df['B_fighter'] == blue_fighter)].copy()
|
|
||||||
f1.reset_index(drop=True, inplace=True)
|
|
||||||
f1 = f1[:1]
|
|
||||||
f2 = df[(df['R_fighter'] == red_fighter) | (df['B_fighter'] == red_fighter)].copy()
|
|
||||||
f2.reset_index(drop=True, inplace=True)
|
|
||||||
f2 = f2[:1]
|
|
||||||
|
|
||||||
print("OK 1")
|
|
||||||
|
|
||||||
# if the fighter was red/blue corner on his last fight, we filter columns to only keep his statistics (and not the other fighter)
|
|
||||||
# then we rename columns according to the color of the corner in the parameters using re.sub()
|
|
||||||
if (f1.loc[0, ['R_fighter']].values[0]) == blue_fighter:
|
|
||||||
result1 = f1.filter(regex='^R', axis=1).copy() #here we keep the red corner stats
|
|
||||||
result1.rename(columns = lambda x: re.sub('^R','B', x), inplace=True) #we rename it with "B_" prefix because he's in the blue_corner
|
|
||||||
else:
|
|
||||||
result1 = f1.filter(regex='^B', axis=1).copy()
|
|
||||||
if (f2.loc[0, ['R_fighter']].values[0]) == red_fighter:
|
|
||||||
result2 = f2.filter(regex='^R', axis=1).copy()
|
|
||||||
else:
|
|
||||||
result2 = f2.filter(regex='^B', axis=1).copy()
|
|
||||||
result2.rename(columns = lambda x: re.sub('^B','R', x), inplace=True)
|
|
||||||
|
|
||||||
print("OK 2")
|
|
||||||
|
|
||||||
fight = pd.concat([result1, result2], axis = 1) # we concatenate the red and blue fighter dataframes (in columns)
|
|
||||||
fight.drop(['R_fighter','B_fighter'], axis = 1, inplace = True) # we remove fighter names
|
|
||||||
fight.insert(0, 'title_bout', title_bout) # we add tittle_bout, weight class and number of rounds data to the dataframe
|
|
||||||
fight.insert(1, 'weight_class', weightclass)
|
|
||||||
fight.insert(2, 'no_of_rounds', rounds)
|
|
||||||
fight['title_bout'] = fight['title_bout'].map({True: 1, False: 0})
|
|
||||||
|
|
||||||
print("OK 3")
|
|
||||||
|
|
||||||
pred = pipeline.predict(fight)
|
|
||||||
proba = pipeline.predict_proba(fight)
|
|
||||||
|
|
||||||
print("OK 4")
|
|
||||||
|
|
||||||
if (pred == 1.0):
|
|
||||||
print("The predicted winner is", red_fighter, 'with a probability of', round(proba[0][1] * 100, 2), "%")
|
|
||||||
else:
|
|
||||||
print("The predicted winner is", blue_fighter, 'with a probability of ', round(proba[0][0] * 100, 2), "%")
|
|
||||||
return proba
|
|
||||||
|
|
||||||
|
|
||||||
#predict(df, model, 'Kamaru Usman', 'Colby Covington', 'Welterweight', 3, True)
|
|
||||||
#predict(df, model, 'Leon Edwards', 'Belal Muhammad', 'Welterweight', 3, True)
|
|
||||||
#predict(df, model, 'Conor McGregor', 'Khabib Nurmagomedov', 'Lightweight', 5, True)
|
|
||||||
#predict(df, model, 'Conor McGregor', 'Tai Tuivasa', 'Heavyweight', 5, True)
|
|
||||||
#predict(df,model,'Charles Oliveira','Conor McGregor','Lightweight',5,True)
|
|
||||||
#predict(df,model,'Charles Oliveira','Khabib Nurmagomedov','Lightweight',5,True)
|
|
||||||
#predict(df, model, 'Leon Edwards', 'Kamaru Usman', 'Welterweight', 5, True)
|
|
Loading…
Reference in new issue