luevard 1 year ago
commit cad4dc579f

@ -0,0 +1,131 @@
from flask import Flask, render_template, request
import pandas as pd
from test import * # Assurez-vous d'avoir un fichier predict.py avec votre fonction predict
app = Flask(__name__)
# Charger le DataFrame une seule fois pour économiser des ressources
df = pd.read_csv('archive/data.csv') # Assurez-vous de spécifier le bon chemin vers votre fichier de données
limit_date = '2001-04-01'
df = df[(df['date'] > limit_date)]
displayNumberOfNaNValues(df)
imp_features = ['R_Weight_lbs', 'R_Height_cms', 'B_Height_cms', 'R_age', 'B_age', 'R_Reach_cms', 'B_Reach_cms']
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
# Iterate over each feature to impute missing values
for feature in imp_features:
# Fit and transform the feature using median imputation
imp_feature = imp_median.fit_transform(df[feature].values.reshape(-1,1))
# Assign the imputed values back to the DataFrame
df[feature] = imp_feature
# Impute missing values for 'R_Stance' using most frequent strategy
imp_stance_R = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_R_stance = imp_stance_R.fit_transform(df['R_Stance'].values.reshape(-1,1))
# Impute missing values for 'B_Stance' using most frequent strategy
imp_stance_B = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_B_stance = imp_stance_B.fit_transform(df['B_Stance'].values.reshape(-1,1))
# Create DataFrames for imputed stances
df['R_Stance'] = pd.DataFrame(imp_R_stance, columns=['R_Stance'])
df['B_Stance'] = pd.DataFrame(imp_B_stance, columns=['B_Stance'])
df.drop(['Referee', 'location'], axis=1, inplace=True)
# Drop column 'B_draw' and 'R_draw' and 'Draw' fight and 'Catch Weight' fight
df.drop(['B_draw', 'R_draw'], axis=1, inplace=True)
df = df[df['Winner'] != 'Draw']
df = df[df['weight_class'] != 'Catch Weight']
# Remove column when data type is not float or int
dfWithoutString = df.select_dtypes(include=['float64', 'int64'])
plt.figure(figsize=(50, 40))
corr_matrix = dfWithoutString.corr(method='pearson').abs()
sns.heatmap(corr_matrix, annot=True)
fighters = list_fighters(df,'2015-01-01')
df_train = build_df_all_but_last(df, fighters)
df_test = build_df(df, fighters,0)
preprocessor = make_column_transformer((OrdinalEncoder(), ['weight_class', 'B_Stance', 'R_Stance']), remainder='passthrough')
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_train['Winner'])
y_test = label_encoder.transform(df_test['Winner'])
X_train, X_test = df_train.drop(['Winner'], axis=1), df_test.drop(['Winner'], axis=1)
# Random Forest composed of 100 decision trees. We optimized parameters using cross-validation and GridSearch tool paired together
random_forest = RandomForestClassifier(n_estimators=100,
criterion='entropy',
max_depth=10,
min_samples_split=2,
min_samples_leaf=1,
random_state=0)
model = Pipeline([('encoding', preprocessor), ('random_forest', random_forest)])
model.fit(X_train, y_train)
# We use cross-validation with 5-folds to have a more precise accuracy (reduce variation)
accuracies = cross_val_score(estimator=model, X=X_train, y=y_train, cv=5)
print('Accuracy mean : ', accuracies.mean())
print('Accuracy standard deviation : ', accuracies.std())
y_pred = model.predict(X_test)
print('Testing accuracy : ', accuracy_score(y_test, y_pred), '\n')
target_names = ["Blue","Red"]
print(classification_report(y_test, y_pred, labels=[0,1], target_names=target_names))
feature_names = [col for col in X_train]
feature_importances = model['random_forest'].feature_importances_
indices = np.argsort(feature_importances)[::-1]
n = 30 # maximum feature importances displayed
idx = indices[0:n]
std = np.std([tree.feature_importances_ for tree in model['random_forest'].estimators_], axis=0)
#for f in range(n):
# print("%d. feature %s (%f)" % (f + 1, feature_names[idx[f]], feature_importances[idx[f]]))
# plt.figure(figsize=(30, 8))
# plt.title("Feature importances")
# plt.bar(range(n), feature_importances[idx], color="r", yerr=std[idx], align="center")
# plt.xticks(range(n), [feature_names[id] for id in idx], rotation = 45)
# plt.xlim([-1, n])
# plt.show()
# Sélectionnez un arbre de votre modèle
tree_estimator = model['random_forest'].estimators_[10]
@app.route('/')
def index():
return render_template('index.html')
@app.route('/predict', methods=['POST'])
def make_prediction():
blue_fighter = request.form['blue_fighter']
red_fighter = request.form['red_fighter']
weightclass = request.form['weightclass']
rounds = int(request.form['rounds'])
title_bout = True if request.form['title_bout'] == 'True' else False
prediction_proba = predict(df, model, blue_fighter, red_fighter, weightclass, rounds, title_bout)
# Formatage du résultat pour l'afficher dans le navigateur
result = ""
if prediction_proba is not None:
result = f"The predicted probability of {blue_fighter} winning is {round(prediction_proba[0][0] * 100, 2)}% and the predicted probability of {red_fighter} winning is {round(prediction_proba[0][1] * 100, 2)}%"
return render_template('result.html', result=result)
if __name__ == '__main__':
app.run(debug=True)

@ -0,0 +1,32 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>UFC Fight Prediction</title>
</head>
<body>
<h1>UFC Fight Prediction</h1>
<form action="/predict" method="post">
<label for="blue_fighter">Blue Fighter:</label>
<input type="text" id="blue_fighter" name="blue_fighter"><br><br>
<label for="red_fighter">Red Fighter:</label>
<input type="text" id="red_fighter" name="red_fighter"><br><br>
<label for="weightclass">Weight Class:</label>
<input type="text" id="weightclass" name="weightclass"><br><br>
<label for="rounds">Number of Rounds:</label>
<input type="number" id="rounds" name="rounds" min="1" max="5" value="3"><br><br>
<label for="title_bout">Title Bout:</label>
<select id="title_bout" name="title_bout">
<option value="True">Yes</option>
<option value="False" selected>No</option>
</select><br><br>
<input type="submit" value="Predict">
</form>
</body>
</html>

@ -0,0 +1,13 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Prediction Result</title>
</head>
<body>
<h2>Prediction Result</h2>
<p>{{ result }}</p>
<p><a href="/">Make Another Prediction</a></p>
</body>
</html>

@ -37,9 +37,6 @@ def displayNumberOfNaNValues(df):
print('Number of features with NaN values:', len([x[1] for x in na if x[1] > 0]))
print("Total NaN in dataframe :" , df.isna().sum().sum())
df = pd.read_csv('archive/data.csv')
# Before April 2001, there were almost no rules in UFC (no judges, no time limits, no rounds, etc.).
#It's up to this precise date that UFC started to implement a set of rules known as
#"Unified Rules of Mixed Martial Arts".
@ -48,63 +45,63 @@ df = pd.read_csv('archive/data.csv')
# Using this old data would not be representative of current fights, especially since this
#sport has become one of the most regulated due to its mixity and complexity.
limit_date = '2001-04-01'
df = df[(df['date'] > limit_date)]
#limit_date = '2001-04-01'
#df = df[(df['date'] > limit_date)]
# Display NaN values
displayNumberOfNaNValues(df)
#displayNumberOfNaNValues(df)
# Define the list of important features to impute
imp_features = ['R_Weight_lbs', 'R_Height_cms', 'B_Height_cms', 'R_age', 'B_age', 'R_Reach_cms', 'B_Reach_cms']
#imp_features = ['R_Weight_lbs', 'R_Height_cms', 'B_Height_cms', 'R_age', 'B_age', 'R_Reach_cms', 'B_Reach_cms']
# Initialize a SimpleImputer to impute missing values with median
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
#imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
# Iterate over each feature to impute missing values
for feature in imp_features:
#for feature in imp_features:
# Fit and transform the feature using median imputation
imp_feature = imp_median.fit_transform(df[feature].values.reshape(-1,1))
#imp_feature = imp_median.fit_transform(df[feature].values.reshape(-1,1))
# Assign the imputed values back to the DataFrame
df[feature] = imp_feature
#df[feature] = imp_feature
# Impute missing values for 'R_Stance' using most frequent strategy
imp_stance_R = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_R_stance = imp_stance_R.fit_transform(df['R_Stance'].values.reshape(-1,1))
# Impute missing values for 'B_Stance' using most frequent strategy
imp_stance_B = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_B_stance = imp_stance_B.fit_transform(df['B_Stance'].values.reshape(-1,1))
# Create DataFrames for imputed stances
df['R_Stance'] = pd.DataFrame(imp_R_stance, columns=['R_Stance'])
df['B_Stance'] = pd.DataFrame(imp_B_stance, columns=['B_Stance'])
# drop B_avg_BODY_att values in the dataframe
# List of features with NaN values to drop
#na_features = ['B_avg_BODY_att', 'R_avg_BODY_att']
# Drop rows with NaN values in specified features
#df.dropna(subset=na_features, inplace=True)
# Drop columns 'Referee' and 'location' from the DataFrame
# The value of references and location has a low impact in battles, which makes it irrelevant to keep
df.drop(['Referee', 'location'], axis=1, inplace=True)
# Drop column 'B_draw' and 'R_draw' and 'Draw' fight and 'Catch Weight' fight
df.drop(['B_draw', 'R_draw'], axis=1, inplace=True)
df = df[df['Winner'] != 'Draw']
df = df[df['weight_class'] != 'Catch Weight']
# Remove column when data type is not float or int
dfWithoutString = df.select_dtypes(include=['float64', 'int64'])
plt.figure(figsize=(50, 40))
corr_matrix = dfWithoutString.corr(method='pearson').abs()
sns.heatmap(corr_matrix, annot=True)
# Show the correlation matrix of the dataframe
# Very laggy feature
#imp_stance_R = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
#imp_R_stance = imp_stance_R.fit_transform(df['R_Stance'].values.reshape(-1,1))
#
## Impute missing values for 'B_Stance' using most frequent strategy
#imp_stance_B = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
#imp_B_stance = imp_stance_B.fit_transform(df['B_Stance'].values.reshape(-1,1))
#
## Create DataFrames for imputed stances
#df['R_Stance'] = pd.DataFrame(imp_R_stance, columns=['R_Stance'])
#df['B_Stance'] = pd.DataFrame(imp_B_stance, columns=['B_Stance'])
#
## drop B_avg_BODY_att values in the dataframe
# # List of features with NaN values to drop
# #na_features = ['B_avg_BODY_att', 'R_avg_BODY_att']
#
# # Drop rows with NaN values in specified features
# #df.dropna(subset=na_features, inplace=True)
#
## Drop columns 'Referee' and 'location' from the DataFrame
## The value of references and location has a low impact in battles, which makes it irrelevant to keep
#df.drop(['Referee', 'location'], axis=1, inplace=True)
#
## Drop column 'B_draw' and 'R_draw' and 'Draw' fight and 'Catch Weight' fight
#df.drop(['B_draw', 'R_draw'], axis=1, inplace=True)
#df = df[df['Winner'] != 'Draw']
#df = df[df['weight_class'] != 'Catch Weight']
#
## Remove column when data type is not float or int
#dfWithoutString = df.select_dtypes(include=['float64', 'int64'])
#
#plt.figure(figsize=(50, 40))
#corr_matrix = dfWithoutString.corr(method='pearson').abs()
#sns.heatmap(corr_matrix, annot=True)
#
## Show the correlation matrix of the dataframe
## Very laggy feature
#
# plt.show()
# i = index of the fighter's fight, 0 means the last fight, -1 means first fight
@ -118,7 +115,7 @@ def select_fight_row(df, name, i):
return arr
# we get the last fight of Khabib :'(
print(select_fight_row(df, 'Khabib Nurmagomedov', 0))
#print(select_fight_row(df, 'Khabib Nurmagomedov', 0))
# get all active UFC fighters (according to the limit_date parameter)
@ -137,7 +134,7 @@ def list_fighters(df, limit_date):
return fighters
# Last year when data fight was not full and correct
fighters = list_fighters(df,'2015-01-01')
#fighters = list_fighters(df,'2015-01-01')
def build_df(df, fighters, i):
arr = [select_fight_row(df, fighters[f], i) for f in range(len(fighters)) if select_fight_row(df, fighters[f], i) is not None]
@ -171,43 +168,43 @@ def build_df_all_but_last(df, fighters):
return df_fights
df_train = build_df_all_but_last(df, fighters)
df_test = build_df(df, fighters,0)
preprocessor = make_column_transformer((OrdinalEncoder(), ['weight_class', 'B_Stance', 'R_Stance']), remainder='passthrough')
# If the winner is from the Red corner, Winner label will be encoded as 1, otherwise it will be 0 (Blue corner)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_train['Winner'])
y_test = label_encoder.transform(df_test['Winner'])
X_train, X_test = df_train.drop(['Winner'], axis=1), df_test.drop(['Winner'], axis=1)
# Random Forest composed of 100 decision trees. We optimized parameters using cross-validation and GridSearch tool paired together
random_forest = RandomForestClassifier(n_estimators=100,
criterion='entropy',
max_depth=10,
min_samples_split=2,
min_samples_leaf=1,
random_state=0)
model = Pipeline([('encoding', preprocessor), ('random_forest', random_forest)])
model.fit(X_train, y_train)
# We use cross-validation with 5-folds to have a more precise accuracy (reduce variation)
accuracies = cross_val_score(estimator=model, X=X_train, y=y_train, cv=5)
print('Accuracy mean : ', accuracies.mean())
print('Accuracy standard deviation : ', accuracies.std())
y_pred = model.predict(X_test)
print('Testing accuracy : ', accuracy_score(y_test, y_pred), '\n')
target_names = ["Blue","Red"]
print(classification_report(y_test, y_pred, labels=[0,1], target_names=target_names))
# cm = confusion_matrix(y_test, y_pred)
# ax = plt.subplot()
#
#df_train = build_df_all_but_last(df, fighters)
#df_test = build_df(df, fighters,0)
#
#preprocessor = make_column_transformer((OrdinalEncoder(), ['weight_class', 'B_Stance', 'R_Stance']), remainder='passthrough')
#
## If the winner is from the Red corner, Winner label will be encoded as 1, otherwise it will be 0 (Blue corner)
#label_encoder = LabelEncoder()
#y_train = label_encoder.fit_transform(df_train['Winner'])
#y_test = label_encoder.transform(df_test['Winner'])
#
#X_train, X_test = df_train.drop(['Winner'], axis=1), df_test.drop(['Winner'], axis=1)
#
## Random Forest composed of 100 decision trees. We optimized parameters using cross-validation and GridSearch tool paired together
#random_forest = RandomForestClassifier(n_estimators=100,
# criterion='entropy',
# max_depth=10,
# min_samples_split=2,
# min_samples_leaf=1,
# random_state=0)
#
#model = Pipeline([('encoding', preprocessor), ('random_forest', random_forest)])
#model.fit(X_train, y_train)
#
## We use cross-validation with 5-folds to have a more precise accuracy (reduce variation)
#accuracies = cross_val_score(estimator=model, X=X_train, y=y_train, cv=5)
#print('Accuracy mean : ', accuracies.mean())
#print('Accuracy standard deviation : ', accuracies.std())
#
#y_pred = model.predict(X_test)
#print('Testing accuracy : ', accuracy_score(y_test, y_pred), '\n')
#
#target_names = ["Blue","Red"]
#print(classification_report(y_test, y_pred, labels=[0,1], target_names=target_names))
#
## cm = confusion_matrix(y_test, y_pred)
## ax = plt.subplot()
# sns.heatmap(cm, annot = True, ax = ax, fmt = "d")
# ax.set_xlabel('Actual')
# ax.set_ylabel('Predicted')
@ -216,12 +213,12 @@ print(classification_report(y_test, y_pred, labels=[0,1], target_names=target_na
# ax.yaxis.set_ticklabels(['Blue', 'Red'])
# plt.show()
feature_names = [col for col in X_train]
feature_importances = model['random_forest'].feature_importances_
indices = np.argsort(feature_importances)[::-1]
n = 30 # maximum feature importances displayed
idx = indices[0:n]
std = np.std([tree.feature_importances_ for tree in model['random_forest'].estimators_], axis=0)
#feature_names = [col for col in X_train]
#feature_importances = model['random_forest'].feature_importances_
#indices = np.argsort(feature_importances)[::-1]
#n = 30 # maximum feature importances displayed
#idx = indices[0:n]
#std = np.std([tree.feature_importances_ for tree in model['random_forest'].estimators_], axis=0)
#for f in range(n):
# print("%d. feature %s (%f)" % (f + 1, feature_names[idx[f]], feature_importances[idx[f]]))
@ -234,7 +231,7 @@ std = np.std([tree.feature_importances_ for tree in model['random_forest'].estim
# plt.show()
# Sélectionnez un arbre de votre modèle
tree_estimator = model['random_forest'].estimators_[10]
#tree_estimator = model['random_forest'].estimators_[10]
# Tracez l'arbre
# plt.figure(figsize=(1, 1))
@ -284,11 +281,11 @@ def predict(df, pipeline, blue_fighter, red_fighter, weightclass, rounds, title_
return
predict(df, model, 'Kamaru Usman', 'Colby Covington', 'Welterweight', 3, True)
predict(df, model, 'Leon Edwards', 'Belal Muhammad', 'Welterweight', 3, True)
predict(df, model, 'Conor McGregor', 'Khabib Nurmagomedov', 'Lightweight', 5, True)
predict(df, model, 'Conor McGregor', 'Tai Tuivasa', 'Heavyweight', 5, True)
predict(df,model,'Charles Oliveira','Conor McGregor','Lightweight',5,True)
predict(df,model,'Charles Oliveira','Khabib Nurmagomedov','Lightweight',5,True)
predict(df, model, 'Leon Edwards', 'Kamaru Usman', 'Welterweight', 5, True)
#predict(df, model, 'Kamaru Usman', 'Colby Covington', 'Welterweight', 3, True)
#predict(df, model, 'Leon Edwards', 'Belal Muhammad', 'Welterweight', 3, True)
#predict(df, model, 'Conor McGregor', 'Khabib Nurmagomedov', 'Lightweight', 5, True)
#predict(df, model, 'Conor McGregor', 'Tai Tuivasa', 'Heavyweight', 5, True)
#
#predict(df,model,'Charles Oliveira','Conor McGregor','Lightweight',5,True)
#predict(df,model,'Charles Oliveira','Khabib Nurmagomedov','Lightweight',5,True)
#predict(df, model, 'Leon Edwards', 'Kamaru Usman', 'Welterweight', 5, True)

Loading…
Cancel
Save