Add Web interface and starting refactor code

master
Lucas EVARD 1 year ago
parent 0100b87fea
commit a560f1416d

@ -0,0 +1,131 @@
from flask import Flask, render_template, request
import pandas as pd
from test import * # Assurez-vous d'avoir un fichier predict.py avec votre fonction predict
app = Flask(__name__)
# Charger le DataFrame une seule fois pour économiser des ressources
df = pd.read_csv('archive/data.csv') # Assurez-vous de spécifier le bon chemin vers votre fichier de données
limit_date = '2001-04-01'
df = df[(df['date'] > limit_date)]
displayNumberOfNaNValues(df)
imp_features = ['R_Weight_lbs', 'R_Height_cms', 'B_Height_cms', 'R_age', 'B_age', 'R_Reach_cms', 'B_Reach_cms']
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
# Iterate over each feature to impute missing values
for feature in imp_features:
# Fit and transform the feature using median imputation
imp_feature = imp_median.fit_transform(df[feature].values.reshape(-1,1))
# Assign the imputed values back to the DataFrame
df[feature] = imp_feature
# Impute missing values for 'R_Stance' using most frequent strategy
imp_stance_R = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_R_stance = imp_stance_R.fit_transform(df['R_Stance'].values.reshape(-1,1))
# Impute missing values for 'B_Stance' using most frequent strategy
imp_stance_B = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_B_stance = imp_stance_B.fit_transform(df['B_Stance'].values.reshape(-1,1))
# Create DataFrames for imputed stances
df['R_Stance'] = pd.DataFrame(imp_R_stance, columns=['R_Stance'])
df['B_Stance'] = pd.DataFrame(imp_B_stance, columns=['B_Stance'])
df.drop(['Referee', 'location'], axis=1, inplace=True)
# Drop column 'B_draw' and 'R_draw' and 'Draw' fight and 'Catch Weight' fight
df.drop(['B_draw', 'R_draw'], axis=1, inplace=True)
df = df[df['Winner'] != 'Draw']
df = df[df['weight_class'] != 'Catch Weight']
# Remove column when data type is not float or int
dfWithoutString = df.select_dtypes(include=['float64', 'int64'])
plt.figure(figsize=(50, 40))
corr_matrix = dfWithoutString.corr(method='pearson').abs()
sns.heatmap(corr_matrix, annot=True)
fighters = list_fighters(df,'2015-01-01')
df_train = build_df_all_but_last(df, fighters)
df_test = build_df(df, fighters,0)
preprocessor = make_column_transformer((OrdinalEncoder(), ['weight_class', 'B_Stance', 'R_Stance']), remainder='passthrough')
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_train['Winner'])
y_test = label_encoder.transform(df_test['Winner'])
X_train, X_test = df_train.drop(['Winner'], axis=1), df_test.drop(['Winner'], axis=1)
# Random Forest composed of 100 decision trees. We optimized parameters using cross-validation and GridSearch tool paired together
random_forest = RandomForestClassifier(n_estimators=100,
criterion='entropy',
max_depth=10,
min_samples_split=2,
min_samples_leaf=1,
random_state=0)
model = Pipeline([('encoding', preprocessor), ('random_forest', random_forest)])
model.fit(X_train, y_train)
# We use cross-validation with 5-folds to have a more precise accuracy (reduce variation)
accuracies = cross_val_score(estimator=model, X=X_train, y=y_train, cv=5)
print('Accuracy mean : ', accuracies.mean())
print('Accuracy standard deviation : ', accuracies.std())
y_pred = model.predict(X_test)
print('Testing accuracy : ', accuracy_score(y_test, y_pred), '\n')
target_names = ["Blue","Red"]
print(classification_report(y_test, y_pred, labels=[0,1], target_names=target_names))
feature_names = [col for col in X_train]
feature_importances = model['random_forest'].feature_importances_
indices = np.argsort(feature_importances)[::-1]
n = 30 # maximum feature importances displayed
idx = indices[0:n]
std = np.std([tree.feature_importances_ for tree in model['random_forest'].estimators_], axis=0)
#for f in range(n):
# print("%d. feature %s (%f)" % (f + 1, feature_names[idx[f]], feature_importances[idx[f]]))
# plt.figure(figsize=(30, 8))
# plt.title("Feature importances")
# plt.bar(range(n), feature_importances[idx], color="r", yerr=std[idx], align="center")
# plt.xticks(range(n), [feature_names[id] for id in idx], rotation = 45)
# plt.xlim([-1, n])
# plt.show()
# Sélectionnez un arbre de votre modèle
tree_estimator = model['random_forest'].estimators_[10]
@app.route('/')
def index():
return render_template('index.html')
@app.route('/predict', methods=['POST'])
def make_prediction():
blue_fighter = request.form['blue_fighter']
red_fighter = request.form['red_fighter']
weightclass = request.form['weightclass']
rounds = int(request.form['rounds'])
title_bout = True if request.form['title_bout'] == 'True' else False
prediction_proba = predict(df, model, blue_fighter, red_fighter, weightclass, rounds, title_bout)
# Formatage du résultat pour l'afficher dans le navigateur
result = ""
if prediction_proba is not None:
result = f"The predicted probability of {blue_fighter} winning is {round(prediction_proba[0][0] * 100, 2)}% and the predicted probability of {red_fighter} winning is {round(prediction_proba[0][1] * 100, 2)}%"
return render_template('result.html', result=result)
if __name__ == '__main__':
app.run(debug=True)

@ -0,0 +1,32 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>UFC Fight Prediction</title>
</head>
<body>
<h1>UFC Fight Prediction</h1>
<form action="/predict" method="post">
<label for="blue_fighter">Blue Fighter:</label>
<input type="text" id="blue_fighter" name="blue_fighter"><br><br>
<label for="red_fighter">Red Fighter:</label>
<input type="text" id="red_fighter" name="red_fighter"><br><br>
<label for="weightclass">Weight Class:</label>
<input type="text" id="weightclass" name="weightclass"><br><br>
<label for="rounds">Number of Rounds:</label>
<input type="number" id="rounds" name="rounds" min="1" max="5" value="3"><br><br>
<label for="title_bout">Title Bout:</label>
<select id="title_bout" name="title_bout">
<option value="True">Yes</option>
<option value="False" selected>No</option>
</select><br><br>
<input type="submit" value="Predict">
</form>
</body>
</html>

@ -0,0 +1,13 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Prediction Result</title>
</head>
<body>
<h2>Prediction Result</h2>
<p>{{ result }}</p>
<p><a href="/">Make Another Prediction</a></p>
</body>
</html>

@ -37,9 +37,6 @@ def displayNumberOfNaNValues(df):
print('Number of features with NaN values:', len([x[1] for x in na if x[1] > 0])) print('Number of features with NaN values:', len([x[1] for x in na if x[1] > 0]))
print("Total NaN in dataframe :" , df.isna().sum().sum()) print("Total NaN in dataframe :" , df.isna().sum().sum())
df = pd.read_csv('archive/data.csv')
# Before April 2001, there were almost no rules in UFC (no judges, no time limits, no rounds, etc.). # Before April 2001, there were almost no rules in UFC (no judges, no time limits, no rounds, etc.).
#It's up to this precise date that UFC started to implement a set of rules known as #It's up to this precise date that UFC started to implement a set of rules known as
#"Unified Rules of Mixed Martial Arts". #"Unified Rules of Mixed Martial Arts".
@ -48,63 +45,63 @@ df = pd.read_csv('archive/data.csv')
# Using this old data would not be representative of current fights, especially since this # Using this old data would not be representative of current fights, especially since this
#sport has become one of the most regulated due to its mixity and complexity. #sport has become one of the most regulated due to its mixity and complexity.
limit_date = '2001-04-01' #limit_date = '2001-04-01'
df = df[(df['date'] > limit_date)] #df = df[(df['date'] > limit_date)]
# Display NaN values # Display NaN values
displayNumberOfNaNValues(df) #displayNumberOfNaNValues(df)
# Define the list of important features to impute # Define the list of important features to impute
imp_features = ['R_Weight_lbs', 'R_Height_cms', 'B_Height_cms', 'R_age', 'B_age', 'R_Reach_cms', 'B_Reach_cms'] #imp_features = ['R_Weight_lbs', 'R_Height_cms', 'B_Height_cms', 'R_age', 'B_age', 'R_Reach_cms', 'B_Reach_cms']
# Initialize a SimpleImputer to impute missing values with median # Initialize a SimpleImputer to impute missing values with median
imp_median = SimpleImputer(missing_values=np.nan, strategy='median') #imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
# Iterate over each feature to impute missing values # Iterate over each feature to impute missing values
for feature in imp_features: #for feature in imp_features:
# Fit and transform the feature using median imputation # Fit and transform the feature using median imputation
imp_feature = imp_median.fit_transform(df[feature].values.reshape(-1,1)) #imp_feature = imp_median.fit_transform(df[feature].values.reshape(-1,1))
# Assign the imputed values back to the DataFrame # Assign the imputed values back to the DataFrame
df[feature] = imp_feature #df[feature] = imp_feature
# Impute missing values for 'R_Stance' using most frequent strategy # Impute missing values for 'R_Stance' using most frequent strategy
imp_stance_R = SimpleImputer(missing_values=np.nan, strategy='most_frequent') #imp_stance_R = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_R_stance = imp_stance_R.fit_transform(df['R_Stance'].values.reshape(-1,1)) #imp_R_stance = imp_stance_R.fit_transform(df['R_Stance'].values.reshape(-1,1))
#
# Impute missing values for 'B_Stance' using most frequent strategy ## Impute missing values for 'B_Stance' using most frequent strategy
imp_stance_B = SimpleImputer(missing_values=np.nan, strategy='most_frequent') #imp_stance_B = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp_B_stance = imp_stance_B.fit_transform(df['B_Stance'].values.reshape(-1,1)) #imp_B_stance = imp_stance_B.fit_transform(df['B_Stance'].values.reshape(-1,1))
#
# Create DataFrames for imputed stances ## Create DataFrames for imputed stances
df['R_Stance'] = pd.DataFrame(imp_R_stance, columns=['R_Stance']) #df['R_Stance'] = pd.DataFrame(imp_R_stance, columns=['R_Stance'])
df['B_Stance'] = pd.DataFrame(imp_B_stance, columns=['B_Stance']) #df['B_Stance'] = pd.DataFrame(imp_B_stance, columns=['B_Stance'])
#
# drop B_avg_BODY_att values in the dataframe ## drop B_avg_BODY_att values in the dataframe
# List of features with NaN values to drop # # List of features with NaN values to drop
#na_features = ['B_avg_BODY_att', 'R_avg_BODY_att'] # #na_features = ['B_avg_BODY_att', 'R_avg_BODY_att']
#
# Drop rows with NaN values in specified features # # Drop rows with NaN values in specified features
#df.dropna(subset=na_features, inplace=True) # #df.dropna(subset=na_features, inplace=True)
#
# Drop columns 'Referee' and 'location' from the DataFrame ## Drop columns 'Referee' and 'location' from the DataFrame
# The value of references and location has a low impact in battles, which makes it irrelevant to keep ## The value of references and location has a low impact in battles, which makes it irrelevant to keep
df.drop(['Referee', 'location'], axis=1, inplace=True) #df.drop(['Referee', 'location'], axis=1, inplace=True)
#
# Drop column 'B_draw' and 'R_draw' and 'Draw' fight and 'Catch Weight' fight ## Drop column 'B_draw' and 'R_draw' and 'Draw' fight and 'Catch Weight' fight
df.drop(['B_draw', 'R_draw'], axis=1, inplace=True) #df.drop(['B_draw', 'R_draw'], axis=1, inplace=True)
df = df[df['Winner'] != 'Draw'] #df = df[df['Winner'] != 'Draw']
df = df[df['weight_class'] != 'Catch Weight'] #df = df[df['weight_class'] != 'Catch Weight']
#
# Remove column when data type is not float or int ## Remove column when data type is not float or int
dfWithoutString = df.select_dtypes(include=['float64', 'int64']) #dfWithoutString = df.select_dtypes(include=['float64', 'int64'])
#
plt.figure(figsize=(50, 40)) #plt.figure(figsize=(50, 40))
corr_matrix = dfWithoutString.corr(method='pearson').abs() #corr_matrix = dfWithoutString.corr(method='pearson').abs()
sns.heatmap(corr_matrix, annot=True) #sns.heatmap(corr_matrix, annot=True)
#
# Show the correlation matrix of the dataframe ## Show the correlation matrix of the dataframe
# Very laggy feature ## Very laggy feature
#
# plt.show() # plt.show()
# i = index of the fighter's fight, 0 means the last fight, -1 means first fight # i = index of the fighter's fight, 0 means the last fight, -1 means first fight
@ -118,7 +115,7 @@ def select_fight_row(df, name, i):
return arr return arr
# we get the last fight of Khabib :'( # we get the last fight of Khabib :'(
print(select_fight_row(df, 'Khabib Nurmagomedov', 0)) #print(select_fight_row(df, 'Khabib Nurmagomedov', 0))
# get all active UFC fighters (according to the limit_date parameter) # get all active UFC fighters (according to the limit_date parameter)
@ -137,7 +134,7 @@ def list_fighters(df, limit_date):
return fighters return fighters
# Last year when data fight was not full and correct # Last year when data fight was not full and correct
fighters = list_fighters(df,'2015-01-01') #fighters = list_fighters(df,'2015-01-01')
def build_df(df, fighters, i): def build_df(df, fighters, i):
arr = [select_fight_row(df, fighters[f], i) for f in range(len(fighters)) if select_fight_row(df, fighters[f], i) is not None] arr = [select_fight_row(df, fighters[f], i) for f in range(len(fighters)) if select_fight_row(df, fighters[f], i) is not None]
@ -171,43 +168,43 @@ def build_df_all_but_last(df, fighters):
return df_fights return df_fights
#
df_train = build_df_all_but_last(df, fighters) #df_train = build_df_all_but_last(df, fighters)
df_test = build_df(df, fighters,0) #df_test = build_df(df, fighters,0)
#
preprocessor = make_column_transformer((OrdinalEncoder(), ['weight_class', 'B_Stance', 'R_Stance']), remainder='passthrough') #preprocessor = make_column_transformer((OrdinalEncoder(), ['weight_class', 'B_Stance', 'R_Stance']), remainder='passthrough')
#
# If the winner is from the Red corner, Winner label will be encoded as 1, otherwise it will be 0 (Blue corner) ## If the winner is from the Red corner, Winner label will be encoded as 1, otherwise it will be 0 (Blue corner)
label_encoder = LabelEncoder() #label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_train['Winner']) #y_train = label_encoder.fit_transform(df_train['Winner'])
y_test = label_encoder.transform(df_test['Winner']) #y_test = label_encoder.transform(df_test['Winner'])
#
X_train, X_test = df_train.drop(['Winner'], axis=1), df_test.drop(['Winner'], axis=1) #X_train, X_test = df_train.drop(['Winner'], axis=1), df_test.drop(['Winner'], axis=1)
#
# Random Forest composed of 100 decision trees. We optimized parameters using cross-validation and GridSearch tool paired together ## Random Forest composed of 100 decision trees. We optimized parameters using cross-validation and GridSearch tool paired together
random_forest = RandomForestClassifier(n_estimators=100, #random_forest = RandomForestClassifier(n_estimators=100,
criterion='entropy', # criterion='entropy',
max_depth=10, # max_depth=10,
min_samples_split=2, # min_samples_split=2,
min_samples_leaf=1, # min_samples_leaf=1,
random_state=0) # random_state=0)
#
model = Pipeline([('encoding', preprocessor), ('random_forest', random_forest)]) #model = Pipeline([('encoding', preprocessor), ('random_forest', random_forest)])
model.fit(X_train, y_train) #model.fit(X_train, y_train)
#
# We use cross-validation with 5-folds to have a more precise accuracy (reduce variation) ## We use cross-validation with 5-folds to have a more precise accuracy (reduce variation)
accuracies = cross_val_score(estimator=model, X=X_train, y=y_train, cv=5) #accuracies = cross_val_score(estimator=model, X=X_train, y=y_train, cv=5)
print('Accuracy mean : ', accuracies.mean()) #print('Accuracy mean : ', accuracies.mean())
print('Accuracy standard deviation : ', accuracies.std()) #print('Accuracy standard deviation : ', accuracies.std())
#
y_pred = model.predict(X_test) #y_pred = model.predict(X_test)
print('Testing accuracy : ', accuracy_score(y_test, y_pred), '\n') #print('Testing accuracy : ', accuracy_score(y_test, y_pred), '\n')
#
target_names = ["Blue","Red"] #target_names = ["Blue","Red"]
print(classification_report(y_test, y_pred, labels=[0,1], target_names=target_names)) #print(classification_report(y_test, y_pred, labels=[0,1], target_names=target_names))
#
# cm = confusion_matrix(y_test, y_pred) ## cm = confusion_matrix(y_test, y_pred)
# ax = plt.subplot() ## ax = plt.subplot()
# sns.heatmap(cm, annot = True, ax = ax, fmt = "d") # sns.heatmap(cm, annot = True, ax = ax, fmt = "d")
# ax.set_xlabel('Actual') # ax.set_xlabel('Actual')
# ax.set_ylabel('Predicted') # ax.set_ylabel('Predicted')
@ -216,12 +213,12 @@ print(classification_report(y_test, y_pred, labels=[0,1], target_names=target_na
# ax.yaxis.set_ticklabels(['Blue', 'Red']) # ax.yaxis.set_ticklabels(['Blue', 'Red'])
# plt.show() # plt.show()
feature_names = [col for col in X_train] #feature_names = [col for col in X_train]
feature_importances = model['random_forest'].feature_importances_ #feature_importances = model['random_forest'].feature_importances_
indices = np.argsort(feature_importances)[::-1] #indices = np.argsort(feature_importances)[::-1]
n = 30 # maximum feature importances displayed #n = 30 # maximum feature importances displayed
idx = indices[0:n] #idx = indices[0:n]
std = np.std([tree.feature_importances_ for tree in model['random_forest'].estimators_], axis=0) #std = np.std([tree.feature_importances_ for tree in model['random_forest'].estimators_], axis=0)
#for f in range(n): #for f in range(n):
# print("%d. feature %s (%f)" % (f + 1, feature_names[idx[f]], feature_importances[idx[f]])) # print("%d. feature %s (%f)" % (f + 1, feature_names[idx[f]], feature_importances[idx[f]]))
@ -234,7 +231,7 @@ std = np.std([tree.feature_importances_ for tree in model['random_forest'].estim
# plt.show() # plt.show()
# Sélectionnez un arbre de votre modèle # Sélectionnez un arbre de votre modèle
tree_estimator = model['random_forest'].estimators_[10] #tree_estimator = model['random_forest'].estimators_[10]
# Tracez l'arbre # Tracez l'arbre
# plt.figure(figsize=(1, 1)) # plt.figure(figsize=(1, 1))
@ -284,11 +281,11 @@ def predict(df, pipeline, blue_fighter, red_fighter, weightclass, rounds, title_
return return
predict(df, model, 'Kamaru Usman', 'Colby Covington', 'Welterweight', 3, True) #predict(df, model, 'Kamaru Usman', 'Colby Covington', 'Welterweight', 3, True)
predict(df, model, 'Leon Edwards', 'Belal Muhammad', 'Welterweight', 3, True) #predict(df, model, 'Leon Edwards', 'Belal Muhammad', 'Welterweight', 3, True)
predict(df, model, 'Conor McGregor', 'Khabib Nurmagomedov', 'Lightweight', 5, True) #predict(df, model, 'Conor McGregor', 'Khabib Nurmagomedov', 'Lightweight', 5, True)
predict(df, model, 'Conor McGregor', 'Tai Tuivasa', 'Heavyweight', 5, True) #predict(df, model, 'Conor McGregor', 'Tai Tuivasa', 'Heavyweight', 5, True)
#
predict(df,model,'Charles Oliveira','Conor McGregor','Lightweight',5,True) #predict(df,model,'Charles Oliveira','Conor McGregor','Lightweight',5,True)
predict(df,model,'Charles Oliveira','Khabib Nurmagomedov','Lightweight',5,True) #predict(df,model,'Charles Oliveira','Khabib Nurmagomedov','Lightweight',5,True)
predict(df, model, 'Leon Edwards', 'Kamaru Usman', 'Welterweight', 5, True) #predict(df, model, 'Leon Edwards', 'Kamaru Usman', 'Welterweight', 5, True)

Loading…
Cancel
Save