diff --git a/server.py b/server.py new file mode 100644 index 0000000..ea3440d --- /dev/null +++ b/server.py @@ -0,0 +1,131 @@ +from flask import Flask, render_template, request +import pandas as pd +from test import * # Assurez-vous d'avoir un fichier predict.py avec votre fonction predict + +app = Flask(__name__) + +# Charger le DataFrame une seule fois pour économiser des ressources +df = pd.read_csv('archive/data.csv') # Assurez-vous de spécifier le bon chemin vers votre fichier de données + +limit_date = '2001-04-01' +df = df[(df['date'] > limit_date)] + +displayNumberOfNaNValues(df) + +imp_features = ['R_Weight_lbs', 'R_Height_cms', 'B_Height_cms', 'R_age', 'B_age', 'R_Reach_cms', 'B_Reach_cms'] +imp_median = SimpleImputer(missing_values=np.nan, strategy='median') + +# Iterate over each feature to impute missing values +for feature in imp_features: + # Fit and transform the feature using median imputation + imp_feature = imp_median.fit_transform(df[feature].values.reshape(-1,1)) + # Assign the imputed values back to the DataFrame + df[feature] = imp_feature + +# Impute missing values for 'R_Stance' using most frequent strategy +imp_stance_R = SimpleImputer(missing_values=np.nan, strategy='most_frequent') +imp_R_stance = imp_stance_R.fit_transform(df['R_Stance'].values.reshape(-1,1)) + +# Impute missing values for 'B_Stance' using most frequent strategy +imp_stance_B = SimpleImputer(missing_values=np.nan, strategy='most_frequent') +imp_B_stance = imp_stance_B.fit_transform(df['B_Stance'].values.reshape(-1,1)) + +# Create DataFrames for imputed stances +df['R_Stance'] = pd.DataFrame(imp_R_stance, columns=['R_Stance']) +df['B_Stance'] = pd.DataFrame(imp_B_stance, columns=['B_Stance']) + + +df.drop(['Referee', 'location'], axis=1, inplace=True) + +# Drop column 'B_draw' and 'R_draw' and 'Draw' fight and 'Catch Weight' fight +df.drop(['B_draw', 'R_draw'], axis=1, inplace=True) +df = df[df['Winner'] != 'Draw'] +df = df[df['weight_class'] != 'Catch Weight'] + +# Remove column when data type is not float or int +dfWithoutString = df.select_dtypes(include=['float64', 'int64']) + +plt.figure(figsize=(50, 40)) +corr_matrix = dfWithoutString.corr(method='pearson').abs() +sns.heatmap(corr_matrix, annot=True) + +fighters = list_fighters(df,'2015-01-01') + +df_train = build_df_all_but_last(df, fighters) +df_test = build_df(df, fighters,0) + +preprocessor = make_column_transformer((OrdinalEncoder(), ['weight_class', 'B_Stance', 'R_Stance']), remainder='passthrough') + + +label_encoder = LabelEncoder() +y_train = label_encoder.fit_transform(df_train['Winner']) +y_test = label_encoder.transform(df_test['Winner']) + +X_train, X_test = df_train.drop(['Winner'], axis=1), df_test.drop(['Winner'], axis=1) + +# Random Forest composed of 100 decision trees. We optimized parameters using cross-validation and GridSearch tool paired together +random_forest = RandomForestClassifier(n_estimators=100, + criterion='entropy', + max_depth=10, + min_samples_split=2, + min_samples_leaf=1, + random_state=0) + +model = Pipeline([('encoding', preprocessor), ('random_forest', random_forest)]) +model.fit(X_train, y_train) + +# We use cross-validation with 5-folds to have a more precise accuracy (reduce variation) +accuracies = cross_val_score(estimator=model, X=X_train, y=y_train, cv=5) +print('Accuracy mean : ', accuracies.mean()) +print('Accuracy standard deviation : ', accuracies.std()) + +y_pred = model.predict(X_test) +print('Testing accuracy : ', accuracy_score(y_test, y_pred), '\n') + +target_names = ["Blue","Red"] +print(classification_report(y_test, y_pred, labels=[0,1], target_names=target_names)) + +feature_names = [col for col in X_train] +feature_importances = model['random_forest'].feature_importances_ +indices = np.argsort(feature_importances)[::-1] +n = 30 # maximum feature importances displayed +idx = indices[0:n] +std = np.std([tree.feature_importances_ for tree in model['random_forest'].estimators_], axis=0) + +#for f in range(n): +# print("%d. feature %s (%f)" % (f + 1, feature_names[idx[f]], feature_importances[idx[f]])) + +# plt.figure(figsize=(30, 8)) +# plt.title("Feature importances") +# plt.bar(range(n), feature_importances[idx], color="r", yerr=std[idx], align="center") +# plt.xticks(range(n), [feature_names[id] for id in idx], rotation = 45) +# plt.xlim([-1, n]) +# plt.show() + +# Sélectionnez un arbre de votre modèle +tree_estimator = model['random_forest'].estimators_[10] + + +@app.route('/') +def index(): + return render_template('index.html') + +@app.route('/predict', methods=['POST']) +def make_prediction(): + blue_fighter = request.form['blue_fighter'] + red_fighter = request.form['red_fighter'] + weightclass = request.form['weightclass'] + rounds = int(request.form['rounds']) + title_bout = True if request.form['title_bout'] == 'True' else False + + prediction_proba = predict(df, model, blue_fighter, red_fighter, weightclass, rounds, title_bout) + + # Formatage du résultat pour l'afficher dans le navigateur + result = "" + if prediction_proba is not None: + result = f"The predicted probability of {blue_fighter} winning is {round(prediction_proba[0][0] * 100, 2)}% and the predicted probability of {red_fighter} winning is {round(prediction_proba[0][1] * 100, 2)}%" + + return render_template('result.html', result=result) + +if __name__ == '__main__': + app.run(debug=True) diff --git a/templates/index.html b/templates/index.html new file mode 100644 index 0000000..74e6714 --- /dev/null +++ b/templates/index.html @@ -0,0 +1,32 @@ + + + + + + UFC Fight Prediction + + +

UFC Fight Prediction

+
+ +

+ + +

+ + +

+ + +

+ + +

+ + +
+ + diff --git a/templates/result.html b/templates/result.html new file mode 100644 index 0000000..ca7eba4 --- /dev/null +++ b/templates/result.html @@ -0,0 +1,13 @@ + + + + + + Prediction Result + + +

Prediction Result

+

{{ result }}

+

Make Another Prediction

+ + diff --git a/test.py b/test.py index 0e4e554..76acc5b 100644 --- a/test.py +++ b/test.py @@ -37,9 +37,6 @@ def displayNumberOfNaNValues(df): print('Number of features with NaN values:', len([x[1] for x in na if x[1] > 0])) print("Total NaN in dataframe :" , df.isna().sum().sum()) - -df = pd.read_csv('archive/data.csv') - # Before April 2001, there were almost no rules in UFC (no judges, no time limits, no rounds, etc.). #It's up to this precise date that UFC started to implement a set of rules known as #"Unified Rules of Mixed Martial Arts". @@ -48,63 +45,63 @@ df = pd.read_csv('archive/data.csv') # Using this old data would not be representative of current fights, especially since this #sport has become one of the most regulated due to its mixity and complexity. -limit_date = '2001-04-01' -df = df[(df['date'] > limit_date)] + #limit_date = '2001-04-01' + #df = df[(df['date'] > limit_date)] # Display NaN values -displayNumberOfNaNValues(df) + #displayNumberOfNaNValues(df) # Define the list of important features to impute -imp_features = ['R_Weight_lbs', 'R_Height_cms', 'B_Height_cms', 'R_age', 'B_age', 'R_Reach_cms', 'B_Reach_cms'] +#imp_features = ['R_Weight_lbs', 'R_Height_cms', 'B_Height_cms', 'R_age', 'B_age', 'R_Reach_cms', 'B_Reach_cms'] # Initialize a SimpleImputer to impute missing values with median -imp_median = SimpleImputer(missing_values=np.nan, strategy='median') +#imp_median = SimpleImputer(missing_values=np.nan, strategy='median') # Iterate over each feature to impute missing values -for feature in imp_features: +#for feature in imp_features: # Fit and transform the feature using median imputation - imp_feature = imp_median.fit_transform(df[feature].values.reshape(-1,1)) + #imp_feature = imp_median.fit_transform(df[feature].values.reshape(-1,1)) # Assign the imputed values back to the DataFrame - df[feature] = imp_feature + #df[feature] = imp_feature # Impute missing values for 'R_Stance' using most frequent strategy -imp_stance_R = SimpleImputer(missing_values=np.nan, strategy='most_frequent') -imp_R_stance = imp_stance_R.fit_transform(df['R_Stance'].values.reshape(-1,1)) - -# Impute missing values for 'B_Stance' using most frequent strategy -imp_stance_B = SimpleImputer(missing_values=np.nan, strategy='most_frequent') -imp_B_stance = imp_stance_B.fit_transform(df['B_Stance'].values.reshape(-1,1)) - -# Create DataFrames for imputed stances -df['R_Stance'] = pd.DataFrame(imp_R_stance, columns=['R_Stance']) -df['B_Stance'] = pd.DataFrame(imp_B_stance, columns=['B_Stance']) - -# drop B_avg_BODY_att values in the dataframe - # List of features with NaN values to drop - #na_features = ['B_avg_BODY_att', 'R_avg_BODY_att'] - - # Drop rows with NaN values in specified features - #df.dropna(subset=na_features, inplace=True) - -# Drop columns 'Referee' and 'location' from the DataFrame -# The value of references and location has a low impact in battles, which makes it irrelevant to keep -df.drop(['Referee', 'location'], axis=1, inplace=True) - -# Drop column 'B_draw' and 'R_draw' and 'Draw' fight and 'Catch Weight' fight -df.drop(['B_draw', 'R_draw'], axis=1, inplace=True) -df = df[df['Winner'] != 'Draw'] -df = df[df['weight_class'] != 'Catch Weight'] - -# Remove column when data type is not float or int -dfWithoutString = df.select_dtypes(include=['float64', 'int64']) - -plt.figure(figsize=(50, 40)) -corr_matrix = dfWithoutString.corr(method='pearson').abs() -sns.heatmap(corr_matrix, annot=True) - -# Show the correlation matrix of the dataframe -# Very laggy feature - +#imp_stance_R = SimpleImputer(missing_values=np.nan, strategy='most_frequent') +#imp_R_stance = imp_stance_R.fit_transform(df['R_Stance'].values.reshape(-1,1)) +# +## Impute missing values for 'B_Stance' using most frequent strategy +#imp_stance_B = SimpleImputer(missing_values=np.nan, strategy='most_frequent') +#imp_B_stance = imp_stance_B.fit_transform(df['B_Stance'].values.reshape(-1,1)) +# +## Create DataFrames for imputed stances +#df['R_Stance'] = pd.DataFrame(imp_R_stance, columns=['R_Stance']) +#df['B_Stance'] = pd.DataFrame(imp_B_stance, columns=['B_Stance']) +# +## drop B_avg_BODY_att values in the dataframe +# # List of features with NaN values to drop +# #na_features = ['B_avg_BODY_att', 'R_avg_BODY_att'] +# +# # Drop rows with NaN values in specified features +# #df.dropna(subset=na_features, inplace=True) +# +## Drop columns 'Referee' and 'location' from the DataFrame +## The value of references and location has a low impact in battles, which makes it irrelevant to keep +#df.drop(['Referee', 'location'], axis=1, inplace=True) +# +## Drop column 'B_draw' and 'R_draw' and 'Draw' fight and 'Catch Weight' fight +#df.drop(['B_draw', 'R_draw'], axis=1, inplace=True) +#df = df[df['Winner'] != 'Draw'] +#df = df[df['weight_class'] != 'Catch Weight'] +# +## Remove column when data type is not float or int +#dfWithoutString = df.select_dtypes(include=['float64', 'int64']) +# +#plt.figure(figsize=(50, 40)) +#corr_matrix = dfWithoutString.corr(method='pearson').abs() +#sns.heatmap(corr_matrix, annot=True) +# +## Show the correlation matrix of the dataframe +## Very laggy feature +# # plt.show() # i = index of the fighter's fight, 0 means the last fight, -1 means first fight @@ -118,7 +115,7 @@ def select_fight_row(df, name, i): return arr # we get the last fight of Khabib :'( -print(select_fight_row(df, 'Khabib Nurmagomedov', 0)) +#print(select_fight_row(df, 'Khabib Nurmagomedov', 0)) # get all active UFC fighters (according to the limit_date parameter) @@ -137,7 +134,7 @@ def list_fighters(df, limit_date): return fighters # Last year when data fight was not full and correct -fighters = list_fighters(df,'2015-01-01') +#fighters = list_fighters(df,'2015-01-01') def build_df(df, fighters, i): arr = [select_fight_row(df, fighters[f], i) for f in range(len(fighters)) if select_fight_row(df, fighters[f], i) is not None] @@ -171,43 +168,43 @@ def build_df_all_but_last(df, fighters): return df_fights - -df_train = build_df_all_but_last(df, fighters) -df_test = build_df(df, fighters,0) - -preprocessor = make_column_transformer((OrdinalEncoder(), ['weight_class', 'B_Stance', 'R_Stance']), remainder='passthrough') - -# If the winner is from the Red corner, Winner label will be encoded as 1, otherwise it will be 0 (Blue corner) -label_encoder = LabelEncoder() -y_train = label_encoder.fit_transform(df_train['Winner']) -y_test = label_encoder.transform(df_test['Winner']) - -X_train, X_test = df_train.drop(['Winner'], axis=1), df_test.drop(['Winner'], axis=1) - -# Random Forest composed of 100 decision trees. We optimized parameters using cross-validation and GridSearch tool paired together -random_forest = RandomForestClassifier(n_estimators=100, - criterion='entropy', - max_depth=10, - min_samples_split=2, - min_samples_leaf=1, - random_state=0) - -model = Pipeline([('encoding', preprocessor), ('random_forest', random_forest)]) -model.fit(X_train, y_train) - -# We use cross-validation with 5-folds to have a more precise accuracy (reduce variation) -accuracies = cross_val_score(estimator=model, X=X_train, y=y_train, cv=5) -print('Accuracy mean : ', accuracies.mean()) -print('Accuracy standard deviation : ', accuracies.std()) - -y_pred = model.predict(X_test) -print('Testing accuracy : ', accuracy_score(y_test, y_pred), '\n') - -target_names = ["Blue","Red"] -print(classification_report(y_test, y_pred, labels=[0,1], target_names=target_names)) - -# cm = confusion_matrix(y_test, y_pred) -# ax = plt.subplot() +# +#df_train = build_df_all_but_last(df, fighters) +#df_test = build_df(df, fighters,0) +# +#preprocessor = make_column_transformer((OrdinalEncoder(), ['weight_class', 'B_Stance', 'R_Stance']), remainder='passthrough') +# +## If the winner is from the Red corner, Winner label will be encoded as 1, otherwise it will be 0 (Blue corner) +#label_encoder = LabelEncoder() +#y_train = label_encoder.fit_transform(df_train['Winner']) +#y_test = label_encoder.transform(df_test['Winner']) +# +#X_train, X_test = df_train.drop(['Winner'], axis=1), df_test.drop(['Winner'], axis=1) +# +## Random Forest composed of 100 decision trees. We optimized parameters using cross-validation and GridSearch tool paired together +#random_forest = RandomForestClassifier(n_estimators=100, +# criterion='entropy', +# max_depth=10, +# min_samples_split=2, +# min_samples_leaf=1, +# random_state=0) +# +#model = Pipeline([('encoding', preprocessor), ('random_forest', random_forest)]) +#model.fit(X_train, y_train) +# +## We use cross-validation with 5-folds to have a more precise accuracy (reduce variation) +#accuracies = cross_val_score(estimator=model, X=X_train, y=y_train, cv=5) +#print('Accuracy mean : ', accuracies.mean()) +#print('Accuracy standard deviation : ', accuracies.std()) +# +#y_pred = model.predict(X_test) +#print('Testing accuracy : ', accuracy_score(y_test, y_pred), '\n') +# +#target_names = ["Blue","Red"] +#print(classification_report(y_test, y_pred, labels=[0,1], target_names=target_names)) +# +## cm = confusion_matrix(y_test, y_pred) +## ax = plt.subplot() # sns.heatmap(cm, annot = True, ax = ax, fmt = "d") # ax.set_xlabel('Actual') # ax.set_ylabel('Predicted') @@ -216,12 +213,12 @@ print(classification_report(y_test, y_pred, labels=[0,1], target_names=target_na # ax.yaxis.set_ticklabels(['Blue', 'Red']) # plt.show() -feature_names = [col for col in X_train] -feature_importances = model['random_forest'].feature_importances_ -indices = np.argsort(feature_importances)[::-1] -n = 30 # maximum feature importances displayed -idx = indices[0:n] -std = np.std([tree.feature_importances_ for tree in model['random_forest'].estimators_], axis=0) +#feature_names = [col for col in X_train] +#feature_importances = model['random_forest'].feature_importances_ +#indices = np.argsort(feature_importances)[::-1] +#n = 30 # maximum feature importances displayed +#idx = indices[0:n] +#std = np.std([tree.feature_importances_ for tree in model['random_forest'].estimators_], axis=0) #for f in range(n): # print("%d. feature %s (%f)" % (f + 1, feature_names[idx[f]], feature_importances[idx[f]])) @@ -234,7 +231,7 @@ std = np.std([tree.feature_importances_ for tree in model['random_forest'].estim # plt.show() # Sélectionnez un arbre de votre modèle -tree_estimator = model['random_forest'].estimators_[10] +#tree_estimator = model['random_forest'].estimators_[10] # Tracez l'arbre # plt.figure(figsize=(1, 1)) @@ -284,11 +281,11 @@ def predict(df, pipeline, blue_fighter, red_fighter, weightclass, rounds, title_ return -predict(df, model, 'Kamaru Usman', 'Colby Covington', 'Welterweight', 3, True) -predict(df, model, 'Leon Edwards', 'Belal Muhammad', 'Welterweight', 3, True) -predict(df, model, 'Conor McGregor', 'Khabib Nurmagomedov', 'Lightweight', 5, True) -predict(df, model, 'Conor McGregor', 'Tai Tuivasa', 'Heavyweight', 5, True) - -predict(df,model,'Charles Oliveira','Conor McGregor','Lightweight',5,True) -predict(df,model,'Charles Oliveira','Khabib Nurmagomedov','Lightweight',5,True) -predict(df, model, 'Leon Edwards', 'Kamaru Usman', 'Welterweight', 5, True) +#predict(df, model, 'Kamaru Usman', 'Colby Covington', 'Welterweight', 3, True) +#predict(df, model, 'Leon Edwards', 'Belal Muhammad', 'Welterweight', 3, True) +#predict(df, model, 'Conor McGregor', 'Khabib Nurmagomedov', 'Lightweight', 5, True) +#predict(df, model, 'Conor McGregor', 'Tai Tuivasa', 'Heavyweight', 5, True) +# +#predict(df,model,'Charles Oliveira','Conor McGregor','Lightweight',5,True) +#predict(df,model,'Charles Oliveira','Khabib Nurmagomedov','Lightweight',5,True) +#predict(df, model, 'Leon Edwards', 'Kamaru Usman', 'Welterweight', 5, True)