Merge branch 'master' of https://codefirst.iut.uca.fr/git/PyPloteam/MMIX

1 year ago · cad4dc579f
parent b81bae49b7 a560f1416d
commit cad4dc579f
4 changed files with 275 additions and 102 deletions
--- a/server.py
+++ b/server.py
@ -0,0 +1,131 @@
+from flask import Flask, render_template, request
+import pandas as pd
+from test import *  # Assurez-vous d'avoir un fichier predict.py avec votre fonction predict
+
+app = Flask(__name__)
+
+# Charger le DataFrame une seule fois pour économiser des ressources
+df = pd.read_csv('archive/data.csv')  # Assurez-vous de spécifier le bon chemin vers votre fichier de données
+
+limit_date = '2001-04-01'
+df = df[(df['date'] > limit_date)]
+
+displayNumberOfNaNValues(df)
+
+imp_features = ['R_Weight_lbs', 'R_Height_cms', 'B_Height_cms', 'R_age', 'B_age', 'R_Reach_cms', 'B_Reach_cms']
+imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
+
+# Iterate over each feature to impute missing values
+for feature in imp_features:
+    # Fit and transform the feature using median imputation
+    imp_feature = imp_median.fit_transform(df[feature].values.reshape(-1,1))
+    # Assign the imputed values back to the DataFrame
+    df[feature] = imp_feature
+
+# Impute missing values for 'R_Stance' using most frequent strategy
+imp_stance_R = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
+imp_R_stance = imp_stance_R.fit_transform(df['R_Stance'].values.reshape(-1,1))
+
+# Impute missing values for 'B_Stance' using most frequent strategy
+imp_stance_B = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
+imp_B_stance = imp_stance_B.fit_transform(df['B_Stance'].values.reshape(-1,1))
+
+# Create DataFrames for imputed stances
+df['R_Stance'] = pd.DataFrame(imp_R_stance, columns=['R_Stance'])
+df['B_Stance'] = pd.DataFrame(imp_B_stance, columns=['B_Stance'])
+
+
+df.drop(['Referee', 'location'], axis=1, inplace=True)
+
+# Drop column 'B_draw' and 'R_draw' and 'Draw' fight and 'Catch Weight' fight
+df.drop(['B_draw', 'R_draw'], axis=1, inplace=True)
+df = df[df['Winner'] != 'Draw']
+df = df[df['weight_class'] != 'Catch Weight']
+
+# Remove column when data type is not float or int
+dfWithoutString = df.select_dtypes(include=['float64', 'int64'])
+
+plt.figure(figsize=(50, 40))
+corr_matrix = dfWithoutString.corr(method='pearson').abs()
+sns.heatmap(corr_matrix, annot=True)
+
+fighters = list_fighters(df,'2015-01-01')
+
+df_train = build_df_all_but_last(df, fighters)
+df_test = build_df(df, fighters,0)
+
+preprocessor = make_column_transformer((OrdinalEncoder(), ['weight_class', 'B_Stance', 'R_Stance']), remainder='passthrough')
+
+
+label_encoder = LabelEncoder()
+y_train = label_encoder.fit_transform(df_train['Winner'])
+y_test = label_encoder.transform(df_test['Winner'])
+
+X_train, X_test = df_train.drop(['Winner'], axis=1), df_test.drop(['Winner'], axis=1)
+
+# Random Forest composed of 100 decision trees. We optimized parameters using cross-validation and GridSearch tool paired together
+random_forest = RandomForestClassifier(n_estimators=100, 
+                                       criterion='entropy', 
+                                       max_depth=10, 
+                                       min_samples_split=2,
+                                       min_samples_leaf=1, 
+                                       random_state=0)
+
+model = Pipeline([('encoding', preprocessor), ('random_forest', random_forest)])
+model.fit(X_train, y_train)
+
+# We use cross-validation with 5-folds to have a more precise accuracy (reduce variation)
+accuracies = cross_val_score(estimator=model, X=X_train, y=y_train, cv=5)
+print('Accuracy mean : ', accuracies.mean())
+print('Accuracy standard deviation : ', accuracies.std())
+
+y_pred = model.predict(X_test)
+print('Testing accuracy : ', accuracy_score(y_test, y_pred), '\n')
+
+target_names = ["Blue","Red"]
+print(classification_report(y_test, y_pred, labels=[0,1], target_names=target_names))
+
+feature_names = [col for col in X_train]
+feature_importances = model['random_forest'].feature_importances_
+indices = np.argsort(feature_importances)[::-1]
+n = 30 # maximum feature importances displayed
+idx = indices[0:n] 
+std = np.std([tree.feature_importances_ for tree in model['random_forest'].estimators_], axis=0)
+
+#for f in range(n):
+#    print("%d. feature %s (%f)" % (f + 1, feature_names[idx[f]], feature_importances[idx[f]])) 
+
+# plt.figure(figsize=(30, 8))
+# plt.title("Feature importances")
+# plt.bar(range(n), feature_importances[idx], color="r", yerr=std[idx], align="center")
+# plt.xticks(range(n), [feature_names[id] for id in idx], rotation = 45) 
+# plt.xlim([-1, n]) 
+# plt.show()
+
+# Sélectionnez un arbre de votre modèle
+tree_estimator = model['random_forest'].estimators_[10]
+
+
+@app.route('/')
+def index():
+    return render_template('index.html')
+
+@app.route('/predict', methods=['POST'])
+def make_prediction():
+    blue_fighter = request.form['blue_fighter']
+    red_fighter = request.form['red_fighter']
+    weightclass = request.form['weightclass']
+    rounds = int(request.form['rounds'])
+    title_bout = True if request.form['title_bout'] == 'True' else False
+    
+    prediction_proba = predict(df, model, blue_fighter, red_fighter, weightclass, rounds, title_bout)
+    
+    # Formatage du résultat pour l'afficher dans le navigateur
+    result = ""
+    if prediction_proba is not None:
+        result = f"The predicted probability of {blue_fighter} winning is {round(prediction_proba[0][0] * 100, 2)}% and the predicted probability of {red_fighter} winning is {round(prediction_proba[0][1] * 100, 2)}%"
+    
+    return render_template('result.html', result=result)
+
+if __name__ == '__main__':
+    app.run(debug=True)
--- a/templates/index.html
+++ b/templates/index.html
@ -0,0 +1,32 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>UFC Fight Prediction</title>
+</head>
+<body>
+    <h1>UFC Fight Prediction</h1>
+    <form action="/predict" method="post">
+        <label for="blue_fighter">Blue Fighter:</label>
+        <input type="text" id="blue_fighter" name="blue_fighter"><br><br>
+        
+        <label for="red_fighter">Red Fighter:</label>
+        <input type="text" id="red_fighter" name="red_fighter"><br><br>
+        
+        <label for="weightclass">Weight Class:</label>
+        <input type="text" id="weightclass" name="weightclass"><br><br>
+        
+        <label for="rounds">Number of Rounds:</label>
+        <input type="number" id="rounds" name="rounds" min="1" max="5" value="3"><br><br>
+        
+        <label for="title_bout">Title Bout:</label>
+        <select id="title_bout" name="title_bout">
+            <option value="True">Yes</option>
+            <option value="False" selected>No</option>
+        </select><br><br>
+        
+        <input type="submit" value="Predict">
+    </form>
+</body>
+</html>
--- a/templates/result.html
+++ b/templates/result.html
@ -0,0 +1,13 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Prediction Result</title>
+</head>
+<body>
+    <h2>Prediction Result</h2>
+    <p>{{ result }}</p>
+    <p><a href="/">Make Another Prediction</a></p>
+</body>
+</html>
--- a/test.py
+++ b/test.py
@ -37,9 +37,6 @@ def displayNumberOfNaNValues(df):
    print('Number of features with NaN values:', len([x[1] for x in na if x[1] > 0]))
    print("Total NaN in dataframe :" , df.isna().sum().sum())

-
-df = pd.read_csv('archive/data.csv')
-
 # Before April 2001, there were almost no rules in UFC (no judges, no time limits, no rounds, etc.). 
 #It's up to this precise date that UFC started to implement a set of rules known as 
 #"Unified Rules of Mixed Martial Arts".
@ -48,63 +45,63 @@ df = pd.read_csv('archive/data.csv')
 # Using this old data would not be representative of current fights, especially since this 
 #sport has become one of the most regulated due to its mixity and complexity.

-limit_date = '2001-04-01'
-df = df[(df['date'] > limit_date)]
+    #limit_date = '2001-04-01'
+    #df = df[(df['date'] > limit_date)]

 # Display NaN values
-displayNumberOfNaNValues(df)
+    #displayNumberOfNaNValues(df)

 # Define the list of important features to impute
-imp_features = ['R_Weight_lbs', 'R_Height_cms', 'B_Height_cms', 'R_age', 'B_age', 'R_Reach_cms', 'B_Reach_cms']
+#imp_features = ['R_Weight_lbs', 'R_Height_cms', 'B_Height_cms', 'R_age', 'B_age', 'R_Reach_cms', 'B_Reach_cms']

 # Initialize a SimpleImputer to impute missing values with median
-imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
+#imp_median = SimpleImputer(missing_values=np.nan, strategy='median')

 # Iterate over each feature to impute missing values
-for feature in imp_features:
+#for feature in imp_features:
    # Fit and transform the feature using median imputation
-    imp_feature = imp_median.fit_transform(df[feature].values.reshape(-1,1))
+    #imp_feature = imp_median.fit_transform(df[feature].values.reshape(-1,1))
    # Assign the imputed values back to the DataFrame
-    df[feature] = imp_feature
+    #df[feature] = imp_feature

 # Impute missing values for 'R_Stance' using most frequent strategy
-imp_stance_R = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
-imp_R_stance = imp_stance_R.fit_transform(df['R_Stance'].values.reshape(-1,1))
-
-# Impute missing values for 'B_Stance' using most frequent strategy
-imp_stance_B = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
-imp_B_stance = imp_stance_B.fit_transform(df['B_Stance'].values.reshape(-1,1))
-
-# Create DataFrames for imputed stances
-df['R_Stance'] = pd.DataFrame(imp_R_stance, columns=['R_Stance'])
-df['B_Stance'] = pd.DataFrame(imp_B_stance, columns=['B_Stance'])
-
-# drop B_avg_BODY_att values in the dataframe
-    # List of features with NaN values to drop
-    #na_features = ['B_avg_BODY_att', 'R_avg_BODY_att']
-
-    # Drop rows with NaN values in specified features
-    #df.dropna(subset=na_features, inplace=True)
-
-# Drop columns 'Referee' and 'location' from the DataFrame
-# The value of references and location has a low impact in battles, which makes it irrelevant to keep
-df.drop(['Referee', 'location'], axis=1, inplace=True)
-
-# Drop column 'B_draw' and 'R_draw' and 'Draw' fight and 'Catch Weight' fight
-df.drop(['B_draw', 'R_draw'], axis=1, inplace=True)
-df = df[df['Winner'] != 'Draw']
-df = df[df['weight_class'] != 'Catch Weight']
-
-# Remove column when data type is not float or int
-dfWithoutString = df.select_dtypes(include=['float64', 'int64'])
-
-plt.figure(figsize=(50, 40))
-corr_matrix = dfWithoutString.corr(method='pearson').abs()
-sns.heatmap(corr_matrix, annot=True)
-
-# Show the correlation matrix of the dataframe
-# Very laggy feature
-
+#imp_stance_R = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
+#imp_R_stance = imp_stance_R.fit_transform(df['R_Stance'].values.reshape(-1,1))
+#
+## Impute missing values for 'B_Stance' using most frequent strategy
+#imp_stance_B = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
+#imp_B_stance = imp_stance_B.fit_transform(df['B_Stance'].values.reshape(-1,1))
+#
+## Create DataFrames for imputed stances
+#df['R_Stance'] = pd.DataFrame(imp_R_stance, columns=['R_Stance'])
+#df['B_Stance'] = pd.DataFrame(imp_B_stance, columns=['B_Stance'])
+#
+## drop B_avg_BODY_att values in the dataframe
+#    # List of features with NaN values to drop
+#    #na_features = ['B_avg_BODY_att', 'R_avg_BODY_att']
+#
+#    # Drop rows with NaN values in specified features
+#    #df.dropna(subset=na_features, inplace=True)
+#
+## Drop columns 'Referee' and 'location' from the DataFrame
+## The value of references and location has a low impact in battles, which makes it irrelevant to keep
+#df.drop(['Referee', 'location'], axis=1, inplace=True)
+#
+## Drop column 'B_draw' and 'R_draw' and 'Draw' fight and 'Catch Weight' fight
+#df.drop(['B_draw', 'R_draw'], axis=1, inplace=True)
+#df = df[df['Winner'] != 'Draw']
+#df = df[df['weight_class'] != 'Catch Weight']
+#
+## Remove column when data type is not float or int
+#dfWithoutString = df.select_dtypes(include=['float64', 'int64'])
+#
+#plt.figure(figsize=(50, 40))
+#corr_matrix = dfWithoutString.corr(method='pearson').abs()
+#sns.heatmap(corr_matrix, annot=True)
+#
+## Show the correlation matrix of the dataframe
+## Very laggy feature
+#
 # plt.show()

 #  i = index of the fighter's fight, 0 means the last fight, -1 means first fight
@ -118,7 +115,7 @@ def select_fight_row(df, name, i):
    return arr
    
 #  we get the last fight of Khabib :'(
-print(select_fight_row(df, 'Khabib Nurmagomedov', 0))
+#print(select_fight_row(df, 'Khabib Nurmagomedov', 0))


 # get all active UFC fighters (according to the limit_date parameter)
@ -137,7 +134,7 @@ def list_fighters(df, limit_date):
    return fighters

 # Last year when data fight was not full and correct
-fighters = list_fighters(df,'2015-01-01')
+#fighters = list_fighters(df,'2015-01-01')

 def build_df(df, fighters, i):      
    arr = [select_fight_row(df, fighters[f], i) for f in range(len(fighters)) if select_fight_row(df, fighters[f], i) is not None]
@ -171,43 +168,43 @@ def build_df_all_but_last(df, fighters):

    return df_fights

-
-df_train = build_df_all_but_last(df, fighters)
-df_test = build_df(df, fighters,0)
-
-preprocessor = make_column_transformer((OrdinalEncoder(), ['weight_class', 'B_Stance', 'R_Stance']), remainder='passthrough')
-
-# If the winner is from the Red corner, Winner label will be encoded as 1, otherwise it will be 0 (Blue corner)
-label_encoder = LabelEncoder()
-y_train = label_encoder.fit_transform(df_train['Winner'])
-y_test = label_encoder.transform(df_test['Winner'])
-
-X_train, X_test = df_train.drop(['Winner'], axis=1), df_test.drop(['Winner'], axis=1)
-
-# Random Forest composed of 100 decision trees. We optimized parameters using cross-validation and GridSearch tool paired together
-random_forest = RandomForestClassifier(n_estimators=100, 
-                                       criterion='entropy', 
-                                       max_depth=10, 
-                                       min_samples_split=2,
-                                       min_samples_leaf=1, 
-                                       random_state=0)
-
-model = Pipeline([('encoding', preprocessor), ('random_forest', random_forest)])
-model.fit(X_train, y_train)
-
-# We use cross-validation with 5-folds to have a more precise accuracy (reduce variation)
-accuracies = cross_val_score(estimator=model, X=X_train, y=y_train, cv=5)
-print('Accuracy mean : ', accuracies.mean())
-print('Accuracy standard deviation : ', accuracies.std())
-
-y_pred = model.predict(X_test)
-print('Testing accuracy : ', accuracy_score(y_test, y_pred), '\n')
-
-target_names = ["Blue","Red"]
-print(classification_report(y_test, y_pred, labels=[0,1], target_names=target_names))
-
-# cm = confusion_matrix(y_test, y_pred) 
-# ax = plt.subplot()
+#
+#df_train = build_df_all_but_last(df, fighters)
+#df_test = build_df(df, fighters,0)
+#
+#preprocessor = make_column_transformer((OrdinalEncoder(), ['weight_class', 'B_Stance', 'R_Stance']), remainder='passthrough')
+#
+## If the winner is from the Red corner, Winner label will be encoded as 1, otherwise it will be 0 (Blue corner)
+#label_encoder = LabelEncoder()
+#y_train = label_encoder.fit_transform(df_train['Winner'])
+#y_test = label_encoder.transform(df_test['Winner'])
+#
+#X_train, X_test = df_train.drop(['Winner'], axis=1), df_test.drop(['Winner'], axis=1)
+#
+## Random Forest composed of 100 decision trees. We optimized parameters using cross-validation and GridSearch tool paired together
+#random_forest = RandomForestClassifier(n_estimators=100, 
+#                                       criterion='entropy', 
+#                                       max_depth=10, 
+#                                       min_samples_split=2,
+#                                       min_samples_leaf=1, 
+#                                       random_state=0)
+#
+#model = Pipeline([('encoding', preprocessor), ('random_forest', random_forest)])
+#model.fit(X_train, y_train)
+#
+## We use cross-validation with 5-folds to have a more precise accuracy (reduce variation)
+#accuracies = cross_val_score(estimator=model, X=X_train, y=y_train, cv=5)
+#print('Accuracy mean : ', accuracies.mean())
+#print('Accuracy standard deviation : ', accuracies.std())
+#
+#y_pred = model.predict(X_test)
+#print('Testing accuracy : ', accuracy_score(y_test, y_pred), '\n')
+#
+#target_names = ["Blue","Red"]
+#print(classification_report(y_test, y_pred, labels=[0,1], target_names=target_names))
+#
+## cm = confusion_matrix(y_test, y_pred) 
+## ax = plt.subplot()
 # sns.heatmap(cm, annot = True, ax = ax, fmt = "d")
 # ax.set_xlabel('Actual')
 # ax.set_ylabel('Predicted')
@ -216,12 +213,12 @@ print(classification_report(y_test, y_pred, labels=[0,1], target_names=target_na
 # ax.yaxis.set_ticklabels(['Blue', 'Red'])
 # plt.show()

-feature_names = [col for col in X_train]
-feature_importances = model['random_forest'].feature_importances_
-indices = np.argsort(feature_importances)[::-1]
-n = 30 # maximum feature importances displayed
-idx = indices[0:n] 
-std = np.std([tree.feature_importances_ for tree in model['random_forest'].estimators_], axis=0)
+#feature_names = [col for col in X_train]
+#feature_importances = model['random_forest'].feature_importances_
+#indices = np.argsort(feature_importances)[::-1]
+#n = 30 # maximum feature importances displayed
+#idx = indices[0:n] 
+#std = np.std([tree.feature_importances_ for tree in model['random_forest'].estimators_], axis=0)

 #for f in range(n):
 #    print("%d. feature %s (%f)" % (f + 1, feature_names[idx[f]], feature_importances[idx[f]])) 
@ -234,7 +231,7 @@ std = np.std([tree.feature_importances_ for tree in model['random_forest'].estim
 # plt.show()

 # Sélectionnez un arbre de votre modèle
-tree_estimator = model['random_forest'].estimators_[10]
+#tree_estimator = model['random_forest'].estimators_[10]

 # Tracez l'arbre
 # plt.figure(figsize=(1, 1))
@ -284,11 +281,11 @@ def predict(df, pipeline, blue_fighter, red_fighter, weightclass, rounds, title_
        return


-predict(df, model, 'Kamaru Usman', 'Colby Covington', 'Welterweight', 3, True) 
-predict(df, model, 'Leon Edwards', 'Belal Muhammad', 'Welterweight', 3, True)
-predict(df, model, 'Conor McGregor', 'Khabib Nurmagomedov', 'Lightweight', 5, True)
-predict(df, model, 'Conor McGregor', 'Tai Tuivasa', 'Heavyweight', 5, True)
-
-predict(df,model,'Charles Oliveira','Conor McGregor','Lightweight',5,True)
-predict(df,model,'Charles Oliveira','Khabib Nurmagomedov','Lightweight',5,True)
-predict(df, model, 'Leon Edwards', 'Kamaru Usman', 'Welterweight', 5, True)
+#predict(df, model, 'Kamaru Usman', 'Colby Covington', 'Welterweight', 3, True) 
+#predict(df, model, 'Leon Edwards', 'Belal Muhammad', 'Welterweight', 3, True)
+#predict(df, model, 'Conor McGregor', 'Khabib Nurmagomedov', 'Lightweight', 5, True)
+#predict(df, model, 'Conor McGregor', 'Tai Tuivasa', 'Heavyweight', 5, True)
+#
+#predict(df,model,'Charles Oliveira','Conor McGregor','Lightweight',5,True)
+#predict(df,model,'Charles Oliveira','Khabib Nurmagomedov','Lightweight',5,True)
+#predict(df, model, 'Leon Edwards', 'Kamaru Usman', 'Welterweight', 5, True)