✨ Add test and random tree

1 year ago · ed2d749ed6
parent 91aabc249e
commit ed2d749ed6
2 changed files with 261 additions and 6 deletions
--- a/mmix.py
+++ b/mmix.py
@ -2,7 +2,7 @@ import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 from sklearn.tree import DecisionTreeClassifier
-from sklearn.metrics import accuracy_score
+from sklearn.metrics import accuracy_score, confusion_matrix
 from sklearn import metrics
 from sklearn.model_selection import train_test_split

@ -28,11 +28,11 @@ print(Xtrain.shape)
 print(Xtest.shape)

 Arbre_decision = DecisionTreeClassifier(random_state=0, max_depth=20)
-clf = Arbre_decision.fit(Xi, Yi)
+clf = Arbre_decision.fit(Xtrain, Yi)

-ypredit = clf.predict(Xtest)
-accuracy_score(ytest, ypredit)
-matriceConfusion=metrics.confusion_matrix(ytest, ypredit)
+ypredict = clf.predict(Xtest)
+accuracy = accuracy_score(ytest, ypredict)
+matriceConfusion = confusion_matrix(ytest, ypredict)

 incorrect=matriceConfusion[0][1] + matriceConfusion[1][0]
 total = matriceConfusion.sum()
@ -40,4 +40,20 @@ total = matriceConfusion.sum()
 print("\nNumber of incorrect classifications: " + str(incorrect))
 print("Number of classifications total: " + str(total))

-print("Percent: "+ str((total-incorrect)/total*100))
+print("Percent: "+ str((total-incorrect)/total*100))
+
+fighter_data = dataframe[dataframe['R_fighter'] == 'Adrian Yanez']
+average_fighter_data = fighter_data[colonnes].mean()
+
+fighter_data_2 = dataframe[dataframe['R_fighter'] == 'Gustavo Lopez']
+average_fighter_data_2 = fighter_data_2[colonnes].mean()
+
+combined_features = pd.concat([average_fighter_data, average_fighter_data_2])
+
+prediction = clf.predict([combined_features])
+if prediction[0] == 1:
+    winner = "Blue Corner"
+else:
+    winner = "Red Corner"
+
+print(f"The predicted winner is: {winner}")
--- a/test.py
+++ b/test.py
@ -0,0 +1,239 @@
+import re
+import pandas as pd
+import numpy as np
+import seaborn as sns
+import matplotlib.pyplot as plt
+from sklearn.tree import export_graphviz
+from io import StringIO 
+from IPython.display import Image  
+from sklearn.tree import plot_tree
+import pydotplus
+from IPython.display import Image
+from sklearn.pipeline import Pipeline
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import accuracy_score
+from sklearn.metrics import classification_report
+from sklearn.metrics import confusion_matrix
+from sklearn.model_selection import cross_val_score
+from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
+from sklearn.compose import make_column_transformer
+from sklearn.impute import SimpleImputer
+pd.options.display.max_columns = None
+pd.options.display.max_rows = None
+import sklearn
+print('The scikit-learn version is {}.'.format(sklearn.__version__))
+
+df = pd.read_csv('archive/data.csv')
+
+b_age = df['B_age']  #  we replace B_age to put it among B features 
+df.drop(['B_age'], axis = 1, inplace = True)
+df.insert(76, "B_age", b_age)
+
+df_fe = df.copy() #  We make a copy of the dataframe for the feature engineering part later
+#print(df.head(5))
+
+limit_date = '2001-04-01'
+df = df[(df['date'] > limit_date)]
+
+# print("Total NaN in dataframe :" , df.isna().sum().sum())
+# print("Total NaN in each column of the dataframe")
+na = []
+for index, col in enumerate(df):
+    na.append((index, df[col].isna().sum())) 
+na_sorted = na.copy()
+na_sorted.sort(key = lambda x: x[1], reverse = True) 
+
+# for i in range(len(df.columns)):
+#     print(df.columns[na_sorted[i][0]],":", na_sorted[i][1], "NaN")
+
+imp_features = ['R_Weight_lbs', 'R_Height_cms', 'B_Height_cms', 'R_age', 'B_age', 'R_Reach_cms', 'B_Reach_cms']
+imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
+
+for feature in imp_features:
+    imp_feature = imp_median.fit_transform(df[feature].values.reshape(-1,1))
+    df[feature] = imp_feature
+
+imp_stance_R = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
+imp_R_stance = imp_stance_R.fit_transform(df['R_Stance'].values.reshape(-1,1))
+
+imp_stance_B = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
+imp_B_stance = imp_stance_B.fit_transform(df['B_Stance'].values.reshape(-1,1))
+
+df_R_stance_imputed = pd.DataFrame(imp_R_stance, columns=['R_Stance'])
+df_B_stance_imputed = pd.DataFrame(imp_B_stance, columns=['B_Stance'])
+
+# Assign the imputed values to the original DataFrame
+df['R_Stance'] = df_R_stance_imputed['R_Stance']
+df['B_Stance'] = df_B_stance_imputed['B_Stance']
+
+print('Number of features with NaN values :', len([x[1] for x in na if x[1] > 0]))
+
+na_features = ['B_avg_BODY_att', 'R_avg_BODY_att']
+df.dropna(subset = na_features, inplace = True)
+
+df.drop(['Referee', 'location'], axis = 1, inplace = True)
+
+# print(df.shape)
+# print("Total NaN in dataframe :" , df.isna().sum().sum())
+
+df.drop(['B_draw', 'R_draw'], axis=1, inplace=True)
+df = df[df['Winner'] != 'Draw']
+df = df[df['weight_class'] != 'Catch Weight']
+
+# Supprimez les colonnes non numériques
+df_numeric = df.select_dtypes(include=['float64', 'int64'])
+
+# Tracez la matrice de corrélation
+plt.figure(figsize=(50, 40))
+corr_matrix = df_numeric.corr(method='pearson').abs()
+sns.heatmap(corr_matrix, annot=True)
+# plt.show()
+
+#  i = index of the fighter's fight, 0 means the last fight, -1 means first fight
+def select_fight_row(df, name, i): 
+    df_temp = df[(df['R_fighter'] == name) | (df['B_fighter'] == name)]  # filter df on fighter's name
+    df_temp.reset_index(drop=True, inplace=True) #  as we created a new temporary dataframe, we have to reset indexes
+    idx = max(df_temp.index)  #  get the index of the oldest fight
+    if i > idx:  #  if we are looking for a fight that didn't exist, we return nothing
+        return 
+    arr = df_temp.iloc[i,:].values
+    return arr
+    
+
+# print(select_fight_row(df, 'Amanda Nunes', 0))
+#  we get the last fight of Amanda Nunes
+
+
+# get all active UFC fighters (according to the limit_date parameter)
+def list_fighters(df, limit_date):
+    df_temp = df[df['date'] > limit_date]
+    set_R = set(df_temp['R_fighter'])
+    set_B = set(df_temp['B_fighter'])
+    fighters = list(set_R.union(set_B))
+    return fighters
+
+fighters = list_fighters(df, '2017-01-01')
+print(len(fighters))
+
+def build_df(df, fighters, i):      
+    arr = [select_fight_row(df, fighters[f], i) for f in range(len(fighters)) if select_fight_row(df, fighters[f], i) is not None]
+    cols = [col for col in df] 
+    df_fights = pd.DataFrame(data=arr, columns=cols)
+    df_fights.drop_duplicates(inplace=True)
+    df_fights['title_bout'] = df_fights['title_bout'].map({True: 1, False: 0})
+    df_fights.drop(['R_fighter', 'B_fighter', 'date'], axis=1, inplace=True)
+    return df_fights
+
+df_train = build_df(df, fighters, 0)
+df_test = build_df(df, fighters, 1)
+
+# print(df_train.head(5))
+
+preprocessor = make_column_transformer((OrdinalEncoder(), ['weight_class', 'B_Stance', 'R_Stance']), remainder='passthrough')
+
+# If the winner is from the Red corner, Winner label will be encoded as 1, otherwise it will be 0 (Blue corner)
+label_encoder = LabelEncoder()
+y_train = label_encoder.fit_transform(df_train['Winner'])
+y_test = label_encoder.transform(df_test['Winner'])
+
+X_train, X_test = df_train.drop(['Winner'], axis=1), df_test.drop(['Winner'], axis=1)
+
+# Random Forest composed of 100 decision trees. We optimized parameters using cross-validation and GridSearch tool paired together
+random_forest = RandomForestClassifier(n_estimators=100, 
+                                       criterion='entropy', 
+                                       max_depth=10, 
+                                       min_samples_split=2,
+                                       min_samples_leaf=1, 
+                                       random_state=0)
+
+model = Pipeline([('encoding', preprocessor), ('random_forest', random_forest)])
+model.fit(X_train, y_train)
+
+# We use cross-validation with 5-folds to have a more precise accuracy (reduce variation)
+accuracies = cross_val_score(estimator=model, X=X_train, y=y_train, cv=5)
+print('Accuracy mean : ', accuracies.mean())
+print('Accuracy standard deviation : ', accuracies.std())
+
+y_pred = model.predict(X_test)
+print('Testing accuracy : ', accuracy_score(y_test, y_pred), '\n')
+
+target_names = ["Blue","Red"]
+print(classification_report(y_test, y_pred, labels=[0,1], target_names=target_names))
+
+# cm = confusion_matrix(y_test, y_pred) 
+# ax = plt.subplot()
+# sns.heatmap(cm, annot = True, ax = ax, fmt = "d")
+# ax.set_xlabel('Actual')
+# ax.set_ylabel('Predicted')
+# ax.set_title("Confusion Matrix")
+# ax.xaxis.set_ticklabels(['Blue', 'Red'])
+# ax.yaxis.set_ticklabels(['Blue', 'Red'])
+# plt.show()
+
+feature_names = [col for col in X_train]
+feature_importances = model['random_forest'].feature_importances_
+indices = np.argsort(feature_importances)[::-1]
+n = 30 # maximum feature importances displayed
+idx = indices[0:n] 
+std = np.std([tree.feature_importances_ for tree in model['random_forest'].estimators_], axis=0)
+
+#for f in range(n):
+#    print("%d. feature %s (%f)" % (f + 1, feature_names[idx[f]], feature_importances[idx[f]])) 
+
+# plt.figure(figsize=(30, 8))
+# plt.title("Feature importances")
+# plt.bar(range(n), feature_importances[idx], color="r", yerr=std[idx], align="center")
+# plt.xticks(range(n), [feature_names[id] for id in idx], rotation = 45) 
+# plt.xlim([-1, n]) 
+# plt.show()
+
+# Sélectionnez un arbre de votre modèle
+tree_estimator = model['random_forest'].estimators_[10]
+
+# Tracez l'arbre
+# plt.figure(figsize=(1, 1))
+# plot_tree(tree_estimator, feature_names=df_train.columns, filled=True, rounded=True, fontsize=10)
+# plt.savefig('tree.png', dpi=600)  # Enregistrez l'image au format PNG
+# plt.show()
+
+def predict(df, pipeline, blue_fighter, red_fighter, weightclass, rounds, title_bout=False): 
+    
+    #We build two dataframes, one for each figther 
+    f1 = df[(df['R_fighter'] == blue_fighter) | (df['B_fighter'] == blue_fighter)].copy()
+    f1.reset_index(drop=True, inplace=True)
+    f1 = f1[:1]
+    f2 = df[(df['R_fighter'] == red_fighter) | (df['B_fighter'] == red_fighter)].copy()
+    f2.reset_index(drop=True, inplace=True)
+    f2 = f2[:1]
+    
+    # if the fighter was red/blue corner on his last fight, we filter columns to only keep his statistics (and not the other fighter)
+    # then we rename columns according to the color of  the corner in the parameters using re.sub()
+    if (f1.loc[0, ['R_fighter']].values[0]) == blue_fighter:
+        result1 = f1.filter(regex='^R', axis=1).copy() #here we keep the red corner stats
+        result1.rename(columns = lambda x: re.sub('^R','B', x), inplace=True)  #we rename it with "B_" prefix because he's in the blue_corner
+    else: 
+        result1 = f1.filter(regex='^B', axis=1).copy()
+    if (f2.loc[0, ['R_fighter']].values[0]) == red_fighter:
+        result2 = f2.filter(regex='^R', axis=1).copy()
+    else:
+        result2 = f2.filter(regex='^B', axis=1).copy()
+        result2.rename(columns = lambda x: re.sub('^B','R', x), inplace=True)
+        
+    fight = pd.concat([result1, result2], axis = 1) # we concatenate the red and blue fighter dataframes (in columns)
+    fight.drop(['R_fighter','B_fighter'], axis = 1, inplace = True) # we remove fighter names
+    fight.insert(0, 'title_bout', title_bout) # we add tittle_bout, weight class and number of rounds data to the dataframe
+    fight.insert(1, 'weight_class', weightclass)
+    fight.insert(2, 'no_of_rounds', rounds)
+    fight['title_bout'] = fight['title_bout'].map({True: 1, False: 0})
+    
+    pred = pipeline.predict(fight)
+    proba = pipeline.predict_proba(fight)
+    if (pred == 1.0): 
+        print("The predicted winner is", red_fighter, 'with a probability of', round(proba[0][1] * 100, 2), "%")
+    else:
+        print("The predicted winner is", blue_fighter, 'with a probability of ', round(proba[0][0] * 100, 2), "%")
+    return proba
+
+predict(df, model, 'Kamaru Usman', 'Colby Covington', 'Welterweight', 3, True) 
+predict(df, model, 'Leon Edwards', 'Belal Muhammad', 'Welterweight', 3, True)
+predict(df, model, 'Conor McGregor', 'Khabib Nurmagomedov', 'Lightweight', 5, True)