diff --git a/mmix.py b/mmix.py index b3d5b89..b8035c5 100644 --- a/mmix.py +++ b/mmix.py @@ -2,7 +2,7 @@ import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.tree import DecisionTreeClassifier -from sklearn.metrics import accuracy_score +from sklearn.metrics import accuracy_score, confusion_matrix from sklearn import metrics from sklearn.model_selection import train_test_split @@ -28,11 +28,11 @@ print(Xtrain.shape) print(Xtest.shape) Arbre_decision = DecisionTreeClassifier(random_state=0, max_depth=20) -clf = Arbre_decision.fit(Xi, Yi) +clf = Arbre_decision.fit(Xtrain, Yi) -ypredit = clf.predict(Xtest) -accuracy_score(ytest, ypredit) -matriceConfusion=metrics.confusion_matrix(ytest, ypredit) +ypredict = clf.predict(Xtest) +accuracy = accuracy_score(ytest, ypredict) +matriceConfusion = confusion_matrix(ytest, ypredict) incorrect=matriceConfusion[0][1] + matriceConfusion[1][0] total = matriceConfusion.sum() @@ -40,4 +40,20 @@ total = matriceConfusion.sum() print("\nNumber of incorrect classifications: " + str(incorrect)) print("Number of classifications total: " + str(total)) -print("Percent: "+ str((total-incorrect)/total*100)) \ No newline at end of file +print("Percent: "+ str((total-incorrect)/total*100)) + +fighter_data = dataframe[dataframe['R_fighter'] == 'Adrian Yanez'] +average_fighter_data = fighter_data[colonnes].mean() + +fighter_data_2 = dataframe[dataframe['R_fighter'] == 'Gustavo Lopez'] +average_fighter_data_2 = fighter_data_2[colonnes].mean() + +combined_features = pd.concat([average_fighter_data, average_fighter_data_2]) + +prediction = clf.predict([combined_features]) +if prediction[0] == 1: + winner = "Blue Corner" +else: + winner = "Red Corner" + +print(f"The predicted winner is: {winner}") diff --git a/test.py b/test.py new file mode 100644 index 0000000..f0f2503 --- /dev/null +++ b/test.py @@ -0,0 +1,239 @@ +import re +import pandas as pd +import numpy as np +import seaborn as sns +import matplotlib.pyplot as plt +from sklearn.tree import export_graphviz +from io import StringIO +from IPython.display import Image +from sklearn.tree import plot_tree +import pydotplus +from IPython.display import Image +from sklearn.pipeline import Pipeline +from sklearn.ensemble import RandomForestClassifier +from sklearn.metrics import accuracy_score +from sklearn.metrics import classification_report +from sklearn.metrics import confusion_matrix +from sklearn.model_selection import cross_val_score +from sklearn.preprocessing import OrdinalEncoder, LabelEncoder +from sklearn.compose import make_column_transformer +from sklearn.impute import SimpleImputer +pd.options.display.max_columns = None +pd.options.display.max_rows = None +import sklearn +print('The scikit-learn version is {}.'.format(sklearn.__version__)) + +df = pd.read_csv('archive/data.csv') + +b_age = df['B_age'] # we replace B_age to put it among B features +df.drop(['B_age'], axis = 1, inplace = True) +df.insert(76, "B_age", b_age) + +df_fe = df.copy() # We make a copy of the dataframe for the feature engineering part later +#print(df.head(5)) + +limit_date = '2001-04-01' +df = df[(df['date'] > limit_date)] + +# print("Total NaN in dataframe :" , df.isna().sum().sum()) +# print("Total NaN in each column of the dataframe") +na = [] +for index, col in enumerate(df): + na.append((index, df[col].isna().sum())) +na_sorted = na.copy() +na_sorted.sort(key = lambda x: x[1], reverse = True) + +# for i in range(len(df.columns)): +# print(df.columns[na_sorted[i][0]],":", na_sorted[i][1], "NaN") + +imp_features = ['R_Weight_lbs', 'R_Height_cms', 'B_Height_cms', 'R_age', 'B_age', 'R_Reach_cms', 'B_Reach_cms'] +imp_median = SimpleImputer(missing_values=np.nan, strategy='median') + +for feature in imp_features: + imp_feature = imp_median.fit_transform(df[feature].values.reshape(-1,1)) + df[feature] = imp_feature + +imp_stance_R = SimpleImputer(missing_values=np.nan, strategy='most_frequent') +imp_R_stance = imp_stance_R.fit_transform(df['R_Stance'].values.reshape(-1,1)) + +imp_stance_B = SimpleImputer(missing_values=np.nan, strategy='most_frequent') +imp_B_stance = imp_stance_B.fit_transform(df['B_Stance'].values.reshape(-1,1)) + +df_R_stance_imputed = pd.DataFrame(imp_R_stance, columns=['R_Stance']) +df_B_stance_imputed = pd.DataFrame(imp_B_stance, columns=['B_Stance']) + +# Assign the imputed values to the original DataFrame +df['R_Stance'] = df_R_stance_imputed['R_Stance'] +df['B_Stance'] = df_B_stance_imputed['B_Stance'] + +print('Number of features with NaN values :', len([x[1] for x in na if x[1] > 0])) + +na_features = ['B_avg_BODY_att', 'R_avg_BODY_att'] +df.dropna(subset = na_features, inplace = True) + +df.drop(['Referee', 'location'], axis = 1, inplace = True) + +# print(df.shape) +# print("Total NaN in dataframe :" , df.isna().sum().sum()) + +df.drop(['B_draw', 'R_draw'], axis=1, inplace=True) +df = df[df['Winner'] != 'Draw'] +df = df[df['weight_class'] != 'Catch Weight'] + +# Supprimez les colonnes non numériques +df_numeric = df.select_dtypes(include=['float64', 'int64']) + +# Tracez la matrice de corrélation +plt.figure(figsize=(50, 40)) +corr_matrix = df_numeric.corr(method='pearson').abs() +sns.heatmap(corr_matrix, annot=True) +# plt.show() + +# i = index of the fighter's fight, 0 means the last fight, -1 means first fight +def select_fight_row(df, name, i): + df_temp = df[(df['R_fighter'] == name) | (df['B_fighter'] == name)] # filter df on fighter's name + df_temp.reset_index(drop=True, inplace=True) # as we created a new temporary dataframe, we have to reset indexes + idx = max(df_temp.index) # get the index of the oldest fight + if i > idx: # if we are looking for a fight that didn't exist, we return nothing + return + arr = df_temp.iloc[i,:].values + return arr + + +# print(select_fight_row(df, 'Amanda Nunes', 0)) +# we get the last fight of Amanda Nunes + + +# get all active UFC fighters (according to the limit_date parameter) +def list_fighters(df, limit_date): + df_temp = df[df['date'] > limit_date] + set_R = set(df_temp['R_fighter']) + set_B = set(df_temp['B_fighter']) + fighters = list(set_R.union(set_B)) + return fighters + +fighters = list_fighters(df, '2017-01-01') +print(len(fighters)) + +def build_df(df, fighters, i): + arr = [select_fight_row(df, fighters[f], i) for f in range(len(fighters)) if select_fight_row(df, fighters[f], i) is not None] + cols = [col for col in df] + df_fights = pd.DataFrame(data=arr, columns=cols) + df_fights.drop_duplicates(inplace=True) + df_fights['title_bout'] = df_fights['title_bout'].map({True: 1, False: 0}) + df_fights.drop(['R_fighter', 'B_fighter', 'date'], axis=1, inplace=True) + return df_fights + +df_train = build_df(df, fighters, 0) +df_test = build_df(df, fighters, 1) + +# print(df_train.head(5)) + +preprocessor = make_column_transformer((OrdinalEncoder(), ['weight_class', 'B_Stance', 'R_Stance']), remainder='passthrough') + +# If the winner is from the Red corner, Winner label will be encoded as 1, otherwise it will be 0 (Blue corner) +label_encoder = LabelEncoder() +y_train = label_encoder.fit_transform(df_train['Winner']) +y_test = label_encoder.transform(df_test['Winner']) + +X_train, X_test = df_train.drop(['Winner'], axis=1), df_test.drop(['Winner'], axis=1) + +# Random Forest composed of 100 decision trees. We optimized parameters using cross-validation and GridSearch tool paired together +random_forest = RandomForestClassifier(n_estimators=100, + criterion='entropy', + max_depth=10, + min_samples_split=2, + min_samples_leaf=1, + random_state=0) + +model = Pipeline([('encoding', preprocessor), ('random_forest', random_forest)]) +model.fit(X_train, y_train) + +# We use cross-validation with 5-folds to have a more precise accuracy (reduce variation) +accuracies = cross_val_score(estimator=model, X=X_train, y=y_train, cv=5) +print('Accuracy mean : ', accuracies.mean()) +print('Accuracy standard deviation : ', accuracies.std()) + +y_pred = model.predict(X_test) +print('Testing accuracy : ', accuracy_score(y_test, y_pred), '\n') + +target_names = ["Blue","Red"] +print(classification_report(y_test, y_pred, labels=[0,1], target_names=target_names)) + +# cm = confusion_matrix(y_test, y_pred) +# ax = plt.subplot() +# sns.heatmap(cm, annot = True, ax = ax, fmt = "d") +# ax.set_xlabel('Actual') +# ax.set_ylabel('Predicted') +# ax.set_title("Confusion Matrix") +# ax.xaxis.set_ticklabels(['Blue', 'Red']) +# ax.yaxis.set_ticklabels(['Blue', 'Red']) +# plt.show() + +feature_names = [col for col in X_train] +feature_importances = model['random_forest'].feature_importances_ +indices = np.argsort(feature_importances)[::-1] +n = 30 # maximum feature importances displayed +idx = indices[0:n] +std = np.std([tree.feature_importances_ for tree in model['random_forest'].estimators_], axis=0) + +#for f in range(n): +# print("%d. feature %s (%f)" % (f + 1, feature_names[idx[f]], feature_importances[idx[f]])) + +# plt.figure(figsize=(30, 8)) +# plt.title("Feature importances") +# plt.bar(range(n), feature_importances[idx], color="r", yerr=std[idx], align="center") +# plt.xticks(range(n), [feature_names[id] for id in idx], rotation = 45) +# plt.xlim([-1, n]) +# plt.show() + +# Sélectionnez un arbre de votre modèle +tree_estimator = model['random_forest'].estimators_[10] + +# Tracez l'arbre +# plt.figure(figsize=(1, 1)) +# plot_tree(tree_estimator, feature_names=df_train.columns, filled=True, rounded=True, fontsize=10) +# plt.savefig('tree.png', dpi=600) # Enregistrez l'image au format PNG +# plt.show() + +def predict(df, pipeline, blue_fighter, red_fighter, weightclass, rounds, title_bout=False): + + #We build two dataframes, one for each figther + f1 = df[(df['R_fighter'] == blue_fighter) | (df['B_fighter'] == blue_fighter)].copy() + f1.reset_index(drop=True, inplace=True) + f1 = f1[:1] + f2 = df[(df['R_fighter'] == red_fighter) | (df['B_fighter'] == red_fighter)].copy() + f2.reset_index(drop=True, inplace=True) + f2 = f2[:1] + + # if the fighter was red/blue corner on his last fight, we filter columns to only keep his statistics (and not the other fighter) + # then we rename columns according to the color of the corner in the parameters using re.sub() + if (f1.loc[0, ['R_fighter']].values[0]) == blue_fighter: + result1 = f1.filter(regex='^R', axis=1).copy() #here we keep the red corner stats + result1.rename(columns = lambda x: re.sub('^R','B', x), inplace=True) #we rename it with "B_" prefix because he's in the blue_corner + else: + result1 = f1.filter(regex='^B', axis=1).copy() + if (f2.loc[0, ['R_fighter']].values[0]) == red_fighter: + result2 = f2.filter(regex='^R', axis=1).copy() + else: + result2 = f2.filter(regex='^B', axis=1).copy() + result2.rename(columns = lambda x: re.sub('^B','R', x), inplace=True) + + fight = pd.concat([result1, result2], axis = 1) # we concatenate the red and blue fighter dataframes (in columns) + fight.drop(['R_fighter','B_fighter'], axis = 1, inplace = True) # we remove fighter names + fight.insert(0, 'title_bout', title_bout) # we add tittle_bout, weight class and number of rounds data to the dataframe + fight.insert(1, 'weight_class', weightclass) + fight.insert(2, 'no_of_rounds', rounds) + fight['title_bout'] = fight['title_bout'].map({True: 1, False: 0}) + + pred = pipeline.predict(fight) + proba = pipeline.predict_proba(fight) + if (pred == 1.0): + print("The predicted winner is", red_fighter, 'with a probability of', round(proba[0][1] * 100, 2), "%") + else: + print("The predicted winner is", blue_fighter, 'with a probability of ', round(proba[0][0] * 100, 2), "%") + return proba + +predict(df, model, 'Kamaru Usman', 'Colby Covington', 'Welterweight', 3, True) +predict(df, model, 'Leon Edwards', 'Belal Muhammad', 'Welterweight', 3, True) +predict(df, model, 'Conor McGregor', 'Khabib Nurmagomedov', 'Lightweight', 5, True) \ No newline at end of file