diff --git a/__pycache__/analise.cpython-311.pyc b/__pycache__/analise.cpython-311.pyc new file mode 100644 index 0000000..7e1757e Binary files /dev/null and b/__pycache__/analise.cpython-311.pyc differ diff --git a/__pycache__/cleanData.cpython-311.pyc b/__pycache__/cleanData.cpython-311.pyc new file mode 100644 index 0000000..5d7f87d Binary files /dev/null and b/__pycache__/cleanData.cpython-311.pyc differ diff --git a/__pycache__/models.cpython-311.pyc b/__pycache__/models.cpython-311.pyc new file mode 100644 index 0000000..411179f Binary files /dev/null and b/__pycache__/models.cpython-311.pyc differ diff --git a/__pycache__/runModel.cpython-311.pyc b/__pycache__/runModel.cpython-311.pyc new file mode 100644 index 0000000..2375691 Binary files /dev/null and b/__pycache__/runModel.cpython-311.pyc differ diff --git a/cleanData.py b/cleanData.py index d893b28..dd60264 100644 --- a/cleanData.py +++ b/cleanData.py @@ -7,33 +7,49 @@ from sklearn.impute import SimpleImputer from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, confusion_matrix, classification_report from models import * +import random from analise import * +columns = ['B_fighter','R_fighter','title_bout', + 'B_avg_BODY_landed', 'B_avg_HEAD_landed', 'B_avg_TD_att', 'B_avg_TOTAL_STR_landed', + 'B_avg_opp_BODY_att', 'B_avg_opp_HEAD_landed', 'B_avg_opp_LEG_landed', + 'B_avg_opp_SIG_STR_att', 'B_avg_opp_TOTAL_STR_att', + + 'R_avg_BODY_landed', 'R_avg_HEAD_landed', 'R_avg_TD_att', 'R_avg_TOTAL_STR_landed', + 'R_avg_opp_BODY_att', 'R_avg_opp_HEAD_landed', 'R_avg_opp_LEG_landed', + 'R_avg_opp_SIG_STR_att', 'R_avg_opp_TOTAL_STR_att', + + 'B_age', 'R_age','date','Winner','weight_class','B_Stance','R_Stance'] + +def swap_values(row): + if random.random() > 0.5: + for column in columns: + if column.startswith('B_'): + opposite_column = 'R_' + column[2:] + row[column], row[opposite_column] = row[opposite_column], row[column] + if column.startswith('Winner'): + print(row[column]) + if row[column] == 0: + row[column] = 2 + elif row[column] == 2: + row[column] = 0 + print(row[column]) + return row + return row def getData(): df = pd.read_csv('archive/data.csv') - columns = ['B_fighter','R_fighter','title_bout', - 'B_avg_BODY_landed', 'B_avg_HEAD_landed', 'B_avg_TD_att', 'B_avg_TOTAL_STR_landed', - 'B_avg_opp_BODY_att', 'B_avg_opp_HEAD_landed', 'B_avg_opp_LEG_landed', - 'B_avg_opp_SIG_STR_att', 'B_avg_opp_TOTAL_STR_att', - - 'R_avg_BODY_landed', 'R_avg_HEAD_landed', 'R_avg_TD_att', 'R_avg_TOTAL_STR_landed', - 'R_avg_opp_BODY_att', 'R_avg_opp_HEAD_landed', 'R_avg_opp_LEG_landed', - 'R_avg_opp_SIG_STR_att', 'R_avg_opp_TOTAL_STR_att', - - 'B_age', 'R_age','date','Winner','weight_class','B_Stance','R_Stance'] - - limit_date = '2001-04-01' df = df.loc[df['date'] > limit_date, columns] label_encoder = LabelEncoder() - # Convertir les chaînes de caractères en nombres for column in df.select_dtypes(include=['object']).columns: df[column] = label_encoder.fit_transform(df[column]) + df = df.apply(swap_values, axis=1) + median_values = df.median() df.fillna(median_values, inplace=True) diff --git a/main.py b/main.py index 01ea1ad..2ad39ce 100644 --- a/main.py +++ b/main.py @@ -6,7 +6,12 @@ X,y=getData() X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30, random_state=50) startRandomForest(X_train,X_test,y_train,y_test) -startKNN(X_train,X_test,y_train,y_test) -startSVM(X_train,X_test,y_train,y_test) -startDecisionTree(X_train,X_test,y_train,y_test) -startLogisticRegression(X_train,X_test,y_train,y_test) \ No newline at end of file +#startKNN(X_train,X_test,y_train,y_test) +#startSVM(X_train,X_test,y_train,y_test) +#startDecisionTree(X_train,X_test,y_train,y_test) +#startLogisticRegression(X_train,X_test,y_train,y_test) + +#startLinearSVC(X_train,X_test,y_train,y_test) +#startNaiveBayes(X_train,X_test,y_train,y_test) + +# https://scikit-learn.org/stable/_static/ml_map.png \ No newline at end of file diff --git a/models.py b/models.py index 81380be..9c08d57 100644 --- a/models.py +++ b/models.py @@ -4,6 +4,9 @@ from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.linear_model import SGDClassifier from sklearn import svm +from sklearn.svm import LinearSVC +from sklearn.naive_bayes import GaussianNB + def RandomForest(X_train, X_test, y_train): random_forest = RandomForestClassifier(n_estimators=100, @@ -35,3 +38,13 @@ def LogisticRegress(X_train, X_test, y_train): logistic = LogisticRegression() logistic.fit(X_train,y_train) return logistic.predict(X_test),logistic + +def Linearsvc(X_train, X_test, y_train): + svc = LinearSVC(C=1.0, dual=False, verbose=True, loss="squared_hinge", multi_class="crammer_singer") + svc.fit(X_train,y_train) + return svc.predict(X_test),svc + +def GaussianNaiveBayes(X_train, X_test, y_train): + gnb = GaussianNB() + gnb.fit(X_train, y_train) + return gnb.predict(X_test),gnb \ No newline at end of file diff --git a/runModel.py b/runModel.py index 04cfddf..a2146be 100644 --- a/runModel.py +++ b/runModel.py @@ -45,4 +45,20 @@ def startLogisticRegression(X_train,X_test,y_train,y_test): report(lr_ac, lr_matrix, lr_class_report) seeMatrix(lr_matrix, lr.classes_) #rocCurve(y_test, y_pred) - #seeRocCurve(rf, X_train, y_train, 10) \ No newline at end of file + #seeRocCurve(rf, X_train, y_train, 10) + +def startLinearSVC(X_train,X_test,y_train,y_test): + y_pred, svc = Linearsvc(X_train, X_test, y_train) + svc_ac, svc_matrix, svc_class_report = calculateMatrix(y_test, y_pred) + report(svc_ac, svc_matrix, svc_class_report) + seeMatrix(svc_matrix, svc.classes_) + #rocCurve(y_test, y_pred) + #seeRocCurve(rf, X_train, y_train, 10) + +def startNaiveBayes(X_train,X_test,y_train,y_test): + y_pred, gnb = GaussianNaiveBayes(X_train, X_test, y_train) + gnb_ac, gnb_matrix, gnb_class_report = calculateMatrix(y_test, y_pred) + report(gnb_ac, gnb_matrix, gnb_class_report) + seeMatrix(gnb_matrix, gnb.classes_) + #rocCurve(y_test, y_pred) + #seeRocCurve(rf, X_train, y_train, 10)