diff --git a/README.md b/README.md index a8816b1..59d297b 100644 --- a/README.md +++ b/README.md @@ -23,9 +23,9 @@ AI Plot data: https://machinelearningmastery.com/visualize-machine-learning-data ## Analysis - [X] Train models with just 20000 "GALAXY" class (has an impact?) -- [ ] Which model is the best, ratio learn_time/precision -- [ ] Can we drop more categories and have same results (useless data?) -- [ ] Compare prediction with y_test that were false +- [X] Which model is the best, ratio learn_time/precision +- [X] Can we drop more categories and have same results (useless data?) +- [X] Compare prediction with y_test that were false ## Dataset Nous avons décidé de prendre un dataset sur le site Kaggle, il contient 100 000 lignes qui réprésentent @@ -111,10 +111,10 @@ de 98%. Voici les résultats obtenu sur l'ensemble des modèles: -- KNN (70,724%) -- Decision Tree (96,82%) +- KNN (accuracy: 70,724%, f1: 61.35%) +- Decision Tree (accuracy: 96,82%, f1: 96,32%) - Linear SVC (n'a jamais fini) -- Random Forest (98.012%) -- Multi-Layer Perceptron (59.22%) -- Nearest Centroid (36.328%) -- SGD (18.972%) +- Random Forest (accuracy: 98.012%, f1: 97,61%) +- Multi-Layer Perceptron (accuracy: 59.22%, f1: 24.7%) +- Nearest Centroid (accuracy: 36.328%, f1: 36,85%) +- SGD (accuracy: 21%, f1: 11%) diff --git a/src/main.py b/src/main.py index 91e4bd5..201015a 100755 --- a/src/main.py +++ b/src/main.py @@ -30,6 +30,7 @@ import matplotlib.pyplot as plt import pandas from pandas.plotting import scatter_matrix from sklearn.metrics import confusion_matrix +from sklearn.metrics import f1_score # main def main(): @@ -88,7 +89,7 @@ def model_switch(choice): elif (choice == 7): model = NearestCentroid() elif (choice == 8): - model = MLPClassifier(solver='adam', alpha=1e-5, random_state=1, activation="logistic", hidden_layer_sizes=(1000, 300, 100, 30, 10, 3)) + model = MLPClassifier(solver='adam', alpha=1e-5, random_state=1, activation="logistic", hidden_layer_sizes=(5, 9, 5, 5, 3)) else: raise Exception('Wrong entry') @@ -151,7 +152,8 @@ def training(model, x, y): printPredictedValues(ypredict,ytest) elif res == 3: os.system("clear") - print(accuracy_score(ytest, ypredict)) + print("Accuracy: ", accuracy_score(ytest, ypredict)) + print("F1: ", f1_score(ytest, ypredict, average="macro")) elif res == 0: break else: @@ -176,17 +178,18 @@ def showData(df): plt.pie(x, labels = ['GALAXY', 'QSO', 'Star']) plt.legend() +# N'a jamais fini pour cause de puissance def rfecv_test(x, y, model): rfe = RFECV(estimator=model) pipeline = Pipeline(steps=[('s',rfe),('m',model)]) - # evaluate model + # Evaluation du modele cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) n_scores = cross_val_score(pipeline, x, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise', verbose=3) - # report performance print('Accuracy: %.3f (%.3f)' % (max(n_scores), std(n_scores))) + # Affiche la pertinence des colonnes dans l'entrainement rfe.fit(x,y) for i in range(x.shape[1]): print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i])) @@ -297,6 +300,7 @@ def bestModel(datas): print("Best model : ",model," columns : ",res[0]," Accuracy : ", res[1][model]) print("Worst model : ",modelMin," columns : ",resMin[0]," Accuracy : ", resMin[1][model]) +# Test auto-sklearn def auto_sklearn(): df = read_dataset('data.csv') X_train, X_test, y_train, ytest = train_test_split(x, y,test_size=0.25, random_state=0) @@ -313,6 +317,7 @@ def auto_sklearn(): y_hat = predictions = cls.predict(X_test) print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_hat)) +# Affiche tout les plots def plotAll(): x,df,y = read_dataset('data.csv')