final

1 year ago · b1185a0241
parent 92e60055e8
commit b1185a0241
2 changed files with 18 additions and 13 deletions
--- a/README.md
+++ b/README.md
@ -23,9 +23,9 @@ AI Plot data: https://machinelearningmastery.com/visualize-machine-learning-data
 ## Analysis

 - [X] Train models with just 20000 "GALAXY" class (has an impact?)
- [ ] Which model is the best, ratio learn_time/precision
- [ ] Can we drop more categories and have same results (useless data?)
- [ ] Compare prediction with y_test that were false
+- [X] Which model is the best, ratio learn_time/precision
+- [X] Can we drop more categories and have same results (useless data?)
+- [X] Compare prediction with y_test that were false

 ## Dataset
 Nous avons décidé de prendre un dataset sur le site Kaggle, il contient 100 000 lignes qui réprésentent
@ -111,10 +111,10 @@ de 98%.

 Voici les résultats obtenu sur l'ensemble des modèles:

- KNN (70,724%)
- Decision Tree (96,82%)
+- KNN (accuracy: 70,724%, f1: 61.35%)
+- Decision Tree (accuracy: 96,82%, f1: 96,32%) 
 - Linear SVC (n'a jamais fini)
- Random Forest (98.012%)
- Multi-Layer Perceptron (59.22%)
- Nearest Centroid (36.328%)
- SGD (18.972%)
+- Random Forest (accuracy: 98.012%, f1: 97,61%)
+- Multi-Layer Perceptron (accuracy: 59.22%, f1: 24.7%)
+- Nearest Centroid (accuracy: 36.328%, f1: 36,85%)
+- SGD (accuracy: 21%, f1: 11%)
--- a/src/main.py
+++ b/src/main.py
@ -30,6 +30,7 @@ import matplotlib.pyplot as plt
 import pandas
 from pandas.plotting import scatter_matrix
 from sklearn.metrics import confusion_matrix
+from sklearn.metrics import f1_score

 # main
 def main():
@ -88,7 +89,7 @@ def model_switch(choice):
    elif (choice == 7):
        model = NearestCentroid()
    elif (choice == 8):
-        model = MLPClassifier(solver='adam', alpha=1e-5, random_state=1, activation="logistic", hidden_layer_sizes=(1000, 300, 100, 30, 10, 3))
+        model = MLPClassifier(solver='adam', alpha=1e-5, random_state=1, activation="logistic", hidden_layer_sizes=(5, 9, 5, 5, 3))
    else:
        raise Exception('Wrong entry')       
    
@ -151,7 +152,8 @@ def training(model, x, y):
            printPredictedValues(ypredict,ytest)
        elif res == 3:
            os.system("clear")
-            print(accuracy_score(ytest, ypredict))
+            print("Accuracy: ", accuracy_score(ytest, ypredict))
+            print("F1: ", f1_score(ytest, ypredict, average="macro"))
        elif res == 0:
            break
        else:
@ -176,17 +178,18 @@ def showData(df):
    plt.pie(x, labels = ['GALAXY', 'QSO', 'Star'])
    plt.legend()

+# N'a jamais fini pour cause de puissance
 def rfecv_test(x, y, model):
    rfe = RFECV(estimator=model)
    pipeline = Pipeline(steps=[('s',rfe),('m',model)])
    
-    # evaluate model
+    # Evaluation du modele
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    n_scores = cross_val_score(pipeline, x, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise', verbose=3)
    
-    # report performance
    print('Accuracy: %.3f (%.3f)' % (max(n_scores), std(n_scores)))

+    # Affiche la pertinence des colonnes dans l'entrainement
    rfe.fit(x,y)
    for i in range(x.shape[1]):
        print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))
@ -297,6 +300,7 @@ def bestModel(datas):
    print("Best model : ",model," columns : ",res[0]," Accuracy : ", res[1][model])
    print("Worst model : ",modelMin," columns : ",resMin[0]," Accuracy : ", resMin[1][model])

+# Test auto-sklearn
 def auto_sklearn():
    df = read_dataset('data.csv')
    X_train, X_test, y_train, ytest = train_test_split(x, y,test_size=0.25, random_state=0)
@ -313,6 +317,7 @@ def auto_sklearn():
    y_hat = predictions = cls.predict(X_test)
    print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_hat))

+# Affiche tout les plots
 def plotAll():
    x,df,y = read_dataset('data.csv')