master
remrem 1 year ago
parent 92e60055e8
commit b1185a0241

@ -23,9 +23,9 @@ AI Plot data: https://machinelearningmastery.com/visualize-machine-learning-data
## Analysis ## Analysis
- [X] Train models with just 20000 "GALAXY" class (has an impact?) - [X] Train models with just 20000 "GALAXY" class (has an impact?)
- [ ] Which model is the best, ratio learn_time/precision - [X] Which model is the best, ratio learn_time/precision
- [ ] Can we drop more categories and have same results (useless data?) - [X] Can we drop more categories and have same results (useless data?)
- [ ] Compare prediction with y_test that were false - [X] Compare prediction with y_test that were false
## Dataset ## Dataset
Nous avons décidé de prendre un dataset sur le site Kaggle, il contient 100 000 lignes qui réprésentent Nous avons décidé de prendre un dataset sur le site Kaggle, il contient 100 000 lignes qui réprésentent
@ -111,10 +111,10 @@ de 98%.
Voici les résultats obtenu sur l'ensemble des modèles: Voici les résultats obtenu sur l'ensemble des modèles:
- KNN (70,724%) - KNN (accuracy: 70,724%, f1: 61.35%)
- Decision Tree (96,82%) - Decision Tree (accuracy: 96,82%, f1: 96,32%)
- Linear SVC (n'a jamais fini) - Linear SVC (n'a jamais fini)
- Random Forest (98.012%) - Random Forest (accuracy: 98.012%, f1: 97,61%)
- Multi-Layer Perceptron (59.22%) - Multi-Layer Perceptron (accuracy: 59.22%, f1: 24.7%)
- Nearest Centroid (36.328%) - Nearest Centroid (accuracy: 36.328%, f1: 36,85%)
- SGD (18.972%) - SGD (accuracy: 21%, f1: 11%)

@ -30,6 +30,7 @@ import matplotlib.pyplot as plt
import pandas import pandas
from pandas.plotting import scatter_matrix from pandas.plotting import scatter_matrix
from sklearn.metrics import confusion_matrix from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
# main # main
def main(): def main():
@ -88,7 +89,7 @@ def model_switch(choice):
elif (choice == 7): elif (choice == 7):
model = NearestCentroid() model = NearestCentroid()
elif (choice == 8): elif (choice == 8):
model = MLPClassifier(solver='adam', alpha=1e-5, random_state=1, activation="logistic", hidden_layer_sizes=(1000, 300, 100, 30, 10, 3)) model = MLPClassifier(solver='adam', alpha=1e-5, random_state=1, activation="logistic", hidden_layer_sizes=(5, 9, 5, 5, 3))
else: else:
raise Exception('Wrong entry') raise Exception('Wrong entry')
@ -151,7 +152,8 @@ def training(model, x, y):
printPredictedValues(ypredict,ytest) printPredictedValues(ypredict,ytest)
elif res == 3: elif res == 3:
os.system("clear") os.system("clear")
print(accuracy_score(ytest, ypredict)) print("Accuracy: ", accuracy_score(ytest, ypredict))
print("F1: ", f1_score(ytest, ypredict, average="macro"))
elif res == 0: elif res == 0:
break break
else: else:
@ -176,17 +178,18 @@ def showData(df):
plt.pie(x, labels = ['GALAXY', 'QSO', 'Star']) plt.pie(x, labels = ['GALAXY', 'QSO', 'Star'])
plt.legend() plt.legend()
# N'a jamais fini pour cause de puissance
def rfecv_test(x, y, model): def rfecv_test(x, y, model):
rfe = RFECV(estimator=model) rfe = RFECV(estimator=model)
pipeline = Pipeline(steps=[('s',rfe),('m',model)]) pipeline = Pipeline(steps=[('s',rfe),('m',model)])
# evaluate model # Evaluation du modele
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(pipeline, x, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise', verbose=3) n_scores = cross_val_score(pipeline, x, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise', verbose=3)
# report performance
print('Accuracy: %.3f (%.3f)' % (max(n_scores), std(n_scores))) print('Accuracy: %.3f (%.3f)' % (max(n_scores), std(n_scores)))
# Affiche la pertinence des colonnes dans l'entrainement
rfe.fit(x,y) rfe.fit(x,y)
for i in range(x.shape[1]): for i in range(x.shape[1]):
print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i])) print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))
@ -297,6 +300,7 @@ def bestModel(datas):
print("Best model : ",model," columns : ",res[0]," Accuracy : ", res[1][model]) print("Best model : ",model," columns : ",res[0]," Accuracy : ", res[1][model])
print("Worst model : ",modelMin," columns : ",resMin[0]," Accuracy : ", resMin[1][model]) print("Worst model : ",modelMin," columns : ",resMin[0]," Accuracy : ", resMin[1][model])
# Test auto-sklearn
def auto_sklearn(): def auto_sklearn():
df = read_dataset('data.csv') df = read_dataset('data.csv')
X_train, X_test, y_train, ytest = train_test_split(x, y,test_size=0.25, random_state=0) X_train, X_test, y_train, ytest = train_test_split(x, y,test_size=0.25, random_state=0)
@ -313,6 +317,7 @@ def auto_sklearn():
y_hat = predictions = cls.predict(X_test) y_hat = predictions = cls.predict(X_test)
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_hat)) print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_hat))
# Affiche tout les plots
def plotAll(): def plotAll():
x,df,y = read_dataset('data.csv') x,df,y = read_dataset('data.csv')

Loading…
Cancel
Save