From cf44fa3b0e4ceeeed8779e2b86c3d7b7a092b972 Mon Sep 17 00:00:00 2001 From: rem Date: Mon, 12 Feb 2024 09:14:27 +0100 Subject: [PATCH 1/2] add plots and try auto_sklearn --- README.md | 1 + src/main.py | 84 ++++++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 72 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 80ea39f..2758434 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ Deep Learning: https://machinelearningmastery.com/tutorial-first-neural-network- Python Machine Learning: https://machinelearningmastery.com/machine-learning-in-python-step-by-step/ +AI Plot data: https://machinelearningmastery.com/visualize-machine-learning-data-python-pandas/ ## Columns |Keep |Skip | diff --git a/src/main.py b/src/main.py index ae29bd2..ea5aa66 100755 --- a/src/main.py +++ b/src/main.py @@ -26,15 +26,22 @@ from sklearn.feature_selection import RFECV from sklearn.tree import DecisionTreeClassifier from sklearn.pipeline import Pipeline +import matplotlib.pyplot as plt +import pandas +from pandas.plotting import scatter_matrix +from sklearn.metrics import confusion_matrix + # main def main(): + # plotAll() + #auto_sklearn() + # User input opt = prompt_display() model = model_switch(opt) # Get interesting data - df = read_dataset("data.csv") - x, y = get_xy_from_dataframe(df) + df, x, y = read_dataset('data.csv') # rfecv_test(x, y, RandomForestClassifier()) # Train model @@ -43,14 +50,11 @@ def main(): # Open dataset with panda def read_dataset(filename): df = pd.read_csv(filename) - return df - -# Drop useless columns and return x and y -def get_xy_from_dataframe(df): x = df.drop(['obj_ID','field_ID','run_ID','rerun_ID','cam_col','plate','MJD','fiber_ID','class'],axis=1) y = df['class'].values - return x, y + return df, x, y + # Ask for model choice def prompt_display(): print("""Choose a model: @@ -80,7 +84,7 @@ def model_switch(choice): elif (choice == 7): model = NearestCentroid() elif (choice == 8): - model = MLPClassifier(solver='adam', alpha=1e-5, random_state=1, activation="logistic", hidden_layer_sizes=(100,80,60,40,20,10,3)) + model = MLPClassifier(solver='adam', alpha=1e-5, random_state=1, activation="logistic", hidden_layer_sizes=(1000, 300, 100, 30, 10, 3)) else: raise Exception('Wrong entry') @@ -124,8 +128,9 @@ def training(model, x, y): Xtest = Xtest.reshape(-1, 1) model.fit(Xtrain,ytrain) - - ypredit = model.predict(Xtest) + + ypredict = model.predict(Xtest) + # confusion_matrix(ytrain, ypredict) # os.system("clear") res = -1 while(res != 0): @@ -134,13 +139,13 @@ def training(model, x, y): res = int(input()) if(res == 1): os.system("clear") - printStatValues(ypredit,ytest) + printStatValues(ypredict,ytest) elif(res == 2): os.system("clear") - printPredictedValues(ypredit,ytest) + printPredictedValues(ypredict,ytest) elif res == 3: os.system("clear") - print(accuracy_score(ytest, ypredit)) + print(accuracy_score(ytest, ypredict)) elif res == 0: break else: @@ -236,5 +241,58 @@ def bestModelFinder(datas): print("Tree variance accuracy_score : ", variance(treeMean)) print("Tree ecart-type accuracy_score : ", stdev(treeMean)) +def auto_sklearn(): + df = read_dataset('data.csv') + X_train, X_test, y_train, ytest = train_test_split(x, y,test_size=0.25, random_state=0) + X_train = X_train.values + X_test = X_test.values + + if len(Xtrain.shape) < 2: + Xtrain = Xtrain.reshape(-1, 1) + if len(Xtest.shape) < 2: + Xtest = Xtest.reshape(-1, 1) + + cls = autosklearn.classification.AutoSklearnClassifier() + cls.fit(X_train, y_train) + y_hat = predictions = cls.predict(X_test) + print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_hat)) + +def plotAll(): + df = read_dataset('data.csv') + + plotHistograms(df) + plotDensity(df) + plotBoxWhisker(df) + plotCorrelationMatrix(df) + plotScatterMatrix(df) + +def plotHistograms(df): + df.hist() + plt.show() + +def plotDensity(df): + df.plot(kind='density', subplots=True, layout=(3,3), sharex=False) + plt.show() + +def plotBoxWhisker(df): + df.plot(kind='box', subplots=True, layout=(3,3), sharex=False, sharey=False) + plt.show() + +def plotCorrelationMatrix(df): + correlations = df.corr() + fig = plt.figure() + ax = fig.add_subplot(111) + cax = ax.matshow(correlations, vmin=-1, vmax=1) + fig.colorbar(cax) + ticks = np.arange(0,9,1) + ax.set_xticks(ticks) + ax.set_yticks(ticks) + ax.set_xticklabels(list(df)) + ax.set_yticklabels(list(df)) + plt.show() + +def plotScatterMatrix(df): + scatter_matrix(df) + plt.show() main() From e58c37a695f87155e25241f734a832827ffc6b18 Mon Sep 17 00:00:00 2001 From: rem Date: Mon, 12 Feb 2024 09:55:09 +0100 Subject: [PATCH 2/2] datatset info + readme --- README.md | 17 +++++++++++++++++ src/main.py | 6 +++++- 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 2758434..c5f4782 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ Deep Learning: https://machinelearningmastery.com/tutorial-first-neural-network- Python Machine Learning: https://machinelearningmastery.com/machine-learning-in-python-step-by-step/ AI Plot data: https://machinelearningmastery.com/visualize-machine-learning-data-python-pandas/ + ## Columns |Keep |Skip | @@ -25,3 +26,19 @@ AI Plot data: https://machinelearningmastery.com/visualize-machine-learning-data - [ ] Which model is the best, ratio learn_time/precision - [ ] Can we drop more categories and have same results (useless data?) - [ ] Compare prediction with y_test that were false + +## Dataset +Nous avons décidé de prendre un dataset sur le site Kaggle, il contient 100 000 lignes qui réprésentent +chacune un objet stellaire observé en lui attribuant plusieurs caractéristiques comme sa declinaison, +les couleurs observées et autres valeurs scientifiques. +Chaque ligne est donc associée à une classe qui peut-être "QSO" un quasar, "Galaxy" ou "Star". + +Notre première étape à été de regarder le dataset pour savoir si certaines données sont manquantes. +En utilisant `df.info()` nous pouvons avoir certaines informations sur les données, il ne manque aucune valeur. + +Nous pouvons maintenant regarder la répartition des classes, celle-ci est assez inégale avec ~60.000 Galaxie, +~21.000 étoiles et ~19000 quasar. Nous pouvons en déduire que les galaxies sont plus communes mais cela +pourrait-il avoir une incidence sur la précision de notre modèle ? + +## Plot +J'ai la flemme d'analyser les plots que j'ai fait. diff --git a/src/main.py b/src/main.py index ea5aa66..64ce3b3 100755 --- a/src/main.py +++ b/src/main.py @@ -34,7 +34,11 @@ from sklearn.metrics import confusion_matrix # main def main(): # plotAll() - #auto_sklearn() + # auto_sklearn() + + # dftmp = pd.read_csv('data.csv') + # dftmp.info() + # print(dftmp['class'].value_counts()) # User input opt = prompt_display()