From cf44fa3b0e4ceeeed8779e2b86c3d7b7a092b972 Mon Sep 17 00:00:00 2001
From: rem <remi.arnal@etu.uca.fr>
Date: Mon, 12 Feb 2024 09:14:27 +0100
Subject: [PATCH 1/2] add plots and try auto_sklearn

---
 README.md   |  1 +
 src/main.py | 84 ++++++++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 72 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index 80ea39f..2758434 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,7 @@ Deep Learning: https://machinelearningmastery.com/tutorial-first-neural-network-
 
 Python Machine Learning: https://machinelearningmastery.com/machine-learning-in-python-step-by-step/
 
+AI Plot data: https://machinelearningmastery.com/visualize-machine-learning-data-python-pandas/
 ## Columns
 
 |Keep         |Skip        |
diff --git a/src/main.py b/src/main.py
index ae29bd2..ea5aa66 100755
--- a/src/main.py
+++ b/src/main.py
@@ -26,15 +26,22 @@ from sklearn.feature_selection import RFECV
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.pipeline import Pipeline
 
+import matplotlib.pyplot as plt
+import pandas
+from pandas.plotting import scatter_matrix
+from sklearn.metrics import confusion_matrix
+
 # main
 def main():
+    # plotAll()
+    #auto_sklearn()
+    
     # User input
     opt = prompt_display()
     model = model_switch(opt)
 
     # Get interesting data
-    df = read_dataset("data.csv")
-    x, y = get_xy_from_dataframe(df)
+    df, x, y = read_dataset('data.csv')
 
     # rfecv_test(x, y, RandomForestClassifier())
     # Train model
@@ -43,14 +50,11 @@ def main():
 # Open dataset with panda
 def read_dataset(filename):
     df = pd.read_csv(filename)
-    return df
-
-# Drop useless columns and return x and y
-def get_xy_from_dataframe(df):
     x = df.drop(['obj_ID','field_ID','run_ID','rerun_ID','cam_col','plate','MJD','fiber_ID','class'],axis=1)
     y = df['class'].values
-    return x, y 
 
+    return df, x, y
+    
 # Ask for model choice
 def prompt_display():
     print("""Choose a model:
@@ -80,7 +84,7 @@ def model_switch(choice):
     elif (choice == 7):
         model = NearestCentroid()
     elif (choice == 8):
-        model = MLPClassifier(solver='adam', alpha=1e-5, random_state=1, activation="logistic", hidden_layer_sizes=(100,80,60,40,20,10,3))
+        model = MLPClassifier(solver='adam', alpha=1e-5, random_state=1, activation="logistic", hidden_layer_sizes=(1000, 300, 100, 30, 10, 3))
     else:
         raise Exception('Wrong entry')       
     
@@ -124,8 +128,9 @@ def training(model, x, y):
         Xtest = Xtest.reshape(-1, 1)
  
     model.fit(Xtrain,ytrain)
-    
-    ypredit = model.predict(Xtest)
+
+    ypredict = model.predict(Xtest)
+    # confusion_matrix(ytrain, ypredict)
     # os.system("clear")
     res = -1
     while(res != 0):
@@ -134,13 +139,13 @@ def training(model, x, y):
         res = int(input())
         if(res == 1):
             os.system("clear")
-            printStatValues(ypredit,ytest)
+            printStatValues(ypredict,ytest)
         elif(res == 2):
             os.system("clear")
-            printPredictedValues(ypredit,ytest)
+            printPredictedValues(ypredict,ytest)
         elif res == 3:
             os.system("clear")
-            print(accuracy_score(ytest, ypredit))
+            print(accuracy_score(ytest, ypredict))
         elif res == 0:
             break
         else:
@@ -236,5 +241,58 @@ def bestModelFinder(datas):
     print("Tree variance accuracy_score : ", variance(treeMean))
     print("Tree ecart-type accuracy_score : ", stdev(treeMean))
 
+def auto_sklearn():
+    df = read_dataset('data.csv')
+    X_train, X_test, y_train, ytest = train_test_split(x, y,test_size=0.25, random_state=0)
+    X_train = X_train.values
+    X_test = X_test.values
+    
+    if len(Xtrain.shape) < 2:
+        Xtrain = Xtrain.reshape(-1, 1)
+    if len(Xtest.shape) < 2:
+        Xtest = Xtest.reshape(-1, 1)
+        
+    cls = autosklearn.classification.AutoSklearnClassifier()
+    cls.fit(X_train, y_train)
+    y_hat = predictions = cls.predict(X_test)
+    print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_hat))
+
+def plotAll():
+    df = read_dataset('data.csv')
+    
+    plotHistograms(df)
+    plotDensity(df)
+    plotBoxWhisker(df)
+    plotCorrelationMatrix(df)
+    plotScatterMatrix(df)
+
+def plotHistograms(df):
+    df.hist()
+    plt.show()
+
+def plotDensity(df):
+    df.plot(kind='density', subplots=True, layout=(3,3), sharex=False)
+    plt.show()
+
+def plotBoxWhisker(df):
+    df.plot(kind='box', subplots=True, layout=(3,3), sharex=False, sharey=False)
+    plt.show()
+
+def plotCorrelationMatrix(df):
+    correlations = df.corr()
+    fig = plt.figure()
+    ax = fig.add_subplot(111)
+    cax = ax.matshow(correlations, vmin=-1, vmax=1)
+    fig.colorbar(cax)
+    ticks = np.arange(0,9,1)
+    ax.set_xticks(ticks)
+    ax.set_yticks(ticks)
+    ax.set_xticklabels(list(df))
+    ax.set_yticklabels(list(df))
+    plt.show()
+
+def plotScatterMatrix(df):
+    scatter_matrix(df)
+    plt.show()
 
 main()

From e58c37a695f87155e25241f734a832827ffc6b18 Mon Sep 17 00:00:00 2001
From: rem <remi.arnal@etu.uca.fr>
Date: Mon, 12 Feb 2024 09:55:09 +0100
Subject: [PATCH 2/2] datatset info + readme

---
 README.md   | 17 +++++++++++++++++
 src/main.py |  6 +++++-
 2 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2758434..c5f4782 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,7 @@ Deep Learning: https://machinelearningmastery.com/tutorial-first-neural-network-
 Python Machine Learning: https://machinelearningmastery.com/machine-learning-in-python-step-by-step/
 
 AI Plot data: https://machinelearningmastery.com/visualize-machine-learning-data-python-pandas/
+
 ## Columns
 
 |Keep         |Skip        |
@@ -25,3 +26,19 @@ AI Plot data: https://machinelearningmastery.com/visualize-machine-learning-data
 - [ ] Which model is the best, ratio learn_time/precision
 - [ ] Can we drop more categories and have same results (useless data?)
 - [ ] Compare prediction with y_test that were false
+
+## Dataset
+Nous avons décidé de prendre un dataset sur le site Kaggle, il contient 100 000 lignes qui réprésentent
+chacune un objet stellaire observé en lui attribuant plusieurs caractéristiques comme sa declinaison,
+les couleurs observées et autres valeurs scientifiques.
+Chaque ligne est donc associée à une classe qui peut-être "QSO" un quasar, "Galaxy" ou "Star".
+
+Notre première étape à été de regarder le dataset pour savoir si certaines données sont manquantes.
+En utilisant `df.info()` nous pouvons avoir certaines informations sur les données, il ne manque aucune valeur.
+
+Nous pouvons maintenant regarder la répartition des classes, celle-ci est assez inégale avec ~60.000 Galaxie,
+~21.000 étoiles et ~19000 quasar. Nous pouvons en déduire que les galaxies sont plus communes mais cela
+pourrait-il avoir une incidence sur la précision de notre modèle ?
+
+## Plot
+J'ai la flemme d'analyser les plots que j'ai fait.
diff --git a/src/main.py b/src/main.py
index ea5aa66..64ce3b3 100755
--- a/src/main.py
+++ b/src/main.py
@@ -34,7 +34,11 @@ from sklearn.metrics import confusion_matrix
 # main
 def main():
     # plotAll()
-    #auto_sklearn()
+    # auto_sklearn()
+
+    # dftmp = pd.read_csv('data.csv')
+    # dftmp.info()
+    # print(dftmp['class'].value_counts())
     
     # User input
     opt = prompt_display()