From 4c273e57d33b996115d25ad831a6cf72a1d84392 Mon Sep 17 00:00:00 2001
From: "aurian.jault" <aurian.jault@etu.uca.fr>
Date: Wed, 7 Feb 2024 09:50:31 +0100
Subject: [PATCH] Adding some stats + all posibilities

---
 README.md   |  2 +-
 src/main.py | 91 +++++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 75 insertions(+), 18 deletions(-)

diff --git a/README.md b/README.md
index 3cb5f47..80ea39f 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ Python Machine Learning: https://machinelearningmastery.com/machine-learning-in-
 
 ## Analysis
 
-- [ ] Train models with just 20000 "GALAXY" class (has an impact?)
+- [X] Train models with just 20000 "GALAXY" class (has an impact?)
 - [ ] Which model is the best, ratio learn_time/precision
 - [ ] Can we drop more categories and have same results (useless data?)
 - [ ] Compare prediction with y_test that were false
diff --git a/src/main.py b/src/main.py
index 0f89d00..234e590 100755
--- a/src/main.py
+++ b/src/main.py
@@ -56,13 +56,13 @@ def model_switch(choice):
     elif (choice == 2):
         model = DecisionTreeClassifier(random_state=0, max_depth=20)
     elif (choice == 3):
-        model = RandomForestClassifier(n_estimators=100 ,criterion='entropy', n_jobs=-1)
+        model = RandomForestClassifier(n_estimators=100 ,criterion='entropy')
     elif (choice == 4):
-        model = SGDClassifier(max_iter=1000, tol=0.01)
+        model = sgdclassifier(max_iter=1000, tol=0.01)
     elif (choice == 5):
-        model = svm.SVC(kernel='linear', C = 1.0)    
+        model = svm.svc(kernel='linear', c = 1.0)    
     else:
-        raise Exception('RENTRE LE BON NOMBRE GROS CON')       
+        raise Exception('Wrong entry')       
     
     return model
 
@@ -93,7 +93,7 @@ def printStatValues(ypredit,ytest):
     print("Galaxy : ",(galaxyStats*100/NF),"%","Star :",(starStats*100/NF),"%","QSO : ",(QSOStats*100/NF),"%")
 
 # Train model
-def training(model, x, y):
+def training(model, x, y,res=-1):
     Xtrain, Xtest, ytrain, ytest = train_test_split(x, y,test_size=0.25, random_state=0)
     Xtrain = Xtrain.values
     Xtest = Xtest.values
@@ -102,17 +102,11 @@ def training(model, x, y):
         Xtrain = Xtrain.reshape(-1, 1)
     if len(Xtest.shape) < 2:
         Xtest = Xtest.reshape(-1, 1)
-
-    # if isinstance(model, svm.LinearSVC):
-    #     with parallel_backend('threading', n_jobs=-1):
-    #         model.fit(X_train, y_train)
-    
-    #else: 
+ 
     model.fit(Xtrain,ytrain)
     
     ypredit = model.predict(Xtest)
     os.system("clear")
-    res = -1
     while(res != 0):
         print(" Rentre un chiffre:\n\n1 - Stats %\n2 - Stats raw\n3 - accuracy_score")
         print("0 - QUIT")
@@ -129,22 +123,85 @@ def training(model, x, y):
         elif res == 0:
             break
         else:
-            raise Exception('Mauvaise saisie')
+            raise Exception('Wrong entry')
 
 def clearData(df):
     res = df["class"].value_counts()
     dtemp = df.sort_values(by=['class'])
     supr = int(res["GALAXY"]/1.5)
-    
     dtemp.drop(dtemp.index[range(1,supr)])
     dtemp = dtemp.iloc[34000:]
     return dtemp
 
-def showDate(df):
+def showData(df):
     res = df["class"].value_counts()
     x = [res["GALAXY"],res["QSO"],res["STAR"]]
     plt.figure(figsize = (8, 8))
     plt.pie(x, labels = ['GALAXY', 'QSO', 'Star'])
     plt.legend()
-    
-main()
+
+def allModels(df):
+    dfClone = df.copy()
+    # Aditionnale model randomforestclassifier(n_estimators=100 ,criterion='entropy', n_jobs=-1)
+    modelArray=  ['KNN','Classifier']
+    dfTemp = df.drop(['obj_ID','field_ID','run_ID','rerun_ID','cam_col','plate','MJD','fiber_ID','class'],axis=1)
+    y = df['class'].values
+    x = list(dfTemp.columns.values)
+    datas = []
+    for i in range(0,len(x)):
+        arrayColumns = [x[i]]
+        for j in range(i+1,len(x)):
+            xValues = dfTemp[arrayColumns]
+            for k in range(0,len(modelArray)):
+                if modelArray[k] == "KNN":
+                    model = model_switch(1)
+                elif modelArray[k] == "Classifier":
+                    model = model_switch(2)
+                else:
+                    model = model_switch(1)
+                print("Model used : ",modelArray[k], "---- Case : ",model)
+                print("X values used : ",arrayColumns)
+                accu = customTrainingRaw(model,xValues,y,3)
+                it = [modelArray[k],arrayColumns,accu]
+                datas.append(it)
+            arrayColumns.append(x[j])
+    return datas
+
+def customTrainingRaw(model, x, y,res=-1):
+    Xtrain, Xtest, ytrain, ytest = train_test_split(x, y,test_size=0.25, random_state=0)
+    Xtrain = Xtrain.values
+    Xtest = Xtest.values
+    if len(Xtrain.shape) < 2:
+        Xtrain = Xtrain.reshape(-1, 1)
+    if len(Xtest.shape) < 2:
+        Xtest = Xtest.reshape(-1, 1)
+    model.fit(Xtrain,ytrain)
+    ypredit = model.predict(Xtest)
+    print(accuracy_score(ytest, ypredit))
+    return accuracy_score(ytest, ypredit)
+
+def bestModelFinder(datas):
+    maxi = 0
+    knnMean= 0
+    treeMean= 0
+    for i in range(0,len(datas)):
+        if datas[i][0] == 'KNN':
+            knnMean += datas[i][2]
+        else:
+            treeMean += datas[i][2]
+        if (datas[i][2] > maxi):
+            maxi = datas[i][2]
+            res = datas[i]
+    print("BEST CHOICE IS :", res)
+    print("Knn mean accuracy_score : ", mean(knnMean))
+    print("Knn variance accuracy_score : ", variance(knnMean))
+    print("Knn ecart-type accuracy_score : ", stdev(knnMean))
+    print("Tree mean accuracy_score : ", mean(treeMean))
+    print("Tree variance accuracy_score : ", variance(treeMean))
+    print("Tree ecart-type accuracy_score : ", stdev(treeMean))
+   
+
+
+
+df = read_dataset("../data.csv")
+bestModelFinder(allModels(df))