add support for Linear SVC and SGD model

1 year ago · fc899e7bee
parent f9f942ad42
commit fc899e7bee
2 changed files with 35 additions and 18 deletions
--- a/README.md
+++ b/README.md
@ -1,6 +1,7 @@
 # Plotabit
 Dataset link: https://www.kaggle.com/datasets/fedesoriano/stellar-classification-dataset-sdss17
 Deep Learning: https://machinelearningmastery.com/tutorial-first-neural-network-python-keras/
 Python Machine Learning: https://machinelearningmastery.com/machine-learning-in-python-step-by-step/
 ## Columns
@ -20,3 +21,4 @@ Deep Learning: https://machinelearningmastery.com/tutorial-first-neural-network-
 - [ ] Train models with just 20000 "GALAXY" class (has an impact?)
 - [ ] Which model is the best, ratio learn_time/precision
 - [ ] Can we drop more categories and have same results (useless data?)
 - [ ] Compare prediction with y_test that were false
--- a/src/main.py
+++ b/src/main.py
@ -4,37 +4,52 @@ import pandas as pd
 import matplotlib.pyplot as plt 
 import sklearn as sk
-# Classification 
+from sklearn import svm
-
+from sklearn.linear_model import SGDClassifier
-## KNN
+from sklearn.tree import DecisionTreeClassifier
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.metrics import accuracy_score
 from sklearn.neighbors import KNeighborsClassifier
 from sklearn.model_selection import train_test_split
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.ensemble import RandomForestClassifier
 #charger les données
 df=pd.read_csv('../data.csv')
-# Clear datas
+# Open dataset with panda
-# alpha delta u g r i z redshift spec_OBJ_ID 
+def read_dataset(filename):
-# Y : class 
+    df = pd.read_csv(filename)
-x = df.drop(['obj_ID','field_ID','run_ID','rerun_ID','cam_col','plate','MJD','fiber_ID','class'],axis=1)
+    return df
-y = df['class'].values
+
 # Drop useless columns and return x and y
 def get_xy_from_dataset(filename):
    df = read_dataset(filename)
    x = df.drop(['obj_ID','field_ID','run_ID','rerun_ID','cam_col','plate','MJD','fiber_ID','class'],axis=1)
    y = df['class'].values
    return x, y 
 x, y = get_xy_from_dataset("data.csv")
 x.hist()
-plt.show()
+#plt.show()
-print(" Rentre un chiffre:\n\n1 - KNN\n2 - Tree\n3- RandomForestClassifier")
+print("""Choose a model:
 (1) - KNN
 (2) - Tree
 (3) - RandomForestClassifier
 (4) - SGD
 (5) - Linear SVC""")
 res = int(input())
-if(res == 1):
+
 if (res == 1):
    model = KNeighborsClassifier()
-elif(res == 2):
+elif (res == 2):
    model = DecisionTreeClassifier(random_state=0, max_depth=20)
-elif(res == 3):
+elif (res == 3):
    model = RandomForestClassifier(n_estimators=100 ,criterion='entropy')
 elif (res == 4):
    model = SGDClassifier(max_iter=1000, tol=0.01)
 elif (res == 5):
    model = svm.SVC(kernel='linear', C = 1.0)
 else:
    raise Exception('RENTRE LE BON NOMBRE GROS CON')