diff --git a/README.md b/README.md index 5ce1ba8..37dc186 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ # Plotabit Dataset link: https://www.kaggle.com/datasets/fedesoriano/stellar-classification-dataset-sdss17 Deep Learning: https://machinelearningmastery.com/tutorial-first-neural-network-python-keras/ +Python Machine Learning: https://machinelearningmastery.com/machine-learning-in-python-step-by-step/ ## Columns @@ -20,3 +21,4 @@ Deep Learning: https://machinelearningmastery.com/tutorial-first-neural-network- - [ ] Train models with just 20000 "GALAXY" class (has an impact?) - [ ] Which model is the best, ratio learn_time/precision - [ ] Can we drop more categories and have same results (useless data?) +- [ ] Compare prediction with y_test that were false diff --git a/src/main.py b/src/main.py index 85d5b75..1b46856 100755 --- a/src/main.py +++ b/src/main.py @@ -4,37 +4,52 @@ import pandas as pd import matplotlib.pyplot as plt import sklearn as sk -# Classification - -## KNN +from sklearn import svm +from sklearn.linear_model import SGDClassifier +from sklearn.tree import DecisionTreeClassifier +from sklearn.neighbors import KNeighborsClassifier +from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score -from sklearn.neighbors import KNeighborsClassifier from sklearn.model_selection import train_test_split -from sklearn.tree import DecisionTreeClassifier -from sklearn.ensemble import RandomForestClassifier -#charger les données -df=pd.read_csv('../data.csv') -# Clear datas -# alpha delta u g r i z redshift spec_OBJ_ID -# Y : class -x = df.drop(['obj_ID','field_ID','run_ID','rerun_ID','cam_col','plate','MJD','fiber_ID','class'],axis=1) -y = df['class'].values +# Open dataset with panda +def read_dataset(filename): + df = pd.read_csv(filename) + return df + +# Drop useless columns and return x and y +def get_xy_from_dataset(filename): + df = read_dataset(filename) + x = df.drop(['obj_ID','field_ID','run_ID','rerun_ID','cam_col','plate','MJD','fiber_ID','class'],axis=1) + y = df['class'].values + return x, y + +x, y = get_xy_from_dataset("data.csv") x.hist() -plt.show() +#plt.show() -print(" Rentre un chiffre:\n\n1 - KNN\n2 - Tree\n3- RandomForestClassifier") +print("""Choose a model: +(1) - KNN +(2) - Tree +(3) - RandomForestClassifier +(4) - SGD +(5) - Linear SVC""") res = int(input()) -if(res == 1): + +if (res == 1): model = KNeighborsClassifier() -elif(res == 2): +elif (res == 2): model = DecisionTreeClassifier(random_state=0, max_depth=20) -elif(res == 3): +elif (res == 3): model = RandomForestClassifier(n_estimators=100 ,criterion='entropy') +elif (res == 4): + model = SGDClassifier(max_iter=1000, tol=0.01) +elif (res == 5): + model = svm.SVC(kernel='linear', C = 1.0) else: raise Exception('RENTRE LE BON NOMBRE GROS CON')