Aurian JAULT 1 year ago
commit 917dd02b86

@ -5,6 +5,8 @@ Deep Learning: https://machinelearningmastery.com/tutorial-first-neural-network-
Python Machine Learning: https://machinelearningmastery.com/machine-learning-in-python-step-by-step/ Python Machine Learning: https://machinelearningmastery.com/machine-learning-in-python-step-by-step/
AI Plot data: https://machinelearningmastery.com/visualize-machine-learning-data-python-pandas/
## Columns ## Columns
|Keep |Skip | |Keep |Skip |
@ -24,3 +26,19 @@ Python Machine Learning: https://machinelearningmastery.com/machine-learning-in-
- [ ] Which model is the best, ratio learn_time/precision - [ ] Which model is the best, ratio learn_time/precision
- [ ] Can we drop more categories and have same results (useless data?) - [ ] Can we drop more categories and have same results (useless data?)
- [ ] Compare prediction with y_test that were false - [ ] Compare prediction with y_test that were false
## Dataset
Nous avons décidé de prendre un dataset sur le site Kaggle, il contient 100 000 lignes qui réprésentent
chacune un objet stellaire observé en lui attribuant plusieurs caractéristiques comme sa declinaison,
les couleurs observées et autres valeurs scientifiques.
Chaque ligne est donc associée à une classe qui peut-être "QSO" un quasar, "Galaxy" ou "Star".
Notre première étape à été de regarder le dataset pour savoir si certaines données sont manquantes.
En utilisant `df.info()` nous pouvons avoir certaines informations sur les données, il ne manque aucune valeur.
Nous pouvons maintenant regarder la répartition des classes, celle-ci est assez inégale avec ~60.000 Galaxie,
~21.000 étoiles et ~19000 quasar. Nous pouvons en déduire que les galaxies sont plus communes mais cela
pourrait-il avoir une incidence sur la précision de notre modèle ?
## Plot
J'ai la flemme d'analyser les plots que j'ai fait.

@ -26,15 +26,26 @@ from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeClassifier from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import pandas
from pandas.plotting import scatter_matrix
from sklearn.metrics import confusion_matrix
# main # main
def main(): def main():
# plotAll()
# auto_sklearn()
# dftmp = pd.read_csv('data.csv')
# dftmp.info()
# print(dftmp['class'].value_counts())
# User input # User input
opt = prompt_display() opt = prompt_display()
model = model_switch(opt) model = model_switch(opt)
# Get interesting data # Get interesting data
df = read_dataset("data.csv") df, x, y = read_dataset('data.csv')
x, y = get_xy_from_dataframe(df)
# rfecv_test(x, y, RandomForestClassifier()) # rfecv_test(x, y, RandomForestClassifier())
# Train model # Train model
@ -43,13 +54,10 @@ def main():
# Open dataset with panda # Open dataset with panda
def read_dataset(filename): def read_dataset(filename):
df = pd.read_csv(filename) df = pd.read_csv(filename)
return df
# Drop useless columns and return x and y
def get_xy_from_dataframe(df):
x = df.drop(['obj_ID','field_ID','run_ID','rerun_ID','cam_col','plate','MJD','fiber_ID','class'],axis=1) x = df.drop(['obj_ID','field_ID','run_ID','rerun_ID','cam_col','plate','MJD','fiber_ID','class'],axis=1)
y = df['class'].values y = df['class'].values
return x, y
return df, x, y
# Ask for model choice # Ask for model choice
def prompt_display(): def prompt_display():
@ -80,7 +88,7 @@ def model_switch(choice):
elif (choice == 7): elif (choice == 7):
model = NearestCentroid() model = NearestCentroid()
elif (choice == 8): elif (choice == 8):
model = MLPClassifier(solver='adam', alpha=1e-5, random_state=1, activation="logistic", hidden_layer_sizes=(100,80,60,40,20,10,3)) model = MLPClassifier(solver='adam', alpha=1e-5, random_state=1, activation="logistic", hidden_layer_sizes=(1000, 300, 100, 30, 10, 3))
else: else:
raise Exception('Wrong entry') raise Exception('Wrong entry')
@ -125,7 +133,8 @@ def training(model, x, y):
model.fit(Xtrain,ytrain) model.fit(Xtrain,ytrain)
ypredit = model.predict(Xtest) ypredict = model.predict(Xtest)
# confusion_matrix(ytrain, ypredict)
# os.system("clear") # os.system("clear")
res = -1 res = -1
while(res != 0): while(res != 0):
@ -134,13 +143,13 @@ def training(model, x, y):
res = int(input()) res = int(input())
if(res == 1): if(res == 1):
os.system("clear") os.system("clear")
printStatValues(ypredit,ytest) printStatValues(ypredict,ytest)
elif(res == 2): elif(res == 2):
os.system("clear") os.system("clear")
printPredictedValues(ypredit,ytest) printPredictedValues(ypredict,ytest)
elif res == 3: elif res == 3:
os.system("clear") os.system("clear")
print(accuracy_score(ytest, ypredit)) print(accuracy_score(ytest, ypredict))
elif res == 0: elif res == 0:
break break
else: else:
@ -266,7 +275,59 @@ def bestModel(datas):
print("Best model : ",model," columns : ",res[0]," Accuracy : ", res[1][model]) print("Best model : ",model," columns : ",res[0]," Accuracy : ", res[1][model])
print("Worst model : ",modelMin," columns : ",resMin[0]," Accuracy : ", resMin[1][model]) print("Worst model : ",modelMin," columns : ",resMin[0]," Accuracy : ", resMin[1][model])
def auto_sklearn():
df = read_dataset('data.csv') df = read_dataset('data.csv')
X_train, X_test, y_train, ytest = train_test_split(x, y,test_size=0.25, random_state=0)
X_train = X_train.values
X_test = X_test.values
if len(Xtrain.shape) < 2:
Xtrain = Xtrain.reshape(-1, 1)
if len(Xtest.shape) < 2:
Xtest = Xtest.reshape(-1, 1)
cls = autosklearn.classification.AutoSklearnClassifier()
cls.fit(X_train, y_train)
y_hat = predictions = cls.predict(X_test)
print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_hat))
def plotAll():
df = read_dataset('data.csv')
plotHistograms(df)
plotDensity(df)
plotBoxWhisker(df)
plotCorrelationMatrix(df)
plotScatterMatrix(df)
def plotHistograms(df):
df.hist()
plt.show()
def plotDensity(df):
df.plot(kind='density', subplots=True, layout=(3,3), sharex=False)
plt.show()
def plotBoxWhisker(df):
df.plot(kind='box', subplots=True, layout=(3,3), sharex=False, sharey=False)
plt.show()
def plotCorrelationMatrix(df):
correlations = df.corr()
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,9,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(list(df))
ax.set_yticklabels(list(df))
plt.show()
def plotScatterMatrix(df):
scatter_matrix(df)
plt.show()
# Affiche la répartitions des objets stélaires dans la base de données # Affiche la répartitions des objets stélaires dans la base de données
#showData(df) #showData(df)

Loading…
Cancel
Save