From cf44fa3b0e4ceeeed8779e2b86c3d7b7a092b972 Mon Sep 17 00:00:00 2001 From: rem Date: Mon, 12 Feb 2024 09:14:27 +0100 Subject: [PATCH] add plots and try auto_sklearn --- README.md | 1 + src/main.py | 84 ++++++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 72 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 80ea39f..2758434 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ Deep Learning: https://machinelearningmastery.com/tutorial-first-neural-network- Python Machine Learning: https://machinelearningmastery.com/machine-learning-in-python-step-by-step/ +AI Plot data: https://machinelearningmastery.com/visualize-machine-learning-data-python-pandas/ ## Columns |Keep |Skip | diff --git a/src/main.py b/src/main.py index ae29bd2..ea5aa66 100755 --- a/src/main.py +++ b/src/main.py @@ -26,15 +26,22 @@ from sklearn.feature_selection import RFECV from sklearn.tree import DecisionTreeClassifier from sklearn.pipeline import Pipeline +import matplotlib.pyplot as plt +import pandas +from pandas.plotting import scatter_matrix +from sklearn.metrics import confusion_matrix + # main def main(): + # plotAll() + #auto_sklearn() + # User input opt = prompt_display() model = model_switch(opt) # Get interesting data - df = read_dataset("data.csv") - x, y = get_xy_from_dataframe(df) + df, x, y = read_dataset('data.csv') # rfecv_test(x, y, RandomForestClassifier()) # Train model @@ -43,14 +50,11 @@ def main(): # Open dataset with panda def read_dataset(filename): df = pd.read_csv(filename) - return df - -# Drop useless columns and return x and y -def get_xy_from_dataframe(df): x = df.drop(['obj_ID','field_ID','run_ID','rerun_ID','cam_col','plate','MJD','fiber_ID','class'],axis=1) y = df['class'].values - return x, y + return df, x, y + # Ask for model choice def prompt_display(): print("""Choose a model: @@ -80,7 +84,7 @@ def model_switch(choice): elif (choice == 7): model = NearestCentroid() elif (choice == 8): - model = MLPClassifier(solver='adam', alpha=1e-5, random_state=1, activation="logistic", hidden_layer_sizes=(100,80,60,40,20,10,3)) + model = MLPClassifier(solver='adam', alpha=1e-5, random_state=1, activation="logistic", hidden_layer_sizes=(1000, 300, 100, 30, 10, 3)) else: raise Exception('Wrong entry') @@ -124,8 +128,9 @@ def training(model, x, y): Xtest = Xtest.reshape(-1, 1) model.fit(Xtrain,ytrain) - - ypredit = model.predict(Xtest) + + ypredict = model.predict(Xtest) + # confusion_matrix(ytrain, ypredict) # os.system("clear") res = -1 while(res != 0): @@ -134,13 +139,13 @@ def training(model, x, y): res = int(input()) if(res == 1): os.system("clear") - printStatValues(ypredit,ytest) + printStatValues(ypredict,ytest) elif(res == 2): os.system("clear") - printPredictedValues(ypredit,ytest) + printPredictedValues(ypredict,ytest) elif res == 3: os.system("clear") - print(accuracy_score(ytest, ypredit)) + print(accuracy_score(ytest, ypredict)) elif res == 0: break else: @@ -236,5 +241,58 @@ def bestModelFinder(datas): print("Tree variance accuracy_score : ", variance(treeMean)) print("Tree ecart-type accuracy_score : ", stdev(treeMean)) +def auto_sklearn(): + df = read_dataset('data.csv') + X_train, X_test, y_train, ytest = train_test_split(x, y,test_size=0.25, random_state=0) + X_train = X_train.values + X_test = X_test.values + + if len(Xtrain.shape) < 2: + Xtrain = Xtrain.reshape(-1, 1) + if len(Xtest.shape) < 2: + Xtest = Xtest.reshape(-1, 1) + + cls = autosklearn.classification.AutoSklearnClassifier() + cls.fit(X_train, y_train) + y_hat = predictions = cls.predict(X_test) + print("Accuracy score", sklearn.metrics.accuracy_score(y_test, y_hat)) + +def plotAll(): + df = read_dataset('data.csv') + + plotHistograms(df) + plotDensity(df) + plotBoxWhisker(df) + plotCorrelationMatrix(df) + plotScatterMatrix(df) + +def plotHistograms(df): + df.hist() + plt.show() + +def plotDensity(df): + df.plot(kind='density', subplots=True, layout=(3,3), sharex=False) + plt.show() + +def plotBoxWhisker(df): + df.plot(kind='box', subplots=True, layout=(3,3), sharex=False, sharey=False) + plt.show() + +def plotCorrelationMatrix(df): + correlations = df.corr() + fig = plt.figure() + ax = fig.add_subplot(111) + cax = ax.matshow(correlations, vmin=-1, vmax=1) + fig.colorbar(cax) + ticks = np.arange(0,9,1) + ax.set_xticks(ticks) + ax.set_yticks(ticks) + ax.set_xticklabels(list(df)) + ax.set_yticklabels(list(df)) + plt.show() + +def plotScatterMatrix(df): + scatter_matrix(df) + plt.show() main()