From 140e2dd5c76c383a802fc6102fe92672b71d8673 Mon Sep 17 00:00:00 2001 From: axdelafuen Date: Wed, 7 Feb 2024 10:45:12 +0000 Subject: [PATCH] :sparkles: transfer code from colab (traing model / metrics) --- src/analysis.py | 12 ++++++++++++ src/classifier.py | 28 ++++++++++++++++++++++++++++ src/main.py | 33 ++++++++++++++++++++++++++++++--- src/preprocessing.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 112 insertions(+), 3 deletions(-) diff --git a/src/analysis.py b/src/analysis.py index e69de29..d2bfc93 100644 --- a/src/analysis.py +++ b/src/analysis.py @@ -0,0 +1,12 @@ +from sklearn import metrics +from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay +import matplotlib.pyplot as plt +from sklearn.model_selection import learning_curve +import numpy as np + +def metrics(y_test, y_pred): + accuracy = accuracy_score(y_test, y_pred) + conf_matrix = confusion_matrix(y_test, y_pred) + class_report = classification_report(y_test, y_pred) + + return accuracy, conf_matrix, class_report diff --git a/src/classifier.py b/src/classifier.py index e69de29..91a6d5a 100644 --- a/src/classifier.py +++ b/src/classifier.py @@ -0,0 +1,28 @@ +from sklearn.neighbors import KNeighborsClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.tree import DecisionTreeClassifier +from sklearn.linear_model import SGDClassifier + +def logistic_regression(X_train, y_train, X_test): + logistic = LogisticRegression(max_iter = 100000) + logistic.fit(X_train,y_train) + + return logistic.predict(X_test) + +def decision_tree(X_train, y_train, X_test): + decisionTree = DecisionTreeClassifier() + decisionTree = decisionTree.fit(X_train,y_train) + + return decisionTree.predict(X_test) + +def knn_classifier(X_train, y_train, X_test): + knn = KNeighborsClassifier(n_neighbors=5) + knn.fit(X_train, y_train) + + return knn.predict(X_test) + +def sgd_classifier(X_train, y_train, X_test): + sgd = SGDClassifier(loss="hinge", penalty="l2") + sgd.fit(X_train, y_train) + + return sgd.predict(X_test) diff --git a/src/main.py b/src/main.py index e7cf413..31eb69a 100644 --- a/src/main.py +++ b/src/main.py @@ -2,8 +2,35 @@ import preprocessing import classifier import analysis +from warnings import simplefilter +simplefilter(action='ignore', category=FutureWarning) + if __name__ == '__main__': - print("Start learning... :)") + print("Start learning...") + + X_train, X_test, y_train, y_test = preprocessing.process() + print("\nPre-processing... OK") + + print("\nTraining models...") + + y_pred_knn = classifier.knn_classifier(X_train, y_train, X_test) + print("Knn... OK") + + y_pred_dt = classifier.decision_tree(X_train, y_train, X_test) + print("DecisionTree... OK") + + y_pred_logistic_reg = classifier.logistic_regression(X_train, y_train, X_test) + print("Logistic Regression... OK") + + y_pred_sgd = classifier.sgd_classifier(X_train, y_train, X_test) + print("SGD... OK") + + print("\nMetrics calculations...") + + print("\n--------------Knn metrics---------------") + knn_accuracy, knn_conf_matrix, knn_class_report = analysis.metrics(y_test, y_pred_knn) + print(f'Accuracy: {knn_accuracy}') + print(f'Confusion Matrix:\n{knn_conf_matrix}') + print(f'Classification Report:\n{knn_class_report}') - df = preprocessing.load_datas() - df.head(5) + print("\nTODO()") diff --git a/src/preprocessing.py b/src/preprocessing.py index 3a7d517..e010468 100644 --- a/src/preprocessing.py +++ b/src/preprocessing.py @@ -1,4 +1,46 @@ import pandas as pd +from sklearn.preprocessing import LabelEncoder + +from sklearn.model_selection import train_test_split + +def process(): + df = load_datas() + + df = tokenize_datas(df) + + X, y = features_selection(df) + + X_train, X_test, y_train, y_test = split_df(X, y) + + return X_train, X_test, y_train, y_test + + def load_datas(): return pd.read_csv("../datas/FakeNewsNet.csv") + +def tokenize_datas(df): + le = LabelEncoder() + label = le.fit_transform(df['news_url']) + label1=le.fit_transform(df['title']) + label2=le.fit_transform(df['source_domain']) + df.drop("news_url", axis=1, inplace=True) + df.drop("title", axis=1, inplace=True) + df.drop("source_domain", axis=1, inplace=True) + + df["news_url"] = label + df["title"] = label1 + df["source_domain"] = label2 + + return df + +def features_selection(df): + features = ["title", "news_url", "source_domain"] + + return df[features].fillna(''), df["real"] + + +def split_df(X, y): + X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.30, random_state=42) + return X_train, X_test, y_train, y_test +