From b0c32a3acb0eb473a58e7c14c85a27acbe1665c2 Mon Sep 17 00:00:00 2001 From: axdelafuen Date: Mon, 12 Feb 2024 10:52:35 +0000 Subject: [PATCH] :sparkles: add classifier to web app, can now detect fake news from app --- src/app/src/classifier.py | 28 +++++++++++++++ src/app/src/html/assets/favicon.svg | 3 ++ src/app/src/html/home.html | 50 +++++++++++++------------- src/app/src/models.py | 31 +++++++++++++++++ src/app/src/preprocessing.py | 54 +++++++++++++++++++++++++++++ src/app/src/views.py | 14 +++++++- src/main.py | 2 ++ src/preprocessing.py | 2 ++ 8 files changed, 159 insertions(+), 25 deletions(-) create mode 100644 src/app/src/classifier.py create mode 100644 src/app/src/html/assets/favicon.svg create mode 100644 src/app/src/preprocessing.py diff --git a/src/app/src/classifier.py b/src/app/src/classifier.py new file mode 100644 index 0000000..fa8ce1d --- /dev/null +++ b/src/app/src/classifier.py @@ -0,0 +1,28 @@ +from sklearn.neighbors import KNeighborsClassifier +from sklearn.linear_model import LogisticRegression +from sklearn.tree import DecisionTreeClassifier +from sklearn.linear_model import SGDClassifier + +def logistic_regression(X_train, y_train, X_test): + logistic = LogisticRegression(max_iter = 100000) + logistic.fit(X_train,y_train) + + return logistic.predict(X_test), logistic + +def decision_tree(X_train, y_train, X_test): + decisionTree = DecisionTreeClassifier() + decisionTree = decisionTree.fit(X_train,y_train) + + return decisionTree.predict(X_test), decisionTree + +def knn_classifier(X_train, y_train, X_test): + knn = KNeighborsClassifier(n_neighbors=5) + knn.fit(X_train, y_train) + + return knn.predict(X_test), knn + +def sgd_classifier(X_train, y_train, X_test): + sgd = SGDClassifier(loss="hinge", penalty="l2") + sgd.fit(X_train, y_train) + + return sgd.predict(X_test), sgd diff --git a/src/app/src/html/assets/favicon.svg b/src/app/src/html/assets/favicon.svg new file mode 100644 index 0000000..5c0be60 --- /dev/null +++ b/src/app/src/html/assets/favicon.svg @@ -0,0 +1,3 @@ + + 📰 + diff --git a/src/app/src/html/home.html b/src/app/src/html/home.html index 6f756d6..4b7c53f 100644 --- a/src/app/src/html/home.html +++ b/src/app/src/html/home.html @@ -8,10 +8,12 @@ +
-

Fake News Detector

+

Fake News Detector

Enter title and url of the news:

{% csrf_token %} {{ form }}
+

+ {{ result }} +

diff --git a/src/app/src/models.py b/src/app/src/models.py index ccda823..6bb3ec6 100644 --- a/src/app/src/models.py +++ b/src/app/src/models.py @@ -1,4 +1,5 @@ from django.db import models +from urllib.parse import urlparse # Create your models here. @@ -12,3 +13,33 @@ class Text(models.Model): def __str__(self): return self.title +def get_domain(url): + parsed_url = urlparse(url) + if parsed_url.netloc.startswith('www.'): + return parsed_url.netloc[4:] + else: + return parsed_url.netloc + +from .preprocessing import * +from .classifier import * +import pandas as pd +import numpy as np + +def prediction(title, url): + domain = get_domain(url) + + input_df = pd.DataFrame({'title': title, 'news_url': url, 'source_domain': domain}, index = ['1']) + concat_df = pd.concat([load_datas(), input_df], ignore_index=True) + + input_df_tokenized = tokenize_datas(concat_df).tail(1) + input_df_tokenized.drop("tweet_num", axis=1, inplace=True) + input_df_tokenized.drop("real", axis=1, inplace=True) + + #return input_df_tokenized + + X, y = no_split_process() + + prediction, knn = knn_classifier(X, y, input_df_tokenized) + + return prediction + diff --git a/src/app/src/preprocessing.py b/src/app/src/preprocessing.py new file mode 100644 index 0000000..c713ed9 --- /dev/null +++ b/src/app/src/preprocessing.py @@ -0,0 +1,54 @@ +import pandas as pd + +from sklearn.preprocessing import LabelEncoder + +from sklearn.model_selection import train_test_split + +def process(): + df = load_datas() + + df = tokenize_datas(df) + + X, y = features_selection(df) + + X_train, X_test, y_train, y_test = split_df(X, y) + + return X_train, X_test, y_train, y_test + +def no_split_process(): + df = load_datas() + + df = tokenize_datas(df) + + X, y = features_selection(df) + + return X, y + +def load_datas(): + return pd.read_csv("../../datas/FakeNewsNet.csv") + +def tokenize_datas(df): + le = LabelEncoder() + label = le.fit_transform(df['news_url']) + label1=le.fit_transform(df['title']) + label2=le.fit_transform(df['source_domain']) + df.drop("news_url", axis=1, inplace=True) + df.drop("title", axis=1, inplace=True) + df.drop("source_domain", axis=1, inplace=True) + + df["news_url"] = label + df["title"] = label1 + df["source_domain"] = label2 + + return df + +def features_selection(df): + features = ["title", "news_url", "source_domain"] + + return df[features].fillna(''), df["real"] + + +def split_df(X, y): + X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.30, random_state=42) + return X_train, X_test, y_train, y_test + diff --git a/src/app/src/views.py b/src/app/src/views.py index 9627e54..426ea89 100644 --- a/src/app/src/views.py +++ b/src/app/src/views.py @@ -2,14 +2,26 @@ from django.shortcuts import render, redirect # Create your views here. from .forms import TextForm +from .models import * def index(request): if request.method == 'POST': form = TextForm(request.POST) if form.is_valid(): + # get datas from the news title = form.cleaned_data["title"] url = form.cleaned_data["url"] - return redirect("index") # Rediriger vers une page d'accueil ou une autre vue + + # get result from model + if(prediction(title, url) == 1): + result = "This is not fake news !" + else: + result = "It's a Fake News !!!" + + # reset form + form = TextForm() + + return render(request, 'home.html', {'form':form, 'result':result}) else: form = TextForm() return render(request, 'home.html', {'form': form}) diff --git a/src/main.py b/src/main.py index bbb5456..d18aae6 100644 --- a/src/main.py +++ b/src/main.py @@ -5,6 +5,8 @@ import analysis from warnings import simplefilter simplefilter(action='ignore', category=FutureWarning) +import pandas as pd + if __name__ == '__main__': print("Start learning...") diff --git a/src/preprocessing.py b/src/preprocessing.py index e010468..ceee027 100644 --- a/src/preprocessing.py +++ b/src/preprocessing.py @@ -32,6 +32,8 @@ def tokenize_datas(df): df["title"] = label1 df["source_domain"] = label2 + print(df) + return df def features_selection(df):