parent
a41ee5e5fd
commit
140e2dd5c7
@ -0,0 +1,12 @@
|
||||
from sklearn import metrics
|
||||
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.model_selection import learning_curve
|
||||
import numpy as np
|
||||
|
||||
def metrics(y_test, y_pred):
|
||||
accuracy = accuracy_score(y_test, y_pred)
|
||||
conf_matrix = confusion_matrix(y_test, y_pred)
|
||||
class_report = classification_report(y_test, y_pred)
|
||||
|
||||
return accuracy, conf_matrix, class_report
|
@ -0,0 +1,28 @@
|
||||
from sklearn.neighbors import KNeighborsClassifier
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
from sklearn.linear_model import SGDClassifier
|
||||
|
||||
def logistic_regression(X_train, y_train, X_test):
|
||||
logistic = LogisticRegression(max_iter = 100000)
|
||||
logistic.fit(X_train,y_train)
|
||||
|
||||
return logistic.predict(X_test)
|
||||
|
||||
def decision_tree(X_train, y_train, X_test):
|
||||
decisionTree = DecisionTreeClassifier()
|
||||
decisionTree = decisionTree.fit(X_train,y_train)
|
||||
|
||||
return decisionTree.predict(X_test)
|
||||
|
||||
def knn_classifier(X_train, y_train, X_test):
|
||||
knn = KNeighborsClassifier(n_neighbors=5)
|
||||
knn.fit(X_train, y_train)
|
||||
|
||||
return knn.predict(X_test)
|
||||
|
||||
def sgd_classifier(X_train, y_train, X_test):
|
||||
sgd = SGDClassifier(loss="hinge", penalty="l2")
|
||||
sgd.fit(X_train, y_train)
|
||||
|
||||
return sgd.predict(X_test)
|
@ -1,4 +1,46 @@
|
||||
import pandas as pd
|
||||
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
|
||||
from sklearn.model_selection import train_test_split
|
||||
|
||||
def process():
|
||||
df = load_datas()
|
||||
|
||||
df = tokenize_datas(df)
|
||||
|
||||
X, y = features_selection(df)
|
||||
|
||||
X_train, X_test, y_train, y_test = split_df(X, y)
|
||||
|
||||
return X_train, X_test, y_train, y_test
|
||||
|
||||
|
||||
def load_datas():
|
||||
return pd.read_csv("../datas/FakeNewsNet.csv")
|
||||
|
||||
def tokenize_datas(df):
|
||||
le = LabelEncoder()
|
||||
label = le.fit_transform(df['news_url'])
|
||||
label1=le.fit_transform(df['title'])
|
||||
label2=le.fit_transform(df['source_domain'])
|
||||
df.drop("news_url", axis=1, inplace=True)
|
||||
df.drop("title", axis=1, inplace=True)
|
||||
df.drop("source_domain", axis=1, inplace=True)
|
||||
|
||||
df["news_url"] = label
|
||||
df["title"] = label1
|
||||
df["source_domain"] = label2
|
||||
|
||||
return df
|
||||
|
||||
def features_selection(df):
|
||||
features = ["title", "news_url", "source_domain"]
|
||||
|
||||
return df[features].fillna(''), df["real"]
|
||||
|
||||
|
||||
def split_df(X, y):
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.30, random_state=42)
|
||||
return X_train, X_test, y_train, y_test
|
||||
|
||||
|
Loading…
Reference in new issue