Natural language processing (NLP) is a subfield of computer science that impacts everyone in daily life. One of the most prominent use of natural language processing is found in Gmail. Spam emails are automatically filtered in Gmail using NLP in combination with classification models.
This assigment attempts to distinguish between fake news and real news by building a classification model. We believe it's imperative to maintain a high level of accuracy when creating a fake news detector--where all fake news gets filtered out and authentic news will not be affected by a filter.
# Basic Libraries
import pandas as pd
import numpy as np
import re
# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sb
import plotly.plotly as py
import plotly.graph_objs as go
import seaborn as sn
from wordcloud import WordCloud
import cufflinks as cf
# NLTK
from nltk.corpus import stopwords
from textblob import Word
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
# Sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.model_selection import train_test_split
from sklearn import svm, grid_search
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
# Warning and Disable
import warnings
warnings.filterwarnings("ignore")
The given Train dataset has following fields.
The given Test dataset contains ID, title and text columns
data_copy dataset does not clean the 33 out-of-place rows, rather it drops them. This will be used to train classification models without cleaning to ascertain whether dropping those rows produce better results
data = pd.read_csv('fake_or_real_news_training.csv')
data_copy = data.copy()
data_test = pd.read_csv('fake_or_real_news_test.csv')
data_test_copy = data_test.copy()
There are 9 duplicate rows each in both train datasets. Duplicates are removed from the training datasets
def find_duplicates():
# Duplicates in title and text
print(
"Duplicates in Title field of data = ",
sum(data.duplicated(subset="title", keep="first")),
)
print(
"Duplicates in Title field of data_copy = ",
sum(data_copy.duplicated(subset="title", keep="first")),
)
print(
"Duplicates in Text field of data = ",
sum(data.duplicated(subset="text", keep="first")),
)
print(
"Duplicates in Text field of data_copy = ",
sum(data_copy.duplicated(subset="text", keep="first")),
)
# Duplicates in rows of data (considering all the fields)
print(
"Duplicate rows in data = ",
sum(
data[data.text.notnull()].duplicated(
subset=["text", "title", "label", "X1", "X2"], keep="first"
)
),
)
# Duplicates in rows of data_copy (considering all the fields)
print(
"Duplicate rows in data_copy = ",
sum(
data_copy[data_copy.text.notnull()].duplicated(
subset=["text", "title", "label", "X1", "X2"], keep="first"
)
),
)
find_duplicates()
data.drop_duplicates(subset=["text", "title", "label", "X1", "X2"], keep="first", inplace=True)
data_copy.drop_duplicates(subset=["text", "title", "label", "X1", "X2"], keep="first", inplace=True)
print("Number of duplicates after removal")
find_duplicates()
33 rows in the "label" column do not have the labels, rather they contain data from the 'text' field
out_of_place_rows_label = (data["label"] != "FAKE" ) & (data["label"] != "REAL")
print("Number of rows that have out-of-place data:", sum(out_of_place_rows_label))
31 out of 33 labels missing from the 'label' column are in 'X1' column
out_of_place_rows_X1 = (data["X1"] == "FAKE" ) | (data["X1"] == "REAL")
print("Number of rows that have out-of-place data:", sum(out_of_place_rows_X1))
2 out of 33 labels missing from 'label' column are in 'X2' column
out_of_place_rows_X2 = (data["X2"] == "FAKE" ) | (data["X2"] == "REAL")
print("Number of rows that have out-of-place data:", sum(out_of_place_rows_X2))
In order to fix out-of-place labels identified above, following steps are taken:
# Find the indexes where X1 field contains REAL/FAKE values
idx_X1_label = (data["X1"] == "FAKE") | (data["X1"] == "REAL")
# Create a new field called 'target' and fill it with REAL/FAKE values present from X1
data.loc[idx_X1_label, "target"] = data.loc[idx_X1_label, "X1"].values
# We do the same for X2 field
idx_X2_label = (data["X2"] == "FAKE") | (data["X2"] == "REAL")
data.loc[idx_X2_label, "target"] = data.loc[idx_X2_label, "X2"].values
data.loc[idx_X2_label, "X2"] = np.nan
# Then finally we fill the rest of the values in 'target' field from the REAL/FAKE values from 'label' field
data.target = data.target.fillna(value=data.label)
print("Missing values in 'target' field: ", sum(data.target.isnull()))
print("Proportion of fake/real news in training data:\n", data.target.value_counts())
In order to fix out-of-place news text, following steps are taken:
# Empty the REAL/FAKE values in X1 (useful while merging the text and X1 field)
data.loc[idx_X1_label, "X1"] = np.nan
# Empty the REAL/FAKE values in label (useful while merging the text and label field)
idx_label = (data["label"] == "FAKE") | (data["label"] == "REAL")
data.loc[idx_label, "label"] = np.nan
# Replacing the NaN values in X1 and label fields with ""
data.X1.fillna("", inplace=True)
data.label.fillna("", inplace=True)
# Merge title, text, label and X1 column to combine all text related values.
data["feature"] = data[["title", "text", "label", "X1"]].apply(
lambda x: ". ".join(x.astype(str)), axis=1
)
The second version of train dataset does not include the 33 rows that had out-of-place labels and news text
data_copy = data_copy[data_copy.X1.isnull()]
data_copy_clean = data_copy[data_copy.X2.isnull()]
In order to have title and text in the same column in second version of train dataset, the two columns are joined and represented as one column, "feature"
# Merge title and text column to combine all text related values.
data_copy_clean["feature"] = data_copy_clean[["title", "text"]].apply(
lambda x: ". ".join(x.astype(str)), axis=1
)
In order to have identical data shape in test and train datasets, the following code combines "title" and "text" fields in the test dataset
data_test["feature"] = data_test[["title", "text"]].apply(
lambda x: ". ".join(x.astype(str)), axis=1
)
data_test_copy["feature"] = data_test_copy[["title", "text"]].apply(
lambda x: ". ".join(x.astype(str)), axis=1
)
The first transformation is to convert all the words in lower case across both train datasets and both test data sets
This transformation was not used in the final model because it decreases the score. The reason is that most fake news have capitalised font and the model can use this characteristic of fake news to better distinguish between real and fake news
# # data and data_copy_clean dataset
# data["feature"] = data["feature"].apply(lambda x: x.lower())
# data_copy_clean["feature"] = data_copy_clean["feature"].apply(lambda x: x.lower())
# # data_test and data_test_copy dataset
# data_test["feature"] = data_test["feature"].apply(lambda x: x.lower())
# data_test_copy["feature"] = data_test_copy["feature"].apply(lambda x: x.lower())
Second transformation is to remove all the html tags from both train datasets and both test datasets
# data and data_copy_clean dataset
data["feature"] = data["feature"].apply(lambda x: re.sub(r"<.*?>", "", x))
data_copy_clean["feature"] = data_copy_clean["feature"].apply(lambda x: re.sub(r"<.*?>", "", x))
# data_test and data_test_copy dataset
data_test["feature"] = data_test["feature"].apply(lambda x: re.sub(r"<.*?>", "", x))
data_test_copy["feature"] = data_test_copy["feature"].apply(lambda x: re.sub(r"<.*?>", "", x))
Third transformation is to remove all numeric characters from both train datasets and both test datasets
# data and data_copy_clean dataset
data["feature"] = data["feature"].apply(lambda x: re.sub(r"\d+", "", x))
data_copy_clean["feature"] = data_copy_clean["feature"].apply(lambda x: re.sub(r"\d+", "", x))
# data_test and data_test_copy dataset
data_test["feature"] = data_test["feature"].apply(lambda x: re.sub(r"\d+", "", x))
data_test_copy["feature"] = data_test_copy["feature"].apply(lambda x: re.sub(r"\d+", "", x))
Fourth transformation is to remove punctuations from both train datasets and both test data sets
# data and data_copy_clean dataset
data['feature'] = data['feature'].str.replace(r"[^\w\s]", " ")
data_copy_clean['feature'] = data_copy_clean['feature'].str.replace(r"[^\w\s]", " ")
# data_test and data_test_copy dataset
data_test['feature'] = data_test['feature'].str.replace(r"[^\w\s]", " ")
data_test_copy['feature'] = data_test_copy['feature'].str.replace(r"[^\w\s]", " ")
Fifth transformation is to remove all non-alphabetic characters from both train datasets and both test datasets. The non-alphabetic characters include numeric values and special characters
# data and data_copy_clean dataset
data["feature"] = data["feature"].str.replace(r"[^A-Za-z]", " ")
data_copy_clean["feature"] = data_copy_clean["feature"].str.replace(r"[^A-Za-z]", " ")
# data_test and data_test_copy dataset
data_test["feature"] = data_test["feature"].str.replace(r"[^A-Za-z]", " ")
data_test_copy["feature"] = data_test_copy["feature"].str.replace(r"[^A-Za-z]", " ")
Sixth transformation is to remove all the stop words from both train datasets and both test datasets. Stop words in english are words such as, the, are, etc.
# Initialize stop words that will be used to remove any stops words present inside review
stop = stopwords.words("english")
# data and data_copy_clean dataset
data["feature"] = data["feature"].apply(
lambda x: " ".join(x for x in x.split() if x not in stop)
)
data_copy_clean["feature"] = data_copy_clean["feature"].apply(
lambda x: " ".join(x for x in x.split() if x not in stop)
)
# data_test and data_test_copy dataset
data_test["feature"] = data_test["feature"].apply(
lambda x: " ".join(x for x in x.split() if x not in stop)
)
data_test_copy["feature"] = data_test_copy["feature"].apply(
lambda x: " ".join(x for x in x.split() if x not in stop)
)
Seventh transformation is to remove words that have less than 2 characters from both train datasets and test datasets. It is a manual way of removing stop words that are not captured by textblob, a python library for processing textual data.
# data and data_copy_clean dataset
data["feature"] = data["feature"].apply(
lambda x: " ".join(x for x in x.split() if len(x) > 1)
)
data_copy_clean["feature"] = data_copy_clean["feature"].apply(
lambda x: " ".join(x for x in x.split() if len(x) > 1)
)
# data_test and data_test_copy dataset
data_test["feature"] = data_test["feature"].apply(
lambda x: " ".join(x for x in x.split() if len(x) > 1)
)
data_test_copy["feature"] = data_test_copy["feature"].apply(
lambda x: " ".join(x for x in x.split() if len(x) > 1)
)
Eighth and last transformation is to reduce every word to its root form across both train datasets and test datasets. For example, 'hiring', 'hired', and 'hire', all belong to the same root word 'hire'. Lemmatization is a technique to reduce every word to its root word.
# data and data_copy_clean dataset
data["feature"] = data["feature"].apply(
lambda x: " ".join([Word(word).lemmatize() for word in x.split()])
)
data_copy_clean["feature"] = data_copy_clean["feature"].apply(
lambda x: " ".join([Word(word).lemmatize() for word in x.split()])
)
# data_test and data_test_copy dataset
data_test["feature"] = data_test["feature"].apply(
lambda x: " ".join([Word(word).lemmatize() for word in x.split()])
)
data_test_copy["feature"] = data_test_copy["feature"].apply(
lambda x: " ".join([Word(word).lemmatize() for word in x.split()])
)
Word cloud will identify the most frequent words in the dataset. Since the most frequent words will appear in both, fake and real news, they will not help in distinguishing one from another. Therfore, these words will be dropped from the corpus
wordcloud = WordCloud(
background_color='white',
stopwords=stop,
max_words=200,
max_font_size=40,
random_state=42
).generate(str(data["feature"]))
print(wordcloud)
fig = plt.figure(1)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
The list below consists of words that are atleast present in 40% of the corpus. Therefore models will be trained with and without these words to conclude whether removing these words will give better result
dict_wordcloud=wordcloud.words_
dict_wordcloud=[k for k,v in dict_wordcloud.items() if v >= 0.4]
dict_wordcloud
The graph below shows the top 20 bigrams from the corpus. These bigrams are useful for identifying what type of news is in the corpus. This graph shows that the corpus contains news regarding American politics. We plan to use this information for modelling in the future after understanding the distrubution of the different 'buzz' words in American politics. Getting a good overview of the relevant 'buzz' words is crucial to develop a thorough understanding for what is going on.
def get_top_n_bigram(corpus, n=None):
vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
common_words = get_top_n_bigram(data['feature'], 20)
df3 = pd.DataFrame(common_words, columns = ['feature' , 'count'])
pd.DataFrame(df3.groupby('feature').sum()['count'].sort_values(ascending=False)).iplot(
kind='bar', yTitle='Count', linecolor='black', title='Top 20 bigrams in news')
The following code splits both training datasets into train and test sets. Tests sets have 30% of data. This will help in cross validation
X_train, X_test, y_train, y_test = train_test_split(
data["feature"], data["target"], test_size=0.3, random_state=42
)
X_train_copy, X_test_copy, y_train_copy, y_test_copy = train_test_split(
data_copy_clean["feature"], data_copy_clean["label"], test_size=0.3, random_state=42
)
# building classifier using naive bayes
nb_cvec_pipeline = Pipeline([("cvec", CountVectorizer()), ("nb", MultinomialNB())])
# Tune GridSearchCV
pipe_params_nb_cvec = {
"cvec__ngram_range": [(1, 1), (1, 2), (1, 3)],
"nb__alpha": [0.01, 0.05, 0.1],
}
model_nb_cvec_gs = GridSearchCV(nb_cvec_pipeline, param_grid=pipe_params_nb_cvec, cv=5)
model_nb_cvec_gs.fit(X_train, y_train)
print("Best Train score: ", model_nb_cvec_gs.best_score_)
print("Test score: ", model_nb_cvec_gs.score(X_test, y_test))
print("Best parameters: ", model_nb_cvec_gs.best_params_)
# building classifier using naive bayes
nb_cvec_pipeline = Pipeline([("cvec", CountVectorizer()), ("nb", MultinomialNB())])
# Tune GridSearchCV
pipe_params_nb_cvec = {
"cvec__ngram_range": [(1, 1), (1, 2), (1, 3)],
"nb__alpha": [0.01, 0.05, 0.1],
}
model_nb_cvec_gs = GridSearchCV(nb_cvec_pipeline, param_grid=pipe_params_nb_cvec, cv=5)
model_nb_cvec_gs.fit(X_train_copy, y_train_copy)
print("Best Train score: ", model_nb_cvec_gs.best_score_)
print("Test score: ", model_nb_cvec_gs.score(X_test_copy, y_test_copy))
print("Best parameters: ", model_nb_cvec_gs.best_params_)
pac = PassiveAggressiveClassifier()
# Create a pipeline with CountVectorizer and model
pipe = Pipeline([
('pac_CV', CountVectorizer()),
('pac', PassiveAggressiveClassifier())
])
# Tune GridSearchCV
pipe_params = {'pac_CV__max_df': [0.8,0.85,0.9],
'pac__C': [0, 0.5, 0.75, 1],
'pac__loss': ['hinge', 'squared_hinge']
}
pac_gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
pac_gs.fit(X_train, y_train);
print("Best score:", pac_gs.best_score_)
print("Test score", pac_gs.score(X_test, y_test))
print("Best parameters: ", pac_gs.best_params_)
pac = PassiveAggressiveClassifier()
# Create a pipeline with CountVectorizer and model
pipe = Pipeline([
('pac_CV', CountVectorizer()),
('pac', PassiveAggressiveClassifier())
])
# Tune GridSearchCV
pipe_params = {'pac_CV__max_df': [0.8,0.85,0.9],
'pac__C': [0, 0.5, 0.75, 1],
'pac__loss': ['hinge', 'squared_hinge']
}
pac_gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
pac_gs.fit(X_train_copy, y_train_copy);
print("Best score:", pac_gs.best_score_)
print("Test score", pac_gs.score(X_test_copy, y_test_copy))
print("Best parameters: ", pac_gs.best_params_)
Excluding the out-of-place rows gives a worse score than including the out-of-place rows. Therefore, all the following models include out-of-place rows and will be tested with and without most frequent words found in WordCloud
# building classifier using naive bayes
nb_tvec_pipeline = Pipeline([("tvect", TfidfVectorizer(stop_words=dict_wordcloud)), ("nb", MultinomialNB())])
# Tune GridSearchCV
pipe_params_nb_tvec = {
'tvect__max_df': [0.95],
'tvect__max_features': [50000],
'tvect__ngram_range': [(1,2), (1,3)],
"nb__alpha": [0.002],
}
model_nb_tvec_gs = GridSearchCV(
nb_tvec_pipeline, param_grid=pipe_params_nb_tvec, cv=5, verbose=True
)
model_nb_tvec_gs.fit(X_train, y_train)
print("Best score: ", model_nb_tvec_gs.best_score_)
print("Test score: ", model_nb_tvec_gs.score(X_test, y_test))
print("Best parameters: ", model_nb_tvec_gs.best_params_)
# building classifier using naive bayes
nb_tvec_pipeline = Pipeline([("tvect", TfidfVectorizer()), ("nb", MultinomialNB())])
# Tune GridSearchCV
pipe_params_nb_tvec = {
'tvect__max_df': [0.95],
'tvect__max_features': [50000],
'tvect__ngram_range': [(1,2), (1,3)],
"nb__alpha": [0.002],
}
model_nb_tvec_gs = GridSearchCV(
nb_tvec_pipeline, param_grid=pipe_params_nb_tvec, cv=3, verbose=True
)
model_nb_tvec_gs.fit(X_train, y_train)
print("Best score: ", model_nb_tvec_gs.best_score_)
print("Test score: ", model_nb_tvec_gs.score(X_test, y_test))
print("Best parameters: ", model_nb_tvec_gs.best_params_)
svm_pipeline = Pipeline([
('tvec', TfidfVectorizer(stop_words=dict_wordcloud)),
('svm_clf',SGDClassifier(random_state=42))
])
pipe_params_svm_tfidf = {
'tvec__ngram_range': ([(1,1),(1,2),(1,3)]),
'svm_clf__alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3], # learning rate
'svm_clf__n_iter': [1000], # number of epochs
'svm_clf__loss': ['log'], # logistic regression
'svm_clf__penalty': ['l2'],
'svm_clf__n_jobs': [-1]
}
svm_pipeline_gs = GridSearchCV(svm_pipeline, param_grid= pipe_params_svm_tfidf, cv = 3)
svm_pipeline_gs.fit(X_train, y_train)
print("Best score: ", svm_pipeline_gs.best_score_)
print("Test score: ", svm_pipeline_gs.score(X_test, y_test))
print("Best parameters: ", svm_pipeline_gs.best_params_)
svm_pipeline = Pipeline([
('tvec', TfidfVectorizer()),
('svm_clf',SGDClassifier(random_state=41))
])
pipe_params_svm_tfidf = {
'tvec__ngram_range': ([(1,1),(1,2),(1,3)]),
'svm_clf__alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3], # learning rate
'svm_clf__n_iter': [1000], # number of epochs
'svm_clf__loss': ['log'], # logistic regression
'svm_clf__penalty': ['l2'],
'svm_clf__n_jobs': [-1]
}
svm_pipeline_gs = GridSearchCV(svm_pipeline, param_grid= pipe_params_svm_tfidf, cv = 3)
svm_pipeline_gs.fit(X_train, y_train)
print("Best score: ", svm_pipeline_gs.best_score_)
print("Test score: ", svm_pipeline_gs.score(X_test, y_test))
print("Best parameters: ", svm_pipeline_gs.best_params_)
# Create a pipeline with TfidfVectorizer and model
pipe = Pipeline([('tvec', TfidfVectorizer()),
('pac', PassiveAggressiveClassifier(random_state=41))])
# Tune GridSearchCV
pipe_params = {'tvec__max_df': [0.95,0.90],
'tvec__max_features': [50000],
'tvec__ngram_range': [(1,3),(1,4)],
'pac__C': [0.75],
'pac__loss': ['squared_hinge']}
pac_gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
pac_gs.fit(X_train, y_train);
print("Best score:", pac_gs.best_score_)
print("Test score", pac_gs.score(X_test, y_test))
print("Best parameters: ", pac_gs.best_params_)
# Create a pipeline with TfidfVectorizer and model
pipe = Pipeline([('tvec', TfidfVectorizer(stop_words=dict_wordcloud)),
('pac', PassiveAggressiveClassifier(random_state=42))])
# Tune GridSearchCV
pipe_params = {'tvec__max_df': [0.95,0.90],
'tvec__max_features': [50000],
'tvec__ngram_range': [(1,3),(1,4)],
'pac__C': [0.75],
'pac__loss': ['squared_hinge']}
pac_gs = GridSearchCV(pipe, param_grid=pipe_params, cv=3)
pac_gs.fit(X_train, y_train);
print("Best score:", pac_gs.best_score_)
print("Test score", pac_gs.score(X_test, y_test))
print("Best parameters: ", pac_gs.best_params_)
final_pred = pac_gs.predict(data_test['feature'])
data_test["label"] = final_pred
predictions= data_test[['ID','label']]
predictions.columns = ['News_id', 'prediction']
predictions.to_csv("Predictions_Furqan_Wieland.csv")