Link to this notebook uploaded on Colab
Link to this notebook uploaded on Github
Link to this notebook uploaded on Google Drive
Python environment should be Python 3.6
from textblob import TextBlob
import sys,os,time
import matplotlib.pyplot as plt
import json
import demoji
import pandas as pd
import numpy as np
import preprocessor as pp
import re
import spacy
import gc
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from IPython.display import clear_output
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
# Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB
# Gausian Naive Bayes
from sklearn.naive_bayes import GaussianNB
# Categorical Naive Bayes
from sklearn.naive_bayes import CategoricalNB
# Bernoulli Naive Bayes
from sklearn.naive_bayes import BernoulliNB
# K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
# SVM
from sklearn.svm import SVC
# Linear Model
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
# Grid Search
from sklearn.model_selection import GridSearchCV
# Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, precision_score, recall_score
import seaborn as sns
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from IPython.display import display, Markdown
import threading
from sklearn.model_selection import StratifiedKFold
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from wordcloud import WordCloud, STOPWORDS
from PIL import Image
import warnings
import torch
import torch.nn.functional as F
import torch.nn as nn
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, Dropout, Layer
from tensorflow.keras.layers import Embedding, Input, GlobalAveragePooling1D, Dense
from tensorflow.keras.models import Sequential, Model
import pickle
# plot 3D PCA using plotly
from plotly.offline import iplot
import plotly.graph_objs as go
import pyLDAvis
import pyLDAvis.sklearn
import multiprocessing
import cufflinks as cf
from tqdm import tqdm
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
pyLDAvis.enable_notebook()
# tqdm.pandas(desc='Progress')
warnings.filterwarnings("ignore")
def printmd(string):
display(Markdown(string))
%matplotlib inline
class Normalizer(BaseEstimator, TransformerMixin):
def __init__(self, options):
self.verbose = False
if isinstance(options, tuple):
options, self.verbose = options
self.nlp = spacy.load("en_core_web_sm")
pp.set_options(pp.OPT.URL)
if 'l' not in options and 's' not in options and 'r' not in options and 'c' not in options:
print("Options: (r | l | s | c)")
raise Error
char_map = {'l': 'Lemmatization', 's': 'Stemming', 'r': 'Removal of Stopwords', 'c': 'Clean Text'}
if self.verbose:
printmd("## Using " +
" + ".join([f"{char_map[option]}" for option in options]))
self.options = options
self.stemmer = SnowballStemmer(language='english')
def lemmatize_text(self, current_text):
lemmatized_text_text = []
if self.verbose:
printmd("## Lemmatizing text")
if type(current_text) != str:
current_text = " ".join(current_text)
doc = self.nlp(current_text)
for token in doc:
lemmatized_text_text.append(token.lemma_)
if self.verbose:
printmd(f'''
| **text** | **Lemmatized text** |
| --- | -- |
| {current_text} | {lemmatized_text_text} |
''')
return lemmatized_text_text
def stemitize_text(self, current_text):
if self.verbose:
printmd("## Stemming text")
stemitized_text_text = []
if type(current_text) == str:
current_text = current_text.split()
for token in current_text:
stemitized_text_text.append(self.stemmer.stem(token))
if self.verbose:
printmd(f'''
| **text** | **text after Stemming** |
| --- | -- |
| {current_text} | {stemitized_text_text} |
''')
return stemitized_text_text
def remove_stopwords(self, current_text):
stopwords_removed_text = []
if type(current_text) == str:
current_text = current_text.split()
for word in current_text:
if word not in self.nlp.Defaults.stop_words:
stopwords_removed_text.append(word)
return stopwords_removed_text
def remove_html_tags(self, current_text):
return re.sub('<[^<]+?>', ' ', current_text)
def remove_urls(self, current_text):
return re.sub(r'http\S+', ' ', current_text)
def remove_punctuation(self, current_text):
return re.sub('[^a-zA-Z]', ' ', current_text)
def remove_numbers(self, current_text):
return re.sub('[0-9]', ' ', current_text)
def remove_non_ascii(self, current_text):
return re.sub(r'[^\x00-\x7F]+', ' ', current_text)
def clean_text(self, current_text):
current_text = current_text.replace("'", "")
current_text = self.remove_urls(current_text)
current_text = self.remove_html_tags(current_text)
current_text = self.remove_punctuation(current_text)
current_text = self.remove_numbers(current_text)
current_text = self.remove_non_ascii(current_text)
current_text = current_text.lower()
# remove single characters
current_text = re.sub('\s[a-zA-Z]\s', ' ', current_text)
# replace multiple spaces with a single space
current_text = re.sub(' +', ' ', current_text)
# remove leading and trailing spaces
current_text = current_text.strip()
return current_text
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
lemmatize_text_list = []
for index, datum in tqdm(X.items()):
cleaned_text = datum
# Check for the options
for c in self.options:
if type(cleaned_text) != str:
cleaned_text = " ".join(cleaned_text)
if c == "l":
# Lemmatize the text
cleaned_text = self.lemmatize_text(
current_text=cleaned_text)
elif c == 's':
# Stem the text
cleaned_text = self.stemitize_text(
current_text=cleaned_text)
elif c == 'r':
# Remove stopwords
cleaned_text = self.remove_stopwords(
current_text=cleaned_text)
elif c == 'c':
# Clean the text
cleaned_text = self.clean_text(
current_text=cleaned_text)
if type(cleaned_text) != str:
# As this is a list, join to make a string again
normalized_text = " ".join(cleaned_text)
else:
normalized_text = cleaned_text
# Append text to the lematize_text_list
lemmatize_text_list.append(normalized_text)
if self.verbose:
printmd(
f"## text after Normalization \n### {normalized_text}\n---\n")
X = lemmatize_text_list
return X
train = pd.read_csv('data/train.csv')
normalizer = Normalizer(options='cr')
train_slice = train.head(10)
train_slice['Normalized_Review_text'] = normalizer.fit_transform(train_slice['Review_text'])
10it [00:00, 9861.99it/s]
train_slice.loc[1, 'Review_text']
'*****<br />Numi\'s Collection Assortment Melange includes:<br />5 Herbal Teas (caffeine-free, also called "teasans"):<br />* Dry Desert Lime: Lime Herbal Teasan<br />* Fields of Gold: Lemongrass Herbal Teasan<br />* Red Mellow Bush: Rooibos Herbal Teasan<br />* Bushmen\'s Brew: Honeybush Herbal Teasan<br />* Simply Mint: Moroccan Mint<br /><br />2 Green Teas (lower in caffeine):<br />* Temple of Heaven: Gunpowder Green Tea<br />* Monkey King: Jasmine Green Tea<br /><br />2 Black Teas (contain caffeine):<br />* Chinese Breakfast: Yunnan Black Tea<br />* Smoky Tarry: Lapsang Souchong Black Tea<br /><br />This is a total of nine different teas, two tea bags of each one in each box. Numi teas are known for their high-quality, organic and kosher ingredients, and in my opinion, are some of the tastiest and best teas I have ever tried. They do not include artificial ingredients or flavorings.<br /><br />On the box, the manufacturer writes: "From mist-covered mountains to sun-drenched deserts to fertile fields, we proudly bring you our tea palette. The flavors range from smooth earthiness, and light floral scents, to refreshingly sweet and sour notes. What they all share is the recollection of how some Ancient One tamed fire and water to coexist harmoniously, and steeped in them the gifts of Nature. Since that distant past, people have soothed, roused, healed, explored and celebrated with these wonderful infusions. We encourage you to do the same with Numi\'s finest."<br /><br />The price is perfect, a big savings over single boxes. If you like Numi teas and want to sample a wonderful assortment, get it---you won\'t be sorry. There may be some you like and others you don\'t, but sampling them will be a delightful experience. If you are new to Numi this is an excellent way to sample their high-quality teas. I do not think you could find a better source of premium tea than Numi.<br /><br />Highly recommended.<br />***** Organic, Kosher, Tasty Assortment of Premium Teas & Teasans'
train_slice.loc[1, 'Normalized_Review_text']
'numis collection assortment melange includes herbal teas caffeine free called teasans dry desert lime lime herbal teasan fields gold lemongrass herbal teasan red mellow bush rooibos herbal teasan bushmens brew honeybush herbal teasan simply mint moroccan mint green teas lower caffeine temple heaven gunpowder green tea monkey king jasmine green tea black teas contain caffeine chinese breakfast yunnan black tea smoky tarry lapsang souchong black tea total different teas tea bags box numi teas known high quality organic kosher ingredients opinion tastiest best teas tried include artificial ingredients flavorings box manufacturer writes mist covered mountains sun drenched deserts fertile fields proudly bring tea palette flavors range smooth earthiness light floral scents refreshingly sweet sour notes share recollection ancient tamed fire water coexist harmoniously steeped gifts nature distant past people soothed roused healed explored celebrated wonderful infusions encourage numis finest price perfect big savings single boxes like numi teas want sample wonderful assortment wont sorry like dont sampling delightful experience new numi excellent way sample high quality teas think find better source premium tea numi highly recommended organic kosher tasty assortment premium teas teasans'
clean = Normalizer(options='cr') # cleans the text and removes stopwords
cleaned_df = pd.DataFrame(normalizer.fit_transform(train['Review_text']), columns=['cleaned_text'])
cleaned_df['Score'] = train['Score']
cleaned_df.to_csv('cleaned_text.csv', index=False)
df_cleaned = pd.read_csv('data/cleaned_text.csv') # load the dataset with cleaned text
# typecasting cleaned_text to string
df_cleaned['cleaned_text'] = df_cleaned['cleaned_text'].astype(str)
df_cleaned.head()
| Score | cleaned_text | |
|---|---|---|
| 0 | 5 | received product early seller tastey great mid... |
| 1 | 5 | numis collection assortment melange includes h... |
| 2 | 5 | careful overcook pasta making sure bite minute... |
| 3 | 5 | buying multi pack misled picture hazel nuts pr... |
| 4 | 5 | bars good loved warmed definitely think great ... |
print(df_cleaned.shape)
dropped_count = df_cleaned.shape[0] - df_cleaned.drop_duplicates().shape[0]
df_cleaned['Score'].value_counts().plot(
kind='pie', autopct='%1.1f%%', startangle=90, figsize=(20, 10))
df_cleaned['Score'].value_counts().iplot(kind='bar')
plt.show()
df_cleaned.drop_duplicates(inplace=True)
print(df_cleaned.shape)
df_cleaned['Score'].value_counts().plot(
kind='pie', autopct='%1.1f%%', startangle=90, figsize=(20, 10))
df_cleaned['Score'].value_counts().iplot(kind='bar')
plt.show()
printmd(f"# Dropped {dropped_count} duplicate rows")
from collections import Counter
counter=Counter(' '.join(df_cleaned['cleaned_text'].tolist()).split())
most=counter.most_common()
x, y= [], []
for word,count in most[:20]:
x.append(word)
y.append(count)
plt.figure(figsize=(20, 10))
p = sns.barplot(x=y, y=x)
p.set_title("For the entire data", fontsize=40)
plt.show()
for i in range(1,6):
print(f"{i} star reviews")
print(df_cleaned[df_cleaned['Score']==i].shape[0])
counter = Counter(
' '.join(df_cleaned[df_cleaned['Score'] == i]['cleaned_text'].tolist()).split())
most = counter.most_common()
x, y = [], []
for word, count in most[:20]:
x.append(word)
y.append(count)
plt.figure(figsize=(20, 10))
p = sns.barplot(x=y, y=x)
p.set_title(f"{i} star reviews", fontsize=40)
plt.show()
1 star reviews 28519
2 star reviews 16375
3 star reviews 23256
4 star reviews 43912
5 star reviews 196425
import pandas_profiling
# Perform and advanced visualisation using pandas_profiling library
pandas_profiling.ProfileReport(df_cleaned)
# Make the dataset balanced with stratification
df_balanced = df_cleaned.groupby('Score').apply(
lambda x: x.sample(n=df_cleaned['Score'].value_counts().min()))
df_balanced = df_balanced.reset_index(drop=True)
df_balanced['Score'].value_counts().iplot(kind='bar')
plt.show()
# make wordcloud for each score
def make_wordcloud(df, label, mask_path):
words = ' '.join(df[df['Score'] == label]
['cleaned_text'].values.astype('U'))
mask = np.array(Image.open(f'masks/{mask_path}'))
wordcloud = WordCloud(background_color='white',
mask=mask, stopwords=STOPWORDS, min_font_size=5, width=4000, height=4000, contour_color='#141414', contour_width=5).generate(words)
# plot the WordCloud image
plt.figure(figsize=(10, 10), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.title(f"Score {i}", fontsize=40)
plt.tight_layout(pad=0)
plt.savefig(f"wordclouds/wordcloud_score_{i}.png", dpi=1200)
plt.show()
masks = ['pizza_mask.png', 'burger_mask.png', 'drink_mask.png', 'samosa_mask.png', 'icecream_mask.png']
for i in range(1,6):
make_wordcloud(df, i, masks[i-1])
normalizer = Normalizer(options='lc')
df_cleaned['l_text'] = normalizer.fit_transform(df_cleaned['cleaned_text'])
normalizer = Normalizer(options='sc')
df_cleaned['s_text'] = normalizer.fit_transform(df_cleaned['cleaned_text'])
df_cleaned.to_csv('data/preprocessed_data.csv', index=False)
Since the wordcloud generation is time consuming, We have saved the wordclouds in the wordclouds folder and display them below.
df = pd.read_csv('data/preprocessed_data.csv', encoding='utf-8')
df['l_text'] = df['l_text'].astype(str)
df['s_text'] = df['s_text'].astype(str)
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
# sample data
df_sample = df.groupby('Score').apply(
lambda x: x.sample(n=int(x.shape[0]*0.1)))
for vectorizer in [CountVectorizer, TfidfVectorizer]:
vect = vectorizer()
X = vect.fit_transform(df_sample.cleaned_text).todense()
pca = PCA(n_components=2).fit(X)
data2D = pca.transform(X)
fig, ax = plt.subplots()
fig.set_size_inches(20, 10)
ax.set_title(f'2D PCA {vect.__class__.__name__}', fontsize=40)
sns.scatterplot(data2D[:, 0], data2D[:, 1], hue=df_sample.Score,
s=100, ax=ax, legend='full', alpha=0.8)
# set title
fig.show()
fig.savefig(f'{vect.__class__.__name__}_pca.png')
The CountVectorizer and TfIDF for this dataset gives vector with more than thousand dimension, where each dimension represents a token/unique word in our vocabulary. To model it in a 2-D space we used Principal Component Analysis(PCA). Pca linearly transforms each vector into 2d space.
# sample data
df_sample = df.groupby('Score').apply(
lambda x: x.sample(n=int(x.shape[0]*0.1)))
# vect = CountVectorizer()
# X = vect.fit_transform(df_sample.cleaned_text).todense()
# pca = PCA(n_components=3).fit(X)
# data3D = pca.transform(X)
# np.save('data/pca_data_countvectorizer.npy', data3D)
data3D = np.load('data/pca_data_countvectorizer.npy')
def plot_3d_pca(data3D, df_sample):
trace = go.Scatter3d(
x=data3D[:, 0],
y=data3D[:, 1],
z=data3D[:, 2],
mode='markers',
marker=dict(
size=10,
color=df_sample.Score,
colorscale='Viridis',
opacity=0.8
)
)
data = [trace]
layout = go.Layout(
margin=dict(
l=0,
r=0,
b=0,
t=0
)
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
plot_3d_pca(data3D, df_sample)
The TfIdf Vectorizer for this dataset gives vector with more than thousand dimension, where each dimension represents a token/unique word in our vocabulary. To model it in a 3-D space we used Principal Component Analysis(PCA). Pca linearly transforms each vector into 3d space.
# vect = TfidfVectorizer()
# X = vect.fit_transform(df_sample.cleaned_text).todense()
# pca = PCA(n_components=3).fit(X)
# data3D = pca.transform(X)
# np.save('data/pca_data_tfidfvectorizer.npy', data3D)
data3D = np.load('data/pca_data_tfidfvectorizer.npy')
def plot_3d_pca(data3D, df_sample):
trace = go.Scatter3d(
x=data3D[:, 0],
y=data3D[:, 1],
z=data3D[:, 2],
mode='markers',
marker=dict(
size=10,
color=df_sample.Score,
colorscale='Viridis',
opacity=0.8
)
)
data = [trace]
layout = go.Layout(
margin=dict(
l=0,
r=0,
b=0,
t=0
)
)
fig = go.Figure(data=data, layout=layout)
iplot(fig)
plot_3d_pca(data3D, df_sample)
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn import metrics
import matplotlib.cm as cpm
# vect = TfidfVectorizer()
# X = df_sample['cleaned_text']
# X = vect.fit_transform(X).todense()
# # Using PCA to remove cols which has less co-relation
# sklearn_pca = PCA(n_components=2)
# # fit_transform() is used to scale training data to learn parameters such as
# X = sklearn_pca.fit_transform(X)
X = np.load('data/pca_data.npy')
y = df_sample['Score']
def kmeansSil(X):
range_n_clusters = range(3,8)
admi = []
ars = []
rs = []
dbs = []
homo = []
comp = []
vmeasure = []
fms = []
for n_clusters in range_n_clusters:
# Create a subplot with 1 row and 2 columns
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
# The 1st subplot is the silhouette plot
# The silhouette coefficient can range from -1, 1 but in this example all
# lie within [-0.1, 1]
ax1.set_xlim([-1, 1])
# The (n_clusters+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])
# Initialize the clusterer with n_clusters value and a random generator
# seed of 10 for reproducibility.
clusterer = KMeans(n_clusters=n_clusters, random_state=42, n_jobs=-1, init="k-means++").fit(X)
cluster_labels = clusterer.predict(X)
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
silhouette_avg = silhouette_score(X, cluster_labels)
print("For n_clusters =", n_clusters,
"The average silhouette_score is :", silhouette_avg)
# print("Adjusted Mutual Info Score:",metrics.adjusted_mutual_info_score(y, cluster_labels))
admi.append(metrics.adjusted_mutual_info_score(
y, cluster_labels))
# print("Adjusted Rand Score:",metrics.adjusted_rand_score(y, cluster_labels))
ars.append(metrics.adjusted_rand_score(y, cluster_labels))
# print("Rand Score:",metrics.rand_score(y, cluster_labels))
rs.append(metrics.rand_score(y, cluster_labels))
# print("Davies Bouldin Score:",metrics.davies_bouldin_score(X, cluster_labels))
dbs.append(metrics.davies_bouldin_score(X, cluster_labels))
hcv = metrics.homogeneity_completeness_v_measure(
y, cluster_labels)
# print("Homogeneity:",hcv[0])
homo.append(hcv[0])
# print("Completeness:",hcv[1])
comp.append(hcv[1])
# print("V measure:",hcv[2])
vmeasure.append(hcv[2])
# print("Fowlkes-Mallows scores",metrics.fowlkes_mallows_score(y, cluster_labels),'\n')
fms.append(metrics.fowlkes_mallows_score(y, cluster_labels))
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(X, cluster_labels)
y_lower = 10
for i in range(n_clusters):
# Aggregate the silhouette scores for samples belonging to
# cluster i, and sort them
ith_cluster_silhouette_values = \
sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cpm.nipy_spectral(float(i) / n_clusters)
ax1.fill_betweenx(np.arange(y_lower, y_upper),
0, ith_cluster_silhouette_values,
facecolor=color, edgecolor=color, alpha=0.7)
# Label the silhouette plots with their cluster numbers at the middle
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
# Compute the new y_lower for next plot
y_lower = y_upper + 10 # 10 for the 0 samples
ax1.set_title("The silhouette plot for the various clusters.")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")
# The vertical line for average silhouette score of all the values
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
ax1.set_yticks([]) # Clear the yaxis labels / ticks
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
# 2nd Plot showing the actual clusters formed
colors = cpm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7,
c=colors, edgecolor='k')
# Labeling the clusters
centers = clusterer.cluster_centers_
# Draw white circles at cluster centers
ax2.scatter(centers[:, 0], centers[:, 1], marker='o',
c="white", alpha=1, s=200, edgecolor='k')
for i, c in enumerate(centers):
ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1,
s=50, edgecolor='k')
ax2.set_title("The visualization of the clustered data.")
ax2.set_xlabel("Feature space for the 1st feature")
ax2.set_ylabel("Feature space for the 2nd feature")
plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
"with n_clusters = %d" % n_clusters),
fontsize=14, fontweight='bold')
plt.show()
plt.figure(figsize=(20, 10))
plt.plot(range_n_clusters, ars, label='Adjusted Rand')
plt.plot(range_n_clusters, rs, label='Rand')
plt.plot(range_n_clusters, dbs, label='Davies Boulding')
plt.plot(range_n_clusters, homo, label='Homogeneity')
plt.plot(range_n_clusters, comp, label='Completeness')
plt.plot(range_n_clusters, vmeasure, label='V measure')
plt.plot(range_n_clusters, fms, label='Fowlkes-Mallows')
plt.xlabel('N clusters')
plt.ylabel('Evaluation Scores')
plt.legend(loc='upper right')
plt.show()
plt.figure(figsize=(20, 10))
plt.plot(range_n_clusters, admi, label='Adjusted Mutual Info')
plt.xlabel('N clusters')
plt.ylabel('Evaluation Scores')
plt.legend(loc='upper right')
plt.show()
kmeansSil(X)
For n_clusters = 3 The average silhouette_score is : 0.8152353881599693
For n_clusters = 4 The average silhouette_score is : 0.7706989350407105
For n_clusters = 5 The average silhouette_score is : 0.7790039313823386
For n_clusters = 6 The average silhouette_score is : 0.46732705373173083
For n_clusters = 7 The average silhouette_score is : 0.48704312481390705
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
def agglo(X):
for linkages in ["ward"]:
admi = []
ars = []
rs = []
dbs = []
homo = []
comp = []
vmeasure = []
fms = []
# Plotting clusters
for clusters in range(3,8):
agglo = AgglomerativeClustering(linkage=linkages, n_clusters=clusters)
agglo.fit(X)
y_pred = agglo.fit_predict(X)
x_min, x_max = np.min(X, axis=0), np.max(X, axis=0)
X_red = (X - x_min) / (x_max - x_min)
for i in range(X_red.shape[0]):
plt.text(
X_red[i, 0],
X_red[i, 1],
str('*'),
color=plt.cm.nipy_spectral(agglo.labels_[i] / clusters),
fontdict={"weight": "bold", "size": 9},
)
# plt.figure(figsize=(20, 10))
plt.xticks([])
plt.yticks([])
plt.axis("off")
plt.title((f"Agglomerative Clustering for n_clusters = {clusters} and linkage = {linkages}"), fontsize=14, fontweight='bold')
plt.show()
# print("Adjusted Mutual Info Score:",metrics.adjusted_mutual_info_score(y, agglo.labels_))
# print("Adjusted Rand Score:",metrics.adjusted_rand_score(y, agglo.labels_))
# print("Rand Score:",metrics.rand_score(y, agglo.labels_))
# print("Davies Bouldin Score:",metrics.davies_bouldin_score(X, agglo.labels_))
# hcv = metrics.homogeneity_completeness_v_measure(y, agglo.labels_)
# print("Homogeneity:",hcv[0])
# print("Completeness:",hcv[1])
# print("V measure:",hcv[2])
# print("Fowlkes-Mallows scores",metrics.fowlkes_mallows_score(y, agglo.labels_))
admi.append(metrics.adjusted_mutual_info_score(y, agglo.labels_))
ars.append(metrics.adjusted_rand_score(y, agglo.labels_))
rs.append(metrics.rand_score(y, agglo.labels_))
dbs.append(metrics.davies_bouldin_score(X, agglo.labels_))
hcv = metrics.homogeneity_completeness_v_measure(y, agglo.labels_)
homo.append(hcv[0])
comp.append(hcv[1])
vmeasure.append(hcv[2])
fms.append(metrics.fowlkes_mallows_score(y, agglo.labels_))
# PLotting dendrogram
Z = linkage(X, linkages)
plt.figure(figsize=(20, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample index')
plt.ylabel('Distance')
dendrogram(Z, truncate_mode='level', p=5)
plt.suptitle((f"Dendrogram for Agglomerative clustering on sample data with linkage = {linkages}" ),
fontsize=14, fontweight='bold')
# Plotting graphs for analysis
plt.figure(figsize=(20, 10))
plt.plot(range(3,8), ars, label='Adjusted Rand')
plt.plot(range(3,8), rs, label='Rand')
plt.plot(range(3,8), dbs, label='Davies Boulding')
plt.plot(range(3,8), homo, label='Homogeneity')
plt.plot(range(3,8), comp, label='Completeness')
plt.plot(range(3,8), vmeasure, label='V measure')
plt.plot(range(3,8), fms, label='Fowlkes-Mallows')
plt.xlabel('N clusters')
plt.ylabel('Evaluation Scores')
plt.legend(loc='upper right')
plt.show()
plt.figure(figsize=(20, 10))
plt.plot(range(3,8), admi, label='Adjusted Mutual Info')
plt.xlabel('N clusters')
plt.ylabel('Evaluation Scores')
plt.legend(loc='upper right')
plt.show()
'''
The above code for agglomerative clustering takes a long time to run.
Hence the results for a previous run are shown below.
'''
# agglo(X)
'\nThe above code for agglomerative clustering takes a long time to run.\nHence the results for a previous run are shown below.\n'
printmd("Helper functions to train and test the models")
def prockfold(dfLematized, index,model, numberOfFolds, optionsName, nGram):
# 5 fold cross validation
kfold = StratifiedKFold(n_splits=numberOfFolds,
shuffle=True, random_state=7)
totalAccuracy = 0
totalFScore = 0
totalConfusion_matrix = None
threads = []
for train_index, test_index in kfold.split(dfLematized[index], dfLematized['Score']):
X_train, X_test = dfLematized.iloc[train_index][index], dfLematized.iloc[test_index][index]
y_train, y_test = dfLematized.iloc[train_index]['Score'], dfLematized.iloc[test_index]['Score']
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
totalAccuracy += accuracy_score(y_test, y_pred)
totalFScore += f1_score(y_test, y_pred, average='macro')
totalConfusion_matrix = totalConfusion_matrix + confusion_matrix(
y_test, y_pred) if totalConfusion_matrix is not None else confusion_matrix(y_test, y_pred)
fscore = totalFScore/kfold.get_n_splits()
acc_score = totalAccuracy/kfold.get_n_splits()
confusion_matrix_result = totalConfusion_matrix/kfold.get_n_splits()
moddddeeell = {"accuracy": acc_score, "f1_score": fscore}
# Save the confusion matrix np save
with open(f"unbalanced_{model['model'].__class__.__name__}_{model['vectorizer'].__class__.__name__}_{optionsName}_{nGram}_confusion_matrix.npy", "wb") as of:
np.save(of, confusion_matrix_result)
# Save the file as json
with open(f"unBalanced_{model['model'].__class__.__name__}_{model['vectorizer'].__class__.__name__}_{optionsName}_{nGram}.json", 'w') as f:
json.dump(moddddeeell, f)
def loadModel(modelAlgoName,vectorizerName,optionsName,nGram,path):
# Save the confusion matrix np save
with open(f"{path}/unbalanced_{modelAlgoName}_{vectorizerName}_{optionsName}_{nGram}_confusion_matrix.npy","rb") as of:
cM = np.load(of)
# Save the file as json
with open(f"{path}/unBalanced_{modelAlgoName}_{vectorizerName}_{optionsName}_{nGram}.json", 'r') as f:
model = json.load(f)
return model,cM
def prettyPrintModels(model,cM,modelAlgoName,vectorizerName,optionsName,nGram,numberOfFolds):
printmd("## Trained and Tested Model: " + modelAlgoName +
"\n\t - using " + optionsName + " for tokenization" +
"\n\t - with " + vectorizerName + " as a vectorizer taking " + nGram + " as a single token"+
"\n\t - without stratification on an unbalanced dataset")
printmd("--"*10+"Results" + "--"*10)
printmd(f"- Average Accuracy of {modelAlgoName} across {numberOfFolds}-folds = {model['accuracy']}")
printmd(f"- Average F1-Score of {modelAlgoName} across {numberOfFolds}-folds = {model['f1_score']}")
printmd(f"- Average Confusion Matrix of {modelAlgoName} across {numberOfFolds}-folds:")
# print(model['confusion_matrix'])
sns.heatmap(cM, annot=True)
plt.show()
Helper functions to train and test the models
Simplest yet powerful and fastest algorithim for the given large dataset. It uses Bayes theorem to forecast membership probabilities for each class. It outputs the likelihood for a given documnent/food_review belongs to a class. The class with the highest likelihood for the document is most likely to be the class of the document. Although, Naive Bayes assumes that each word is conditionally independent and does not estimates probabilities accurately but classification is about predicting the correct class and not about precisely estimating the probabilities. Moreover, the Naive Bayes is robust to non relevant features and to changing of definition of class over time. Therefore, for a fast and low on storage baseline model for food review classification we used Naive Bayes. https://www.researchgate.net/publication/266463703_Is_Naive_Bayes_a_Good_Classifier_for_Document_Classification confirms our choice as they conclude as one of the best and robust classifer for text compared to other machine learning models.
During the training Naives Bayes model contructs a vocabulary and calculates probabilites of each class and probability of each word belonging to a class. This is used, to calculate the probability of a class given a word, thus predicting the class for given set of words. To vectorize our data we have following 2 options: -> CountVectorizer: converts the food review into a vector that stores information of how many times each word in the vocabulary occurs in a food_review regradless of the order in which they occur. -> TfIdf: converts food review into a vector that assigns weights by importance to each each word in the vocabulary. For Naive Bayes CountVectorizer will be a good fit as it needs the count for each word in the food review to calculate probabilities and CountVectorizer is known to be better for probabilistics methods. Moreover, TfIDF will not be useful as importance wont be useful to calcualte the probabilities.
Will run the tests and evaluate the vectorization method.
numberOfFolds = 5
processes = []
# To compare vectorizers
pipeline = Pipeline([
('vectorizer', CountVectorizer(ngram_range=(1, 1))),
('model', MultinomialNB())
])
p = threading.Thread(target=prockfold, args=(
dfLematized, 'l_text', pipeline, numberOfFolds, 'Lematization', 'Unigram'))
processes.append(p)
p.start()threading.Thread
# To compare vectorizers
pipeline = Pipeline([
('vectorizer', TfidfVectorizer(ngram_range=(1, 1))),
('model', MultinomialNB())
])
p = threading.Thread(target=prockfold, args=(
dfLematized, 'l_text', pipeline, numberOfFolds, 'Lematization', 'Unigram'))
processes.append(p)
p.start()
# To compare Normalization techniques
pipeline = Pipeline([
('vectorizer', CountVectorizer(ngram_range=(1, 1))),
('model', MultinomialNB())
])
p = threading.Thread(target=prockfold, args=(
dfLematized, 's_text', pipeline, numberOfFolds, 'Stemming', 'Unigram'))
processes.append(p)
p.start()
# To compare n-grams
pipeline = Pipeline([
('vectorizer', CountVectorizer(ngram_range=(2, 2))),
('model', MultinomialNB())
])
p = threading.Thread(target=prockfold, args=(
dfLematized, 'l_text', pipeline, numberOfFolds, 'Lematization', 'Bigram'))
processes.append(p)
p.start()
pipeline = Pipeline([
('vectorizer', CountVectorizer(ngram_range=(3, 3))),
('model', MultinomialNB())
])
p = threading.Thread(target=prockfold, args=(
dfLematized, 'l_text', pipeline, numberOfFolds, 'Lematization', 'Trigram'))
processes.append(p)
p.start()
pipeline = Pipeline([
('vectorizer', CountVectorizer(ngram_range=(1, 2))),
('model', MultinomialNB())
])
p = threading.Thread(target=prockfold, args=(
dfLematized, 'l_text', pipeline, numberOfFolds, 'Lematization', 'Unigram and Bigram'))
processes.append(p)
p.start()
for p in processes:
p.join()
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from pycm import ConfusionMatrix
models_info = []
def plot_cm(cm, options):
header = []
body = [[], [], [], [], []]
accuracy_score_list = []
f1_score_list = []
for i, option in enumerate(options):
modelAlgoName, vectorizerName, optionsName, nGram, numberOfFolds = option
header.append(r"Trained and Tested Model: " + modelAlgoName)
body[0].append("using " + optionsName + " for tokenization")
body[1].append("with " + vectorizerName +
" as a vectorizer taking " + nGram + " as a single token")
body[2].append("without stratification on an unbalanced dataset")
body[3].append(
f"Average Accuracy of {modelAlgoName} across {numberOfFolds}-folds = {cm[i][1][0]['accuracy']*100:.2f}%")
accuracy_score_list.append(cm[i][1][0]['accuracy'])
body[4].append(
f"Average F1-Score of {modelAlgoName} across {numberOfFolds}-folds = {cm[i][1][0]['f1_score']*100:.2f}%")
f1_score_list.append(cm[i][1][0]['f1_score'])
table = pd.DataFrame(body, columns=header)
printmd(table.to_markdown())
f, axes = plt.subplots(1, len(cm), figsize=(30, 15), sharey='row')
for i, a in enumerate(cm):
key, n = a
model, cf_matrix = n
disp = ConfusionMatrixDisplay(cf_matrix,
display_labels=range(1, 6))
disp.plot(ax=axes[i], xticks_rotation=45)
disp.ax_.set_title(
key + f"\nAccuracy - {model['accuracy']*100:.2f}%\nF1-Score - {model['f1_score']*100:.2f}%")
disp.im_.colorbar.remove()
disp.ax_.set_xlabel('')
if i != 0:
disp.ax_.set_ylabel('')
f.text(0.4, 0.1, 'Predicted label', ha='left')
plt.subplots_adjust(wspace=0.40, hspace=0.1)
f.colorbar(disp.im_, ax=axes)
plt.show()
x = [f[0] for f in cm]
# plot the accuracy and f1 score in the same graph
plt.figure(figsize=(30, 10))
X_axis = np.arange(len(x))
plt.bar(X_axis-0.2, accuracy_score_list, 0.4, label='Accuracy')
plt.bar(X_axis+0.2, f1_score_list, 0.4, label='F1 Score')
plt.xticks(X_axis, x, rotation=90)
plt.legend()
plt.show()
numberOfFolds = 5
printmd("# CountVectorizer vs TfidfVectorizer")
# Load the models
model_1,cM_1 = loadModel("MultinomialNB","CountVectorizer","Lematization","Unigram", "models/data/NB")
model_2, cM_2 = loadModel("MultinomialNB", "TfidfVectorizer",
"Lematization", "Unigram", "models/data/NB")
info = [
["MultinomialNB", "CountVectorizer", "Lematization", "Unigram", numberOfFolds],
["MultinomialNB", "TfidfVectorizer",
"Lematization", "Unigram", numberOfFolds]
]
models = [
['CountVectorizer', (model_1,cM_1)],
['TfidfVectorizer', (model_2,cM_2)]
]
plot_cm(
models,
info
)
models_info.extend([x + [models[i][1][0]['accuracy'], models[i]
[1][0]['f1_score']] for i, x in enumerate(info)])
| Trained and Tested Model: MultinomialNB | Trained and Tested Model: MultinomialNB | |
|---|---|---|
| 0 | using Lematization for tokenization | using Lematization for tokenization |
| 1 | with CountVectorizer as a vectorizer taking Unigram as a single token | with TfidfVectorizer as a vectorizer taking Unigram as a single token |
| 2 | without stratification on an unbalanced dataset | without stratification on an unbalanced dataset |
| 3 | Average Accuracy of MultinomialNB across 5-folds = 69.24% | Average Accuracy of MultinomialNB across 5-folds = 64.52% |
| 4 | Average F1-Score of MultinomialNB across 5-folds = 44.00% | Average F1-Score of MultinomialNB across 5-folds = 19.08% |
printmd("## Lemmatization vs Stemming")
model_1, cM_1 = loadModel("MultinomialNB", "CountVectorizer",
"Lematization", "Unigram", "models/data/NB")
model_2, cM_2 = loadModel("MultinomialNB", "CountVectorizer",
"Stemming", "Unigram", "models/data/NB")
info = [
["MultinomialNB", "CountVectorizer",
"Lematization", "Unigram", numberOfFolds],
["MultinomialNB", "CountVectorizer",
"Stemming", "Unigram", numberOfFolds]
]
models = [
['Lematization', (model_1, cM_1)],
['Stemming', (model_2, cM_2)]
]
plot_cm(
models,
info
)
models_info.extend([x + [models[i][1][0]['accuracy'], models[i]
[1][0]['f1_score']] for i, x in enumerate(info)])
| Trained and Tested Model: MultinomialNB | Trained and Tested Model: MultinomialNB | |
|---|---|---|
| 0 | using Lematization for tokenization | using Stemming for tokenization |
| 1 | with CountVectorizer as a vectorizer taking Unigram as a single token | with CountVectorizer as a vectorizer taking Unigram as a single token |
| 2 | without stratification on an unbalanced dataset | without stratification on an unbalanced dataset |
| 3 | Average Accuracy of MultinomialNB across 5-folds = 69.24% | Average Accuracy of MultinomialNB across 5-folds = 69.20% |
| 4 | Average F1-Score of MultinomialNB across 5-folds = 44.00% | Average F1-Score of MultinomialNB across 5-folds = 44.75% |
printmd("# N-GRAMS")
model_1, cM_1 = loadModel("MultinomialNB", "CountVectorizer",
"Lematization", "Unigram", "models/data/NB")
model_2, cM_2 = loadModel("MultinomialNB", "CountVectorizer",
"Lematization", "Bigram", "models/data/NB")
model_3, cM_3 = loadModel("MultinomialNB", "CountVectorizer",
"Lematization", "Trigram", "models/data/NB")
model_4, cM_4 = loadModel("MultinomialNB", "CountVectorizer",
"Lematization", "Unigram and Bigram", "models/data/NB")
info = [
["MultinomialNB", "CountVectorizer",
"Lematization", "Unigram", numberOfFolds],
["MultinomialNB", "CountVectorizer",
"Lematization", "Bigram", numberOfFolds],
["MultinomialNB", "CountVectorizer",
"Lematization", "Trigram", numberOfFolds],
["MultinomialNB", "CountVectorizer",
"Lematization", "Unigram and Bigram", numberOfFolds]
]
models = [
['Unigram', (model_1, cM_1)],
['Bigram', (model_2, cM_2)],
['Trigram', (model_3, cM_3)],
['Unigram and Bigram', (model_4, cM_4)]
]
plot_cm(
models,
info
)
models_info.extend([x + [models[i][1][0]['accuracy'], models[i]
[1][0]['f1_score']] for i, x in enumerate(info)])
| Trained and Tested Model: MultinomialNB | Trained and Tested Model: MultinomialNB | Trained and Tested Model: MultinomialNB | Trained and Tested Model: MultinomialNB | |
|---|---|---|---|---|
| 0 | using Lematization for tokenization | using Lematization for tokenization | using Lematization for tokenization | using Lematization for tokenization |
| 1 | with CountVectorizer as a vectorizer taking Unigram as a single token | with CountVectorizer as a vectorizer taking Bigram as a single token | with CountVectorizer as a vectorizer taking Trigram as a single token | with CountVectorizer as a vectorizer taking Unigram and Bigram as a single token |
| 2 | without stratification on an unbalanced dataset | without stratification on an unbalanced dataset | without stratification on an unbalanced dataset | without stratification on an unbalanced dataset |
| 3 | Average Accuracy of MultinomialNB across 5-folds = 69.24% | Average Accuracy of MultinomialNB across 5-folds = 67.31% | Average Accuracy of MultinomialNB across 5-folds = 66.09% | Average Accuracy of MultinomialNB across 5-folds = 66.88% |
| 4 | Average F1-Score of MultinomialNB across 5-folds = 44.00% | Average F1-Score of MultinomialNB across 5-folds = 28.96% | Average F1-Score of MultinomialNB across 5-folds = 27.56% | Average F1-Score of MultinomialNB across 5-folds = 26.91% |
numberOfFolds = 5
processes = []
# To compare vectorizers
pipeline = Pipeline([
('vectorizer', CountVectorizer(ngram_range=(1, 1))),
('model', LogisticRegression())
])
p = threading.Thread(target=prockfold, args=(
dfLematized, 'l_text', pipeline, numberOfFolds, 'Lematization', 'Unigram'))
processes.append(p)
p.start()threading.Thread
# To compare vectorizers
pipeline = Pipeline([
('vectorizer', TfidfVectorizer(ngram_range=(1, 1))),
('model', LogisticRegression())
])
p = threading.Thread(target=prockfold, args=(
dfLematized, 'l_text', pipeline, numberOfFolds, 'Lematization', 'Unigram'))
processes.append(p)
p.start()
# To compare Normalization techniques
pipeline = Pipeline([
('vectorizer', TfidfVectorizer(ngram_range=(1, 1))),
('model', LogisticRegression())
])
p = threading.Thread(target=prockfold, args=(
dfLematized, 's_text', pipeline, numberOfFolds, 'Stemming', 'Unigram'))
processes.append(p)
p.start()
# To compare n-grams
pipeline = Pipeline([
('vectorizer', TfidfVectorizer(ngram_range=(2, 2))),
('model', LogisticRegression())
])
p = threading.Thread(target=prockfold, args=(
dfLematized, 'l_text', pipeline, numberOfFolds, 'Lematization', 'Bigram'))
processes.append(p)
p.start()
pipeline = Pipeline([
('vectorizer', TfidfVectorizer(ngram_range=(3, 3))),
('model', LogisticRegression())
])
p = threading.Thread(target=prockfold, args=(
dfLematized, 'l_text', pipeline, numberOfFolds, 'Lematization', 'Trigram'))
processes.append(p)
p.start()
pipeline = Pipeline([
('vectorizer', TfidfVectorizer(ngram_range=(1, 2))),
('model', LogisticRegression())
])
p = threading.Thread(target=prockfold, args=(
dfLematized, 'l_text', pipeline, numberOfFolds, 'Lematization', 'Unigram and Bigram'))
processes.append(p)
p.start()
for p in processes:
p.join()
numberOfFolds = 5
printmd("# CountVectorizer vs TfidfVectorizer")
# Load the models
model_1, cM_1 = loadModel("LogisticRegression", "CountVectorizer",
"Lematization", "Unigram", "models/data/LR")
model_2, cM_2 = loadModel("LogisticRegression", "TfidfVectorizer",
"Lematization", "Unigram", "models/data/LR")
info = [
["LogisticRegression", "CountVectorizer",
"Lematization", "Unigram", numberOfFolds],
["LogisticRegression", "TfidfVectorizer",
"Lematization", "Unigram", numberOfFolds]
]
models = [
['CountVectorizer', (model_1, cM_1)],
['TfidfVectorizer', (model_2, cM_2)]
]
plot_cm(
models,
info
)
models_info.extend([x + [models[i][1][0]['accuracy'], models[i]
[1][0]['f1_score']] for i, x in enumerate(info)])
| Trained and Tested Model: LogisticRegression | Trained and Tested Model: LogisticRegression | |
|---|---|---|
| 0 | using Lematization for tokenization | using Lematization for tokenization |
| 1 | with CountVectorizer as a vectorizer taking Unigram as a single token | with TfidfVectorizer as a vectorizer taking Unigram as a single token |
| 2 | without stratification on an unbalanced dataset | without stratification on an unbalanced dataset |
| 3 | Average Accuracy of LogisticRegression across 5-folds = 68.07% | Average Accuracy of LogisticRegression across 5-folds = 68.09% |
| 4 | Average F1-Score of LogisticRegression across 5-folds = 44.50% | Average F1-Score of LogisticRegression across 5-folds = 44.65% |
printmd("## Lemmatization vs Stemming")
model_1, cM_1 = loadModel("LogisticRegression", "TfidfVectorizer",
"Lematization", "Unigram", "models/data/LR")
model_2, cM_2 = loadModel("LogisticRegression", "TfidfVectorizer",
"Stemming", "Unigram", "models/data/LR")
info = [
["LogisticRegression", "TfidfVectorizer",
"Lematization", "Unigram", numberOfFolds],
["LogisticRegression", "TfidfVectorizer",
"Stemming", "Unigram", numberOfFolds]
]
models = [
['Lematization', (model_1, cM_1)],
['Stemming', (model_2, cM_2)]
]
plot_cm(
models,
info
)
models_info.extend([x + [models[i][1][0]['accuracy'], models[i]
[1][0]['f1_score']] for i, x in enumerate(info)])
| Trained and Tested Model: LogisticRegression | Trained and Tested Model: LogisticRegression | |
|---|---|---|
| 0 | using Lematization for tokenization | using Stemming for tokenization |
| 1 | with TfidfVectorizer as a vectorizer taking Unigram as a single token | with TfidfVectorizer as a vectorizer taking Unigram as a single token |
| 2 | without stratification on an unbalanced dataset | without stratification on an unbalanced dataset |
| 3 | Average Accuracy of LogisticRegression across 5-folds = 68.09% | Average Accuracy of LogisticRegression across 5-folds = 69.33% |
| 4 | Average F1-Score of LogisticRegression across 5-folds = 44.65% | Average F1-Score of LogisticRegression across 5-folds = 45.69% |
printmd("# N-GRAMS")
model_1, cM_1 = loadModel("LogisticRegression", "TfidfVectorizer",
"Lematization", "Unigram", "models/data/LR")
model_2, cM_2 = loadModel("LogisticRegression", "TfidfVectorizer",
"Lematization", "Bigram", "models/data/LR")
model_3, cM_3 = loadModel("LogisticRegression", "TfidfVectorizer",
"Lematization", "Trigram", "models/data/LR")
model_4, cM_4 = loadModel("LogisticRegression", "TfidfVectorizer",
"Lematization", "Unigram and Bigram", "models/data/LR")
info = [
["LogisticRegression", "TfidfVectorizer",
"Lematization", "Unigram", numberOfFolds],
["LogisticRegression", "TfidfVectorizer",
"Lematization", "Bigram", numberOfFolds],
["LogisticRegression", "TfidfVectorizer",
"Lematization", "Trigram", numberOfFolds],
["LogisticRegression", "TfidfVectorizer",
"Lematization", "Unigram and Bigram", numberOfFolds]
]
models = [
['Unigram', (model_1, cM_1)],
['Bigram', (model_2, cM_2)],
['Trigram', (model_3, cM_3)],
['Unigram and Bigram', (model_4, cM_4)]
]
plot_cm(
models,
info
)
models_info.extend([x + [models[i][1][0]['accuracy'], models[i]
[1][0]['f1_score']] for i, x in enumerate(info)])
| Trained and Tested Model: LogisticRegression | Trained and Tested Model: LogisticRegression | Trained and Tested Model: LogisticRegression | Trained and Tested Model: LogisticRegression | |
|---|---|---|---|---|
| 0 | using Lematization for tokenization | using Lematization for tokenization | using Lematization for tokenization | using Lematization for tokenization |
| 1 | with TfidfVectorizer as a vectorizer taking Unigram as a single token | with TfidfVectorizer as a vectorizer taking Bigram as a single token | with TfidfVectorizer as a vectorizer taking Trigram as a single token | with TfidfVectorizer as a vectorizer taking Unigram and Bigram as a single token |
| 2 | without stratification on an unbalanced dataset | without stratification on an unbalanced dataset | without stratification on an unbalanced dataset | without stratification on an unbalanced dataset |
| 3 | Average Accuracy of LogisticRegression across 5-folds = 68.09% | Average Accuracy of LogisticRegression across 5-folds = 68.51% | Average Accuracy of LogisticRegression across 5-folds = 61.49% | Average Accuracy of LogisticRegression across 5-folds = 70.77% |
| 4 | Average F1-Score of LogisticRegression across 5-folds = 44.65% | Average F1-Score of LogisticRegression across 5-folds = 43.82% | Average F1-Score of LogisticRegression across 5-folds = 34.33% | Average F1-Score of LogisticRegression across 5-folds = 48.09% |
numberOfFolds = 5
processes = []
# To compare vectorizers
pipeline = Pipeline([
('vectorizer', CountVectorizer(ngram_range=(1, 1))),
('model', RandomForestClassifier())
])
p = threading.Thread(target=prockfold, args=(
dfLematized, 'l_text', pipeline, numberOfFolds, 'Lematization', 'Unigram'))
processes.append(p)
p.start()threading.Thread
# To compare vectorizers
pipeline = Pipeline([
('vectorizer', TfidfVectorizer(ngram_range=(1, 1))),
('model', RandomForestClassifier())
])
p = threading.Thread(target=prockfold, args=(
dfLematized, 'l_text', pipeline, numberOfFolds, 'Lematization', 'Unigram'))
processes.append(p)
p.start()
# To compare Normalization techniques
pipeline = Pipeline([
('vectorizer', TfidfVectorizer(ngram_range=(1, 1))),
('model', RandomForestClassifier())
])
p = threading.Thread(target=prockfold, args=(
dfLematized, 's_text', pipeline, numberOfFolds, 'Stemming', 'Unigram'))
processes.append(p)
p.start()
# To compare n-grams
pipeline = Pipeline([
('vectorizer', TfidfVectorizer(ngram_range=(2, 2))),
('model', RandomForestClassifier())
])
p = threading.Thread(target=prockfold, args=(
dfLematized, 'l_text', pipeline, numberOfFolds, 'Lematization', 'Bigram'))
processes.append(p)
p.start()
pipeline = Pipeline([
('vectorizer', TfidfVectorizer(ngram_range=(3, 3))),
('model', RandomForestClassifier())
])
p = threading.Thread(target=prockfold, args=(
dfLematized, 'l_text', pipeline, numberOfFolds, 'Lematization', 'Trigram'))
processes.append(p)
p.start()
pipeline = Pipeline([
('vectorizer', TfidfVectorizer(ngram_range=(1, 2))),
('model', RandomForestClassifier())
])
p = threading.Thread(target=prockfold, args=(
dfLematized, 'l_text', pipeline, numberOfFolds, 'Lematization', 'Unigram and Bigram'))
processes.append(p)
p.start()
for p in processes:
p.join()
numberOfFolds = 5
printmd("# CountVectorizer vs TfidfVectorizer")
# Load the models
model_1, cM_1 = loadModel("RandomForestClassifier", "CountVectorizer",
"Lematization", "Unigram", "models/data/RF")
model_2, cM_2 = loadModel("RandomForestClassifier", "TfidfVectorizer",
"Lematization", "Unigram", "models/data/RF")
info = [
["RandomForestClassifier", "CountVectorizer",
"Lematization", "Unigram", numberOfFolds],
["RandomForestClassifier", "TfidfVectorizer",
"Lematization", "Unigram", numberOfFolds]
]
models = [
['CountVectorizer', (model_1, cM_1)],
['TfidfVectorizer', (model_2, cM_2)]
]
plot_cm(
models,
info
)
models_info.extend([x + [models[i][1][0]['accuracy'], models[i]
[1][0]['f1_score']] for i, x in enumerate(info)])
| Trained and Tested Model: RandomForestClassifier | Trained and Tested Model: RandomForestClassifier | |
|---|---|---|
| 0 | using Lematization for tokenization | using Lematization for tokenization |
| 1 | with CountVectorizer as a vectorizer taking Unigram as a single token | with TfidfVectorizer as a vectorizer taking Unigram as a single token |
| 2 | without stratification on an unbalanced dataset | without stratification on an unbalanced dataset |
| 3 | Average Accuracy of RandomForestClassifier across 5-folds = 63.71% | Average Accuracy of RandomForestClassifier across 5-folds = 63.70% |
| 4 | Average F1-Score of RandomForestClassifier across 5-folds = 15.70% | Average F1-Score of RandomForestClassifier across 5-folds = 15.66% |
printmd("# LEMMATIZATION vs STEMMING")
model_1, cM_1= loadModel("RandomForestClassifier", "TfidfVectorizer",
"Lematization", "Unigram", "models/data/RF")
model_2, cM_2 = loadModel("RandomForestClassifier", "TfidfVectorizer",
"Stemming", "Unigram", "models/data/RF")
info = [
["RandomForestClassifier", "TfidfVectorizer",
"Lematization", "Unigram", numberOfFolds],
["RandomForestClassifier", "TfidfVectorizer",
"Stemming", "Unigram", numberOfFolds]
]
models = [
['Lematization', (model_1, cM_1)],
['Stemming', (model_2, cM_2)]
]
plot_cm(
models,
info
)
models_info.extend([x + [models[i][1][0]['accuracy'], models[i]
[1][0]['f1_score']] for i, x in enumerate(info)])
| Trained and Tested Model: RandomForestClassifier | Trained and Tested Model: RandomForestClassifier | |
|---|---|---|
| 0 | using Lematization for tokenization | using Stemming for tokenization |
| 1 | with TfidfVectorizer as a vectorizer taking Unigram as a single token | with TfidfVectorizer as a vectorizer taking Unigram as a single token |
| 2 | without stratification on an unbalanced dataset | without stratification on an unbalanced dataset |
| 3 | Average Accuracy of RandomForestClassifier across 5-folds = 63.70% | Average Accuracy of RandomForestClassifier across 5-folds = 63.73% |
| 4 | Average F1-Score of RandomForestClassifier across 5-folds = 15.66% | Average F1-Score of RandomForestClassifier across 5-folds = 15.80% |
printmd("# N-GRAMS")
model_1, cM_1 = loadModel("RandomForestClassifier", "TfidfVectorizer",
"Lematization", "Unigram", "models/data/RF")
model_2, cM_2 = loadModel("RandomForestClassifier", "TfidfVectorizer",
"Lematization", "Bigram", "models/data/RF")
model_3, cM_3 = loadModel("RandomForestClassifier", "TfidfVectorizer",
"Lematization", "Trigram", "models/data/RF")
model_4, cM_4 = loadModel("RandomForestClassifier", "TfidfVectorizer",
"Lematization", "Unigram and Bigram", "models/data/RF")
info = [
["RandomForestClassifier", "TfidfVectorizer",
"Lematization", "Unigram", numberOfFolds],
["RandomForestClassifier", "TfidfVectorizer",
"Lematization", "Bigram", numberOfFolds],
["RandomForestClassifier", "TfidfVectorizer",
"Lematization", "Trigram", numberOfFolds],
["RandomForestClassifier", "TfidfVectorizer",
"Lematization", "Unigram and Bigram", numberOfFolds]
]
models = [
['Unigram', (model_1, cM_1)],
['Bigram', (model_2, cM_2)],
['Trigram', (model_3, cM_3)],
['Unigram and Bigram', (model_4, cM_4)]
]
plot_cm(
models,
info
)
models_info.extend([x + [models[i][1][0]['accuracy'], models[i]
[1][0]['f1_score']] for i, x in enumerate(info)])
| Trained and Tested Model: RandomForestClassifier | Trained and Tested Model: RandomForestClassifier | Trained and Tested Model: RandomForestClassifier | Trained and Tested Model: RandomForestClassifier | |
|---|---|---|---|---|
| 0 | using Lematization for tokenization | using Lematization for tokenization | using Lematization for tokenization | using Lematization for tokenization |
| 1 | with TfidfVectorizer as a vectorizer taking Unigram as a single token | with TfidfVectorizer as a vectorizer taking Bigram as a single token | with TfidfVectorizer as a vectorizer taking Trigram as a single token | with TfidfVectorizer as a vectorizer taking Unigram and Bigram as a single token |
| 2 | without stratification on an unbalanced dataset | without stratification on an unbalanced dataset | without stratification on an unbalanced dataset | without stratification on an unbalanced dataset |
| 3 | Average Accuracy of RandomForestClassifier across 5-folds = 63.70% | Average Accuracy of RandomForestClassifier across 5-folds = 63.67% | Average Accuracy of RandomForestClassifier across 5-folds = 63.68% | Average Accuracy of RandomForestClassifier across 5-folds = 63.67% |
| 4 | Average F1-Score of RandomForestClassifier across 5-folds = 15.66% | Average F1-Score of RandomForestClassifier across 5-folds = 15.56% | Average F1-Score of RandomForestClassifier across 5-folds = 15.59% | Average F1-Score of RandomForestClassifier across 5-folds = 15.56% |
SVC is a non linear classifier that classifies by creating a decision boundary. It creates a decision boundary by maximizing the distance from the nearest data points of all classes. To make it non linear SVC uses kernel trick to increase the dimension of the vector space only for calculation by doing non linear transformation.
During training, SVC tries to learn the best decision boundary for the given dataset. So, it takes a lot of time for large dataset like ours. Upon reaserching, it was found that the SVC wih sigmoid kernel is optimal for the given dataset. The SVC was trained on evaluated on different techniques of vectorization, normalization and n-_gram for the tokenization using 5-fold cross validation. It took more than 72 hours to evalautes each combination as our dataset was huge.
numberOfFolds = 5
processes = []
# To compare vectorizers
pipeline = Pipeline([
('vectorizer', CountVectorizer(ngram_range=(1, 1))),
('model', SVC(kernel="sigmoid",gamma="scale", tol=0.1))
])
p = multiprocessing.Process(target=prockfold, args=(dfLematized, 'l_text', pipeline, numberOfFolds, 'Lematization', 'Unigram'))
processes.append(p)
p.start()
# To compare vectorizers
pipeline = Pipeline([
('vectorizer', TfidfVectorizer(ngram_range=(1, 1))),
('model', SVC(kernel="sigmoid",gamma="scale", tol=0.1))
])
p = multiprocessing.Process(target=prockfold, args=(dfLematized, 'l_text', pipeline, numberOfFolds, 'Lematization', 'Unigram'))
processes.append(p)
p.start()
# To compare Normalization techniques
pipeline = Pipeline([
('vectorizer', CountVectorizer(ngram_range=(1, 1))),
('model', SVC(kernel="sigmoid",gamma="scale", tol=0.1))
])
p = multiprocessing.Process(target=prockfold, args=(dfLematized, 's_text', pipeline, numberOfFolds, 'Stemming', 'Unigram'))
processes.append(p)
p.start()
# To compare n-grams
pipeline = Pipeline([
('vectorizer', CountVectorizer(ngram_range=(2, 2))),
('model', SVC(kernel="sigmoid",gamma="scale", tol=0.1))
])
p = multiprocessing.Process(target=prockfold, args=(dfLematized, 'l_text', pipeline, numberOfFolds, 'Lematization', 'Bigram'))
processes.append(p)
p.start()
pipeline = Pipeline([
('vectorizer', CountVectorizer(ngram_range=(3, 3))),
('model', SVC(kernel="sigmoid",gamma="scale", tol=0.1))
])
p = multiprocessing.Process(target=prockfold, args=(dfLematized, 'l_text', pipeline, numberOfFolds, 'Lematization', 'Trigram'))
processes.append(p)
p.start()
pipeline = Pipeline([
('vectorizer', CountVectorizer(ngram_range=(1, 2))),
('model', SVC(kernel="sigmoid",gamma="scale", tol=0.1))
])
p = multiprocessing.Process(target=prockfold, args=(dfLematized, 'l_text', pipeline, numberOfFolds, 'Lematization', 'Unigram and Bigram'))
processes.append(p)
p.start()
for p in processes:
p.join()
TfIdf represents a token based on its importance of the token in a given document. This will aid SVC as it will seperate 2 documents on a same token/dimension based on the importance of the token in that document/vector. To Evaluate this, we display the results of SVC trained on both the techniques.
numberOfFolds = 5
printmd("# CountVectorizer vs TfidfVectorizer")
# Load the models
model_1, cM_1 = loadModel("SVC", "CountVectorizer",
"Lematization", "Unigram", "models/data/SVC")
model_2, cM_2 = loadModel("SVC", "TfidfVectorizer",
"Lematization", "Unigram", "models/data/SVC")
info = [
["SVC", "CountVectorizer",
"Lematization", "Unigram", numberOfFolds],
["SVC", "TfidfVectorizer",
"Lematization", "Unigram", numberOfFolds]
]
models = [
['CountVectorizer', (model_1, cM_1)],
['TfidfVectorizer', (model_2, cM_2)]
]
plot_cm(
models,
info
)
models_info.extend([x + [models[i][1][0]['accuracy'], models[i]
[1][0]['f1_score']] for i, x in enumerate(info)])
| Trained and Tested Model: SVC | Trained and Tested Model: SVC | |
|---|---|---|
| 0 | using Lematization for tokenization | using Lematization for tokenization |
| 1 | with CountVectorizer as a vectorizer taking Unigram as a single token | with TfidfVectorizer as a vectorizer taking Unigram as a single token |
| 2 | without stratification on an unbalanced dataset | without stratification on an unbalanced dataset |
| 3 | Average Accuracy of SVC across 5-folds = 59.43% | Average Accuracy of SVC across 5-folds = 70.83% |
| 4 | Average F1-Score of SVC across 5-folds = 27.69% | Average F1-Score of SVC across 5-folds = 46.33% |
Either of these options wouldn't make a big difference on the model as the same normalization will be done on the test dataset and SVC doesn't take context into account. However, lemmatization is better for the context as it uses explict and human enginerred dictionary of known word forms. We still evaluated the model using 5 fold cross validation on both of the techniques to find an optimal one
numberOfFolds = 5
printmd("# N-GRAMS")
# Load the models
model_1, cM_1 = loadModel("SVC", "TfidfVectorizer",
"Lematization", "Unigram", "models/data/SVC")
# model_2, cM_2 = loadModel("SVC", "TfidfVectorizer",
# "Lematization", "Bigram", "models/data/SVC")
# model_3, cM_3 = loadModel("SVC", "TfidfVectorizer",
# "Lematization", "Trigram", "models/data/SVC")
# model_4, cM_4=loadModel("SVC", "TfidfVectorizer",
# "Lematization", "Unigram and Bigram", "models/data/SVC")
info = [
["SVC", "TfidfVectorizer",
"Lematization", "Unigram", numberOfFolds],
["SVC", "TfidfVectorizer",
"Lematization", "Bigram", numberOfFolds],
["SVC", "TfidfVectorizer",
"Lematization", "Trigram", numberOfFolds],
["SVC", "TfidfVectorizer",
"Lematization", "Unigram and Bigram", numberOfFolds]
]
models = [
['Unigram', (model_1, cM_1)],
['Bigram', (model_2, cM_2)],
['Trigram', (model_3, cM_3)],
['Unigram and Bigram', (model_4, cM_4)]
]
plot_cm(
models,
info
)
models_info.extend([x + [models[i][1][0]['accuracy'], models[i]
[1][0]['f1_score']] for i, x in enumerate(info)])
| Trained and Tested Model: SVC | Trained and Tested Model: SVC | Trained and Tested Model: SVC | Trained and Tested Model: SVC | |
|---|---|---|---|---|
| 0 | using Lematization for tokenization | using Lematization for tokenization | using Lematization for tokenization | using Lematization for tokenization |
| 1 | with TfidfVectorizer as a vectorizer taking Unigram as a single token | with TfidfVectorizer as a vectorizer taking Bigram as a single token | with TfidfVectorizer as a vectorizer taking Trigram as a single token | with TfidfVectorizer as a vectorizer taking Unigram and Bigram as a single token |
| 2 | without stratification on an unbalanced dataset | without stratification on an unbalanced dataset | without stratification on an unbalanced dataset | without stratification on an unbalanced dataset |
| 3 | Average Accuracy of SVC across 5-folds = 70.83% | Average Accuracy of SVC across 5-folds = 70.96% | Average Accuracy of SVC across 5-folds = 63.68% | Average Accuracy of SVC across 5-folds = 63.67% |
| 4 | Average F1-Score of SVC across 5-folds = 46.33% | Average F1-Score of SVC across 5-folds = 46.68% | Average F1-Score of SVC across 5-folds = 15.59% | Average F1-Score of SVC across 5-folds = 15.56% |
models_df = pd.DataFrame(models_info, columns=["model", "vectorizer",
"lemmatization", "ngram", "numberOfFolds", "accuracy", "f1_score"])
models_df.drop_duplicates(inplace=True)
models_df.sort_values(by=["f1_score"], ascending=False, inplace=True)
models_df.reset_index(drop=True, inplace=True)
printmd(models_df.to_markdown())
| model | vectorizer | lemmatization | ngram | numberOfFolds | accuracy | f1_score | |
|---|---|---|---|---|---|---|---|
| 0 | LogisticRegression | TfidfVectorizer | Lematization | Unigram and Bigram | 5 | 0.707732 | 0.480877 |
| 1 | SVC | TfidfVectorizer | Stemming | Unigram | 5 | 0.709647 | 0.466782 |
| 2 | SVC | TfidfVectorizer | Lematization | Bigram | 5 | 0.709647 | 0.466782 |
| 3 | SVC | TfidfVectorizer | Lematization | Unigram | 5 | 0.70825 | 0.463258 |
| 4 | LogisticRegression | TfidfVectorizer | Stemming | Unigram | 5 | 0.693271 | 0.456862 |
| 5 | MultinomialNB | CountVectorizer | Stemming | Unigram | 5 | 0.692042 | 0.447514 |
| 6 | LogisticRegression | TfidfVectorizer | Lematization | Unigram | 5 | 0.680888 | 0.446481 |
| 7 | LogisticRegression | CountVectorizer | Lematization | Unigram | 5 | 0.68069 | 0.44503 |
| 8 | MultinomialNB | CountVectorizer | Lematization | Unigram | 5 | 0.692405 | 0.440018 |
| 9 | LogisticRegression | TfidfVectorizer | Lematization | Bigram | 5 | 0.685066 | 0.438245 |
| 10 | LogisticRegression | TfidfVectorizer | Lematization | Trigram | 5 | 0.614943 | 0.343346 |
| 11 | MultinomialNB | CountVectorizer | Lematization | Bigram | 5 | 0.673075 | 0.289593 |
| 12 | SVC | CountVectorizer | Lematization | Unigram | 5 | 0.594314 | 0.27688 |
| 13 | MultinomialNB | CountVectorizer | Lematization | Trigram | 5 | 0.660923 | 0.275642 |
| 14 | MultinomialNB | CountVectorizer | Lematization | Unigram and Bigram | 5 | 0.668787 | 0.269072 |
| 15 | MultinomialNB | TfidfVectorizer | Lematization | Unigram | 5 | 0.64521 | 0.190771 |
| 16 | RandomForestClassifier | TfidfVectorizer | Stemming | Unigram | 5 | 0.637272 | 0.157958 |
| 17 | RandomForestClassifier | CountVectorizer | Lematization | Unigram | 5 | 0.637064 | 0.157027 |
| 18 | RandomForestClassifier | TfidfVectorizer | Lematization | Unigram | 5 | 0.636964 | 0.156613 |
| 19 | RandomForestClassifier | TfidfVectorizer | Lematization | Trigram | 5 | 0.636808 | 0.155927 |
| 20 | SVC | TfidfVectorizer | Lematization | Trigram | 5 | 0.636808 | 0.155927 |
| 21 | RandomForestClassifier | TfidfVectorizer | Lematization | Unigram and Bigram | 5 | 0.636743 | 0.15564 |
| 22 | SVC | TfidfVectorizer | Lematization | Unigram and Bigram | 5 | 0.636743 | 0.15564 |
| 23 | RandomForestClassifier | TfidfVectorizer | Lematization | Bigram | 5 | 0.636737 | 0.155611 |
# plot the results
def plot_accuracy(df, title):
plt.figure(figsize=(20, 10))
# make a bar chart using plt
sns.barplot(x="model", y="accuracy", data=df)
plt.title(title)
plt.show()
plot_accuracy(models_df,
"Accuracy of the best model")
def plot_f1_score(df, title):
plt.figure(figsize=(20, 10))
sns.barplot(x="model", y="f1_score", data=df)
plt.title(title)
plt.show()
plot_f1_score(models_df,
"F1-Score of the best model")
plt.Figure(figsize=(20, 10))
sns.catplot(x="model", y="accuracy",
data=models_df, kind="point", label='Models', size=5, aspect=4)
plt.title("Accuracy of the best model")
plt.show()
plt.Figure(figsize=(20, 10))
sns.catplot(x="model", y="f1_score",
data=models_df, kind="point", label='Models', size=5, aspect=4)
plt.title("F1-Score of the best model")
plt.show()
r = sns.kdeplot(models_df["accuracy"],
shade=True, label='Accuracy', color='r')
b = sns.kdeplot(models_df["f1_score"],
shade=True, label='F1-Score', color='b')
r.figure.set_size_inches(20, 10)
plt.title("F1-Scores and Accuracies of the best models")
plt.legend()
plt.show()
f = pd.read_csv('data/train.csv')
print(*set(re.findall(r'[^\x00-\x7F]', " ".join(f['Review_text'].tolist()))))
« Â · ¬ ¾ ë £ æ Ã ª Å ® © º Þ ¨ » µ ° ± ô Î ¼ ² ¢ ã ø ç â § ½ ¦ å × û ê
class BiLSTM(nn.Module):
def __init__(self, le, embedding_matrix,max_features, embed_size):
super(BiLSTM, self).__init__()
self.hidden_size = 128
drp = 0.25
n_classes = len(le.classes_)
self.embedding = nn.Embedding(max_features, embed_size)
self.embedding.weight = nn.Parameter(
torch.tensor(embedding_matrix, dtype=torch.float32))
self.embedding.weight.requires_grad = False
self.lstm = nn.LSTM(embed_size, self.hidden_size,
bidirectional=True, batch_first=True)
self.linear = nn.Linear(self.hidden_size*4, 200)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(drp)
self.out = nn.Linear(200, n_classes)
def forward(self, x):
# rint(x.size())
h_embedding = self.embedding(x)
#_embedding = torch.squeeze(torch.unsqueeze(h_embedding, 0))
h_lstm, _ = self.lstm(h_embedding)
avg_pool = torch.mean(h_lstm, 1)
max_pool, _ = torch.max(h_lstm, 1)
conc = torch.cat((avg_pool, max_pool), 1)
conc = self.relu(self.linear(conc))
conc = self.dropout(conc)
out = self.out(conc)
return out
class LstmModelPytorch(BaseEstimator, TransformerMixin):
def __init__(self, max_features, n_epochs, batch_size, maxlen, embed_size, kFolds, debug, X, y):
# Reproducing same results
self.max_features = max_features
self.le = LabelEncoder()
self.tokenizer = Tokenizer(num_words=self.max_features)
self.n_epochs = n_epochs
self.loss_fn = nn.CrossEntropyLoss(reduction='mean')
self.batch_size = batch_size
self.maxlen = maxlen
self.embed_size = embed_size
self.kFolds = kFolds
self.debug = debug
self.X = X
self.y = y
def load_glove(self, word_index, embed_size):
EMBEDDING_FILE = 'glove.6B/glove.6B.50d.txt'
def get_coefs(word, *arr): return word, np.asarray(arr,
dtype='float32')[:300]
embeddings_index = dict(get_coefs(*o.split(" "))
for o in open(EMBEDDING_FILE, encoding="utf8"))
all_embs = np.stack(embeddings_index.values())
emb_mean, emb_std = -0.005838499, 0.48782197
embed_size = all_embs.shape[1]
nb_words = min(self.max_features, len(word_index)+1)
embedding_matrix = np.random.normal(
emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
if i >= self.max_features:
continue
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
else:
embedding_vector = embeddings_index.get(word.capitalize())
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
return embedding_matrix
def plot_graph(self, epochs, train_loss, val_loss):
fig = plt.figure(figsize=(12, 12))
plt.title("Train/Validation Loss")
plt.plot(list(np.arange(epochs) + 1), train_loss, label='train')
plt.plot(list(np.arange(epochs) + 1), val_loss, label='validation')
plt.xlabel('num_epochs', fontsize=12)
plt.ylabel('loss', fontsize=12)
plt.legend(loc='best')
plt.show()
def fit(self, X, y=None):
xDf = pd.DataFrame(X, columns=[self.X])
xDf[self.y] = y.to_list()
average_trainingLoss = []
average_validationLoss = []
totalFScore = 0
totalAccuracy = 0
totalConfusion_matrix = None
kfold = StratifiedKFold(n_splits=self.kFolds,
shuffle=True)
foldCounter = 0
bestModel = None
bestValidationF1 = 0
# # Do the test train split
# train_X, test_X, train_y, test_y = train_test_split(
# xDf[[self.X]], xDf[self.y], test_size=0.2, random_state=7,stratify=y)
for train_index, test_index in kfold.split(xDf[[self.X]], xDf[self.y]):
foldCounter += 1
train_X, test_X = xDf.iloc[train_index][[
self.X]], xDf.iloc[test_index][[self.X]]
train_y, test_y = xDf.iloc[train_index][self.y], xDf.iloc[test_index][self.y]
self.tokenizer.fit_on_texts(list(train_X[self.X]))
train_X = self.tokenizer.texts_to_sequences(train_X[self.X])
test_X = self.tokenizer.texts_to_sequences(test_X[self.X])
if self.debug:
self.embedding_matrix = np.random.randn(120000, 300)
else:
self.embedding_matrix = self.load_glove(
self.tokenizer.word_index, self.embed_size)
# Pad the sentences
train_X = pad_sequences(train_X, maxlen=self.maxlen)
test_X = pad_sequences(test_X, maxlen=self.maxlen)
train_y = self.le.fit_transform(train_y.values)
test_y = self.le.transform(test_y.values)
# Load train and test in CUDA Memory
x_train = torch.tensor(train_X, dtype=torch.long).cuda()
y_train = torch.tensor(train_y, dtype=torch.long).cuda()
x_cv = torch.tensor(test_X, dtype=torch.long).cuda()
y_cv = torch.tensor(test_y, dtype=torch.long).cuda()
# Create Torch datasets
train = torch.utils.data.TensorDataset(x_train, y_train)
valid = torch.utils.data.TensorDataset(x_cv, y_cv)
self.model = BiLSTM(self.le, self.embedding_matrix,self.max_features,self.embed_size)
self.optimizer = torch.optim.Adam(
filter(lambda p: p.requires_grad, self.model.parameters()), lr=0.001)
self.model.cuda()
# Create Data Loaders
train_loader = torch.utils.data.DataLoader(
train, batch_size=self.batch_size, shuffle=True)
valid_loader = torch.utils.data.DataLoader(
valid, batch_size=self.batch_size, shuffle=False)
train_loss = []
valid_loss = []
for epoch in range(self.n_epochs):
start_time = time.time()
# Set model to train configuration
self.model.train()
avg_loss = 0.
for i, (x_batch, y_batch) in enumerate(train_loader):
# Predict/Forward Pass
y_pred = self.model(x_batch)
# Compute loss
self.loss = self.loss_fn(y_pred, y_batch)
self.optimizer.zero_grad()
self.loss.backward()
self.optimizer.step()
avg_loss += self.loss.item() / len(train_loader)
# Set model to validation configuration -Doesn't get trained here
self.model.eval()
avg_val_loss = 0.
val_preds = np.zeros((len(x_cv), len(self.le.classes_)))
for i, (x_batch, y_batch) in enumerate(valid_loader):
y_pred = self.model(x_batch).detach()
avg_val_loss += self.loss_fn(y_pred,
y_batch).item() / len(valid_loader)
# keep/store predictions
val_preds[i * self.batch_size:(i+1) *
self.batch_size] = F.softmax(y_pred).cpu().numpy()
# Check Accuracy
val_accuracy = sum(val_preds.argmax(
axis=1) == test_y)/len(test_y)
train_loss.append(avg_loss)
valid_loss.append(avg_val_loss)
elapsed_time = time.time() - start_time
print('Epoch {}/{} at {} fold: \t loss={:.4f} \t val_loss={:.4f} \t val_acc={:.4f} \t time={:.2f}s'.format(
epoch + 1, self.n_epochs, foldCounter, avg_loss, avg_val_loss, val_accuracy, elapsed_time))
average_trainingLoss.append(train_loss)
average_validationLoss.append(valid_loss)
y_true = [self.le.classes_[x] for x in test_y]
y_pred = [self.le.classes_[x] for x in val_preds.argmax(axis=1)]
fSc = f1_score(y_true, y_pred, average='weighted')
if fSc > bestValidationF1:
bestValidationF1 = fSc
bestModel = self.model
totalAccuracy += accuracy_score(y_true, y_pred)
totalFScore += fSc
totalConfusion_matrix = totalConfusion_matrix + confusion_matrix(
y_true, y_pred) if totalConfusion_matrix is not None else confusion_matrix(y_true, y_pred)
torch.save(bestModel, 'bilstm_model')
torch.save(self.tokenizer, 'bilstm_model_tokenizer')
torch.save(self.le, 'bilstm_model_labelencoder')
gc.collect()
# Element wise sum the average training and validation loss
average_trainingLoss = np.array(average_trainingLoss).sum(axis=0)
average_validationLoss = np.array(average_validationLoss).sum(axis=0)
self.plot_graph(self.n_epochs, average_trainingLoss,
average_validationLoss)
totalAccuracy = totalAccuracy/self.k_fold
totalFScore = totalFScore/self.k_fold
totalConfusion_matrix = totalConfusion_matrix/self.k_fold
# Save the confusion matrix np save
with open(f"lstm_confusion_matrix.npy", "wb") as of:
np.save(of, totalConfusion_matrix)
# Save the file as json
with open(f"stats.json", 'w') as f:
json.dump({'accuracy': totalAccuracy, 'fScore': totalFScore}, f)
print("## Trained and Tested Model: BiLSTM" +
"\n\t - using lemmitization for tokenization" +
"\n\t - with Glove Embeddings for vectorization" +
f"\n\t - {'without stratification on an unbalanced dataset'if len(X)>2000 else 'on a statified balanced dataset'}")
print("--"*10+"Results" + "--"*10)
print(
# f"- Average Accuracy of BiLSTM across {self.kFolds}-folds = {totalAccuracy/self.kFolds}")
f"- Average Accuracy of BiLSTM across {self.kFolds}-folds = {totalAccuracy}")
print(
f"- Average F1-Score of BiLSTM across {self.kFolds}-folds = {totalFScore}")
# f"- Average F1-Score of BiLSTM across {self.kFolds}-folds = {totalFScore/self.kFolds}")
print(
f"- Average Confustion Matrix of BiLSTM across {self.kFolds}-folds:")
# sns.heatmap(totalConfusion_matrix/self.kFolds, annot=True)
sns.heatmap(totalConfusion_matrix, annot=True)
plt.show()
def predict(self, X):
self.model = torch.load('bilstm_model')
self.tokenizer = torch.load('bilstm_model_tokenizer')
self.le = torch.load('bilstm_model_labelencoder')
# generate list of zeroes only int same as the length of X
y = [1 for _ in range(len(X))]
test_X = self.tokenizer.texts_to_sequences(X[self.X])
test_X = pad_sequences(test_X, maxlen=self.maxlen)
test_y = self.le.transform(y)
x_cv = torch.tensor(test_X, dtype=torch.long).cuda()
y_cv = torch.tensor(test_y, dtype=torch.long).cuda()
valid = torch.utils.data.TensorDataset(x_cv, y_cv)
valid_loader = torch.utils.data.DataLoader(
valid, batch_size=self.batch_size, shuffle=False)
# Set model to validation configuration -Doesn't get trained here
self.model.eval()
val_preds = np.zeros((len(x_cv), len(self.le.classes_)))
for i, (x_batch, y_batch) in enumerate(valid_loader):
y_pred = self.model(x_batch).detach()
# keep/store predictions
val_preds[i * self.batch_size:(i+1) *
self.batch_size] = F.softmax(y_pred).cpu().numpy()
y_new = [self.le.classes_[x] for x in val_preds.argmax(axis=1)]
return y_new
embed_size = 50 # how big is each word vector
# how many unique words to use (i.e num rows in embedding vector)
max_features = 120000
# max number of words in a tweet to use
maxlen = 1000
# maxlen = int(df['cleaned_text'].str.split().str.len().max())
batch_size = 64 # how many samples to process at once
n_epochs = 1 # how many times to iterate over all samples
n_splits = 5 # Number of K-fold Splits
debug = 0
lstmModel = LstmModelPytorch(max_features=max_features,
n_epochs=n_epochs, batch_size=batch_size, maxlen=maxlen, embed_size=embed_size, kFolds=n_splits, debug=debug, X="cleaned_text", y="Score")
df = pd.read_csv('/content/drive/MyDrive/colab/data/train_cleaned_new.csv', encoding='utf-8')
df['cleaned_text'] = df['cleaned_text'].astype(str)
# # df = df.groupby('Score').apply(lambda x: x.sample(n=df['Score'].value_counts().min()))
# df = df.groupby('Score').apply(lambda x: x.sample(n=100))
_ = lstmModel.fit(df[["cleaned_text"]], df["Score"])
dl_models_info = []
def plot_dl_cm(cm, options, only_table=False):
header = []
body = [[], []]
accuracy_score_list = []
f1_score_list = []
for i, option in enumerate(options):
modelAlgoName, mode, model, numberOfFolds = option
header.append(r"Trained and Tested Deep Leaning Model: " +
modelAlgoName + " using " + mode.capitalize())
body[0].append(
f"Average Accuracy of {modelAlgoName} across {numberOfFolds}-folds = {model['accuracy']*100:.2f}%")
accuracy_score_list.append(model['accuracy'])
body[1].append(
f"Average F1-Score of {modelAlgoName} across {numberOfFolds}-folds = {model['fScore']*100:.2f}%")
f1_score_list.append(model['fScore'])
table = pd.DataFrame(body, columns=header)
printmd(table.to_markdown())
if only_table==False:
f, axes = plt.subplots(1, len(cm), figsize=(30, 15), sharey='row')
for i, a in enumerate(cm):
key, n = a
model, cf_matrix = n
disp = ConfusionMatrixDisplay(cf_matrix,
display_labels=range(1, 6))
disp.plot(ax=axes[i], xticks_rotation=45)
disp.ax_.set_title(
key + f"\nAccuracy - {model['accuracy']*100:.2f}%\nF1-Score - {model['fScore']*100:.2f}%")
disp.im_.colorbar.remove()
disp.ax_.set_xlabel('')
if i != 0:
disp.ax_.set_ylabel('')
f.text(0.4, 0.1, 'Predicted label', ha='left')
plt.subplots_adjust(wspace=0.40, hspace=0.1)
f.colorbar(disp.im_, ax=axes)
plt.show()
x = [f[0] for f in cm]
# plot the accuracy and f1 score in the same graph
plt.figure(figsize=(30, 10))
X_axis = np.arange(len(x))
plt.bar(X_axis-0.2, accuracy_score_list, 0.4, label='Accuracy')
plt.bar(X_axis+0.2, f1_score_list, 0.4, label='F1 Score')
plt.xticks(X_axis, x, rotation=0)
plt.legend()
plt.show()
def loadDLModel(name, path):
# Save the confusion matrix np save
with open(f"{path}/{name}_confusion_matrix.npy", "rb") as of:
cM = np.load(of)
# Save the file as json
with open(f"{path}/{name}_stats.json", 'r') as f:
model = json.load(f)
return model, cM
def prettyPrintDLModels(model, cM, name,norm, numberOfFolds):
printmd("## Trained and Tested Model: " + name+
"\n \t -- with Normalization by "+ norm
)
printmd("--"*10+"Results" + "--"*10)
printmd(
f"- Average Accuracy of {name} across {numberOfFolds}-folds = {model['accuracy']}")
printmd(
f"- Average F1-Score of {name} across {numberOfFolds}-folds = {model['fScore']}")
printmd(
f"- Average Confusion Matrix of {name} across {numberOfFolds}-folds:")
sns.heatmap(cM, annot=True)
plt.show()
model_1, cM_1 = loadDLModel("lstm", "lstm/lstm_lema")
prettyPrintDLModels(model_1, cM_1, "Bi-LSTM","lemmatization", 5)
-- with Normalization by lemmatization
--------------------Results--------------------
model, cM = loadDLModel("lstm", "lstm/lstm_cleanedStopword")
prettyPrintDLModels(model, cM, "Bi-LSTM","cleaning plus stopword removal", 5)
-- with Normalization by cleaning plus stopword removal
--------------------Results--------------------
model, cM = loadDLModel("lstm", "lstm/lstm_cleaned")
prettyPrintDLModels(model, cM, "Bi-LSTM","only cleaning", 5)
-- with Normalization by only cleaning
--------------------Results--------------------
printmd("Comparing the results of the above models")
model_1, cM_1 = loadDLModel("lstm", "lstm/lstm_lema")
model_2, cM_2 = loadDLModel("lstm", "lstm/lstm_cleanedStopword")
model_3, cM_3 = loadDLModel("lstm", "lstm/lstm_cleaned")
info = [
["Bi-LSTM", "lemmatization", model_1, 5],
["Bi-LSTM", "cleaning plus stopword removal", model_2, 5],
["Bi-LSTM", "only cleaning", model_3, 5]
]
models = [
['Lemmatization', (model_1, cM_1)],
['Cleaning plus stopword removal', (model_2, cM_2)],
['Only cleaning', (model_3, cM_3)]
]
plot_dl_cm(
models,
info
)
dl_models_info.extend([x + [models[i][1][0]['accuracy'], models[i]
[1][0]['fScore']] for i, x in enumerate(info)])
Comparing the results of the above models
| Trained and Tested Deep Leaning Model: Bi-LSTM using Lemmatization | Trained and Tested Deep Leaning Model: Bi-LSTM using Cleaning plus stopword removal | Trained and Tested Deep Leaning Model: Bi-LSTM using Only cleaning | |
|---|---|---|---|
| 0 | Average Accuracy of Bi-LSTM across 5-folds = 73.22% | Average Accuracy of Bi-LSTM across 5-folds = 73.66% | Average Accuracy of Bi-LSTM across 5-folds = 79.06% |
| 1 | Average F1-Score of Bi-LSTM across 5-folds = 70.01% | Average F1-Score of Bi-LSTM across 5-folds = 70.21% | Average F1-Score of Bi-LSTM across 5-folds = 77.61% |
class TransformerBlock(Layer):
def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
super(TransformerBlock, self).__init__()
self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
self.ffn = Sequential(
[Dense(ff_dim, activation="relu"),
Dense(embed_dim),]
)
self.layernorm1 = LayerNormalization(epsilon=1e-6)
self.layernorm2 = LayerNormalization(epsilon=1e-6)
self.dropout1 = Dropout(rate)
self.dropout2 = Dropout(rate)
def call(self, inputs, training):
attn_output = self.att(inputs, inputs)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(inputs + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
return self.layernorm2(out1 + ffn_output)
class TokenAndPositionEmbedding(Layer):
def __init__(self, maxlen, vocab_size, embed_dim):
super(TokenAndPositionEmbedding, self).__init__()
self.token_emb = Embedding(input_dim=vocab_size, output_dim=embed_dim)
self.pos_emb = Embedding(input_dim=maxlen, output_dim=embed_dim)
def call(self, x):
maxlen = tf.shape(x)[-1]
positions = tf.range(start=0, limit=maxlen, delta=1)
positions = self.pos_emb(positions)
x = self.token_emb(x)
return x + positions
class TransformerTensorflow(BaseEstimator, TransformerMixin):
def __init__(self, vocab_size, maxlen, embed_dim , num_heads,ff_dim, n_epochs, batch_size, kFolds, debug, X, y):
# Reproducing same results
self.vocab_size = vocab_size
self.maxlen = maxlen
inputs = Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
inp = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
tb = transformer_block(inp)
GA = GlobalAveragePooling1D()(tb)
GA = Dropout(0.25)(GA)
l1 = Dense(80, activation="relu")(GA)
l1 = Dropout(0.25)(l1)
l2 = Dense(40, activation="relu")(l1)
l2 = Dropout(0.25)(l2)
# Can add more layers here like this
l3 = Dense(20, activation="relu")(l2)
l3 = Dropout(0.25)(l3)
l4 = Dense(10, activation="relu")(l3)
l4 = Dropout(0.25)(l4)
# And can change the number of layers
outputs = Dense(5, activation="softmax")(l4)
self.model = Model(inputs=inputs, outputs=outputs)
self.model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=['accuracy'])
self.n_epochs = n_epochs
self.batch_size = batch_size
self.kFolds = kFolds
self.debug = debug
self.X = X
self.y = y
self.tokenizer = keras.preprocessing.text.Tokenizer(num_words=vocab_size)
self.le = LabelEncoder()
def plot1(self,history):
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
## Accuracy plot
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
## Loss plot
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()
def plot2(self,history):
pd.DataFrame(history.history).plot(figsize=(8, 5))
plt.grid(True)
#plt.gca().set_ylim(0,1)
plt.show()
def fit(self, X, y=None):
xDf = pd.DataFrame(X, columns=[self.X])
xDf[self.y] = y.to_list()
totalFScore = 0
totalAccuracy = 0
totalConfusion_matrix = None
kfold = StratifiedKFold(n_splits=self.kFolds,
shuffle=True)
foldCounter = 0
bestModel = None
bestValidationF1 = 0
# Do the test train split
# train_X, test_X, train_y, test_y = train_test_split(
# xDf[[self.X]], xDf[self.y], test_size=0.2, random_state=7)
for train_index, test_index in kfold.split(xDf[[self.X]], xDf[self.y]):
foldCounter += 1
train_X, test_X = xDf.iloc[train_index][[
self.X]], xDf.iloc[test_index][[self.X]]
train_y, test_y = xDf.iloc[train_index][self.y], xDf.iloc[test_index][self.y]
self.tokenizer.fit_on_texts(list(train_X[self.X]))
train_X = self.tokenizer.texts_to_sequences(train_X[self.X])
test_X = self.tokenizer.texts_to_sequences(test_X[self.X])
# Pad the sentences
train_X = keras.preprocessing.sequence.pad_sequences(train_X, maxlen=self.maxlen)
test_X = keras.preprocessing.sequence.pad_sequences(test_X, maxlen=self.maxlen)
train_y = self.le.fit_transform(train_y)
test_y = self.le.transform(test_y)
history = self.model.fit(train_X, train_y, batch_size=self.batch_size, epochs=self.n_epochs, validation_data=(test_X, test_y) )
self.plot1(history)
self.plot2(history)
results = self.model.predict(test_X,verbose=2).argmax(axis=1)
fSc = f1_score(test_y, results, average='weighted')
# if fSc > bestValidationF1:
bestValidationF1 = fSc
bestModel = self.model
totalAccuracy += accuracy_score(test_y, results)
totalFScore += fSc
totalConfusion_matrix = totalConfusion_matrix + confusion_matrix(
test_y, results) if totalConfusion_matrix is not None else confusion_matrix(test_y, results)
bestModel.save_weights("transformer_result/predict_class.h5")
# Save tokenizer
with open('transformer_result/tokenizer.pickle', 'wb') as handle:
pickle.dump(self.tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
# Save label encoder
with open('transformer_result/label_encoder.pickle', 'wb') as handle:
pickle.dump(self.le, handle, protocol=pickle.HIGHEST_PROTOCOL)
gc.collect()
totalAccuracy = totalAccuracy/self.kFolds
totalFScore = totalFScore/self.kFolds
totalConfusion_matrix = totalConfusion_matrix/self.kFolds
# Save the confusion matrix np save
with open(f"transformer_result/transformer_confusion_matrix.npy","wb") as of:
np.save(of, totalConfusion_matrix)
# Save the file as json
with open(f"transformer_result/transformer_stats.json", 'w') as f:
json.dump({'accuracy':totalAccuracy ,'fScore':totalFScore}, f)
print("## Trained and Tested Model: Transformer" +
"\n\t - using lemmitization for tokenization" +
f"\n\t - {'without stratification on an unbalanced dataset'if len(X)>2000 else 'on a balanced dataset'}")
print("--"*10+"Results" + "--"*10)
print(
# f"- Average Accuracy of BiLSTM across {self.kFolds}-folds = {totalAccuracy/self.kFolds}")
f"- Average Accuracy of Transformer across {self.kFolds}-folds = {totalAccuracy}")
print(
f"- Average F1-Score of Transformer across {self.kFolds}-folds = {totalFScore}")
# f"- Average F1-Score of BiLSTM across {self.kFolds}-folds = {totalFScore/self.kFolds}")
print(
f"- Average Confustion Matrix of Transformer across {self.kFolds}-folds:")
# sns.heatmap(totalConfusion_matrix/self.kFolds, annot=True)
sns.heatmap(totalConfusion_matrix, annot=True)
plt.show()
def predict(self, X):
# Load the model
self.model.load_weights("transformer_result/predict_class.h5")
# Load the tokenizer
with open('transformer_result/tokenizer.pickle', 'rb') as handle:
self.tokenizer = pickle.load(handle)
# Load the label encoder
with open('transformer_result/label_encoder.pickle', 'rb') as handle:
self.le = pickle.load(handle)
test_X = self.tokenizer.texts_to_sequences(X[self.X])
test_X = keras.preprocessing.sequence.pad_sequences(test_X, maxlen=self.maxlen)
predictions = self.model.predict(test_X, verbose=2)
results = predictions.argmax(axis=1)
results = self.le.inverse_transform(results)
return results
num_heads = 250 # Number of attention heads
ff_dim = 500 # Hidden layer size in feed forward network inside transformer
embed_dim = 100 # how big is each word vector
# how many unique words to use (i.e num rows in embedding vector)
vocab_size = 10000 # Only consider the top k words
maxlen = 50 # Only consider the first 200 words of each movie review
# maxlen = int(df['cleaned_text'].str.split().str.len().max())
batch_size = 100 # how many samples to process at once
n_epochs = 30 # how many times to iterate over all samples
n_splits = 5 # Number of K-fold Splits
debug = 0
model = TransformerTensorflow(vocab_size, maxlen, embed_dim , num_heads,ff_dim, n_epochs, batch_size, n_splits, debug, X="cleaned_text", y="Score")
df = pd.read_csv('/content/drive/MyDrive/colab/data/train_cleaned_new.csv', encoding='utf-8')
df['cleaned_text'] = df['cleaned_text'].astype(str)
_=model.fit(df[['cleaned_text']],df['Score'])
model, cM = loadDLModel("Transformer", "transformer/transFormersLematized")
prettyPrintDLModels(model, cM, "Transformer","lemmatization", 5)
-- with Normalization by lemmatization
--------------------Results--------------------
model, cM = loadDLModel(
"Transformer", "transformer/transformerCleanAndStopwordRemoved")
prettyPrintDLModels(model, cM, "Transformer","cleaning plus stopword removal", 5)
-- with Normalization by cleaning plus stopword removal
--------------------Results--------------------
model, cM = loadDLModel("Transformer", "transformer/transformers_cleaning")
prettyPrintDLModels(model, cM, "Transformer","only cleaning", 5)
-- with Normalization by only cleaning
--------------------Results--------------------
printmd("Comparing the results of the above models")
model_1, cM_1 = loadDLModel("transformer", "transformer/transformersLematized")
model_2, cM_2 = loadDLModel("transformer", "transformer/transformerCleanAndStopwordRemoved")
model_3, cM_3 = loadDLModel("transformer", "transformer/transformers_cleaning")
info = [
["transformer", "lemmatization", model_1, 5],
["transformer", "cleaning plus stopword removal", model_2, 5],
["transformer", "only cleaning", model_3, 5]
]
models = [
['Lemmatization', (model_1, cM_1)],
['Cleaning plus stopword removal', (model_2, cM_2)],
['Only cleaning', (model_3, cM_3)]
]
plot_dl_cm(
models,
info
)
dl_models_info.extend([x + [models[i][1][0]['accuracy'], models[i]
[1][0]['fScore']] for i, x in enumerate(info)])
Comparing the results of the above models
| Trained and Tested Deep Leaning Model: transformer using Lemmatization | Trained and Tested Deep Leaning Model: transformer using Cleaning plus stopword removal | Trained and Tested Deep Leaning Model: transformer using Only cleaning | |
|---|---|---|---|
| 0 | Average Accuracy of transformer across 5-folds = 72.24% | Average Accuracy of transformer across 5-folds = 72.42% | Average Accuracy of transformer across 5-folds = 79.15% |
| 1 | Average F1-Score of transformer across 5-folds = 68.46% | Average F1-Score of transformer across 5-folds = 69.90% | Average F1-Score of transformer across 5-folds = 78.65% |
dl_models_df = pd.DataFrame(dl_models_info, columns=['model', 'Preprocessing', 'model_results', 'K-Fold' , 'accuracy', 'f1_score'])
dl_models_df.drop(columns=['model_results'], inplace=True)
dl_models_df
| model | Preprocessing | K-Fold | accuracy | f1_score | |
|---|---|---|---|---|---|
| 0 | Bi-LSTM | lemmatization | 5 | 0.732238 | 0.700080 |
| 1 | Bi-LSTM | only cleaning | 5 | 0.790611 | 0.776052 |
| 2 | Bi-LSTM | lemmatization | 5 | 0.732238 | 0.700080 |
| 3 | Bi-LSTM | cleaning plus stopword removal | 5 | 0.736620 | 0.702082 |
| 4 | Bi-LSTM | only cleaning | 5 | 0.790611 | 0.776052 |
| 5 | transformer | lemmatization | 5 | 0.722357 | 0.684567 |
| 6 | transformer | cleaning plus stopword removal | 5 | 0.724221 | 0.698964 |
| 7 | transformer | only cleaning | 5 | 0.791540 | 0.786524 |
# plot the results
def plot_accuracy(df, title):
plt.figure(figsize=(20, 10))
# make a bar chart using plt
sns.barplot(x="model", y="accuracy", data=df)
plt.title(title)
plt.show()
plot_accuracy(dl_models_df,
"Accuracy of the Deep Learning models")
def plot_f1_score(df, title):
plt.figure(figsize=(20, 10))
sns.barplot(x="model", y="f1_score", data=df)
plt.title(title)
plt.show()
plot_f1_score(dl_models_df,
"F1-Score of the Deep Learning models")
plt.Figure(figsize=(20, 10))
sns.catplot(x="model", y="accuracy",
data=dl_models_df, kind="point", label='Models', size=5, aspect=4)
plt.title("Accuracy of the best model")
plt.show()
plt.Figure(figsize=(20, 10))
sns.catplot(x="model", y="f1_score",
data=dl_models_df, kind="point", label='Models', size=5, aspect=4)
plt.title("F1-Score of the best model")
plt.show()
class TransformerBlock(Layer):
def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
super(TransformerBlock, self).__init__()
self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
self.ffn = Sequential(
[Dense(ff_dim, activation="relu"),
Dense(embed_dim),]
)
self.layernorm1 = LayerNormalization(epsilon=1e-6)
self.layernorm2 = LayerNormalization(epsilon=1e-6)
self.dropout1 = Dropout(rate)
self.dropout2 = Dropout(rate)
def call(self, inputs, training):
attn_output = self.att(inputs, inputs)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(inputs + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
return self.layernorm2(out1 + ffn_output)
class TokenAndPositionEmbedding(Layer):
def __init__(self, maxlen, vocab_size, embed_dim):
super(TokenAndPositionEmbedding, self).__init__()
self.token_emb = Embedding(input_dim=vocab_size, output_dim=embed_dim)
self.pos_emb = Embedding(input_dim=maxlen, output_dim=embed_dim)
def call(self, x):
maxlen = tf.shape(x)[-1]
positions = tf.range(start=0, limit=maxlen, delta=1)
positions = self.pos_emb(positions)
x = self.token_emb(x)
return x + positions
class TransformerTensorflow(BaseEstimator, TransformerMixin):
def __init__(self, vocab_size, maxlen, embed_dim , num_heads,ff_dim, n_epochs, batch_size, kFolds, debug, X, y):
# Reproducing same results
self.vocab_size = vocab_size
self.maxlen = maxlen
inputs = Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
inp = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
tb = transformer_block(inp)
GA = GlobalAveragePooling1D()(tb)
GA = Dropout(0.25)(GA)
l1 = Dense(1024*2, activation="relu")(GA)
l1 = Dropout(0.25)(l1)
l2 = Dense(1024*3, activation="relu")(l1)
l2 = Dropout(0.25)(l2)
# Can add more layers here like this
l3 = Dense(1024*4, activation="relu")(l2)
l3 = Dropout(0.25)(l3)
l4 = Dense(1024*5, activation="relu")(l3)
l4 = Dropout(0.25)(l4)
l5 = Dense(1024*6, activation="relu")(l3)
l5 = Dropout(0.25)(l4)
# And can change the number of layers
outputs = Dense(5, activation="softmax")(l5)
self.model = Model(inputs=inputs, outputs=outputs)
self.model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=['accuracy'])
self.n_epochs = n_epochs
self.batch_size = batch_size
self.kFolds = kFolds
self.debug = debug
self.X = X
self.y = y
self.tokenizer = keras.preprocessing.text.Tokenizer(num_words=vocab_size)
self.le = LabelEncoder()
def plot1(self,history):
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
## Accuracy plot
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
## Loss plot
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()
def plot2(self,history):
pd.DataFrame(history.history).plot(figsize=(8, 5))
plt.grid(True)
#plt.gca().set_ylim(0,1)
plt.show()
def fit(self, X, y=None):
xDf = pd.DataFrame(X, columns=[self.X])
xDf[self.y] = y.to_list()
totalFScore = 0
totalAccuracy = 0
totalConfusion_matrix = None
kfold = StratifiedKFold(n_splits=self.kFolds,
shuffle=True)
foldCounter = 0
bestModel = None
bestValidationF1 = 0
# Do the test train split
# train_X, test_X, train_y, test_y = train_test_split(
# xDf[[self.X]], xDf[self.y], test_size=0.2, random_state=7)
for train_index, test_index in kfold.split(xDf[[self.X]], xDf[self.y]):
foldCounter += 1
train_X, test_X = xDf.iloc[train_index][[
self.X]], xDf.iloc[test_index][[self.X]]
train_y, test_y = xDf.iloc[train_index][self.y], xDf.iloc[test_index][self.y]
self.tokenizer.fit_on_texts(list(train_X[self.X]))
train_X = self.tokenizer.texts_to_sequences(train_X[self.X])
test_X = self.tokenizer.texts_to_sequences(test_X[self.X])
# Pad the sentences
train_X = keras.preprocessing.sequence.pad_sequences(train_X, maxlen=self.maxlen)
test_X = keras.preprocessing.sequence.pad_sequences(test_X, maxlen=self.maxlen)
train_y = self.le.fit_transform(train_y)
test_y = self.le.transform(test_y)
history = self.model.fit(train_X, train_y, batch_size=self.batch_size, epochs=self.n_epochs, validation_data=(test_X, test_y) )
self.plot1(history)
self.plot2(history)
results = self.model.predict(test_X,verbose=2).argmax(axis=1)
fSc = f1_score(test_y, results, average='weighted')
# if fSc > bestValidationF1:
bestValidationF1 = fSc
bestModel = self.model
totalAccuracy += accuracy_score(test_y, results)
totalFScore += fSc
totalConfusion_matrix = totalConfusion_matrix + confusion_matrix(
test_y, results) if totalConfusion_matrix is not None else confusion_matrix(test_y, results)
bestModel.save_weights("transformer_result/predict_class.h5")
# Save tokenizer
with open('transformer_result3Classes/tokenizer.pickle', 'wb') as handle:
pickle.dump(self.tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
# Save label encoder
with open('transformer_result3Classes/label_encoder.pickle', 'wb') as handle:
pickle.dump(self.le, handle, protocol=pickle.HIGHEST_PROTOCOL)
gc.collect()
totalAccuracy = totalAccuracy/self.kFolds
totalFScore = totalFScore/self.kFolds
totalConfusion_matrix = totalConfusion_matrix/self.kFolds
# Save the confusion matrix np save
with open(f"transformer_result/transformer_confusion_matrix.npy","wb") as of:
np.save(of, totalConfusion_matrix)
# Save the file as json
with open(f"transformer_result/transformer_stats.json", 'w') as f:
json.dump({'accuracy':totalAccuracy ,'fScore':totalFScore}, f)
print("## Trained and Tested Model: Transformer" +
"\n\t - using lemmitization for tokenization" +
f"\n\t - {'without stratification on an unbalanced dataset'if len(X)>2000 else 'on a balanced dataset'}")
print("--"*10+"Results" + "--"*10)
print(
# f"- Average Accuracy of BiLSTM across {self.kFolds}-folds = {totalAccuracy/self.kFolds}")
f"- Average Accuracy of Transformer across {self.kFolds}-folds = {totalAccuracy}")
print(
f"- Average F1-Score of Transformer across {self.kFolds}-folds = {totalFScore}")
# f"- Average F1-Score of BiLSTM across {self.kFolds}-folds = {totalFScore/self.kFolds}")
print(
f"- Average Confustion Matrix of Transformer across {self.kFolds}-folds:")
# sns.heatmap(totalConfusion_matrix/self.kFolds, annot=True)
sns.heatmap(totalConfusion_matrix, annot=True)
plt.show()
def predict(self, X):
# Load the model
self.model.load_weights("transformer_result/predict_class.h5")
# Load the tokenizer
with open('transformer_result3Classes/tokenizer.pickle', 'rb') as handle:
self.tokenizer = pickle.load(handle)
# Load the label encoder
with open('transformer_result3Classes/label_encoder.pickle', 'rb') as handle:
self.le = pickle.load(handle)
test_X = self.tokenizer.texts_to_sequences(X[self.X])
test_X = keras.preprocessing.sequence.pad_sequences(test_X, maxlen=self.maxlen)
predictions = self.model.predict(test_X, verbose=2)
results = predictions.argmax(axis=1)
results = self.le.inverse_transform(results)
return results
num_heads = 80 # Number of attention heads
ff_dim = 128 # Hidden layer size in feed forward network inside transformer
embed_dim = 100 # how big is each word vector
# how many unique words to use (i.e num rows in embedding vector)
vocab_size = 10000 # Only consider the top k words
maxlen = 200 # Only consider the first 200 words of each movie review
# maxlen = int(df['cleaned_text'].str.split().str.len().max())
batch_size = 512 # how many samples to process at once
n_epochs = 20 # how many times to iterate over all samples
n_splits = 5 # Number of K-fold Splits
debug = 0
model = TransformerTensorflow(vocab_size, maxlen, embed_dim , num_heads,ff_dim, n_epochs, batch_size, n_splits, debug, X="cleaned_text", y="Score")
df = pd.read_csv('data/train_cleaned_new.csv', encoding='utf-8')
df['cleaned_text'] = df['cleaned_text'].astype(str)
df.loc[:, 'Score'] = df['Score'].map({5: 1, 4: 1, 1: -1, 2: -1, 3: 0})
'''
The following code fits the model on the training data.
'''
# _=model.fit(df[['cleaned_text']],df['Score'])
'\nThe following code fits the model on the training data.\n'
model_1, cM_1 = loadDLModel("Transformer", "transformer/transformer_result3Classes")
model_2, cM_2 = loadDLModel("Transformer", "transformer/transformers_cleaning")
info = [
["Transformer", "only cleaning (3 classes)", model_1, 5],
["Transformer", "only cleaning (5 classes)", model_2, 5]
]
models = [
['Only cleaning (3 classes)', (model_1, cM_1)],
['Only cleaning (5 classes)', (model_2, cM_2)]
]
plot_dl_cm(
models,
info,
only_table=True,
)
# dl_models_info.extend([x + [models[i][1][0]['accuracy'], models[i]
# [1][0]['fScore']] for i, x in enumerate(info)])
| Trained and Tested Deep Leaning Model: Transformer using Only cleaning (3 classes) | Trained and Tested Deep Leaning Model: Transformer using Only cleaning (5 classes) | |
|---|---|---|
| 0 | Average Accuracy of Transformer across 5-folds = 90.51% | Average Accuracy of Transformer across 5-folds = 79.15% |
| 1 | Average F1-Score of Transformer across 5-folds = 90.42% | Average F1-Score of Transformer across 5-folds = 78.65% |
From the results of the evaluation, it is clearly observed that transformer performs much better on 3 classes rather than 5. This is because it harder to distinguish between similar ratings like 1 & 2 and 4 & 5 as shown from the clustering experiment. Moreover, ratings 1 & 2 and 4 & 5 have a lot of common words as shown in the data visualization, thus making it difficult to classify.
class BiLSTM(nn.Module):
def __init__(self, le, embedding_matrix,max_features, embed_size):
super(BiLSTM, self).__init__()
self.hidden_size = 128
drp = 0.25
n_classes = len(le.classes_)
self.embedding = nn.Embedding(max_features, embed_size)
self.embedding.weight = nn.Parameter(
torch.tensor(embedding_matrix, dtype=torch.float32))
self.embedding.weight.requires_grad = False
self.lstm = nn.LSTM(embed_size, self.hidden_size,
bidirectional=True, batch_first=True)
self.linear = nn.Linear(self.hidden_size*4, 200)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(drp)
self.out = nn.Linear(200, n_classes)
def forward(self, x):
# rint(x.size())
h_embedding = self.embedding(x)
#_embedding = torch.squeeze(torch.unsqueeze(h_embedding, 0))
h_lstm, _ = self.lstm(h_embedding)
avg_pool = torch.mean(h_lstm, 1)
max_pool, _ = torch.max(h_lstm, 1)
conc = torch.cat((avg_pool, max_pool), 1)
conc = self.relu(self.linear(conc))
conc = self.dropout(conc)
out = self.out(conc)
return out
class LstmModelPytorch(BaseEstimator, TransformerMixin):
def __init__(self, max_features, n_epochs, batch_size, maxlen, embed_size, kFolds, debug, X, y):
# Reproducing same results
self.max_features = max_features
self.le = LabelEncoder()
self.tokenizer = Tokenizer(num_words=self.max_features)
self.n_epochs = n_epochs
self.loss_fn = nn.CrossEntropyLoss(reduction='mean')
self.batch_size = batch_size
self.maxlen = maxlen
self.embed_size = embed_size
self.kFolds = kFolds
self.debug = debug
self.X = X
self.y = y
def load_glove(self, word_index, embed_size):
EMBEDDING_FILE = 'glove.6B/glove.6B.50d.txt'
def get_coefs(word, *arr): return word, np.asarray(arr,
dtype='float32')[:300]
embeddings_index = dict(get_coefs(*o.split(" "))
for o in open(EMBEDDING_FILE, encoding="utf8"))
all_embs = np.stack(embeddings_index.values())
emb_mean, emb_std = -0.005838499, 0.48782197
embed_size = all_embs.shape[1]
nb_words = min(self.max_features, len(word_index)+1)
embedding_matrix = np.random.normal(
emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
if i >= self.max_features:
continue
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
else:
embedding_vector = embeddings_index.get(word.capitalize())
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
return embedding_matrix
def plot_graph(self, epochs, train_loss, val_loss):
fig = plt.figure(figsize=(12, 12))
plt.title("Train/Validation Loss")
plt.plot(list(np.arange(epochs) + 1), train_loss, label='train')
plt.plot(list(np.arange(epochs) + 1), val_loss, label='validation')
plt.xlabel('num_epochs', fontsize=12)
plt.ylabel('loss', fontsize=12)
plt.legend(loc='best')
plt.show()
def fit(self, X, y=None):
xDf = pd.DataFrame(X, columns=[self.X])
xDf[self.y] = y.to_list()
average_trainingLoss = []
average_validationLoss = []
totalFScore = 0
totalAccuracy = 0
totalConfusion_matrix = None
kfold = StratifiedKFold(n_splits=self.kFolds,
shuffle=True)
foldCounter = 0
bestModel = None
bestValidationF1 = 0
# # Do the test train split
# train_X, test_X, train_y, test_y = train_test_split(
# xDf[[self.X]], xDf[self.y], test_size=0.2, random_state=7,stratify=y)
for train_index, test_index in kfold.split(xDf[[self.X]], xDf[self.y]):
foldCounter += 1
train_X, test_X = xDf.iloc[train_index][[
self.X]], xDf.iloc[test_index][[self.X]]
train_y, test_y = xDf.iloc[train_index][self.y], xDf.iloc[test_index][self.y]
self.tokenizer.fit_on_texts(list(train_X[self.X]))
train_X = self.tokenizer.texts_to_sequences(train_X[self.X])
test_X = self.tokenizer.texts_to_sequences(test_X[self.X])
if self.debug:
self.embedding_matrix = np.random.randn(120000, 300)
else:
self.embedding_matrix = self.load_glove(
self.tokenizer.word_index, self.embed_size)
# Pad the sentences
train_X = pad_sequences(train_X, maxlen=self.maxlen)
test_X = pad_sequences(test_X, maxlen=self.maxlen)
train_y = self.le.fit_transform(train_y.values)
test_y = self.le.transform(test_y.values)
# Load train and test in CUDA Memory
x_train = torch.tensor(train_X, dtype=torch.long).cuda()
y_train = torch.tensor(train_y, dtype=torch.long).cuda()
x_cv = torch.tensor(test_X, dtype=torch.long).cuda()
y_cv = torch.tensor(test_y, dtype=torch.long).cuda()
# Create Torch datasets
train = torch.utils.data.TensorDataset(x_train, y_train)
valid = torch.utils.data.TensorDataset(x_cv, y_cv)
self.model = BiLSTM(self.le, self.embedding_matrix,self.max_features,self.embed_size)
self.optimizer = torch.optim.Adam(
filter(lambda p: p.requires_grad, self.model.parameters()), lr=0.001)
self.model.cuda()
# Create Data Loaders
train_loader = torch.utils.data.DataLoader(
train, batch_size=self.batch_size, shuffle=True)
valid_loader = torch.utils.data.DataLoader(
valid, batch_size=self.batch_size, shuffle=False)
train_loss = []
valid_loss = []
for epoch in range(self.n_epochs):
start_time = time.time()
# Set model to train configuration
self.model.train()
avg_loss = 0.
for i, (x_batch, y_batch) in enumerate(train_loader):
# Predict/Forward Pass
y_pred = self.model(x_batch)
# Compute loss
self.loss = self.loss_fn(y_pred, y_batch)
self.optimizer.zero_grad()
self.loss.backward()
self.optimizer.step()
avg_loss += self.loss.item() / len(train_loader)
# Set model to validation configuration -Doesn't get trained here
self.model.eval()
avg_val_loss = 0.
val_preds = np.zeros((len(x_cv), len(self.le.classes_)))
for i, (x_batch, y_batch) in enumerate(valid_loader):
y_pred = self.model(x_batch).detach()
avg_val_loss += self.loss_fn(y_pred,
y_batch).item() / len(valid_loader)
# keep/store predictions
val_preds[i * self.batch_size:(i+1) *
self.batch_size] = F.softmax(y_pred).cpu().numpy()
# Check Accuracy
val_accuracy = sum(val_preds.argmax(
axis=1) == test_y)/len(test_y)
train_loss.append(avg_loss)
valid_loss.append(avg_val_loss)
elapsed_time = time.time() - start_time
print('Epoch {}/{} at {} fold: \t loss={:.4f} \t val_loss={:.4f} \t val_acc={:.4f} \t time={:.2f}s'.format(
epoch + 1, self.n_epochs, foldCounter, avg_loss, avg_val_loss, val_accuracy, elapsed_time))
average_trainingLoss.append(train_loss)
average_validationLoss.append(valid_loss)
y_true = [self.le.classes_[x] for x in test_y]
y_pred = [self.le.classes_[x] for x in val_preds.argmax(axis=1)]
fSc = f1_score(y_true, y_pred, average='weighted')
if fSc > bestValidationF1:
bestValidationF1 = fSc
bestModel = self.model
totalAccuracy += accuracy_score(y_true, y_pred)
totalFScore += fSc
totalConfusion_matrix = totalConfusion_matrix + confusion_matrix(
y_true, y_pred) if totalConfusion_matrix is not None else confusion_matrix(y_true, y_pred)
torch.save(bestModel, 'lstm3CLasses/bilstm_model')
torch.save(self.tokenizer, 'lstm3CLasses/bilstm_model_tokenizer')
torch.save(self.le, 'lstm3CLasses/bilstm_model_labelencoder')
gc.collect()
# Element wise sum the average training and validation loss
average_trainingLoss = np.array(average_trainingLoss).sum(axis=0)
average_validationLoss = np.array(average_validationLoss).sum(axis=0)
self.plot_graph(self.n_epochs, average_trainingLoss,
average_validationLoss)
totalAccuracy = totalAccuracy/self.k_fold
totalFScore = totalFScore/self.k_fold
totalConfusion_matrix = totalConfusion_matrix/self.k_fold
# Save the confusion matrix np save
with open(f"lstm_confusion_matrix.npy", "wb") as of:
np.save(of, totalConfusion_matrix)
# Save the file as json
with open(f"stats.json", 'w') as f:
json.dump({'accuracy': totalAccuracy, 'fScore': totalFScore}, f)
print("## Trained and Tested Model: BiLSTM" +
"\n\t - using lemmitization for tokenization" +
"\n\t - with Glove Embeddings for vectorization" +
f"\n\t - {'without stratification on an unbalanced dataset'if len(X)>2000 else 'on a statified balanced dataset'}")
print("--"*10+"Results" + "--"*10)
print(
# f"- Average Accuracy of BiLSTM across {self.kFolds}-folds = {totalAccuracy/self.kFolds}")
f"- Average Accuracy of BiLSTM across {self.kFolds}-folds = {totalAccuracy}")
print(
f"- Average F1-Score of BiLSTM across {self.kFolds}-folds = {totalFScore}")
# f"- Average F1-Score of BiLSTM across {self.kFolds}-folds = {totalFScore/self.kFolds}")
print(
f"- Average Confustion Matrix of BiLSTM across {self.kFolds}-folds:")
# sns.heatmap(totalConfusion_matrix/self.kFolds, annot=True)
sns.heatmap(totalConfusion_matrix, annot=True)
plt.show()
def predict(self, X):
self.model = torch.load('lstm3CLasses/bilstm_model')
self.tokenizer = torch.load('lstm3CLasses/bilstm_model_tokenizer')
self.le = torch.load('lstm3CLasses/bilstm_model_labelencoder')
# generate list of zeroes only int same as the length of X
y = [1 for _ in range(len(X))]
test_X = self.tokenizer.texts_to_sequences(X[self.X])
test_X = pad_sequences(test_X, maxlen=self.maxlen)
test_y = self.le.transform(y)
x_cv = torch.tensor(test_X, dtype=torch.long).cuda()
y_cv = torch.tensor(test_y, dtype=torch.long).cuda()
valid = torch.utils.data.TensorDataset(x_cv, y_cv)
valid_loader = torch.utils.data.DataLoader(
valid, batch_size=self.batch_size, shuffle=False)
# Set model to validation configuration -Doesn't get trained here
self.model.eval()
val_preds = np.zeros((len(x_cv), len(self.le.classes_)))
for i, (x_batch, y_batch) in enumerate(valid_loader):
y_pred = self.model(x_batch).detach()
# keep/store predictions
val_preds[i * self.batch_size:(i+1) *
self.batch_size] = F.softmax(y_pred).cpu().numpy()
y_new = [self.le.classes_[x] for x in val_preds.argmax(axis=1)]
return y_new
embed_size = 50 # how big is each word vector
# how many unique words to use (i.e num rows in embedding vector)
max_features = 120000
# max number of words in a tweet to use
maxlen = 1000
# maxlen = int(df['cleaned_text'].str.split().str.len().max())
batch_size = 64 # how many samples to process at once
n_epochs = 1 # how many times to iterate over all samples
n_splits = 5 # Number of K-fold Splits
debug = 0
lstmModel = LstmModelPytorch(max_features=max_features,
n_epochs=n_epochs, batch_size=batch_size, maxlen=maxlen, embed_size=embed_size, kFolds=n_splits, debug=debug, X="cleaned_text", y="Score")
df = pd.read_csv('data/train_cleaned_new.csv', encoding='utf-8')
df['cleaned_text'] = df['cleaned_text'].astype(str)
df.loc[:, 'Score'] = df['Score'].map({5: 1, 4: 1, 1: -1, 2: -1, 3: 0})
'''
Uncomment the following line to train the Bi-LSTM model.
'''
# _ = lstmModel.fit(df[["cleaned_text"]], df["Score"])
'\nUncomment the following line to train the Bi-LSTM model.\n'
model_1, cM_1 = loadDLModel("lstm", "lstm/lstm3Classes")
prettyPrintDLModels(model_1, cM_1, "Bi-Lstm","only cleaning", 5)
model_2, cM_2 = loadDLModel("Transformer", "transformer/transformers_cleaning")
# prettyPrintDLModels(model, cM, "Transformer","only cleaning", 5)
info = [
["Bi-Lstm", "only cleaning (3 classes)", model_1, 5],
["Bi-Lstm", "only cleaning (5 classes)", model_2, 5]
]
models = [
['Only cleaning (3 classes)', (model_1, cM_1)],
['Only cleaning (5 classes)', (model_2, cM_2)]
]
plot_dl_cm(
models,
info,
only_table=True,
)
-- with Normalization by only cleaning
--------------------Results--------------------
| Trained and Tested Deep Leaning Model: Bi-Lstm using Only cleaning (3 classes) | Trained and Tested Deep Leaning Model: Bi-Lstm using Only cleaning (5 classes) | |
|---|---|---|
| 0 | Average Accuracy of Bi-Lstm across 5-folds = 87.87% | Average Accuracy of Bi-Lstm across 5-folds = 79.15% |
| 1 | Average F1-Score of Bi-Lstm across 5-folds = 93.54% | Average F1-Score of Bi-Lstm across 5-folds = 78.65% |
From the results of the evaluation, similar to transformers it is clearly observed that bi-lstm performs much better on 3 classes rather than 5. To be honest it also difficult for a human to distinguish between rating 1 & 2 or 4 & 5.
from gensim import corpora
from gensim.models import LsiModel
from gensim.models.coherencemodel import CoherenceModel
import re
import math
tfidf_vectorizer = TfidfVectorizer(strip_accents = 'unicode',
stop_words = 'english',
lowercase = True,
token_pattern = r'\b[a-zA-Z]{3,}\b',
max_df = 0.5,
min_df = 10)
tfidf_score1 = tfidf_vectorizer.fit_transform(train[train['Score'] == 1]['Normalized_Review_text'].astype('str').values)
lda_tf_score1 = LatentDirichletAllocation(n_components=15, random_state=0)
lda_tf_score1.fit(tfidf_score1)
LatentDirichletAllocation(n_components=15, random_state=0)
pyLDAvis.sklearn.prepare(lda_tf_score1, tfidf_score1, tfidf_vectorizer)
tfidf_score5 = tfidf_vectorizer.fit_transform(train[train['Score'] == 5]['Normalized_Review_text'].astype('str').values)
lda_tf_score5 = LatentDirichletAllocation(n_components=15, random_state=0)
lda_tf_score5.fit(tfidf_score5)
LatentDirichletAllocation(n_components=15, random_state=0)
pyLDAvis.sklearn.prepare(lda_tf_score5, tfidf_score5, tfidf_vectorizer)
The LDA from both scores speaks about the same topics. The only difference between them is that Score = 1 LDA, displays a negative view on the topics where else the Score = 5 LDA shows positive view on the topics as expected.
def prepare_corpus(corpus):
# Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
dictionary = corpora.Dictionary([str(doc).split() for doc in corpus['Normalized_Review_text']])
# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared a
doc_term_matrix = [dictionary.doc2bow(str(doc).split()) for doc in corpus['Normalized_Review_text']]
return [dictionary, doc_term_matrix]
lsa_doc_score1 = train[train['Score'] == 1]
lsa_doc_score5 = train[train['Score'] == 5]
corpus1 = prepare_corpus(lsa_doc_score1)
corpus5 = prepare_corpus(lsa_doc_score5)
lsamodel1 = LsiModel(corpus1[1], num_topics=15, id2word = corpus1[0])
lsamodel5 = LsiModel(corpus5[1], num_topics=15, id2word = corpus5[0])
def print_topic_model(lsamodel):
topic_dfs = []
topics = lsamodel.print_topics(num_topics=15, num_words=15)
for topic in topics:
line = topic[1]
values = re.findall("0\.[0-9]+(?=\*)", line, re.DOTALL)
terms = re.findall("[a-z]+", line)
obj = {'Terms': terms, "Values": values}
topic_dfs.append(pd.DataFrame(data =obj))
return topic_dfs
def visualize_topic_model(model, rows):
topic_dfs = print_topic_model(model)
cols = math.ceil(len(topic_dfs)/rows)
fig, ax = plt.subplots(nrows=rows, ncols=cols, figsize=(15,15))
for i in range(len(topic_dfs)):
axes = ax[math.floor(i/cols)][i%cols]
sns.barplot(x='Values', y='Terms', data=topic_dfs[i], ax=axes )
axes.set(xlim=(0,0.8))
axes.title.set_text('Topic' + str(i+1))
fig.tight_layout()
visualize_topic_model(lsamodel1, 5)
visualize_topic_model(lsamodel5, 5)
After evaluating transformers using 5-fold crossvalidation we found that the transformer performs better on only clean text. Moreover, we tested different parameters for transformers using 5 fold cross validation on TPU on google colab to find the optimal parameters. Then we trained the transformer with optimal paramters on the full given train dataset to get the best result.
from re import L
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, Dropout, Layer
from tensorflow.keras.layers import Embedding, Input, GlobalAveragePooling1D, Dense
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential, Model
import numpy as np
import warnings
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, precision_score, recall_score
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import pickle
from sklearn.preprocessing import OneHotEncoder
import gc
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)
import json
class TransformerBlock(Layer):
def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
super(TransformerBlock, self).__init__()
self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
self.ffn = Sequential(
[Dense(ff_dim, activation="relu"),
Dense(embed_dim),]
)
self.layernorm1 = LayerNormalization(epsilon=1e-6)
self.layernorm2 = LayerNormalization(epsilon=1e-6)
self.dropout1 = Dropout(rate)
self.dropout2 = Dropout(rate)
def call(self, inputs, training):
attn_output = self.att(inputs, inputs)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(inputs + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
return self.layernorm2(out1 + ffn_output)
class TokenAndPositionEmbedding(Layer):
def __init__(self, maxlen, vocab_size, embed_dim):
super(TokenAndPositionEmbedding, self).__init__()
self.token_emb = Embedding(input_dim=vocab_size, output_dim=embed_dim)
self.pos_emb = Embedding(input_dim=maxlen, output_dim=embed_dim)
def call(self, x):
maxlen = tf.shape(x)[-1]
positions = tf.range(start=0, limit=maxlen, delta=1)
positions = self.pos_emb(positions)
x = self.token_emb(x)
return x + positions
class TransformerTensorflow(BaseEstimator, TransformerMixin):
def __init__(self, vocab_size, maxlen, embed_dim , num_heads,ff_dim, n_epochs, batch_size, kFolds, debug, X, y):
# Reproducing same results
self.vocab_size = vocab_size
self.maxlen = maxlen
inputs = Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
inp = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
tb = transformer_block(inp)
GA = GlobalAveragePooling1D()(tb)
GA = Dropout(0.25)(GA)
l1 = Dense(100, activation="relu")(GA)
l1 = Dropout(0.015)(l1)
l2 = Dense(200, activation="relu")(l1)
l2 = Dropout(0.055)(l2)
# And can change the number of layers
outputs = Dense(5, activation="softmax")(l2)
self.model = Model(inputs=inputs, outputs=outputs)
self.model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=['accuracy'])
self.n_epochs = n_epochs
self.batch_size = batch_size
self.kFolds = kFolds
self.debug = debug
self.X = X
self.y = y
self.tokenizer = keras.preprocessing.text.Tokenizer(num_words=vocab_size)
self.le = LabelEncoder()
def plot1(self,history):
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
## Accuracy plot
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
## Loss plot
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()
def plot2(self,history):
pd.DataFrame(history.history).plot(figsize=(8, 5))
plt.grid(True)
#plt.gca().set_ylim(0,1)
plt.show()
def fit(self, X, y=None):
xDf = pd.DataFrame(X, columns=[self.X])
xDf[self.y] = y.to_list()
totalFScore = 0
totalAccuracy = 0
totalConfusion_matrix = None
kfold = StratifiedKFold(n_splits=self.kFolds,
shuffle=True, random_state=7)
foldCounter = 0
bestModel = None
bestValidationF1 = 0
# Do the test train split
train_X, test_X, train_y, test_y = train_test_split(xDf[[self.X]], xDf[self.y], test_size=5/xDf.shape[0])
self.tokenizer.fit_on_texts(list(train_X[self.X]))
train_X = self.tokenizer.texts_to_sequences(train_X[self.X])
test_X = self.tokenizer.texts_to_sequences(test_X[self.X])
# Pad the sentences
train_X = keras.preprocessing.sequence.pad_sequences(train_X, maxlen=self.maxlen)
test_X = keras.preprocessing.sequence.pad_sequences(test_X, maxlen=self.maxlen)
train_y = self.le.fit_transform(train_y)
test_y = self.le.transform(test_y)
history = self.model.fit(train_X, train_y, batch_size=self.batch_size, epochs=self.n_epochs, validation_data=(test_X, test_y) )
self.plot1(history)
self.plot2(history)
results = self.model.predict(test_X,verbose=2).argmax(axis=1)
fSc = f1_score(test_y, results, average='weighted')
# if fSc > bestValidationF1:
bestValidationF1 = fSc
bestModel = self.model
totalAccuracy += accuracy_score(test_y, results)
totalFScore += fSc
totalConfusion_matrix = totalConfusion_matrix + confusion_matrix(
test_y, results) if totalConfusion_matrix is not None else confusion_matrix(test_y, results)
bestModel.save_weights("predict_class.h5")
# Save tokenizer
with open('tokenizer.pickle', 'wb') as handle:
pickle.dump(self.tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
# Save label encoder
with open('label_encoder.pickle', 'wb') as handle:
pickle.dump(self.le, handle, protocol=pickle.HIGHEST_PROTOCOL)
# totalAccuracy = totalAccuracy/self.k_fold
# totalFScore = totalFScore/self.k_fold
# totalConfusion_matrix = totalConfusion_matrix/self.k_fold
# Save the confusion matrix np save
with open(f"lstm_confusion_matrix.npy","wb") as of:
np.save(of, totalConfusion_matrix)
# Save the file as json
with open(f"stats.json", 'w') as f:
json.dump({'accuracy':totalAccuracy ,'fScore':totalFScore}, f)
print("## Trained and Tested Model: Transformer" +
"\n\t - using lemmitization for tokenization" +
f"\n\t - {'without stratification on an unbalanced dataset'if len(X)>2000 else 'on a balanced dataset'}")
print("--"*10+"Results" + "--"*10)
print(
# f"- Average Accuracy of BiLSTM across {self.kFolds}-folds = {totalAccuracy/self.kFolds}")
f"- Average Accuracy of Transformer across {self.kFolds}-folds = {totalAccuracy}")
print(
f"- Average F1-Score of Transformer across {self.kFolds}-folds = {totalFScore}")
# f"- Average F1-Score of BiLSTM across {self.kFolds}-folds = {totalFScore/self.kFolds}")
print(
f"- Average Confustion Matrix of Transformer across {self.kFolds}-folds:")
# sns.heatmap(totalConfusion_matrix/self.kFolds, annot=True)
sns.heatmap(totalConfusion_matrix, annot=True)
plt.show()
def predict(self, X):
# Load the model
self.model.load_weights("kaggle/predict_class.h5")
# Load the tokenizer
with open('kaggle/tokenizer.pickle', 'rb') as handle:
self.tokenizer = pickle.load(handle)
# Load the label encoder
with open('kaggle/label_encoder.pickle', 'rb') as handle:
self.le = pickle.load(handle)
test_X = self.tokenizer.texts_to_sequences(X[self.X])
test_X = keras.preprocessing.sequence.pad_sequences(test_X, maxlen=self.maxlen)
predictions = self.model.predict(test_X, verbose=2)
results = predictions.argmax(axis=1)
results = self.le.inverse_transform(results)
return results
def main():
# tqdm.pandas(desc='Progress')
warnings.filterwarnings("ignore")
df = pd.read_csv('data/train_cleaned_new.csv', encoding='utf-8')
df['cleaned_text'] = df['cleaned_text'].astype(str)
num_heads = 100 # Number of attention heads
ff_dim = 50 # Hidden layer size in feed forward network inside transformer
embed_dim = 50 # how big is each word vector
# how many unique words to use (i.e num rows in embedding vector)
vocab_size = 1000 # Only consider the top k words
maxlen = 50 # Only consider the first 200 words of each movie review
# maxlen = int(df['cleaned_text'].str.split().str.len().max())
batch_size = 1000 # how many samples to process at once
n_epochs = 200 # how many times to iterate over all samples
n_splits = 5 # Number of K-fold Splits
debug = 0
model = TransformerTensorflow(vocab_size, maxlen, embed_dim , num_heads,ff_dim, n_epochs, batch_size, n_splits, debug, X="cleaned_text", y="Score")
df['cleaned_text'] = df['cleaned_text'].astype(str)
_=model.fit(df[['cleaned_text']],df['Score'])
# load and Take the index from the csv file
new_df = pd.read_csv('data/test_cleaned_new.csv', encoding='utf-8')
new_df ['cleaned_text'] = new_df ['cleaned_text'].astype(str)
gc.collect()
y_pred = model.predict(new_df [["cleaned_text"]])
# save only y_pred to csv
new_df ['Score'] = y_pred
new_df [['Id','Score']].to_csv('data/predicted_test_data.csv', index=False)
'''
Run the main function to train and save results for the transformer.
'''
'\nRun the main function to train and save results for the transformer.\n'