# importing the libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import string
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split as tts
# importing the dataset
data = pd.read_csv("IMDB-Dataset.csv")
# top values of the data-set
data.head()
# shape of the data
data.shape
# column names
data.columns
# count of unique values in the column
data['sentiment'].value_counts()
# top 10 elements of the dataset
data.head(10)
# data from the bottom
data.tail(5)
def clean_text1(text):
text=text.lower()
text=re.sub('\[.*?\]','',text)
text=re.sub('[%s]'%re.escape(string.punctuation),'',text)
text=re.sub('\w*\d\w*','',text)
return text
cleaned1=lambda x:clean_text1(x)
data['review']=pd.DataFrame(data.review.apply(cleaned1))
data.head()
# second round of cleaning
def clean_text2(text):
text=re.sub('[''"",,,]','',text)
text=re.sub('\n','',text)
return text
cleaned2=lambda x:clean_text2(x)
data['review']=pd.DataFrame(data.review.apply(cleaned2))
data.head()
x = data.iloc[0:,0].values
y = data.iloc[0:,1].values
xtrain,xtest,ytrain,ytest = tts(x,y,test_size = 0.26,random_state = 5)
tf = TfidfVectorizer()
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
classifier=LogisticRegression()
model=Pipeline([('vectorizer',tf),('classifier',classifier)])
model.fit(xtrain,ytrain)
ypred=model.predict(xtest)
# model score
accuracy_score(ypred,ytest)
# confusion matrix
A=confusion_matrix(ytest,ypred)
print(A)
# f1 score
recall=A[0][0]/(A[0][0]+A[1][0])
precision=A[0][0]/(A[0][0]+A[0][1])
F1=2*recall*precision/(recall+precision)
print(F1)