import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from random import shuffle
import random
from nltk.tokenize import word_tokenize 
import numpy as np

nltk.download(["stopwords", "vader_lexicon", "punkt"])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yanyuchen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/yanyuchen/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /Users/yanyuchen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!

True


url = 'https://raw.githubusercontent.com/yanyuchen/sentiment-analysis/main/data/all_reviews.csv'
df = pd.read_csv(url)[['review', 'rating']].dropna()
df = df.reset_index(drop=True)
df.head()


stopwords = set(nltk.corpus.stopwords.words("english"))
stopwords.update(nltk.corpus.stopwords.words("spanish"))


act = df.rating > 3
features = [(df.review[i], act[i]) for i in range(len(act))]


random.seed(220)


train_count = round(len(features) * 0.8)
shuffle(features)
train = features[:train_count]
all_words = set(word.lower() for passage in train for word in word_tokenize(passage[0]) if word.lower() not in stopwords)
#t = [({word: (word in word_tokenize(x[0])) for word in all_words}, x[1]) for x in train]
# speed up:
t = [({word: False for word in all_words} | {word: True for word in word_tokenize(passage[0]) if word.lower() not in stopwords},
      label) for passage, label in train]


classifier = nltk.NaiveBayesClassifier.train(t)
classifier.show_most_informative_features(15)

Most Informative Features
                       9 = True            False : True   =     11.0 : 1.0
                       $ = True            False : True   =      6.6 : 1.0
                       4 = True            False : True   =      6.6 : 1.0
                       8 = True            False : True   =      6.6 : 1.0
                      `` = True            False : True   =      6.6 : 1.0
                       Н = True            False : True   =      6.6 : 1.0
                       ア = True            False : True   =      6.6 : 1.0
                       子 = True            False : True   =      6.6 : 1.0
                       전 = True            False : True   =      6.6 : 1.0
                       È = True            False : True   =      4.7 : 1.0
                       c = True            False : True   =      4.0 : 1.0
                       Е = True            False : True   =      4.0 : 1.0
                       П = True            False : True   =      4.0 : 1.0
                       샌 = True            False : True   =      3.7 : 1.0
                       v = True            False : True   =      3.0 : 1.0


nltk.classify.accuracy(classifier, t)

0.8696296296296296


test = features[train_count:]
s = [{word: False for word in all_words} | {word: True for word in word_tokenize(passage[0]) if word.lower() not in stopwords}
     for passage, _ in test]
act = [ts[1] for ts in test]


pred = classifier.classify_many(s)


def accuracy(pred, act):
    return sum([pred[i] == act[i] for i in range(len(act))])/len(act)


accuracy(pred, act)

0.875


review = [review for review in df.review] # preserve the order
all_w = [{word: False for word in all_words} | {word: True for word in word_tokenize(passage) if word.lower() not in stopwords}
     for passage in review]


pred_all = classifier.classify_many(all_w)
pd.DataFrame(pred_all).to_csv('nltk.NaiveBayesClassifier_pred.csv', index=False)


from sklearn.naive_bayes import (BernoulliNB, ComplementNB, MultinomialNB,)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


classifiers = {
    "BernoulliNB": BernoulliNB(),
    "ComplementNB": ComplementNB(),
    "MultinomialNB": MultinomialNB(),
    "LogisticRegression": LogisticRegression(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
}


accuracy_list = []
pred_list = []

for name, sklearn_classifier in classifiers.items():
    classifier = nltk.classify.SklearnClassifier(sklearn_classifier)
    classifier.train(t)
    
    # training set
    accur = nltk.classify.accuracy(classifier, t)
    accuracy_list.append(accur)
    
    # testing set
    pred = classifier.classify_many(s)
    pred_list.append(accuracy(pred, act))
    
    # whole dataset
    pred_all = classifier.classify_many(all_w)
    pd.DataFrame(pred_all).to_csv(F'{name}_pred.csv', index=False)
    
    print(F"{name} finish")

BernoulliNB finish
ComplementNB finish
MultinomialNB finish
LogisticRegression finish
DecisionTreeClassifier finish
RandomForestClassifier finish


pd.DataFrame(accuracy_list, columns = ['accuracy'], index = list(classifiers.keys())).T


pd.DataFrame(pred_list, columns = ['accuracy'], index = list(classifiers.keys())).T

	review	rating
0	My family took the tour ( BUY TICKETS IN ADVAN...	5.0
1	This is a must stop if you are in San Fran!!! ...	5.0
2	I did not expect to enjoy the tour as much as ...	5.0
3	San Francisco is completely unsafe. We bought ...	1.0
4	I had a 13-hour layover in San Francisco And I...	4.0

Using NLTK’s built-in classifiers for Sentiment Analyzer¶

Loading Libraries and Data¶

Training and Using a Classifier¶

Evaluating on the Testing Set¶

Comparing Additional Classifiers with Scikit-learn¶

Evaluating on the Testing Set¶