import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from random import shuffle

nltk.download(["stopwords", "vader_lexicon", "punkt"])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yanyuchen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/yanyuchen/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /Users/yanyuchen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!

True


url = 'https://raw.githubusercontent.com/yanyuchen/sentiment-analysis/main/data/all_reviews.csv'
df = pd.read_csv(url)[['review', 'rating']].dropna()
df = df.reset_index(drop=True)
df.head()


sia = SentimentIntensityAnalyzer()
def is_positive(txt: str) -> bool:
    return sia.polarity_scores(txt)["compound"] > 0


pred = [is_positive(txt) for txt in df.review]
pd.DataFrame(pred).to_csv('naive_pred.csv', index=False)


def accuracy(pred, act):
    return sum([pred[i] == act[i] for i in range(len(act))])/len(act)


act = df.rating > 3

accuracy(act, pred)

0.7284775347484107


positive_review = [review for review in df.review if is_positive(review)]
negative_review = [review for review in df.review if is_positive(review) == False]
positive_words = [word.lower() for review in positive_review for word in review.split()]
negative_words = [word.lower() for review in negative_review for word in review.split()]


positive_fd = nltk.FreqDist(positive_words)
negative_fd = nltk.FreqDist(negative_words)

common_set = set(positive_fd).intersection(negative_fd)

for word in common_set:
    del positive_fd[word]
    del negative_fd[word]


plt.figure(figsize=(12,8))
positive_fd.plot(15)
plt.show()


positive_fd

FreqDist({'fantastic': 143, 'perfect': 114, 'free.': 76, 'it!': 71, 'fabulous': 55, 'popular': 53, 'tasting': 49, 'grounds': 48, 'cute': 48, 'exploring': 48, ...})


plt.figure(figsize=(12,8))
negative_fd.plot(15)
plt.show()


negative_fd

FreqDist({'nicht': 36, 'cada': 24, 'pela': 24, 'ich': 22, 'nur': 21, 'sind': 20, '2006': 19, 'centro': 18, '2010': 17, 'dove': 16, ...})

	review	rating
0	My family took the tour ( BUY TICKETS IN ADVAN...	5.0
1	This is a must stop if you are in San Fran!!! ...	5.0
2	I did not expect to enjoy the tour as much as ...	5.0
3	San Francisco is completely unsafe. We bought ...	1.0
4	I had a 13-hour layover in San Francisco And I...	4.0

Using NLTK’s Pre-Trained Sentiment Analyzer¶

Loading Libraries and Data¶

Create an Instance of the Pre-trained VADER¶

Deeper Insight from the VADER¶

Positive Words from the VADER¶

Negative Words from the VADER¶