We have three data sources for our sentiment analysis of reviews: 1) TripAdvisor, and 2) Yelp, 3) Google Map. Since TripAdvisor and Google Reviews does not have an affordable documential API, we utilitize web scraping to obtain review-related data. Our reviews are mainly about the attractions in San Francisco, CA.
We use selenium
to do web scraping on TripAdvisor for reviews in San Francisco, CA.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import os
import time
from pprint import pprint
driver = webdriver.Chrome("/Users/alliewu/Desktop/DataScience_Projects/SF_Top_Attractions/chromedriver")
def scrape_review(name):
username, city, country, contribution, title, month, year, review, ratings = [],[],[],[],[],[],[],[],[]
for i in range(1,26):
review_chuncks = driver.find_element(By.CLASS_NAME, 'LbPSX')
chunck_list = review_chuncks.text.split('\nThis review is the subjective opinion of a Tripadvisor member and not of Tripadvisor LLC. Tripadvisor performs checks on reviews.\n')
chunck_list.pop(-1)
for chunck in chunck_list:
try:
review_element = chunck.split('\n')
# element 1: username
username.append(review_element[0].strip())
# element 2,3,4: city, country, contribution
mix_string = review_element[1]
pattern = r'([\w\s]+),\s*([\w\s]+)\s*(\d+)\s*contributions'
match = re.match(pattern, mix_string)
if match:
city.append(match.group(1))
country.append(match.group(2))
contribution.append(match.group(3))
else:
city.append('None')
country.append('None')
contribution.append(review_element[1].replace('contributions','').strip())
# element 5: title
title.append(review_element[3].strip())
# element 6,7: month, year
element4 = review_element[4].split(' • ')[0]
if len(element4.split()) >= 2:
month.append(element4.split()[0])
year.append(element4.split()[1])
else:
month.append('None')
year.append('None')
# element 8: review
review.append(review_element[5].strip())
except IndexError:
continue
# element 9: rating
rating_elements = review_chuncks.find_elements(By.CLASS_NAME, 'UctUV.d.H0')
for i in range(len(rating_elements)):
try:
rating_string = rating_elements[i].get_attribute("aria-label")
rating = rating_string.split()[0]
ratings.append(rating)
except IndexError:
continue
driver.find_element(By.XPATH, '//*[@id="tab-data-qa-reviews-0"]/div/div[5]/div/div[11]/div[1]/div/div[1]/div[2]/div/a').click()
time.sleep(3)
attraction = [name]*len(username)
review_dict = {'attraction':attraction,
'username':username,
'city': city,
'country': country,
'contribution': contribution,
'title': title,
'month': month,
'year': year,
'review': review,
'rating': ratings}
review = pd.DataFrame(review_dict)
return review
place = pd.read_csv('SF_places.csv')
place_rating = []
all_reviews = pd.DataFrame(columns=['attraction', 'username', 'city', 'country',
'contribution', 'title', 'month', 'year', 'review', 'rating'])
for i in range(place.shape[0]): #place.shape[0]
try:
url = place['url'][i]
driver.get(url)
driver.implicitly_wait(5)
driver.execute_script("window.scrollBy(0, 2000);")
time.sleep(10)
language_button = '//*[@id="tab-data-qa-reviews-0"]/div/div[1]/div/div/div[2]/div/div/div[2]/div/div/div/button'
all_language_button = '//*[@id="menu-item-all"]'
driver.find_element(By.XPATH, language_button).click()
time.sleep(5)
driver.find_element(By.XPATH, all_language_button).click()
time.sleep(5)
place_rating.append(driver.find_element(By.CLASS_NAME, 'biGQs._P.fiohW.hzzSG.uuBRH').text)
name = place['name'][i]
reviews = scrape_review(name=name)
all_reviews = pd.concat([all_reviews, reviews])
except Exception as e:
print(f"An error occurred while processing row {i}: {e}")
continue
all_reviews.head(5)
attraction | username | city | country | contribution | title | month | year | review | rating | |
---|---|---|---|---|---|---|---|---|---|---|
0 | Alcatraz Island | Amber N W | None | None | 5 | FUN FOR ALL AGES | Mar | 2023 | My family took the tour ( BUY TICKETS IN ADVAN... | 5.0 |
1 | Alcatraz Island | Gord P | None | None | 16 | great tour | Feb | 2023 | This is a must stop if you are in San Fran!!! ... | 5.0 |
2 | Alcatraz Island | Jess | None | None | 4 | Must See | Mar | 2023 | I did not expect to enjoy the tour as much as ... | 5.0 |
3 | Alcatraz Island | Christine J | None | None | 13 | Unsafe area DO NOT GO! | Mar | 2023 | San Francisco is completely unsafe. We bought ... | 1.0 |
4 | Alcatraz Island | Yankeehoya | Doha | Qatar25 | 8 | No Additional Tour Guide Needed | Mar | 2023 | I had a 13-hour layover in San Francisco And I... | 4.0 |
Yelp provides their Document API as follows: https://docs.developer.yelp.com/docs/fusion-intro. We obtain two dataframes of reviews San Francisco, CA from it: businesses aroound there and reviews to all businesses
import pandas as pd
import time
import requests
import json
# can only get n_review <= 3, n_business <= 50, review text with certain pieces
def get_yelp(key, loc = "San Francisco, CA", t_sleep = 0.1, n_business = 50, n_review = 3):
headers = {"Authorization": "Bearer %s" % key}
url = "https://api.yelp.com/v3/businesses/search"
params={"limit": n_business, "location": loc}
response = requests.get(url, headers=headers, params=params) #The API does not return businesses without any reviews
businesses = response.json()["businesses"]
businesses_pd = pd.DataFrame([business for business in businesses])
appended_data = [None] * n_business
i = 0
for business in businesses:
url2 = "https://api.yelp.com/v3/businesses/" + business['id'] + '/reviews'
params2={"limit": n_review, "sort_by": "newest"}
time.sleep(t_sleep)
response2 = requests.get(url2, headers=headers, params=params2)
reviews = response2.json()["reviews"]
reviews_pd = pd.DataFrame([review for review in reviews])
reviews_pd['business_id'] = business['id']
appended_data[i] = reviews_pd
i += 1
return [businesses_pd, pd.concat(appended_data, ignore_index=True)]
api_key = "scWgvjtjmz1UMUb9LD1q6C8qDEZkOrNjv6ZVrf9jFU4GurLk9QlA8CC3-Ac1GWEAUEvG7weRAOp-Uo1ay-kMtOPLsM7UFlY4FDlpurtYwrVPNen-j9WMsjHw7o4ZZHYx"
businesses, reviews = get_yelp(key = api_key)
businesses.head()
id | alias | name | image_url | is_closed | url | review_count | categories | rating | coordinates | transactions | price | location | phone | display_phone | distance | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | wGl_DyNxSv8KUtYgiuLhmA | bi-rite-creamery-san-francisco | Bi-Rite Creamery | https://s3-media3.fl.yelpcdn.com/bphoto/ZFLw9l... | False | https://www.yelp.com/biz/bi-rite-creamery-san-... | 10031 | [{'alias': 'icecream', 'title': 'Ice Cream & F... | 4.5 | {'latitude': 37.761591, 'longitude': -122.425717} | ['delivery'] | $$ | {'address1': '3692 18th St', 'address2': None,... | 1.415627e+10 | (415) 626-5600 | 946.386739 |
1 | lJAGnYzku5zSaLnQ_T6_GQ | brendas-french-soul-food-san-francisco-6 | Brenda's French Soul Food | https://s3-media4.fl.yelpcdn.com/bphoto/VJ865E... | False | https://www.yelp.com/biz/brendas-french-soul-f... | 11992 | [{'alias': 'breakfast_brunch', 'title': 'Break... | 4.0 | {'latitude': 37.78291531984934, 'longitude': -... | ['delivery'] | $$ | {'address1': '652 Polk St', 'address2': '', 'a... | 1.415346e+10 | (415) 345-8100 | 2893.406622 |
2 | WavvLdfdP6g8aZTtbBQHTw | gary-danko-san-francisco | Gary Danko | https://s3-media3.fl.yelpcdn.com/bphoto/eyYUz3... | False | https://www.yelp.com/biz/gary-danko-san-franci... | 5828 | [{'alias': 'newamerican', 'title': 'American (... | 4.5 | {'latitude': 37.80587, 'longitude': -122.42058} | [] | $$$$ | {'address1': '800 N Point St', 'address2': '',... | 1.415749e+10 | (415) 749-2060 | 5191.341803 |
3 | 76smcUUGRvq3k1MVPUXbnA | mitchells-ice-cream-san-francisco | Mitchells Ice Cream | https://s3-media2.fl.yelpcdn.com/bphoto/f4lzrs... | False | https://www.yelp.com/biz/mitchells-ice-cream-s... | 4690 | [{'alias': 'icecream', 'title': 'Ice Cream & F... | 4.5 | {'latitude': 37.744221, 'longitude': -122.422791} | ['pickup', 'delivery'] | $ | {'address1': '688 San Jose Ave', 'address2': '... | 1.415648e+10 | (415) 648-2300 | 2209.260424 |
4 | ri7UUYmx21AgSpRsf4-9QA | tartine-bakery-san-francisco-3 | Tartine Bakery | https://s3-media4.fl.yelpcdn.com/bphoto/QRbC0T... | False | https://www.yelp.com/biz/tartine-bakery-san-fr... | 8715 | [{'alias': 'bakeries', 'title': 'Bakeries'}, {... | 4.0 | {'latitude': 37.76131, 'longitude': -122.42431} | ['delivery'] | $$ | {'address1': '600 Guerrero St', 'address2': ''... | 1.415487e+10 | (415) 487-2600 | 1087.638933 |
reviews.head(5)
id | url | text | rating | time_created | user | business_id | |
---|---|---|---|---|---|---|---|
0 | XeBZmAzdhswpWiQYe6sTcw | https://www.yelp.com/biz/bi-rite-creamery-san-... | Got the Earl Gray + Pina colada and it was SO ... | 5 | 2023-03-12 11:26:29 | {'id': 'J_iJFYQp5y8fwcz-BdUEDQ', 'profile_url'... | wGl_DyNxSv8KUtYgiuLhmA |
1 | FuF8jvLV0olpgWbNQ2jHZg | https://www.yelp.com/biz/bi-rite-creamery-san-... | My review is not on the ice cream. The ice cre... | 1 | 2023-03-19 16:50:01 | {'id': 'T_yEM-V-vmbODnGqUGi--g', 'profile_url'... | wGl_DyNxSv8KUtYgiuLhmA |
2 | ruuEFbw8S0wQKZ9UN0-8tg | https://www.yelp.com/biz/bi-rite-creamery-san-... | 2/25/2023 - A delicious end to our date night ... | 5 | 2023-02-26 01:34:44 | {'id': '84oPkNdCcisrtOmYK_ACwA', 'profile_url'... | wGl_DyNxSv8KUtYgiuLhmA |
3 | eBDHqdq65uP6JAMs24q3tQ | https://www.yelp.com/biz/brendas-french-soul-f... | Many French toast shops do not handle it prope... | 5 | 2023-03-03 22:55:42 | {'id': '5TpUy6HRIDhH3JvQlm8LBA', 'profile_url'... | lJAGnYzku5zSaLnQ_T6_GQ |
4 | IXYJkV13UIjG21CMMMy8kQ | https://www.yelp.com/biz/brendas-french-soul-f... | First and foremost, let's talk about Brenda's ... | 5 | 2023-03-09 18:43:35 | {'id': 'qizLZcjOtLwk9v-pjMn2sg', 'profile_url'... | lJAGnYzku5zSaLnQ_T6_GQ |
We obtain the reviews on Google Maps in San Francisco, CA. This data are obtained through other web widgets, but we read them into our working environment and make some adjustment according to our analysis.
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
place = pd.read_csv('SF_places.csv')
csv_name = !ls new_google_review
def load_data(filename):
data = pd.read_csv(f'/Users/alliewu/Desktop/DataScience_Projects/SF_Top_Attractions/new_google_review/{filename}')
# Remove the prefix before the underscore
filename = filename.split('_', 1)[1]
# Remove the suffix after the dot
filename = filename.rsplit('.', 1)[0]
# Replace underscores with spaces
filename = filename.replace('_', ' ')
data['attraction'] = [filename]*(data.shape[0])
return data
data = load_data('google_Adventure_Playground.csv')
for i in range(1,len(csv_name)): #place.shape[0]
filename = csv_name[i]
#name = place['name'][i]
reviews = load_data(filename=filename)
data = pd.merge(data, reviews, how='outer')
data = data.rename(columns={'d4r55': 'username',
'RfnDt 2': 'contributions',
'rsqaWe': 'time',
'wiI7pd': 'review',
'kyuRq 2': 'language'
})
data['hCCjke src'] = [1]*data.shape[0]
data['hCCjke src 2'] = data['hCCjke src 2'].replace({'https://maps.gstatic.com/consumer/images/icons/2x/ic_star_rate_14.png': 1,
'https://maps.gstatic.com/consumer/images/icons/2x/ic_star_rate_empty_14.png': 0})
data['hCCjke src 3'] = data['hCCjke src 3'].replace({'https://maps.gstatic.com/consumer/images/icons/2x/ic_star_rate_14.png': 1,
'https://maps.gstatic.com/consumer/images/icons/2x/ic_star_rate_empty_14.png': 0})
data['hCCjke src 4'] = data['hCCjke src 4'].replace({'https://maps.gstatic.com/consumer/images/icons/2x/ic_star_rate_14.png': 1,
'https://maps.gstatic.com/consumer/images/icons/2x/ic_star_rate_empty_14.png': 0})
data['hCCjke src 5'] = data['hCCjke src 5'].replace({'https://maps.gstatic.com/consumer/images/icons/2x/ic_star_rate_14.png': 1,
'https://maps.gstatic.com/consumer/images/icons/2x/ic_star_rate_empty_14.png': 0})
data['rating'] = data.apply(lambda row: row['hCCjke src'] + row['hCCjke src 2'] + row['hCCjke src 3'] + row['hCCjke src 4']+ row['hCCjke src 5'], axis=1)
data['language'] = data['language'].replace(np.nan, 'English')
data['contributions'] = [str(i).replace(' reviews','').strip() for i in data['contributions']]
data['contributions'] = [i.replace('· ','').strip() for i in data['contributions']]
def convert_date(date_str):
if isinstance(date_str, str):
if date_str.endswith('ago'):
try:
num = int(date_str.split()[0])
except:
num = 1
if 'day' in date_str:
return (datetime.today() - timedelta(days=num)).strftime('%b %Y')
elif 'week' in date_str:
return (datetime.today() - timedelta(weeks=num)).strftime('%b %Y')
elif 'month' in date_str:
return (datetime.today() - timedelta(days=num*30)).strftime('%b %Y')
elif 'year' in date_str:
return (datetime.today() - timedelta(days=num*365)).strftime('%b %Y')
return np.nan
data['time'] = [convert_date(i) for i in data['time']]
keep_columns = ['attraction','username','contributions', 'time','review','rating'] #,'language'
# Keep only the specified columns
google_reviews = data.loc[:, keep_columns]
google_reviews.dropna(subset=['review'], inplace=True)
google_reviews.head(5)
attraction | username | contributions | time | review | rating | |
---|---|---|---|---|---|---|
0 | Adventure Playground | jackie | 117 | Dec 2022 | Wasn’t able to go in, however the structure is... | 4.0 |
1 | Adventure Playground | Julia Gidwani | 125 | Feb 2023 | Keep an eye on your kids and let them go wild ... | 5.0 |
2 | Adventure Playground | Nathan Pierce | 72 | Feb 2023 | When I was a kid we somehow had wood around an... | 5.0 |
3 | Adventure Playground | Harry Jung | 444 | Jan 2023 | Great playground for age 5+. Kids can be explo... | 4.0 |
4 | Adventure Playground | V Tancredi | 227 | Dec 2022 | Very creative and unique free play area at the... | 5.0 |