Data Sources¶

We have three data sources for our sentiment analysis of reviews: 1) TripAdvisor, and 2) Yelp, 3) Google Map. Since TripAdvisor and Google Reviews does not have an affordable documential API, we utilitize web scraping to obtain review-related data. Our reviews are mainly about the attractions in San Francisco, CA.

TripAdvisor¶

We use selenium to do web scraping on TripAdvisor for reviews in San Francisco, CA.

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import os
import time
from pprint import pprint
In [ ]:
driver = webdriver.Chrome("/Users/alliewu/Desktop/DataScience_Projects/SF_Top_Attractions/chromedriver")
In [ ]:
def scrape_review(name):
    username, city, country, contribution, title, month, year, review, ratings = [],[],[],[],[],[],[],[],[]
    for i in range(1,26):
        review_chuncks = driver.find_element(By.CLASS_NAME, 'LbPSX')
        
        chunck_list = review_chuncks.text.split('\nThis review is the subjective opinion of a Tripadvisor member and not of Tripadvisor LLC. Tripadvisor performs checks on reviews.\n')
        chunck_list.pop(-1)

        for chunck in chunck_list:
            try:
                review_element = chunck.split('\n')

                # element 1: username
                username.append(review_element[0].strip())
                
                # element 2,3,4:  city, country, contribution
                mix_string = review_element[1]
                pattern = r'([\w\s]+),\s*([\w\s]+)\s*(\d+)\s*contributions'
                match = re.match(pattern, mix_string)
                if match:
                    city.append(match.group(1))
                    country.append(match.group(2))
                    contribution.append(match.group(3))
                else:
                    city.append('None')
                    country.append('None')
                    contribution.append(review_element[1].replace('contributions','').strip())
                
                # element 5: title
                title.append(review_element[3].strip())
                
                # element 6,7: month, year
                element4 = review_element[4].split(' • ')[0]
                if len(element4.split()) >= 2:
                    month.append(element4.split()[0])
                    year.append(element4.split()[1])
                else:
                    month.append('None')
                    year.append('None')
                
                # element 8: review
                review.append(review_element[5].strip())
            
            except IndexError:
                continue
        
        # element 9: rating
        rating_elements = review_chuncks.find_elements(By.CLASS_NAME, 'UctUV.d.H0')
        for i in range(len(rating_elements)):
            try:
                rating_string = rating_elements[i].get_attribute("aria-label")
                rating = rating_string.split()[0]
                ratings.append(rating)
            except IndexError:
                continue
                
        driver.find_element(By.XPATH, '//*[@id="tab-data-qa-reviews-0"]/div/div[5]/div/div[11]/div[1]/div/div[1]/div[2]/div/a').click()
        time.sleep(3)

    attraction = [name]*len(username)
    review_dict = {'attraction':attraction,
                'username':username,
                'city': city,
                'country': country,
                'contribution': contribution,
                'title': title,
                'month': month,
                'year': year,
                'review': review,
                'rating': ratings}
    review = pd.DataFrame(review_dict)
    return review
In [ ]:
place = pd.read_csv('SF_places.csv')
place_rating = []
all_reviews = pd.DataFrame(columns=['attraction', 'username', 'city', 'country', 
                           'contribution', 'title', 'month', 'year', 'review', 'rating'])
for i in range(place.shape[0]): #place.shape[0]
    try:
        url = place['url'][i]
        driver.get(url)
        driver.implicitly_wait(5)
        driver.execute_script("window.scrollBy(0, 2000);")
        time.sleep(10)

        language_button = '//*[@id="tab-data-qa-reviews-0"]/div/div[1]/div/div/div[2]/div/div/div[2]/div/div/div/button'
        all_language_button = '//*[@id="menu-item-all"]'
        driver.find_element(By.XPATH, language_button).click()
        time.sleep(5)
        driver.find_element(By.XPATH, all_language_button).click()
        time.sleep(5)

        place_rating.append(driver.find_element(By.CLASS_NAME, 'biGQs._P.fiohW.hzzSG.uuBRH').text)
        
        name = place['name'][i]
        reviews = scrape_review(name=name)
        
        all_reviews = pd.concat([all_reviews, reviews])
    
    except Exception as e:
        print(f"An error occurred while processing row {i}: {e}")
        continue
In [3]:
all_reviews.head(5)
Out[3]:
attraction username city country contribution title month year review rating
0 Alcatraz Island Amber N W None None 5 FUN FOR ALL AGES Mar 2023 My family took the tour ( BUY TICKETS IN ADVAN... 5.0
1 Alcatraz Island Gord P None None 16 great tour Feb 2023 This is a must stop if you are in San Fran!!! ... 5.0
2 Alcatraz Island Jess None None 4 Must See Mar 2023 I did not expect to enjoy the tour as much as ... 5.0
3 Alcatraz Island Christine J None None 13 Unsafe area DO NOT GO! Mar 2023 San Francisco is completely unsafe. We bought ... 1.0
4 Alcatraz Island Yankeehoya Doha Qatar25 8 No Additional Tour Guide Needed Mar 2023 I had a 13-hour layover in San Francisco And I... 4.0

Yelp¶

Yelp provides their Document API as follows: https://docs.developer.yelp.com/docs/fusion-intro. We obtain two dataframes of reviews San Francisco, CA from it: businesses aroound there and reviews to all businesses

In [ ]:
import pandas as pd
import time
import requests
import json
In [ ]:
# can only get n_review <= 3, n_business <= 50, review text with certain pieces

def get_yelp(key, loc = "San Francisco, CA", t_sleep = 0.1, n_business = 50, n_review = 3):
    headers = {"Authorization": "Bearer %s" % key}
    url = "https://api.yelp.com/v3/businesses/search"
    params={"limit": n_business, "location": loc}
    response = requests.get(url, headers=headers, params=params) #The API does not return businesses without any reviews
    businesses = response.json()["businesses"]
    businesses_pd = pd.DataFrame([business for business in businesses])
    appended_data = [None] * n_business
    i = 0
    for business in businesses:
        url2 = "https://api.yelp.com/v3/businesses/" + business['id'] + '/reviews'
        params2={"limit": n_review, "sort_by": "newest"}
        time.sleep(t_sleep)
        response2 = requests.get(url2, headers=headers, params=params2)
        reviews = response2.json()["reviews"]
        reviews_pd = pd.DataFrame([review for review in reviews])
        reviews_pd['business_id'] = business['id']
        appended_data[i] = reviews_pd
        i += 1
    return [businesses_pd, pd.concat(appended_data, ignore_index=True)]
In [ ]:
api_key = "scWgvjtjmz1UMUb9LD1q6C8qDEZkOrNjv6ZVrf9jFU4GurLk9QlA8CC3-Ac1GWEAUEvG7weRAOp-Uo1ay-kMtOPLsM7UFlY4FDlpurtYwrVPNen-j9WMsjHw7o4ZZHYx"
businesses, reviews = get_yelp(key = api_key)
In [5]:
businesses.head()
Out[5]:
id alias name image_url is_closed url review_count categories rating coordinates transactions price location phone display_phone distance
0 wGl_DyNxSv8KUtYgiuLhmA bi-rite-creamery-san-francisco Bi-Rite Creamery https://s3-media3.fl.yelpcdn.com/bphoto/ZFLw9l... False https://www.yelp.com/biz/bi-rite-creamery-san-... 10031 [{'alias': 'icecream', 'title': 'Ice Cream & F... 4.5 {'latitude': 37.761591, 'longitude': -122.425717} ['delivery'] $$ {'address1': '3692 18th St', 'address2': None,... 1.415627e+10 (415) 626-5600 946.386739
1 lJAGnYzku5zSaLnQ_T6_GQ brendas-french-soul-food-san-francisco-6 Brenda's French Soul Food https://s3-media4.fl.yelpcdn.com/bphoto/VJ865E... False https://www.yelp.com/biz/brendas-french-soul-f... 11992 [{'alias': 'breakfast_brunch', 'title': 'Break... 4.0 {'latitude': 37.78291531984934, 'longitude': -... ['delivery'] $$ {'address1': '652 Polk St', 'address2': '', 'a... 1.415346e+10 (415) 345-8100 2893.406622
2 WavvLdfdP6g8aZTtbBQHTw gary-danko-san-francisco Gary Danko https://s3-media3.fl.yelpcdn.com/bphoto/eyYUz3... False https://www.yelp.com/biz/gary-danko-san-franci... 5828 [{'alias': 'newamerican', 'title': 'American (... 4.5 {'latitude': 37.80587, 'longitude': -122.42058} [] $$$$ {'address1': '800 N Point St', 'address2': '',... 1.415749e+10 (415) 749-2060 5191.341803
3 76smcUUGRvq3k1MVPUXbnA mitchells-ice-cream-san-francisco Mitchells Ice Cream https://s3-media2.fl.yelpcdn.com/bphoto/f4lzrs... False https://www.yelp.com/biz/mitchells-ice-cream-s... 4690 [{'alias': 'icecream', 'title': 'Ice Cream & F... 4.5 {'latitude': 37.744221, 'longitude': -122.422791} ['pickup', 'delivery'] $ {'address1': '688 San Jose Ave', 'address2': '... 1.415648e+10 (415) 648-2300 2209.260424
4 ri7UUYmx21AgSpRsf4-9QA tartine-bakery-san-francisco-3 Tartine Bakery https://s3-media4.fl.yelpcdn.com/bphoto/QRbC0T... False https://www.yelp.com/biz/tartine-bakery-san-fr... 8715 [{'alias': 'bakeries', 'title': 'Bakeries'}, {... 4.0 {'latitude': 37.76131, 'longitude': -122.42431} ['delivery'] $$ {'address1': '600 Guerrero St', 'address2': ''... 1.415487e+10 (415) 487-2600 1087.638933
In [6]:
reviews.head(5)
Out[6]:
id url text rating time_created user business_id
0 XeBZmAzdhswpWiQYe6sTcw https://www.yelp.com/biz/bi-rite-creamery-san-... Got the Earl Gray + Pina colada and it was SO ... 5 2023-03-12 11:26:29 {'id': 'J_iJFYQp5y8fwcz-BdUEDQ', 'profile_url'... wGl_DyNxSv8KUtYgiuLhmA
1 FuF8jvLV0olpgWbNQ2jHZg https://www.yelp.com/biz/bi-rite-creamery-san-... My review is not on the ice cream. The ice cre... 1 2023-03-19 16:50:01 {'id': 'T_yEM-V-vmbODnGqUGi--g', 'profile_url'... wGl_DyNxSv8KUtYgiuLhmA
2 ruuEFbw8S0wQKZ9UN0-8tg https://www.yelp.com/biz/bi-rite-creamery-san-... 2/25/2023 - A delicious end to our date night ... 5 2023-02-26 01:34:44 {'id': '84oPkNdCcisrtOmYK_ACwA', 'profile_url'... wGl_DyNxSv8KUtYgiuLhmA
3 eBDHqdq65uP6JAMs24q3tQ https://www.yelp.com/biz/brendas-french-soul-f... Many French toast shops do not handle it prope... 5 2023-03-03 22:55:42 {'id': '5TpUy6HRIDhH3JvQlm8LBA', 'profile_url'... lJAGnYzku5zSaLnQ_T6_GQ
4 IXYJkV13UIjG21CMMMy8kQ https://www.yelp.com/biz/brendas-french-soul-f... First and foremost, let's talk about Brenda's ... 5 2023-03-09 18:43:35 {'id': 'qizLZcjOtLwk9v-pjMn2sg', 'profile_url'... lJAGnYzku5zSaLnQ_T6_GQ

Google Maps¶

We obtain the reviews on Google Maps in San Francisco, CA. This data are obtained through other web widgets, but we read them into our working environment and make some adjustment according to our analysis.

In [ ]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
In [ ]:
place = pd.read_csv('SF_places.csv')
csv_name = !ls new_google_review
In [ ]:
def load_data(filename):
    data = pd.read_csv(f'/Users/alliewu/Desktop/DataScience_Projects/SF_Top_Attractions/new_google_review/{filename}')
    
    # Remove the prefix before the underscore
    filename = filename.split('_', 1)[1]

    # Remove the suffix after the dot
    filename = filename.rsplit('.', 1)[0]

    # Replace underscores with spaces
    filename = filename.replace('_', ' ')
    data['attraction'] = [filename]*(data.shape[0])
    return data
In [ ]:
data = load_data('google_Adventure_Playground.csv')
for i in range(1,len(csv_name)): #place.shape[0]
    filename = csv_name[i]
    #name = place['name'][i]
    reviews = load_data(filename=filename)
    data = pd.merge(data, reviews, how='outer')
In [ ]:
data = data.rename(columns={'d4r55': 'username', 
                            'RfnDt 2': 'contributions',
                            'rsqaWe': 'time',
                            'wiI7pd': 'review',
                            'kyuRq 2': 'language'
                            })
In [ ]:
data['hCCjke src'] = [1]*data.shape[0]
    
data['hCCjke src 2'] = data['hCCjke src 2'].replace({'https://maps.gstatic.com/consumer/images/icons/2x/ic_star_rate_14.png': 1, 
                                                 'https://maps.gstatic.com/consumer/images/icons/2x/ic_star_rate_empty_14.png': 0})
data['hCCjke src 3'] = data['hCCjke src 3'].replace({'https://maps.gstatic.com/consumer/images/icons/2x/ic_star_rate_14.png': 1, 
                                                 'https://maps.gstatic.com/consumer/images/icons/2x/ic_star_rate_empty_14.png': 0})
data['hCCjke src 4'] = data['hCCjke src 4'].replace({'https://maps.gstatic.com/consumer/images/icons/2x/ic_star_rate_14.png': 1, 
                                                 'https://maps.gstatic.com/consumer/images/icons/2x/ic_star_rate_empty_14.png': 0})
data['hCCjke src 5'] = data['hCCjke src 5'].replace({'https://maps.gstatic.com/consumer/images/icons/2x/ic_star_rate_14.png': 1, 
                                                 'https://maps.gstatic.com/consumer/images/icons/2x/ic_star_rate_empty_14.png': 0})

data['rating'] = data.apply(lambda row: row['hCCjke src'] + row['hCCjke src 2'] + row['hCCjke src 3'] + row['hCCjke src 4']+ row['hCCjke src 5'], axis=1)

data['language'] = data['language'].replace(np.nan, 'English')
data['contributions'] = [str(i).replace(' reviews','').strip() for i in data['contributions']]
data['contributions'] = [i.replace('· ','').strip() for i in data['contributions']]
In [ ]:
def convert_date(date_str):
    if isinstance(date_str, str):
        if date_str.endswith('ago'):
            try:
                num = int(date_str.split()[0])
            except:
                num = 1
            if 'day' in date_str:
                return (datetime.today() - timedelta(days=num)).strftime('%b %Y')
            elif 'week' in date_str:
                return (datetime.today() - timedelta(weeks=num)).strftime('%b %Y')
            elif 'month' in date_str:
                return (datetime.today() - timedelta(days=num*30)).strftime('%b %Y')
            elif 'year' in date_str:
                return (datetime.today() - timedelta(days=num*365)).strftime('%b %Y')
    return np.nan

data['time'] = [convert_date(i) for i in data['time']]
In [ ]:
keep_columns = ['attraction','username','contributions', 'time','review','rating'] #,'language'

# Keep only the specified columns
google_reviews = data.loc[:, keep_columns]
google_reviews.dropna(subset=['review'], inplace=True)
In [8]:
google_reviews.head(5)
Out[8]:
attraction username contributions time review rating
0 Adventure Playground jackie 117 Dec 2022 Wasn’t able to go in, however the structure is... 4.0
1 Adventure Playground Julia Gidwani 125 Feb 2023 Keep an eye on your kids and let them go wild ... 5.0
2 Adventure Playground Nathan Pierce 72 Feb 2023 When I was a kid we somehow had wood around an... 5.0
3 Adventure Playground Harry Jung 444 Jan 2023 Great playground for age 5+. Kids can be explo... 4.0
4 Adventure Playground V Tancredi 227 Dec 2022 Very creative and unique free play area at the... 5.0