-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathretrieve_reviews_Yelp.py
63 lines (49 loc) · 2.01 KB
/
retrieve_reviews_Yelp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
## extract reviews from Yelp API
import io, time, json
import requests
from bs4 import BeautifulSoup
def retrieve_html(url):
r = requests.get(url)
return (r.status_code,r.text)
def parse_page(html):
"""
Parse the reviews on a single page of a restaurant.
Args:
html (string): String of HTML corresponding to a Yelp restaurant
Returns:
tuple(list, string): a tuple of two elements
first element: list of dictionaries corresponding to the extracted review information
second element: URL for the next page of reviews (or None if it is the last page)
"""
soup = BeautifulSoup(html,'html.parser')
url_next = soup.find('link',rel='next')
if url_next:
url_next = url_next.get('href')
else:
url_next = None
reviews = soup.find_all('div', itemprop="review")
reviews_list = []
# HINT: print reviews to see what http tag to extract
for r in reviews:
author = r.find('meta', itemprop='author').get("content")
rating = r.find('meta', itemprop='ratingValue').get("content")
rating = float(rating)
date = r.find('meta', itemprop='datePublished').get("content")
description = r.find('p', itemprop='description').getText()
reviews_list.append({'author':author,'rating':rating, 'date':date, 'description':description})
return reviews_list, url_next
def extract_reviews(url, html_fetcher):
code, html = html_fetcher(url) # function implemented in Q0 should work
rev_list, url_next = parse_page(html)
globalList = rev_list
while url_next:
time.sleep(0.5)
code, html = html_fetcher(url_next)
rev_list, url_next = parse_page(html)
globalList = globalList + rev_list
return globalList
data = extract_reviews('https://www.yelp.com/biz/the-jibarito-stop-chicago-2?start=220', html_fetcher=retrieve_html)
print(len(data))
# 40
print(data[0])
# {'author': 'Betsy F.', 'rating': '5.0', 'date': '2016-10-01', 'description': "Authentic, incredible ... " }