-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbs_scrape.py
103 lines (76 loc) · 4.52 KB
/
bs_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from models import connect_to_db, db, Reviews, Restaurants
from runserver import app
from datetime import datetime
import os, random, re, time, urllib2
from bs4 import BeautifulSoup
def grab_review_objects(search_url, restaurant_name):
"""
Grabs all reviews from given restaurant's yelp page.
"""
rest_time = random.randint(5000, 60000)/1000.0 # before scraping the page, script will rest for between 5 to 60 secs
print "rest time is", rest_time # print statements to see scraping progress
time.sleep(rest_time)
print "url is now", search_url
contents = urllib2.urlopen(search_url).read()
soup = BeautifulSoup(contents, "html.parser")
biz_id = soup.find('a', attrs={'class':'edit-category', 'href':True}).attrs['href'] # grab the yelp biz id
biz_id = biz_id.replace("/biz_attribute?biz_id=", "") # lop off the prefix
if not restaurant_info: # only need to grab this stuff the first time through review scraping
restaurant_info['biz_id'] = biz_id
name = (soup.find('h1', attrs={'itemprop':'name'}).get_text()).strip()
restaurant_info['name'] = name
restaurant_info['yelp_name'] = restaurant_name
restaurant_info["avg_rating"] = float(soup.find('meta', {'itemprop':'ratingValue', 'content':True}).attrs['content'])
restaurant_info["location"] = soup.find('span', attrs={'itemprop':'addressLocality'}).get_text() + ", " + soup.find('span', attrs={'itemprop':'addressRegion'}).get_text()
restaurant_info["stars"] = 3
load_restaurant_info_to_db(restaurant_info)
next = soup.find('a', attrs={'class':'next'}) # look for the "next" page link
review_objs_list = []
for review in soup.findAll('div', class_='review') :
review_objs_list.append(review)
review_objs_list = review_objs_list[1:] # first item is dummy
next_page_reviews = []
if next:
next_page_reviews, _ = grab_review_objects(next.attrs['href'], restaurant_name)
return review_objs_list + next_page_reviews, biz_id
def load_restaurant_info_to_db(restaurant_info):
biz_id = restaurant_info["biz_id"]
name = restaurant_info["name"]
yelp_name = restaurant_info["yelp_name"]
avg_rating = float(restaurant_info["avg_rating"])
location = restaurant_info["location"]
stars = int(restaurant_info["stars"])
temp_rest_obj = Restaurants(biz_id=biz_id, common_name=name, yelp_name=yelp_name, location=location, avg_yelp_rating=avg_rating, michelin_stars=stars)
db.session.add(temp_rest_obj)
db.session.commit()
print "restaurant info added!"
def load_reviews_to_db(restaurant):
search_url = search_base_url + restaurant + search_suffix
review_objs_list, biz_id = grab_review_objects(search_url, restaurant)
for review in review_objs_list:
review_id = review.attrs['data-review-id']
date = review.find('meta', attrs={'itemprop':'datePublished', 'content':True}).attrs['content']
date = datetime.strptime(date, "%Y-%m-%d")
rating = float(review.find('meta', {'itemprop':'ratingValue', 'content':True}).attrs['content'])
location = review.find('li', attrs={'class':'user-location'}).get_text().replace("\n","")
text = re.sub(ur'\u00a0','',review.find('p', attrs={'itemprop':'description'}).get_text(),re.UNICODE) # get rid of unicode
r_obj = Restaurants.query.filter_by(biz_id=biz_id).first()
r_name = r_obj.common_name
temp_review_obj = Reviews(review_id=review_id, restaurant_name=r_name, biz_id=biz_id, rating=rating, text=text, location=location, date=date)
db.session.add(temp_review_obj)
db.session.commit()
print "reviews added!"
def main(restaurant):
load_reviews_to_db(restaurant)
if __name__ == "__main__":
connect_to_db(app)
# needed to construct URLs for scraping
search_base_url = "https://www.yelp.com/biz/"
search_suffix = "?sort_by=date_desc"
starting_url = "https://www.yelp.com"
restaurant_info = {}
# the 3 star, US Michelin restaurants to gather data from yelp & dump to db
restaurant_list = ["jean-georges-new-york", "chefs-table-at-brooklyn-fare-brooklyn-3", "per-se-new-york", "masa-new-york", "alinea-chicago", "grace-chicago-3", "le-bernardin-new-york", "eleven-madison-park-new-york", "benu-san-francisco-4", "the-french-laundry-yountville-2", "the-restaurant-at-meadowood-st-helena", "manresa-los-gatos-2", "saison-san-francisco-2"]
for item in restaurant_list:
restaurant_info = {} #clear the restaurant info dict for each new restaurant scraped
main(item)