forked from rohansadale/YelpWebScraping
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrapeDate.py
47 lines (37 loc) · 1.24 KB
/
scrapeDate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import sys
from bs4 import BeautifulSoup
import requests
from pprint import pprint
import time
import argparse
reload(sys)
sys.setdefaultencoding("utf-8")
def scrapeDate(restaurantIDs):
reviewStartDates= []
count = 0
for restaurantID in restaurantIDs:
url = 'http://www.yelp.com/biz/' + restaurantID + '?sort_by=date_asc'
html = requests.get(url).text
soup = BeautifulSoup(html,'html.parser')
reviews = soup.findAll('div', {'class' : 'review-content'})
if len(reviews) != 0:
date = reviews[0].find('meta', itemprop = 'datePublished')['content']
else:
date = '99999'
reviewStartDates.append(restaurantID + '|' + date)
count += 1
print restaurantID
if count%20 == 0:
time.sleep(4)
return reviewStartDates
if __name__ == '__main__':
parser = argparse.ArgumentParser();
parser.add_argument('-f', '--fileName', dest='fileName', type=str, help='input file with restaurantIDs', required=True)
inputValues = parser.parse_args()
restaurantIDs = []
with open(inputValues.fileName) as f:
restaurantIDs = f.read().splitlines()
reviewStartDates = scrapeDate(restaurantIDs)
with open('output/firstReviewDate.csv', 'wb') as f:
for item in reviewStartDates:
f.write(item + '\n')