-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.py
53 lines (44 loc) · 1.75 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from lxml import html
import requests
class AppCrawler:
def __init__(self, starting_url, depth):
self.starting_url = starting_url
self.depth = depth
self.current_depth = 0
self.depth_links = []
self.apps = []
def crawl(self):
app = self.get_app_from_link(self.starting_url)
self.apps.append(app)
self.depth_links.append(app.links)
while self.current_depth < self.depth:
current_links = []
for link in self.depth_links[self.current_depth]:
current_app = self.get_app_from_link(link)
current_links.extend(current_app.links)
self.apps.append(current_app)
self.current_depth += 1
self.depth_links.append(current_links)
def get_app_from_link(self, link):
start_page = requests.get(link)
tree = html.fromstring(start_page.text)
name = tree.xpath('//h1[@itemprop="name"]/text()')[0]
developer = tree.xpath('//div[@class="left"]/h2/text()')[0]
price = tree.xpath('//div[@itemprop="price"]/text()')[0]
links = tree.xpath('//div[@class="center-stack"]//*/a[@class="name"]/@href')
app = App(name, developer, price, links)
return app
class App:
def __init__(self,name,developer,price, links):
self.name = name
self.developer = developer
self.price = price
self.links = links
def __str__(self):
return ("Name "+ self.name.encode('UTF-8') +
"\r\nDeveloper: "+self.developer.encode('UTF-8')+
"\r\nPrice: "+ self.price.encode('UTF-8') +"\r\n")
crawler = AppCrawler('https://itunes.apple.com/in/app/candy-crush-saga/id553834731', 1)
crawler.crawl()
for app in crawler.apps:
print app