-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape1.py
34 lines (28 loc) · 1.1 KB
/
scrape1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
from lxml import html
import requests
pages = [];
for x in range(1,231):
pages = pages + ['http://www.patheos.com/blogs/unequallyyoked/posts/page/' + str(x)]
url = []
for base_page in pages:
page = requests.get(base_page)
tree = html.fromstring(page.text)
#buyers = tree.find_class('.entry_title')
links = tree.iterlinks()
links = filter(lambda x: x[2].find('patheos.com/blogs/unequally') != -1, links)
links = filter(lambda x: x[2].find('#') == -1, links)
links = filter(lambda x: x[2].find('author') == -1, links)
links = filter(lambda x: x[2].find('tag') == -1, links)
links = filter(lambda x: x[2].find('category') == -1, links)
links = filter(lambda x: x[2].find('files') == -1, links)
links = filter(lambda x: x[2].find('posts') == -1, links)
links = filter(lambda x: ( x[2].find('2010') != -1 or x[2].find('2011') != -1 or x[2].find('2012') != -1 or x[2].find('2013') != -1 or x[2].find('2014') != -1 or x[2].find('2015') != -1 ), links)
for link in links:
if link[2] not in url:
url = url + [link[2]]
f = open('data/urls.txt', 'w')
for m in url:
f.write(m)
f.write('\n')
f.close()
print tree