-
Notifications
You must be signed in to change notification settings - Fork 0
/
goblet_scraper.py
executable file
·125 lines (103 loc) · 3.89 KB
/
goblet_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Sources for this example script:
# http://docs.python-guide.org/en/latest/scenarios/scrape/
# http://docs.ckan.org/en/latest/api/index.html#example-importing-datasets-with-the-ckan-api
# BROKEN: HTTP error 409
# [Wed Dec 03 17:26:04 2014] [error] 2014-12-03 17:26:04,746 ERROR [ckan.controllers.api]
# Validation error: "{'__type': 'Validation Error', 'name': [u'Url must be purely lowercase alphanumeric (ascii) characters and these symbols: -_']}"
from bs4 import BeautifulSoup
from training import *
from lxml import html
import datetime
import requests
import re
import pprint
import urllib2
root_url = 'http://www.mygoblet.org/'
owner_org = 'goblet'
lessons = {}
def parse_data(page):
topic_match = re.compile('topic-tags')
audience_match = re.compile('audience-tags')
portal_match = re.compile('training-portal')
response = urllib2.urlopen(root_url + page)
html = response.read()
#pprint.pprint(html)
#with open ("goblet.html", "r") as myfile:
# data=myfile.read().replace('\n', '')
tree = BeautifulSoup(html) # or data, if reading locally
#tree = BeautifulSoup(data) # or html, if reading remotely
rows = tree.find_all('tbody')[0].find_all('tr')
for row in rows:
key = None
name = None
topics = []
audience = []
links = row.find_all('a')
for link in links:
href = link.get('href')
text = link.contents[0].encode('utf8','ignore')
if topic_match.search(href):
topics.append(text)
elif audience_match.search(href):
audience.append(text)
elif portal_match.search(href):
key = href
name = text
cells = row.find_all('td')
stuff = cells[0].get_text().encode('utf8','ignore').strip()
reldate = stuff.replace(links[0].contents[0].encode('utf8','ignore'),'')
date_modified = return_date(reldate)
lessons[key] = {'audience':audience, 'topics':topics, 'last_modified':date_modified, 'name':name}
# This monstrosity would not be required if we had a proper feed
# with the actual date in it.
def return_date(datestring):
parts = datestring.split()
today = datetime.date.today()
years = 0
months = 0
weeks = 0
days = 0
year_match = re.compile('year')
month_match = re.compile('month')
week_match = re.compile('week')
day_match = re.compile('day')
if month_match.search(parts[2]):
months = int(parts[1])
elif year_match.search(parts[2]):
years = int(parts[1])
elif week_match.search(parts[2]):
weeks = int(parts[1])
if month_match.search(parts[4]):
months = int(parts[3])
elif week_match.search(parts[4]):
weeks = int(parts[3])
elif day_match.search(parts[4]):
days = int(parts[3])
diff = days + (weeks * 7) + (months * 30) + (years * 365)
delta = datetime.timedelta(days=diff)
earlier = today - delta
return earlier
##################################################
# Main body of the script below, functions above #
##################################################
pages = ['0','1','2']
for p in pages:
parse_data('training-portal?page=' + p)
#uploader = CKANUploader(None)
# each individual tutorial
for key in lessons:
course = Tutorial()
course.url = root_url + key
course.owning_org = owner_org
course.title = lessons[key]['name']
course.set_name(owner_org,lessons[key]['name'])
course.last_modified = str(lessons[key]['last_modified'])
course.created = str(lessons[key]['last_modified'])
course.audience = lessons[key]['audience']
course.keywords = lessons[key]['topics']
course.format = 'html'
# Before attempting to create anything we need to check if the resource/dataset already exists, updating it
# as and where necessary.
CKANUploader.create_or_update(course)