forked from stanfordjournalism/search-script-scrape
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path9.py
26 lines (24 loc) · 1.19 KB
/
9.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# The number of roll call votes that were rejected by a margin of less than 5 votes, in the first session of the U.S. Senate in the 114th Congress
# Note: this example shows how to scrape the Senate webpage, which is
# the WRONG thing to do in practice. Use the XML instead:
# http://www.senate.gov/legislative/LIS/roll_call_lists/vote_menu_114_1.xml
# via https://twitter.com/octonion/status/611296541941321731
from lxml import html
import requests
import re
congress_num = 114
session_num = 1
url = ('http://www.senate.gov/legislative/LIS/roll_call_lists/vote_menu_%s_%s.htm'
% (congress_num, session_num))
doc = html.fromstring(requests.get(url).text)
# unnecessarily convoluted xpath statement, which I do here
# just so I can practice xpath statements
# http://stackoverflow.com/questions/1457638/xpath-get-nodes-where-child-node-contains-an-attribute
xstr = "//*[@id='contentArea']//table/tr[td[2][contains(text(), 'Rejected')]]"
# i.e. find all tr elements that have a 2nd td child with text that contains "Rejected"
xcount = 0
for r in doc.xpath(xstr):
yeas, nays = re.search('(\d+)-(\d+)', r.find('td').text_content()).groups()
if (int(nays) - int(yeas) < 5):
xcount += 1
print(xcount)