forked from stanfordjournalism/search-script-scrape
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path63.py
35 lines (34 loc) · 1.58 KB
/
63.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# In the current dataset behind Medicare's Nursing Home Compare website, the total amount of fines received by penalized nursing homes
# landing page:
# https://data.medicare.gov/data/nursing-home-compare
import csv
import os
import requests
from lxml import html
from shutil import unpack_archive
from urllib.parse import urljoin, urlparse, parse_qs
LOCAL_DATADIR = "/tmp/nursinghomes"
CSV_NAME = os.path.join(LOCAL_DATADIR, 'Penalties_Download.csv')
os.makedirs(LOCAL_DATADIR, exist_ok = True)
#
# The zip URL looks like this:
# https://data.medicare.gov/views/bg9k-emty/files/AsD4-xSfJuwZKwb_gMosljIKMST...
# TZ1PmBSoRGqivFmo?filename=DMG_CSV_DOWNLOAD20150501.zip&content_type=application%2Fzip%3B%20charset%3Dbinary
# we assume that the zip file URL changes frequently and can't be hardcoded
# so we go through the process of auto-magically determining that URL
url = 'https://data.medicare.gov/data/nursing-home-compare'
doc = html.fromstring(requests.get(url).text)
zipurl = [a.attrib['href'] for a in doc.cssselect('a')
if 'CSV_DOWNLOAD' in str(a.attrib.get('href'))][0]
zipurl = urljoin(url, zipurl)
bname = parse_qs(urlparse(zipurl).query)['filename'][0]
zname = os.path.join(LOCAL_DATADIR, bname)
if not os.path.exists(zname):
print("Downloading", zipurl, 'to', zname)
z = requests.get(zipurl).content
with open(zname, 'wb') as f:
f.write(z)
print('Unzipping', zname, 'to', LOCAL_DATADIR)
unpack_archive(zname, LOCAL_DATADIR)
rows = list(csv.DictReader(open(CSV_NAME, encoding = 'ISO-8859-1')))
print(sum([float(r['fine_amt']) for r in rows if r['fine_amt']]))