forked from stanfordjournalism/search-script-scrape
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path12.py
51 lines (49 loc) · 2.33 KB
/
12.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# In 2010, the year-over-year change in enplanements at America's busiest airport
# The landing page for this data is:
# http://www.faa.gov/airports/planning_capacity/passenger_allcargo_stats/passenger/
# For each year, there's a separate page with a table of links, including
# XLS format:
# e.g. ./passenger/media/cy10_primary_enplanements.xls
import csv
import requests
# we can't be sure that the XLS has the same naming convention year over year
# so let's do a little HTML parsing
from lxml import html
from os.path import basename
from urllib.parse import urljoin
from xlrd import open_workbook
BASE_URL = "http://www.faa.gov/airports/planning_capacity/passenger_allcargo_stats/passenger/"
YEAR = 2010
resp = requests.get(BASE_URL, params = {'year': YEAR})
doc = html.fromstring(resp.text)
# There are several spreadsheets and conventions over the years. I'm going to
# be lazy and just pick the first spreadsheet with "enplanements" and assume it's the primary
# doc
xls_url = doc.xpath("//a[contains(@href, 'enplanements') and contains(@href, 'xls')]/@href")[0]
print("Downloading", xls_url)
xresp = requests.get(urljoin(BASE_URL, xls_url))
# save to disk
fn = "/tmp/" + basename(xls_url)
with open(fn, "wb") as f:
f.write(xresp.content)
# open with xlrd
book = open_workbook(fn)
sheet = book.sheets()[0]
# Format looks like this:
# | Airport | CY 10 | CY 09 |
# | Name | Enplanements | Enplanements |
# |--------------------------------------------|--------------|--------------|
# | Hartsfield - Jackson Atlanta International | 43,130,585 | 42,280,868 |
# | Chicago O'Hare International | 32,171,831 | 31,135,732 |
# | Los Angeles International | 28,857,755 | 27,439,897 |
headers = sheet.row_values(0)
# get all the data rows
rows = [sheet.row_values(i) for i in range(1, sheet.nrows)]
# make them into dicts
drows = [dict(zip(headers, row)) for row in rows]
# remove rows without 'CY 10 Enplanements' as a float
drows = [d for d in drows if isinstance(d['CY 10 Enplanements'], float)]
# get biggest airport
airport = max(drows, key = lambda r: r['CY 10 Enplanements'])
print("%s: %i" % (airport['Airport Name'], airport['CY 10 Enplanements'] - airport['CY 09 Enplanements']))
# Hartsfield - Jackson Atlanta International: 849717