-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpublic_data_matching.py
106 lines (90 loc) · 3.42 KB
/
public_data_matching.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
'''a script to match library data taken from muckrock to public library/distract
data from the original exports
by @peteyreplies
'''
import csv
from collections import OrderedDict
#files and filders
base_path = '../RESOURCES/geocen/'
f = open('../DATADUMP/GeoCen/merged_public_data.csv','w')
#first, load csvs into list of dicts
muckrock_data = []
muckrock_entities = csv.DictReader(open(base_path + 'muckrock_data.csv', 'rU'))
for m in muckrock_entities:
muckrock_data.append(m)
library_data = []
libraries = csv.DictReader(open(base_path + 'Alabama Public Libraries(IMLS).csv', 'rU'))
for l in libraries:
library_data.append(l)
school_data = []
school_districts = csv.DictReader(open(base_path + 'Alabama Public School Districts(NCES).csv', 'rU'))
for s in school_districts:
school_data.append(s)
#create translation tables
locales = {
'11':'City, Large',
'12':'City, Midsize',
'13':'City, Small',
'21':'Suburb, Large',
'22':'Suburb, Midsize',
'23':'Suburb, Small',
'31':'Town, Fringe',
'32':'Town, Distant',
'33':'Town, Remote',
'41':'Rural, Fringe',
'42':'Rural, Distant',
'43':'Rural, Remote',
'M':'Missing',
'N':'Not Applicable',
}
#loop thru muckrock list
for d in muckrock_data:
#specify common fields up front
d['lat'] = '' #latitude
d['long'] = '' #longitude
d['locale'] = '' #federally defined locale type
d['cdcode'] = '' #congressional district code
d['conum'] = '' #ANSI County Code
d['fedID'] = '' #unique ID in associated dataset
d['total_librarians'] = '' #num of total librarians; only available for libraries
d['mls-ala'] = '' #num of mls-ala credentialed librarians
d['computer_users'] = '' #users of public computers per year; only available for libraries
#if it's a library, look in the library data
if d['agency_type'] == 'Library':
#find matches based on zip and name
possible_matches = [y for y in library_data if y['ZIP_M'] == d['agency_zip'][:5]]
if len(possible_matches) == 1:
lib_match = possible_matches[0]
else:
lib_match = [y for y in possible_matches if y['LIBNAME'].lower() == d['agency_name'].lower()][0]
d['lat'] = lib_match['LATITUDE']
d['long'] = lib_match['LONGITUD']
d['locale'] = locales[lib_match['LOCALE']]
d['cdcode'] = lib_match['CDCODE']
d['conum'] = lib_match['FIPSPLAC']
d['fedID'] = lib_match['FSCSKEY']
d['total_librarians'] = lib_match['LIBRARIA']
d['mls-ala'] = lib_match['MASTER']
d['computer_users'] = lib_match['PITUSR']
#otherwise, look in the school data
else:
possible_matches = [y for y in school_data if y['MZIP'] == d['agency_zip'][:5]]
if len(possible_matches) == 1:
school_match = possible_matches[0]
else:
school_match = [y for y in possible_matches if y['NAME'].lower() == d['agency_name'].lower()][0]
d['lat'] = school_match['LATCOD']
d['long'] = school_match['LONCOD']
d['locale'] = locales[school_match['ULOCAL']]
d['cdcode'] = school_match['CDCODE']
d['conum'] = school_match['CONUM']
d['fedID'] = school_match['LEAID']
f = open('../DATADUMP/GeoCen/merged_public_data.csv','a')
orderedEntity = OrderedDict(sorted(d.items()))
DW = csv.DictWriter(f,orderedEntity.keys())
if f.tell() == 0:
DW.writer.writerow(orderedEntity.keys())
DW.writer.writerow(orderedEntity.values())
else:
DW.writer.writerow(orderedEntity.values())
print 'written data for entity ' + d['agency_name'].encode('ascii')