-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrecode.py
executable file
·154 lines (144 loc) · 7.11 KB
/
recode.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#As a work of the United States government, this project is in the public
#domain within the United States. Additionally, we waive copyright and related
#rights in the work worldwide through the CC0 1.0 Universal public domain
#dedication (https://creativecommons.org/publicdomain/zero/1.0/)
''' Convert Recon/Reid format data back into format needed for metrics.
Mainly this consists of recoding the individual race variables into
the 63 category version. '''
from itertools import product
from operator import itemgetter
import pandas as pd
#DTYPES for rhdf file
DTYPES = {'geoid_block': 'str',
'sex': 'str',
'age': 'int',
'white': 'str',
'black': 'str',
'aian': 'str',
'asian': 'str',
'nhopi': 'str',
'sor': 'str',
'hisp': 'str'}
#Dictionary of race code sets
RACE_DICT = {
'WHITE': {'01', '07', '08', '09', '10', '11', '22', '23', '24', '25', '26',
'27', '28', '29', '30', '31', '42', '43', '44', '45', '46', '47', '48',
'49', '50', '51', '57', '58', '59', '60', '61', '63'},
'BLACK': {'02', '07', '12', '13', '14', '15', '22', '23', '24', '25', '32',
'33', '34', '35', '36', '37', '42', '43', '44', '45', '46', '47', '52',
'53', '54', '55', '57', '58', '59', '60', '62', '63'},
'AIAN': {'03', '08', '12', '16', '17', '18', '22', '26', '27', '28', '32', '33',
'34', '38', '39', '40', '42', '43', '44', '48', '49', '50', '52', '53',
'54', '56', '57', '58', '59', '61', '62', '63'},
'ASIAN': {'04', '09', '13', '16', '19', '20', '23', '26', '29', '30', '32',
'35', '36', '38', '39', '41', '42', '45', '46', '48', '49', '51', '52',
'53', '55', '56', '57', '58', '60', '61', '62', '63'},
'NHOPI': {'05', '10', '14', '17', '19', '21', '24', '27', '29', '31', '33',
'35', '37', '38', '40', '41', '43', '45', '47', '48', '50', '51', '52',
'54', '55', '56', '57', '59', '60', '61', '62', '63'},
'SOR': {'06', '11', '15', '18', '20', '21', '25', '28', '30', '31', '34', '36',
'37', '39', '40', '41', '44', '46', '47', '49', '50', '51', '53', '54',
'55', '56', '58', '59', '60', '61', '62', '63'}
}
RACE_ORDER = ['WHITE', 'BLACK', 'AIAN', 'ASIAN', 'NHOPI', 'SOR']
def make_cenrace(race_dict, race_order):
''' function to define map of rhdf race categories back into cenrace values
e.g. YNNNNN -> 01 #white only
returns: a dictionary of form {'XXXXXX': 'YY', ...} giving the mapping
'''
ret = {}
possibles = [''.join(x) for x in list(product('YN', repeat=6))]
for possible in possibles:
#location of yesses
Ylocs = [pos for pos, char in enumerate(possible) if char=='Y']
#location of noes
Nlocs = [pos for pos, char in enumerate(possible) if char=='N']
Yraces = [race_order[x] for x in Ylocs]
Nraces = [race_order[x] for x in Nlocs]
Yintersection = set()
Nunion = set()
if(len(Yraces) > 0):
Ysets = itemgetter(*Yraces)(race_dict)
Yintersection = Ysets
if(len(Yraces) > 1):
Yintersection = set.intersection(*Ysets)
if(len(Nraces) > 0):
Nsets = itemgetter(*Nraces)(race_dict)
Nunion = Nsets
if(len(Nraces) > 1):
Nunion = set.union(*Nsets)
#Category we need must satisfy all the Y values and none of the N values
rcat = list(Yintersection - Nunion)
if(len(rcat) > 0):
ret[possible] = int(rcat[0])
return(ret)
RACE_MAP = make_cenrace(RACE_DICT, RACE_ORDER)
def process_file(rhdf_path, outname, isCEF=False):
''' Function to process an input reconstruction-format file
into the metrics format. Certain variables expected in the
metrics file, such as relationship, do not exist in the
reconstructed files. These are filled in with stub values
to allow for processing of metrics on existing variables'''
print(f'Reading {rhdf_path}')
rHDF = pd.read_csv(rhdf_path, dtype=DTYPES, usecols=DTYPES.keys())
print('Formatting Data')
#column manipulation for r&r cef file
if(isCEF):
print('Formatting CEF')
print('Formatting white')
rHDF.white.replace({'0': 'N', '1': 'Y'}, inplace=True)
print('Formatting black')
rHDF.black.replace({'0': 'N', '1': 'Y'}, inplace=True)
print('Formatting aian')
rHDF.aian.replace({'0': 'N', '1': 'Y'}, inplace=True)
print('Formatting asian')
rHDF.asian.replace({'0': 'N', '1': 'Y'}, inplace=True)
print('Formatting nhopi')
rHDF.nhopi.replace({'0': 'N', '1': 'Y'}, inplace=True)
print('Formatting sor')
rHDF.sor.replace({'0': 'N', '1': 'Y'}, inplace=True)
print('Formatting hisp')
rHDF.hisp.replace({'0': 'N', '1': 'Y'}, inplace=True)
print('Formatting sex')
rHDF.sex.replace({'0': 'female', '1': 'male'}, inplace=True)
print('Formatting race')
rHDF['race'] = rHDF['white'] + rHDF['black'] + rHDF['aian'] + rHDF['asian'] + rHDF['nhopi'] + rHDF['sor']
print('Formatting CENRACE')
rHDF['CENRACE'] = rHDF.race.replace(RACE_MAP)
print('Formatting TABBLKST')
rHDF['TABBLKST'] = rHDF.geoid_block.str[:2]
print('Formatting TABBLKCOU')
rHDF['TABBLKCOU'] = rHDF.geoid_block.str[2:5]
print('Formatting TABTRACTCE')
rHDF['TABTRACTCE'] = rHDF.geoid_block.str[5:11]
print('Formatting TABLBLKGRPCE')
rHDF['TABBLKGRPCE'] = rHDF.geoid_block.str[11:12]
print('Formatting TABBLK')
rHDF['TABBLK'] = rHDF.geoid_block.str[11:]
print('Formatting QAGE')
rHDF['QAGE'] = rHDF.age
print('Formatting CENHISP')
rHDF['CENHISP'] = rHDF.hisp.replace({'N': '1', 'Y': '2'})
print('Formatting QSEX')
rHDF['QSEX'] = rHDF.sex.replace({'male': '1', 'female': '2'})
print('Formatting RELSHIP')
rHDF['RELSHIP'] = 999
print('Formatting RTYPE')
rHDF['RTYPE'] = 'STUB'
print('Formatting GQTYPE')
rHDF['GQTYPE'] = 999
OUTCOLS = ['CENRACE', 'TABBLKST', 'RELSHIP', 'RTYPE', 'CENHISP', 'TABBLKGRPCE', 'QSEX', 'TABBLK', 'QAGE', 'TABTRACTCE', 'GQTYPE', 'TABBLKCOU']
print(f'Outputting {outname}')
if(isCEF):
rHDF[OUTCOLS].rename({'relship': 'qrel'}, axis='columns').to_csv(outname, index=False)
else:
rHDF[OUTCOLS].to_csv(outname, index=False)
return(True)
if __name__ == '__main__':
from argparse import ArgumentParser
parser = ArgumentParser(description='Converts files in reconstructed data format into format needed for accuracy metric calculations')
parser.add_argument('--infile', type=str, help='File to convert')
parser.add_argument('--outfile', type=str, help='File to save')
parser.add_argument('--cef', action='store_true', help='CSV is the CEF file, which has different naming conventions for metrics')
args = parser.parse_args()
process_file(args.infile, args.outfile, isCEF=args.cef)