-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmetrics.py
executable file
·2592 lines (2256 loc) · 301 KB
/
metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
# coding: utf-8
#As a work of the United States government, this project is in the public
#domain within the United States. Additionally, we waive copyright and related
#rights in the work worldwide through the CC0 1.0 Universal public domain
#dedication (https://creativecommons.org/publicdomain/zero/1.0/)
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("-c","--config", help="pick config file", required=True)
args = parser.parse_args()
import yaml
config = yaml.load(open(f'./{args.config}'), Loader=yaml.SafeLoader)
HDFDATADIR = config['METRICS_CONFIG']['HDFDATADIR']
OUTPUTDIR = config['METRICS_CONFIG']['OUTPUTDIR']
MDFPERFILEPATH = config['METRICS_CONFIG']['MDFPERFILEPATH']
MDFPERPRFILEPATH = config['METRICS_CONFIG']['MDFPERPRFILEPATH']
separatePR = config['METRICS_CONFIG']['separatePR']
try:
runAGEBYRACE = config['METRICS_CONFIG']['runAGEBYRACE']
except:
runAGEBYRACE = False
try:
runPRhere = config['METRICS_CONFIG']['runPRhere']
except:
runPRhere = False
mergeValidation = None # or "1:1"
try:
GETMISSIONNAMES = config['METRICS_CONFIG']['GETMISSIONNAMES']
except:
GETMISSIONNAMES = False
print("runPRhere: ",runPRhere)
from datetime import datetime
print("{} Start".format(datetime.now()))
if GETMISSIONNAMES:
import re
with open(f'{MDFPERFILEPATH}','r') as f:
perhead = [next(f) for x in range(10)]
perheader = " ".join(perhead)
perp = re.search(r"([A-Z][A-Z|\-|\_]+)\.log", perheader)
if runPRhere:
with open(f'{MDFPERPRFILEPATH}','r') as f:
perprhead = [next(f) for x in range(10)]
perprheader = " ".join(perprhead)
perpr = re.search(r"([A-Z][A-Z|\-|\_]+)\.log", perprheader)
else:
perpr = ""
with open(f'{OUTPUTDIR}/filemissionnames_personmetrics.csv','w') as f:
if runPRhere:
f.write("MetricsRunStarted,PersonMissionName,PersonPRMissionName"+'\n')
f.write(f"{datetime.now()},{perp.groups()[0]},{perpr.groups()[0]}")
else:
f.write("MetricsRunStarted,PersonMissionName"+'\n')
f.write(f"{datetime.now()},{perp.groups()[0]}")
import pandas as pd
import numpy as np
from collections import OrderedDict
from statistics import median_grouped
print("{} Libraries Imported".format(datetime.now()))
statedict = OrderedDict({'02': 'AK', '01': 'AL', '05': 'AR', '04': 'AZ', '06': 'CA', '08': 'CO', '09': 'CT', '11': 'DC', '10': 'DE', '12': 'FL', '13': 'GA', '15': 'HI', '19': 'IA', '16': 'ID', '17': 'IL', '18': 'IN', '20': 'KS', '21': 'KY', '22': 'LA', '25': 'MA', '24': 'MD', '23': 'ME', '26': 'MI', '27': 'MN', '29': 'MO', '28': 'MS', '30': 'MT', '37': 'NC', '38': 'ND', '31': 'NE', '33': 'NH', '34': 'NJ', '35': 'NM', '32': 'NV', '36': 'NY', '39': 'OH', '40': 'OK', '41': 'OR', '42': 'PA', '44': 'RI', '45': 'SC', '46': 'SD', '47': 'TN', '48': 'TX', '49': 'UT', '51': 'VA', '50': 'VT', '53': 'WA', '55': 'WI', '54': 'WV', '56': 'WY', '72':'PR'})
racealonedict = {1: 'White Alone', 2: 'Black Alone', 3: 'AIAN Alone', 4: 'Asian Alone', 5: 'NHOPI Alone', 6: 'Some Other Race Alone', 7: 'Two Or More Races'}
raceincombdict = {'whitealone-or-incomb':'White Alone or In Combination', 'blackalone-or-incomb':'Black Alone or In Combination', 'aianalone-or-incomb':'AIAN Alone or In Combination', 'asianalone-or-incomb':'Asian Alone or In Combination', 'nhopialone-or-incomb': 'NHOPI Alone or In Combination', 'soralone-or-incomb':'Some Other Race Alone or In Combination'}
hispdict = {'1': 'Not Hispanic', '2':'Hispanic'}
sexdict = {'1': 'Male', '2':'Female'}
def calc_perc(da):
if (da['HDF_Population'] == 0) and (da['Diff']==0):
return 0
elif da['HDF_Population'] == 0:
return da['Diff']/((da['HDF_Population']+da['MDF_Population'])/2)
else:
return da['Diff']/da['HDF_Population']
def calculate_stats(data):
data = data.copy()
data = data.fillna({'HDF_Population': 0, 'MDF_Population': 0})
data['Diff'] = data['MDF_Population'] - data['HDF_Population']
data['AbsDiff'] = abs(data['Diff'])
data['PercDiff'] = 100 * data.apply(calc_perc, axis='columns')
data['AbsPercDiff'] = abs(data['PercDiff'])
#data['AbsPercDiffC'] = 100 * abs(((data['MDF_Population'] + 0.1) - (data['HDF_Population'] + 0.1))/(data['HDF_Population'] + 0.1))
return data
def calculate_ss(data, geography, sizecategory, characteristic):
if len(data) > 0:
odf = pd.DataFrame({
'Geography': geography,
'Size_Category': sizecategory,
'Characteristic': characteristic,
'MinDiff': np.nanmin(data['Diff']),
'MeanDiff':np.nanmean(data['Diff']),
'MedianDiff':np.nanmedian(data['Diff']),
'MaxDiff': np.nanmax(data['Diff']),
'MeanAbsDiff':np.nanmean(data['AbsDiff']),
'MedianAbsDiff':np.nanmedian(data['AbsDiff']),
'AbsDiff90th': np.nanpercentile(data['AbsDiff'], 90),
'AbsDiff95th': np.nanpercentile(data['AbsDiff'], 95),
'MinPercDiff': np.nanmin(data['PercDiff']),
'MeanPercDiff':np.nanmean(data['PercDiff']),
'MedianPercDiff':np.nanmedian(data['PercDiff']),
'MaxPercDiff': np.nanmax(data['PercDiff']),
'PercDiffNAs': np.sum(data['PercDiff'].isnull()),
'MeanAbsPercDiff': np.nanmean(data['AbsPercDiff']),
'MedianAbsPercDiff': np.nanmedian(data['AbsPercDiff']),
'AbsPercDiff90th': np.nanpercentile(data['AbsPercDiff'], 90),
'AbsPercDiff95th': np.nanpercentile(data['AbsPercDiff'], 95),
'AbsPercDiffMax': np.nanmax(data['AbsPercDiff']),
'AbsPercDiffNAs': np.sum(data['AbsPercDiff'].isnull()),
'RMSE': np.sqrt((data['Diff']**2).mean()),
'CV': 100*(np.sqrt((data['Diff']**2).mean())/np.nanmean(data['HDF_Population'])) if np.nanmean(data['HDF_Population']) != 0 else np.nan,
'MeanHDFPop': np.nanmean(data['HDF_Population']),
'NumCells':len(data),
'NumberBtw2Perc5Perc':len(data[(data['AbsPercDiff'] >=2)&(data['AbsPercDiff'] <= 5)]),
'NumberGreater5Perc':len(data[(data['AbsPercDiff'] > 5)]),
'NumberGreater200': len(data[(data['AbsDiff'] > 200)]),
'NumberGreater10Perc':len(data[(data['AbsPercDiff'] > 10)])
}, index=[0])
else:
odf = pd.DataFrame({
'Geography': geography,
'Size_Category': sizecategory,
'Characteristic': characteristic,
'MinDiff': 0,
'MeanDiff':0,
'MedianDiff':0,
'MaxDiff': 0,
'MeanAbsDiff':0,
'MedianAbsDiff':0,
'AbsDiff90th': 0,
'AbsDiff95th': 0,
'MinPercDiff': 0,
'MeanPercDiff':0,
'MedianPercDiff':0,
'MaxPercDiff': 0,
'PercDiffNAs': 0,
'MeanAbsPercDiff': 0,
'MedianAbsPercDiff': 0,
'AbsPercDiff90th': 0,
'AbsPercDiff95th': 0,
'AbsPercDiffMax': 0,
'AbsPercDiffNAs':0,
'RMSE': 0,
'CV': 0,
'MeanHDFPop':0,
'NumCells':0,
'NumberBtw2Perc5Perc':0,
'NumberGreater5Perc':0,
'NumberGreater200':0,
'NumberGreater10Perc':0
}, index=[0])
return odf
# Output Data Frame
outputdf = pd.DataFrame(data=None, columns = ['Geography','Size_Category','Characteristic','MinDiff','MeanDiff','MedianDiff','MaxDiff','MeanAbsDiff','MedianAbsDiff', 'AbsDiff90th','AbsDiff95th','MinPercDiff','MeanPercDiff','MedianPercDiff','MaxPercDiff','PercDiffNAs', 'MeanAbsPercDiff','MedianAbsPercDiff','AbsPercDiff90th','AbsPercDiff95th','AbsPercDiffMax','AbsPercDiffNAs','RMSE', 'CV','MeanHDFPop','NumCells','NumberBtw2Perc5Perc','NumberGreater5Perc','NumberGreater200','NumberGreater10Perc','AvgAbsDiffMedAge','AvgAbsDiffSexRatio','TAES','CountMDFltHDF','MedianPctDiffWhereMDFltHDF','Number100PlusMDFLessThan20HDF','NumberLessThan20MDF100PlusHDF','NumberHDFgt50kMDFlt50k','NumberHDFlt50kMDFgt50k','NumberHDFneMDF','Inconsistent'])
print("{} Output DF Frame Created OK".format(datetime.now()))
#Set up blocks, tracts, counties from Geography file
allgeos = pd.read_csv(f"{HDFDATADIR}/blocks_to_geos_2010.csv", dtype=str)
allgeospr = allgeos[allgeos.StateGEOID == "72"]
allgeos = allgeos[allgeos.StateGEOID != "72"]
allblocks = list(allgeos.BlockGEOID.unique())
allblockgroups = list(allgeos.BlockGroupGEOID.unique())
alltracts = list(allgeos.TractGEOID.unique())
allcounties = list(allgeos.CountyGEOID.unique())
allplaces = list(allgeos.loc[allgeos['IncPlaceGEOID'].notnull(), 'IncPlaceGEOID'].unique())
allstates = list(allgeos.StateGEOID.unique())
allaiannh = list(allgeos.loc[allgeos['AIANNHGEOID'].notnull(), 'AIANNHGEOID'].unique())
allfedairs = list(allgeos.loc[allgeos['FedAIRGEOID'].notnull(), 'FedAIRGEOID'].unique())
allotsas = list(allgeos.loc[allgeos['OTSAGEOID'].notnull(), 'OTSAGEOID'].unique())
allanvsas = list(allgeos.loc[allgeos['ANVSAGEOID'].notnull(), 'ANVSAGEOID'].unique())
allelemschdists = list(allgeos.loc[allgeos['SchDistEGEOID'].notnull(), 'SchDistEGEOID'].unique())
allsecschdists = list(allgeos.loc[allgeos['SchDistSGEOID'].notnull(), 'SchDistSGEOID'].unique())
allunischdists = list(allgeos.loc[allgeos['SchDistUGEOID'].notnull(), 'SchDistUGEOID'].unique())
allmcds = list(allgeos.loc[allgeos['MCDGEOID'].notnull(), 'MCDGEOID'].unique())
alltractspr = list(allgeospr.TractGEOID.unique())
allcountiespr = list(allgeospr.CountyGEOID.unique())
del allgeospr
#
allstatesindex = pd.Index(allstates, name='GEOID')
allcountiesindex = pd.Index(allcounties, name='GEOID')
alltractsindex = pd.Index(alltracts, name='GEOID')
allblockgroupsindex = pd.Index(allblockgroups, name='GEOID')
allblocksindex = pd.Index(allblocks, name='GEOID')
allplacesindex = pd.Index(allplaces, name='IncPlaceGEOID')
allaiannhindex = pd.Index(allaiannh, name="AIANNHGEOID")
allfedairsindex = pd.Index(allfedairs, name="FedAIRGEOID")
allotsasindex = pd.Index(allotsas, name="OTSAGEOID")
allanvsasindex = pd.Index(allanvsas, name="ANVSAGEOID")
allelemschdistsindex = pd.Index(allelemschdists, name="SchDistEGEOID")
allsecschdistsindex = pd.Index(allsecschdists, name="SchDistSGEOID")
allunischdistsindex = pd.Index(allunischdists, name="SchDistUGEOID")
allmcdsindex = pd.Index(allmcds, name="MCDGEOID")
allcountiesprindex = pd.Index(allcountiespr, name='GEOID')
alltractsprindex = pd.Index(alltractspr, name='GEOID')
print("{} Geography Block Crosswalk Input OK".format(datetime.now()))
agecats = [i for i in range(0,116)]
racealonecats = [1,2,3,4,5,6,7]
sexcats = ['1','2']
hispcats = ['1','2'] # Hisp Cats as string here. Not so with PLonly due to memory.
numracescats = ['1','2','3','4','5','6']
racecats = [f'{i:02d}' for i in range(1,64)]
gqinstcats = ['INST', 'NONINST']
gqmajortypecats = ['Correctional', 'Juvenile', 'Nursing', 'OtherInst', 'College', 'Military', 'OtherNoninst']
mcdstates = ['09', '25', '23', '26', '27', '33', '34', '36', '42', '44', '50', '55']
def assign_racealone_or_incomb(df):
df = df.copy()
whitecols = [1,7,8,9,10,11,22,23,24,25,26,27,28,29,30,31,42,43,44,45,46,47,48,49,50,51,57,58,59,60,61,63]
blackcols = [2,7,12,13,14,15,22,23,24,25,32,33,34,35,36,37,42,43,44,45,46,47,52,53,54,55,57,58,59,60,62,63]
aiancols = [3,8,12,16,17,18,22,26,27,28,32,33,34,38,39,40,42,43,44,48,49,50,52,53,54,56,57,58,59,61,62,63]
asiancols = [4,9,13,16,19,20,23,26,29,30,32,35,36,38,39,41,42,45,46,48,49,51,52,53,55,56,57,58,60,61,62,63]
nhopicols = [5,10,14,17,19,21,24,27,29,31,33,35,37,38,40,41,43,45,47,48,50,51,52,54,55,56,57,59,60,61,62,63]
sorcols = [6,11,15,18,20,21,25,28,30,31,34,36,37,39,40,41,44,46,47,49,50,51,53,54,55,56,58,59,60,61,62,63]
df['whitealone-or-incomb'] = np.where(df['CENRACE'].isin(whitecols),1,0)
df['blackalone-or-incomb'] = np.where(df['CENRACE'].isin(blackcols),1,0)
df['aianalone-or-incomb'] = np.where(df['CENRACE'].isin(aiancols),1,0)
df['asianalone-or-incomb'] = np.where(df['CENRACE'].isin(asiancols),1,0)
df['nhopialone-or-incomb'] = np.where(df['CENRACE'].isin(nhopicols),1,0)
df['soralone-or-incomb'] = np.where(df['CENRACE'].isin(sorcols),1,0)
return df
def assign_raceincomb(df):
df = df.copy()
whitecols = [7,8,9,10,11,22,23,24,25,26,27,28,29,30,31,42,43,44,45,46,47,48,49,50,51,57,58,59,60,61,63]
blackcols = [7,12,13,14,15,22,23,24,25,32,33,34,35,36,37,42,43,44,45,46,47,52,53,54,55,57,58,59,60,62,63]
aiancols = [8,12,16,17,18,22,26,27,28,32,33,34,38,39,40,42,43,44,48,49,50,52,53,54,56,57,58,59,61,62,63]
asiancols = [9,13,16,19,20,23,26,29,30,32,35,36,38,39,41,42,45,46,48,49,51,52,53,55,56,57,58,60,61,62,63]
nhopicols = [10,14,17,19,21,24,27,29,31,33,35,37,38,40,41,43,45,47,48,50,51,52,54,55,56,57,59,60,61,62,63]
sorcols = [11,15,18,20,21,25,28,30,31,34,36,37,39,40,41,44,46,47,49,50,51,53,54,55,56,58,59,60,61,62,63]
df['white-incomb'] = np.where(df['CENRACE'].isin(whitecols),1,0)
df['black-incomb'] = np.where(df['CENRACE'].isin(blackcols),1,0)
df['aian-incomb'] = np.where(df['CENRACE'].isin(aiancols),1,0)
df['asian-incomb'] = np.where(df['CENRACE'].isin(asiancols),1,0)
df['nhopi-incomb'] = np.where(df['CENRACE'].isin(nhopicols),1,0)
df['sor-incomb'] = np.where(df['CENRACE'].isin(sorcols),1,0)
return df
print(f"Reading {MDFPERFILEPATH}")
dfmdf = pd.read_table(f"{MDFPERFILEPATH}", sep = ",", comment="#", usecols = ['TABBLKST','TABBLKCOU', 'TABTRACTCE', 'TABBLK', 'TABBLKGRPCE', 'RTYPE', 'RELSHIP', 'GQTYPE','QSEX','QAGE', 'CENHISP','CENRACE'], dtype = { 'SCHEMA_TYPE_CODE': 'object', 'SCHEMA_BUILD_ID': 'object', 'TABBLKST': 'object', 'TABBLKCOU': 'object', 'TABTRACTCE': 'object', 'TABBLKGRPCE': 'object', 'TABBLK': 'object', 'EPNUM': 'int64', 'RTYPE': 'object', 'GQTYPE': 'int64', 'RELSHIP': 'int64', 'QSEX': 'object', 'QAGE': 'int64', 'CENHISP': 'object', 'CENRACE': 'int64', 'CITIZEN': 'object', 'LIVE_ALONE': 'int64' })
if runPRhere and separatePR: # If you want to run PR from this program and it's a separate file, then input it from the MDFPERPRFILEPATH
dfmdfpr = pd.read_table(f"{MDFPERPRFILEPATH}", sep = "|", comment="#", usecols = ['TABBLKST','TABBLKCOU', 'TABTRACTCE', 'TABBLK', 'TABBLKGRPCE', 'RTYPE', 'RELSHIP', 'GQTYPE','QSEX','QAGE', 'CENHISP','CENRACE'], dtype = { 'SCHEMA_TYPE_CODE': 'object', 'SCHEMA_BUILD_ID': 'object', 'TABBLKST': 'object', 'TABBLKCOU': 'object', 'TABTRACTCE': 'object', 'TABBLKGRPCE': 'object', 'TABBLK': 'object', 'EPNUM': 'int64', 'RTYPE': 'object', 'GQTYPE': 'int64', 'RELSHIP': 'int64', 'QSEX': 'object', 'QAGE': 'int64', 'CENHISP': 'object', 'CENRACE': 'int64', 'CITIZEN': 'object', 'LIVE_ALONE': 'int64' })
else: # If you want to run PR from this program and the data are part of the MDF, subset it from the MDF. (If PR is not available, this will return an empty data frame that will be deleted later.)
dfmdfpr = dfmdf[dfmdf['TABBLKST'] == '72']
dfmdf = dfmdf[dfmdf['TABBLKST'] != "72"]
print("{} MDF Data Loaded OK".format(datetime.now()))
dfmdf['BlockGEOID'] = dfmdf['TABBLKST'] + dfmdf['TABBLKCOU'] + dfmdf['TABTRACTCE'] + dfmdf['TABBLK']
print("{} MDF Block GEOID Created".format(datetime.now()))
dfmdf = pd.merge(dfmdf, allgeos[['BlockGEOID','IncPlaceGEOID','AIANNHGEOID','MCDGEOID','FedAIRGEOID','OTSAGEOID','ANVSAGEOID','SchDistEGEOID','SchDistSGEOID', 'SchDistUGEOID']], on="BlockGEOID", how="left", validate=mergeValidation)
print("{} MDF Non-Spine GEOIDs Merged In".format(datetime.now()))
dfhdf = pd.read_csv(f"{HDFDATADIR}/cef.csv", usecols = ['TABBLKST', 'TABBLKCOU', 'TABTRACTCE', 'TABBLK', 'TABBLKGRPCE', 'QSEX','QAGE','CENRACE','RTYPE','GQTYPE', 'CENHISP'], dtype = { 'TABBLKST': 'object', 'TABBLKCOU': 'object', 'TABTRACTCE': 'object', 'TABBLK': 'object', 'TABBLKGRPCE': 'object', 'QSEX': 'object', 'QAGE': 'int64', 'QSPAN': 'object', 'CENRACE': 'int64', 'RTYPE': 'object', 'GQTYPE': 'int64', 'CENHISP': 'object' })
dfhdfpr = dfhdf[dfhdf['TABBLKST'] == '72']
dfhdf = dfhdf[dfhdf['TABBLKST'] != "72"]
print("{} HDF Data Loaded OK".format(datetime.now()))
dfhdf['BlockGEOID'] = dfhdf['TABBLKST'] + dfhdf['TABBLKCOU'] + dfhdf['TABTRACTCE'] + dfhdf['TABBLK']
print("{} HDF Block GEOID Created".format(datetime.now()))
dfhdf = pd.merge(dfhdf, allgeos[['BlockGEOID','IncPlaceGEOID','AIANNHGEOID','MCDGEOID','FedAIRGEOID','OTSAGEOID','ANVSAGEOID','SchDistEGEOID','SchDistSGEOID', 'SchDistUGEOID']], on="BlockGEOID", how="left", validate=mergeValidation)
print("{} HDF Non-Spine GEOIDs Merged In".format(datetime.now()))
dfhdf['RACEALONE'] = np.where(dfhdf['CENRACE'] > 6, 7, dfhdf['CENRACE'])
dfmdf['RACEALONE'] = np.where(dfmdf['CENRACE'] > 6, 7, dfmdf['CENRACE'])
dfhdf['NUMRACES'] = pd.cut(dfhdf['CENRACE'], [1,7,22,42,57,63,64], include_lowest=True, right=False, labels=['1','2','3','4','5','6'])
dfmdf['NUMRACES'] = pd.cut(dfmdf['CENRACE'], [1,7,22,42,57,63,64], include_lowest=True, right=False, labels=['1','2','3','4','5','6'])
print("{} Created RACEALONE & NUMRACES".format(datetime.now()))
dfhdf['GQINST'] = pd.cut(dfhdf['GQTYPE'], [100,499, 999], include_lowest=True, right=False, labels=gqinstcats)
dfmdf['GQINST'] = pd.cut(dfmdf['GQTYPE'], [100,499, 999], include_lowest=True, right=False, labels=gqinstcats)
dfhdf['GQMAJORTYPE'] = pd.cut(dfhdf['GQTYPE'], [100,200,300,400,500,600,700,999], include_lowest=True, right=False, labels=gqmajortypecats)
dfmdf['GQMAJORTYPE'] = pd.cut(dfmdf['GQTYPE'], [100,200,300,400,500,600,700,999], include_lowest=True, right=False, labels=gqmajortypecats)
dfhdf['QAGE_5Y']=pd.cut(dfhdf['QAGE'],list(range(0,111,5)) + [116], include_lowest=True, right=False)
dfmdf['QAGE_5Y']=pd.cut(dfmdf['QAGE'],list(range(0,111,5)) + [116], include_lowest=True, right=False)
print("{} Created QAGE_5Y".format(datetime.now()))
dfhdf['QAGE_1Y_100']=pd.cut(dfhdf['QAGE'],list(range(0,100,1)) + list(range(100,115,5)) + [116], include_lowest=True, right=False)
dfmdf['QAGE_1Y_100']=pd.cut(dfmdf['QAGE'],list(range(0,100,1)) + list(range(100,115,5)) + [116], include_lowest=True, right=False)
print("{} Created QAGE_1Y_100".format(datetime.now()))
hdfnation_1y_100 = dfhdf.groupby(['QAGE_1Y_100']).size().reset_index(name='HDF_Population')
mdfnation_1y_100 = dfmdf.groupby(['QAGE_1Y_100']).size().reset_index(name='MDF_Population')
nation_1y_100 = pd.merge(hdfnation_1y_100, mdfnation_1y_100, on="QAGE_1Y_100", how="inner")
nation_1y_100['QAGE_1Y_100'] = nation_1y_100['QAGE_1Y_100'].astype("object") ## get around pandas < 1.4.3 bug
nation_1y_100.to_csv(f"{OUTPUTDIR}/nation_1y_100.csv", index=False)
hdfnation_gq1y_100 = dfhdf[dfhdf['GQTYPE'] > 0].groupby(['QAGE_1Y_100']).size().reset_index(name='HDF_Population')
mdfnation_gq1y_100 = dfmdf[dfmdf['GQTYPE'] > 0].groupby(['QAGE_1Y_100']).size().reset_index(name='MDF_Population')
nation_gq1y_100 = pd.merge(hdfnation_gq1y_100, mdfnation_gq1y_100, on="QAGE_1Y_100", how="inner")
nation_gq1y_100['QAGE_1Y_100'] = nation_gq1y_100['QAGE_1Y_100'].astype("object") ## get around pandas < 1.4.3 bug
nation_gq1y_100.to_csv(f"{OUTPUTDIR}/nation_gq1y_100.csv", index=False)
dfhdf['QAGE_3G']= pd.cut(dfhdf['QAGE'],[0,18,65,116], include_lowest=True, right=False)
dfmdf['QAGE_3G']= pd.cut(dfmdf['QAGE'],[0,18,65,116], include_lowest=True, right=False)
print("{} Created QAGE_3G".format(datetime.now()))
if runPRhere:
dfhdfpr['RACEALONE'] = np.where(dfhdfpr['CENRACE'] > 6, 7, dfhdfpr['CENRACE'])
dfhdfpr['NUMRACES'] = pd.cut(dfhdfpr['CENRACE'], [1,7,22,42,57,63,64], include_lowest=True, right=False, labels=['1','2','3','4','5','6'])
# dfhdfpr['GQINST']
# dfhdfpr['GQMAJORTYPE']
dfhdfpr['QAGE_3G']= pd.cut(dfhdfpr['QAGE'],[0,18,65,116], include_lowest=True, right=False)
dfhdfpr['QAGE_5Y']=pd.cut(dfhdfpr['QAGE'],list(range(0,111,5)) + [116], include_lowest=True, right=False)
dfmdfpr['RACEALONE'] = np.where(dfmdfpr['CENRACE'] > 6, 7, dfmdfpr['CENRACE'])
dfmdfpr['NUMRACES'] = pd.cut(dfmdfpr['CENRACE'], [1,7,22,42,57,63,64], include_lowest=True, right=False, labels=['1','2','3','4','5','6'])
# dfmdfpr['GQINST']
# dfmdfpr['GQMAJORTYPE']
dfmdfpr['QAGE_3G']= pd.cut(dfmdfpr['QAGE'],[0,18,65,116], include_lowest=True, right=False)
dfmdfpr['QAGE_5Y']=pd.cut(dfmdfpr['QAGE'],list(range(0,111,5)) + [116], include_lowest=True, right=False)
print("{} Created Puerto Rico Recodes".format(datetime.now()))
else:
print("{} Skipping Puerto Rico Recodes".format(datetime.now()))
del dfmdfpr
del dfhdfpr
print("{} Deleted Puerto Rico Data Files".format(datetime.now()))
qage_5y_cats = list(dfhdf['QAGE_5Y'].cat.categories)
qage_3g_cats = list(dfhdf['QAGE_3G'].cat.categories)
state_racealone_index = pd.MultiIndex.from_product([allstates,racealonecats], names=['GEOID','RACEALONE'])
county_racealone_index = pd.MultiIndex.from_product([allcounties,racealonecats],names=['GEOID','RACEALONE'])
place_racealone_index = pd.MultiIndex.from_product([allplaces,racealonecats],names=['IncPlaceGEOID','RACEALONE'])
tract_racealone_index = pd.MultiIndex.from_product([alltracts,racealonecats],names=['GEOID','RACEALONE'])
blockgroup_racealone_index = pd.MultiIndex.from_product([allblockgroups,racealonecats],names=['GEOID','RACEALONE'])
aiannh_racealone_index = pd.MultiIndex.from_product([allaiannh,racealonecats],names=['AIANNHGEOID','RACEALONE'])
print("{} Start Total Population".format(datetime.now()))
# Counties Total Population
hdfcounties_totalpop = dfhdf.groupby(['TABBLKST', 'TABBLKCOU']).size().reset_index(name='HDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU).drop(columns = ['TABBLKST', 'TABBLKCOU'])
mdfcounties_totalpop = dfmdf.groupby(['TABBLKST', 'TABBLKCOU']).size().reset_index(name='MDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU).drop(columns = ['TABBLKST', 'TABBLKCOU'])
counties_totalpop = pd.merge(hdfcounties_totalpop, mdfcounties_totalpop, on='GEOID', how = 'outer', validate = mergeValidation).pipe(calculate_stats)
counties_totalpop.to_csv(f"{OUTPUTDIR}/counties_totalpop.csv", index=False)
ss = counties_totalpop.pipe(calculate_ss, geography="County", sizecategory = "All", characteristic = "Total Population")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# Counties Total Population By Size
counties_totalpop['Total_PopSize'] = pd.cut(counties_totalpop['HDF_Population'], [0,1000,5000,10000,50000,100000,np.inf], include_lowest=True, right=False)
for i in counties_totalpop['Total_PopSize'].cat.categories:
ss = counties_totalpop[counties_totalpop['Total_PopSize'] == i].pipe(calculate_ss, geography="County", sizecategory = str(i), characteristic = "Total Population")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
#
smallestpop = counties_totalpop['Total_PopSize'].cat.categories[0]
print(smallestpop)
counties_lt1000 = list(counties_totalpop.loc[counties_totalpop['Total_PopSize'] == smallestpop, 'GEOID'])
print(counties_lt1000)
counties_lt1000index = pd.Index(counties_lt1000, name="GEOID")
# Do not delete counties_totalpop
# Places Total Population
hdfplaces_totalpop = dfhdf.groupby(['IncPlaceGEOID']).size().reset_index(name='HDF_Population').set_index('IncPlaceGEOID').reindex(allplacesindex, fill_value=0).reset_index()
mdfplaces_totalpop = dfmdf.groupby(['IncPlaceGEOID']).size().reset_index(name='MDF_Population').set_index('IncPlaceGEOID').reindex(allplacesindex, fill_value=0).reset_index()
places_totalpop = pd.merge(hdfplaces_totalpop, mdfplaces_totalpop, on='IncPlaceGEOID', how = 'outer', validate = mergeValidation).pipe(calculate_stats)
places_totalpop.to_csv(f"{OUTPUTDIR}/places_totalpop.csv", index=False)
ss = places_totalpop.pipe(calculate_ss, geography="Place", sizecategory = "All", characteristic = "Total Population")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# Places Total Population By Size
places_totalpop['Total_PopSize'] = pd.cut(places_totalpop['HDF_Population'], [0,500,1000,5000,10000,50000,100000,np.inf], include_lowest=True, right=False)
for i in places_totalpop['Total_PopSize'].cat.categories:
ss = places_totalpop[places_totalpop['Total_PopSize'] == i].pipe(calculate_ss, geography="Place", sizecategory = str(i), characteristic = "Total Population")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
#
smallestpop = places_totalpop['Total_PopSize'].cat.categories[0]
print(smallestpop)
places_lt500 = list(places_totalpop.loc[places_totalpop['Total_PopSize'] == smallestpop, 'IncPlaceGEOID'])
print(len(places_lt500))
places_lt500index = pd.Index(places_lt500, name="IncPlaceGEOID")
# Tracts Total Population
hdftracts_totalpop = dfhdf.groupby(['TABBLKST','TABBLKCOU','TABTRACTCE']).size().reset_index(name='HDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU + x.TABTRACTCE).drop(columns = ['TABBLKST', 'TABBLKCOU', 'TABTRACTCE']).set_index('GEOID').reindex(alltractsindex, fill_value=0).reset_index()
mdftracts_totalpop = dfmdf.groupby(['TABBLKST','TABBLKCOU','TABTRACTCE']).size().reset_index(name='MDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU + x.TABTRACTCE).drop(columns = ['TABBLKST', 'TABBLKCOU', 'TABTRACTCE']).set_index('GEOID').reindex(alltractsindex, fill_value=0).reset_index()
tracts_totalpop = pd.merge(hdftracts_totalpop, mdftracts_totalpop, on='GEOID', how = 'outer', validate = mergeValidation).pipe(calculate_stats)
tracts_totalpop.to_csv(f"{OUTPUTDIR}/tracts_totalpop.csv", index=False)
ss = tracts_totalpop.pipe(calculate_ss, geography="Tract", sizecategory = "All", characteristic = "Total Population")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
del hdftracts_totalpop
del mdftracts_totalpop
del tracts_totalpop
# Blocks By Rurality
hdfblocks_totalpop = dfhdf.groupby(['TABBLKST','TABBLKCOU','TABTRACTCE', 'TABBLK']).size().reset_index(name='HDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU + x.TABTRACTCE + x.TABBLK).drop(columns = ['TABBLKST', 'TABBLKCOU', 'TABTRACTCE', 'TABBLK']).set_index('GEOID').reindex(allblocksindex, fill_value=0).reset_index()
mdfblocks_totalpop = dfmdf.groupby(['TABBLKST','TABBLKCOU','TABTRACTCE', 'TABBLK']).size().reset_index(name='MDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU + x.TABTRACTCE + x.TABBLK).drop(columns = ['TABBLKST', 'TABBLKCOU', 'TABTRACTCE', 'TABBLK']).set_index('GEOID').reindex(allblocksindex, fill_value=0).reset_index()
blocks_totalpop = pd.merge(hdfblocks_totalpop, mdfblocks_totalpop, on='GEOID', how = 'outer', validate = mergeValidation).pipe(calculate_stats)
blocks_totalpop = pd.merge(blocks_totalpop, allgeos[['BlockGEOID','UR']], left_on="GEOID", right_on = "BlockGEOID", how="outer", validate=mergeValidation)
ss = blocks_totalpop[blocks_totalpop['UR'] == "U"].pipe(calculate_ss, geography="Block", sizecategory = "All", characteristic = "Total Population for Urban Blocks")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
ss = blocks_totalpop[blocks_totalpop['UR'] == "R"].pipe(calculate_ss, geography="Block", sizecategory = "All", characteristic = "Total Population for Rural Blocks")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
del hdfblocks_totalpop
del mdfblocks_totalpop
del blocks_totalpop
if runPRhere:
# PR Counties/Municipios Total Population
hdfcountiespr_totalpop = dfhdfpr.groupby(['TABBLKST', 'TABBLKCOU']).size().reset_index(name='HDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU).drop(columns = ['TABBLKST', 'TABBLKCOU']).set_index('GEOID').reindex(allcountiesprindex, fill_value=0).reset_index()
mdfcountiespr_totalpop = dfmdfpr.groupby(['TABBLKST', 'TABBLKCOU']).size().reset_index(name='MDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU).drop(columns = ['TABBLKST', 'TABBLKCOU']).set_index('GEOID').reindex(allcountiesprindex, fill_value=0).reset_index()
countiespr_totalpop = pd.merge(hdfcountiespr_totalpop, mdfcountiespr_totalpop, on='GEOID', how = 'outer', validate = mergeValidation).pipe(calculate_stats)
countiespr_totalpop.to_csv(f"{OUTPUTDIR}/countiespr_totalpop.csv", index=False)
ss = countiespr_totalpop.pipe(calculate_ss, geography="PR County/Municipio", sizecategory = "All", characteristic = "Total Population")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# PR Tracts Total Population
hdftractspr_totalpop = dfhdfpr.groupby(['TABBLKST','TABBLKCOU','TABTRACTCE']).size().reset_index(name='HDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU + x.TABTRACTCE).drop(columns = ['TABBLKST', 'TABBLKCOU', 'TABTRACTCE']).set_index('GEOID').reindex(alltractsprindex, fill_value=0).reset_index()
mdftractspr_totalpop = dfmdfpr.groupby(['TABBLKST','TABBLKCOU','TABTRACTCE']).size().reset_index(name='MDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU + x.TABTRACTCE).drop(columns = ['TABBLKST', 'TABBLKCOU', 'TABTRACTCE']).set_index('GEOID').reindex(alltractsprindex, fill_value=0).reset_index()
tractspr_totalpop = pd.merge(hdftractspr_totalpop, mdftractspr_totalpop, on='GEOID', how = 'outer', validate = mergeValidation).pipe(calculate_stats)
tractspr_totalpop.to_csv(f"{OUTPUTDIR}/tractspr_totalpop.csv", index=False)
ss = tractspr_totalpop.pipe(calculate_ss, geography="PR Tract", sizecategory = "All", characteristic = "Total Population")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# Elem School Districts Total Population
hdfelemschdists_totalpop = dfhdf.groupby(['SchDistEGEOID']).size().reset_index(name='HDF_Population').set_index('SchDistEGEOID').reindex(allelemschdistsindex, fill_value=0).reset_index()
mdfelemschdists_totalpop = dfmdf.groupby(['SchDistEGEOID']).size().reset_index(name='MDF_Population').set_index('SchDistEGEOID').reindex(allelemschdistsindex, fill_value=0).reset_index()
elemschdists_totalpop = pd.merge(hdfelemschdists_totalpop, mdfelemschdists_totalpop, on='SchDistEGEOID', how = 'outer', validate = mergeValidation).pipe(calculate_stats)
elemschdists_totalpop.to_csv(f"{OUTPUTDIR}/elemschdists_totalpop.csv", index=False)
ss = elemschdists_totalpop.pipe(calculate_ss, geography="ESD", sizecategory = "All", characteristic = "Total Population")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# Elem School Districts Total Population By Size
elemschdists_totalpop['Total_PopSize'] = pd.cut(elemschdists_totalpop['HDF_Population'], [0,1000,5000,10000,50000,100000,np.inf], include_lowest=True, right=False)
for i in elemschdists_totalpop['Total_PopSize'].cat.categories:
ss = elemschdists_totalpop[elemschdists_totalpop['Total_PopSize'] == i].pipe(calculate_ss, geography="ESD", sizecategory = str(i), characteristic = "Total Population")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# Sec School Districts Total Population
hdfsecschdists_totalpop = dfhdf.groupby(['SchDistSGEOID']).size().reset_index(name='HDF_Population').set_index('SchDistSGEOID').reindex(allsecschdistsindex, fill_value=0).reset_index()
mdfsecschdists_totalpop = dfmdf.groupby(['SchDistSGEOID']).size().reset_index(name='MDF_Population').set_index('SchDistSGEOID').reindex(allsecschdistsindex, fill_value=0).reset_index()
secschdists_totalpop = pd.merge(hdfsecschdists_totalpop, mdfsecschdists_totalpop, on='SchDistSGEOID', how = 'outer', validate = mergeValidation).pipe(calculate_stats)
secschdists_totalpop.to_csv(f"{OUTPUTDIR}/secschdists_totalpop.csv", index=False)
ss = secschdists_totalpop.pipe(calculate_ss, geography="SSD", sizecategory = "All", characteristic = "Total Population")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# Sec School Districts Total Population By Size
secschdists_totalpop['Total_PopSize'] = pd.cut(secschdists_totalpop['HDF_Population'], [0,1000,5000,10000,50000,100000,np.inf], include_lowest=True, right=False)
for i in secschdists_totalpop['Total_PopSize'].cat.categories:
ss = secschdists_totalpop[secschdists_totalpop['Total_PopSize'] == i].pipe(calculate_ss, geography="SSD", sizecategory = str(i), characteristic = "Total Population")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# Uni School Districts Total Population
hdfunischdists_totalpop = dfhdf.groupby(['SchDistUGEOID']).size().reset_index(name='HDF_Population').set_index('SchDistUGEOID').reindex(allunischdistsindex, fill_value=0).reset_index()
mdfunischdists_totalpop = dfmdf.groupby(['SchDistUGEOID']).size().reset_index(name='MDF_Population').set_index('SchDistUGEOID').reindex(allunischdistsindex, fill_value=0).reset_index()
unischdists_totalpop = pd.merge(hdfunischdists_totalpop, mdfunischdists_totalpop, on='SchDistUGEOID', how = 'outer', validate = mergeValidation).pipe(calculate_stats)
unischdists_totalpop.to_csv(f"{OUTPUTDIR}/unischdists_totalpop.csv", index=False)
ss = unischdists_totalpop.pipe(calculate_ss, geography="USD", sizecategory = "All", characteristic = "Total Population")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# Uni School Districts Total Population By Size
unischdists_totalpop['Total_PopSize'] = pd.cut(unischdists_totalpop['HDF_Population'], [0,1000,5000,10000,50000,100000,np.inf], include_lowest=True, right=False)
for i in unischdists_totalpop['Total_PopSize'].cat.categories:
ss = unischdists_totalpop[unischdists_totalpop['Total_PopSize'] == i].pipe(calculate_ss, geography="USD", sizecategory = str(i), characteristic = "Total Population")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# Minor Civil Division Total Population
hdfmcds_totalpop = dfhdf.groupby(['MCDGEOID']).size().reset_index(name='HDF_Population').set_index('MCDGEOID').reindex(allmcdsindex, fill_value=0).reset_index()
mdfmcds_totalpop = dfmdf.groupby(['MCDGEOID']).size().reset_index(name='MDF_Population').set_index('MCDGEOID').reindex(allmcdsindex, fill_value=0).reset_index()
mcds_totalpop = pd.merge(hdfmcds_totalpop, mdfmcds_totalpop, on='MCDGEOID', how = 'outer', validate = mergeValidation).pipe(calculate_stats)
mcds_totalpop.to_csv(f"{OUTPUTDIR}/mcds_totalpop.csv", index=False)
ss = mcds_totalpop.pipe(calculate_ss, geography="MCD", sizecategory = "All", characteristic = "Total Population")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# Minor Civil Division Total Population By Size
mcds_totalpop['Total_PopSize'] = pd.cut(mcds_totalpop['HDF_Population'], [0,1000,5000,10000,50000,100000,np.inf], include_lowest=True, right=False)
for i in mcds_totalpop['Total_PopSize'].cat.categories:
ss = mcds_totalpop[mcds_totalpop['Total_PopSize'] == i].pipe(calculate_ss, geography="MCD", sizecategory = str(i), characteristic = "Total Population")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# Federal AIR Total Population
hdffedairs_totalpop = dfhdf.groupby(['FedAIRGEOID']).size().reset_index(name='HDF_Population').set_index('FedAIRGEOID').reindex(allfedairsindex, fill_value=0).reset_index()
mdffedairs_totalpop = dfmdf.groupby(['FedAIRGEOID']).size().reset_index(name='MDF_Population').set_index('FedAIRGEOID').reindex(allfedairsindex, fill_value=0).reset_index()
fedairs_totalpop = pd.merge(hdffedairs_totalpop, mdffedairs_totalpop, on='FedAIRGEOID', how = 'outer', validate = mergeValidation).pipe(calculate_stats)
fedairs_totalpop.to_csv(f"{OUTPUTDIR}/fedairs_totalpop.csv", index=False)
ss = fedairs_totalpop.pipe(calculate_ss, geography="Fed AIR", sizecategory = "All", characteristic = "Total Population")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# Federal AIR Total Population By Size
fedairs_totalpop['Total_PopSize'] = pd.cut(fedairs_totalpop['HDF_Population'], [0,100,1000,10000,np.inf], include_lowest=True, right=False)
for i in fedairs_totalpop['Total_PopSize'].cat.categories:
ss = fedairs_totalpop[fedairs_totalpop['Total_PopSize'] == i].pipe(calculate_ss, geography="Fed AIR", sizecategory = str(i), characteristic = "Total Population")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# OTSA Total Population
hdfotsas_totalpop = dfhdf.groupby(['OTSAGEOID']).size().reset_index(name='HDF_Population').set_index('OTSAGEOID').reindex(allotsasindex, fill_value=0).reset_index()
mdfotsas_totalpop = dfmdf.groupby(['OTSAGEOID']).size().reset_index(name='MDF_Population').set_index('OTSAGEOID').reindex(allotsasindex, fill_value=0).reset_index()
otsas_totalpop = pd.merge(hdfotsas_totalpop, mdfotsas_totalpop, on='OTSAGEOID', how = 'outer', validate = mergeValidation).pipe(calculate_stats)
otsas_totalpop.to_csv(f"{OUTPUTDIR}/otsas_totalpop.csv", index=False)
ss = otsas_totalpop.pipe(calculate_ss, geography="OTSA", sizecategory = "All", characteristic = "Total Population")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# ANVSA Total Population
hdfanvsas_totalpop = dfhdf.groupby(['ANVSAGEOID']).size().reset_index(name='HDF_Population').set_index('ANVSAGEOID').reindex(allanvsasindex, fill_value=0).reset_index()
mdfanvsas_totalpop = dfmdf.groupby(['ANVSAGEOID']).size().reset_index(name='MDF_Population').set_index('ANVSAGEOID').reindex(allanvsasindex, fill_value=0).reset_index()
anvsas_totalpop = pd.merge(hdfanvsas_totalpop, mdfanvsas_totalpop, on='ANVSAGEOID', how = 'outer', validate = mergeValidation).pipe(calculate_stats)
anvsas_totalpop.to_csv(f"{OUTPUTDIR}/anvsas_totalpop.csv", index=False)
ss = anvsas_totalpop.pipe(calculate_ss, geography="ANVSA", sizecategory = "All", characteristic = "Total Population")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# ANVSA Total Population By Size
anvsas_totalpop['Total_PopSize'] = pd.cut(anvsas_totalpop['HDF_Population'], [0,100,1000,10000,np.inf], include_lowest=True, right=False)
for i in anvsas_totalpop['Total_PopSize'].cat.categories:
ss = anvsas_totalpop[anvsas_totalpop['Total_PopSize'] == i].pipe(calculate_ss, geography="ANVSA", sizecategory = str(i), characteristic = "Total Population")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# AIANNH Areas Total Population
hdfaiannh_totalpop = dfhdf.groupby(['AIANNHGEOID']).size().reset_index(name='HDF_Population').set_index('AIANNHGEOID').reindex(allaiannhindex, fill_value=0).reset_index()
mdfaiannh_totalpop = dfmdf.groupby(['AIANNHGEOID']).size().reset_index(name='MDF_Population').set_index('AIANNHGEOID').reindex(allaiannhindex, fill_value=0).reset_index()
aiannh_totalpop = pd.merge(hdfaiannh_totalpop, mdfaiannh_totalpop, on='AIANNHGEOID', how = 'outer', validate = mergeValidation).pipe(calculate_stats)
aiannh_totalpop.to_csv(f"{OUTPUTDIR}/aiannh_totalpop.csv", index=False)
ss = aiannh_totalpop.pipe(calculate_ss, geography="AIANNH Area", sizecategory = "All", characteristic = "Total Population")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# AIANNH Areas Total Population By Size
aiannh_totalpop['Total_PopSize'] = pd.cut(aiannh_totalpop['HDF_Population'], [0,100,500,1000,np.inf], include_lowest=True, right=False)
for i in aiannh_totalpop['Total_PopSize'].cat.categories:
ss = aiannh_totalpop[aiannh_totalpop['Total_PopSize'] == i].pipe(calculate_ss, geography="AIANNH Area", sizecategory = str(i), characteristic = "Total Population")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
print("{} Total Population Done".format(datetime.now()))
# County Total Population 18+
hdfcounties_totalpop18p = dfhdf[(dfhdf['QAGE'] >= 18)].groupby(['TABBLKST', 'TABBLKCOU']).size().reset_index(name='HDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU).drop(columns = ['TABBLKST', 'TABBLKCOU']).set_index(['GEOID']).reindex(allcountiesindex, fill_value=0).reset_index()
mdfcounties_totalpop18p = dfmdf[(dfmdf['QAGE'] >= 18)].groupby(['TABBLKST', 'TABBLKCOU']).size().reset_index(name='MDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU).drop(columns = ['TABBLKST', 'TABBLKCOU']).set_index(['GEOID']).reindex(allcountiesindex, fill_value=0).reset_index()
counties_totalpop18p = pd.merge(hdfcounties_totalpop18p, mdfcounties_totalpop18p, on=['GEOID'], how = 'outer', validate = mergeValidation).pipe(calculate_stats)
ss = counties_totalpop18p.pipe(calculate_ss, geography="County", sizecategory = "All", characteristic = "Total Population Aged 18+")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
counties_totalpop18p = pd.merge(counties_totalpop18p, counties_totalpop[['GEOID','Total_PopSize']], on ='GEOID', how='outer', validate= mergeValidation)
for i in counties_totalpop18p['Total_PopSize'].cat.categories:
ss = counties_totalpop18p[counties_totalpop18p['Total_PopSize'] == i].pipe(calculate_ss, geography="County", sizecategory = str(i), characteristic = "Total Population Aged 18+")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# Places Total Population 18+
hdfplaces_totalpop18p = dfhdf[(dfhdf['QAGE'] >= 18)].groupby(['IncPlaceGEOID']).size().reset_index(name='HDF_Population').set_index('IncPlaceGEOID').reindex(allplacesindex, fill_value=0).reset_index()
mdfplaces_totalpop18p = dfmdf[(dfmdf['QAGE'] >= 18)].groupby(['IncPlaceGEOID']).size().reset_index(name='MDF_Population').set_index('IncPlaceGEOID').reindex(allplacesindex, fill_value=0).reset_index()
places_totalpop18p = pd.merge(hdfplaces_totalpop18p, mdfplaces_totalpop18p, on='IncPlaceGEOID', how = 'outer', validate = mergeValidation).pipe(calculate_stats)
ss = places_totalpop18p.pipe(calculate_ss, geography="Place", sizecategory = "All", characteristic = "Total Population Aged 18+")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
places_totalpop18p = pd.merge(places_totalpop18p, places_totalpop[['IncPlaceGEOID','Total_PopSize']], on ='IncPlaceGEOID', how='outer', validate= mergeValidation)
for i in places_totalpop18p['Total_PopSize'].cat.categories:
ss = places_totalpop18p[places_totalpop18p['Total_PopSize'] == i].pipe(calculate_ss, geography="Place", sizecategory = str(i), characteristic = "Total Population Aged 18+")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# Tracts Total Population 18+
hdftracts_totalpop18p = dfhdf[(dfhdf['QAGE'] >= 18)].groupby(['TABBLKST','TABBLKCOU','TABTRACTCE']).size().reset_index(name='HDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU + x.TABTRACTCE).drop(columns = ['TABBLKST', 'TABBLKCOU', 'TABTRACTCE']).set_index('GEOID').reindex(alltractsindex, fill_value=0).reset_index()
mdftracts_totalpop18p = dfmdf[(dfmdf['QAGE'] >= 18)].groupby(['TABBLKST','TABBLKCOU','TABTRACTCE']).size().reset_index(name='MDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU + x.TABTRACTCE).drop(columns = ['TABBLKST', 'TABBLKCOU', 'TABTRACTCE']).set_index('GEOID').reindex(alltractsindex, fill_value=0).reset_index()
tracts_totalpop18p = pd.merge(hdftracts_totalpop18p, mdftracts_totalpop18p, on='GEOID', how = 'outer', validate = mergeValidation).pipe(calculate_stats)
ss = tracts_totalpop18p.pipe(calculate_ss, geography="Tract", sizecategory = "All", characteristic = "Total Population Aged 18+")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
if runPRhere:
# PR Counties/Municipios Total Population 18+
hdfcountiespr_totalpop18p = dfhdfpr[(dfhdfpr['QAGE'] >= 18)].groupby(['TABBLKST', 'TABBLKCOU']).size().reset_index(name='HDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU).drop(columns = ['TABBLKST', 'TABBLKCOU']).set_index('GEOID').reindex(allcountiesprindex, fill_value=0).reset_index()
mdfcountiespr_totalpop18p = dfmdfpr[(dfmdfpr['QAGE'] >= 18)].groupby(['TABBLKST', 'TABBLKCOU']).size().reset_index(name='MDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU).drop(columns = ['TABBLKST', 'TABBLKCOU']).set_index('GEOID').reindex(allcountiesprindex, fill_value=0).reset_index()
countiespr_totalpop18p = pd.merge(hdfcountiespr_totalpop18p, mdfcountiespr_totalpop18p, on='GEOID', how = 'outer', validate = mergeValidation).pipe(calculate_stats)
ss = countiespr_totalpop18p.pipe(calculate_ss, geography="PR County/Municipio", sizecategory = "All", characteristic = "Total Population Aged 18+")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# PR Tracts Total Population 18+
hdftractspr_totalpop18p = dfhdfpr[(dfhdfpr['QAGE'] >= 18)].groupby(['TABBLKST','TABBLKCOU','TABTRACTCE']).size().reset_index(name='HDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU + x.TABTRACTCE).drop(columns = ['TABBLKST', 'TABBLKCOU', 'TABTRACTCE']).set_index('GEOID').reindex(alltractsprindex, fill_value=0).reset_index()
mdftractspr_totalpop18p = dfmdfpr[(dfmdfpr['QAGE'] >= 18)].groupby(['TABBLKST','TABBLKCOU','TABTRACTCE']).size().reset_index(name='MDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU + x.TABTRACTCE).drop(columns = ['TABBLKST', 'TABBLKCOU', 'TABTRACTCE']).set_index('GEOID').reindex(alltractsprindex, fill_value=0).reset_index()
tractspr_totalpop18p = pd.merge(hdftractspr_totalpop18p, mdftractspr_totalpop18p, on='GEOID', how = 'outer', validate = mergeValidation).pipe(calculate_stats)
ss = tractspr_totalpop18p.pipe(calculate_ss, geography="PR Tract", sizecategory = "All", characteristic = "Total Population Aged 18+")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# Minor Civil Division Total Population 18+
hdfmcds_totalpop18p = dfhdf[(dfhdf['QAGE'] >= 18)].groupby(['MCDGEOID']).size().reset_index(name='HDF_Population').set_index('MCDGEOID').reindex(allmcdsindex, fill_value=0).reset_index()
mdfmcds_totalpop18p = dfmdf[(dfmdf['QAGE'] >= 18)].groupby(['MCDGEOID']).size().reset_index(name='MDF_Population').set_index('MCDGEOID').reindex(allmcdsindex, fill_value=0).reset_index()
mcds_totalpop18p = pd.merge(hdfmcds_totalpop18p, mdfmcds_totalpop18p, on='MCDGEOID', how = 'outer', validate = mergeValidation).pipe(calculate_stats)
ss = mcds_totalpop18p.pipe(calculate_ss, geography="MCD", sizecategory = "All", characteristic = "Total Population Aged 18+")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
mcds_totalpop18p = pd.merge(mcds_totalpop18p, mcds_totalpop[['MCDGEOID','Total_PopSize']], on ='MCDGEOID', how='outer', validate= mergeValidation)
for i in mcds_totalpop18p['Total_PopSize'].cat.categories:
ss = mcds_totalpop18p[mcds_totalpop18p['Total_PopSize'] == i].pipe(calculate_ss, geography="MCD", sizecategory = str(i), characteristic = "Total Population Aged 18+")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# Federal AIR Total Population 18+
hdffedairs_totalpop18p = dfhdf[(dfhdf['QAGE'] >= 18)].groupby(['FedAIRGEOID']).size().reset_index(name='HDF_Population').set_index('FedAIRGEOID').reindex(allfedairsindex, fill_value=0).reset_index()
mdffedairs_totalpop18p = dfmdf[(dfmdf['QAGE'] >= 18)].groupby(['FedAIRGEOID']).size().reset_index(name='MDF_Population').set_index('FedAIRGEOID').reindex(allfedairsindex, fill_value=0).reset_index()
fedairs_totalpop18p = pd.merge(hdffedairs_totalpop18p, mdffedairs_totalpop18p, on='FedAIRGEOID', how = 'outer', validate = mergeValidation).pipe(calculate_stats)
ss = fedairs_totalpop18p.pipe(calculate_ss, geography="Fed AIR", sizecategory = "All", characteristic = "Total Population Aged 18+")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
fedairs_totalpop18p = pd.merge(fedairs_totalpop18p, fedairs_totalpop[['FedAIRGEOID','Total_PopSize']], on ='FedAIRGEOID', how='outer', validate= mergeValidation)
for i in fedairs_totalpop18p['Total_PopSize'].cat.categories:
ss = fedairs_totalpop18p[fedairs_totalpop18p['Total_PopSize'] == i].pipe(calculate_ss, geography="Fed AIR", sizecategory = str(i), characteristic = "Total Population Aged 18+")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# OTSA Total Population 18+
hdfotsas_totalpop18p = dfhdf[(dfhdf['QAGE'] >= 18)].groupby(['OTSAGEOID']).size().reset_index(name='HDF_Population').set_index('OTSAGEOID').reindex(allotsasindex, fill_value=0).reset_index()
mdfotsas_totalpop18p = dfmdf[(dfmdf['QAGE'] >= 18)].groupby(['OTSAGEOID']).size().reset_index(name='MDF_Population').set_index('OTSAGEOID').reindex(allotsasindex, fill_value=0).reset_index()
otsas_totalpop18p = pd.merge(hdfotsas_totalpop18p, mdfotsas_totalpop18p, on='OTSAGEOID', how = 'outer', validate = mergeValidation).pipe(calculate_stats)
ss = otsas_totalpop18p.pipe(calculate_ss, geography="OTSA", sizecategory = "All", characteristic = "Total Population Aged 18+")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# ANVSA Total Population 18+
hdfanvsas_totalpop18p = dfhdf[(dfhdf['QAGE'] >= 18)].groupby(['ANVSAGEOID']).size().reset_index(name='HDF_Population').set_index('ANVSAGEOID').reindex(allanvsasindex, fill_value=0).reset_index()
mdfanvsas_totalpop18p = dfmdf[(dfmdf['QAGE'] >= 18)].groupby(['ANVSAGEOID']).size().reset_index(name='MDF_Population').set_index('ANVSAGEOID').reindex(allanvsasindex, fill_value=0).reset_index()
anvsas_totalpop18p = pd.merge(hdfanvsas_totalpop18p, mdfanvsas_totalpop18p, on='ANVSAGEOID', how = 'outer', validate = mergeValidation).pipe(calculate_stats)
ss = anvsas_totalpop18p.pipe(calculate_ss, geography="ANVSA", sizecategory = "All", characteristic = "Total Population Aged 18+")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
anvsas_totalpop18p = pd.merge(anvsas_totalpop18p, anvsas_totalpop[['ANVSAGEOID','Total_PopSize']], on ='ANVSAGEOID', how='outer', validate= mergeValidation)
for i in anvsas_totalpop18p['Total_PopSize'].cat.categories:
ss = anvsas_totalpop18p[anvsas_totalpop18p['Total_PopSize'] == i].pipe(calculate_ss, geography="ANVSA", sizecategory = str(i), characteristic = "Total Population Aged 18+")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
print("{} Total Population 18+ Done".format(datetime.now()))
# State Hispanic Origin
hdfstates_hisp = dfhdf[(dfhdf['CENHISP']=='2')].groupby(['TABBLKST']).size().reset_index(name='HDF_Population').assign(GEOID = lambda x: x.TABBLKST).drop(columns = ['TABBLKST']).set_index('GEOID').reindex(allstatesindex, fill_value=0).reset_index()
mdfstates_hisp = dfmdf[(dfmdf['CENHISP']=='2')].groupby(['TABBLKST']).size().reset_index(name='MDF_Population').assign(GEOID = lambda x: x.TABBLKST).drop(columns = ['TABBLKST']).set_index('GEOID').reindex(allstatesindex, fill_value=0).reset_index()
states_hisp = pd.merge(hdfstates_hisp, mdfstates_hisp, on = "GEOID", how="outer", validate = mergeValidation).pipe(calculate_stats)
ss = states_hisp.pipe(calculate_ss, geography = "State", sizecategory = "All", characteristic = "Hispanic")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
hdfstates_nonhisp = dfhdf[(dfhdf['CENHISP']=='1')].groupby(['TABBLKST']).size().reset_index(name='HDF_Population').assign(GEOID = lambda x: x.TABBLKST).drop(columns = ['TABBLKST']).set_index('GEOID').reindex(allstatesindex, fill_value=0).reset_index()
mdfstates_nonhisp = dfmdf[(dfmdf['CENHISP']=='1')].groupby(['TABBLKST']).size().reset_index(name='MDF_Population').assign(GEOID = lambda x: x.TABBLKST).drop(columns = ['TABBLKST']).set_index('GEOID').reindex(allstatesindex, fill_value=0).reset_index()
states_nonhisp = pd.merge(hdfstates_nonhisp, mdfstates_nonhisp, on = "GEOID", how="outer", validate = mergeValidation).pipe(calculate_stats)
ss = states_nonhisp.pipe(calculate_ss, geography = "State", sizecategory = "All", characteristic = "Not Hispanic")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# County Hispanic Origin
hdfcounties_hisp = dfhdf[(dfhdf['CENHISP']=='2')].groupby(['TABBLKST', 'TABBLKCOU']).size().reset_index(name='HDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU).drop(columns = ['TABBLKST', 'TABBLKCOU']).set_index('GEOID').reindex(allcountiesindex, fill_value=0).reset_index()
mdfcounties_hisp = dfmdf[(dfmdf['CENHISP']=='2')].groupby(['TABBLKST', 'TABBLKCOU']).size().reset_index(name='MDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU).drop(columns = ['TABBLKST', 'TABBLKCOU']).set_index('GEOID').reindex(allcountiesindex, fill_value=0).reset_index()
counties_hisp = pd.merge(hdfcounties_hisp, mdfcounties_hisp, on = "GEOID", how="outer", validate = mergeValidation).pipe(calculate_stats)
ss = counties_hisp.pipe(calculate_ss, geography = "County", sizecategory = "All", characteristic = "Hispanic")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
hdfcounties_nonhisp = dfhdf[(dfhdf['CENHISP']=='1')].groupby(['TABBLKST', 'TABBLKCOU']).size().reset_index(name='HDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU).drop(columns = ['TABBLKST', 'TABBLKCOU']).set_index('GEOID').reindex(allcountiesindex, fill_value=0).reset_index()
mdfcounties_nonhisp = dfmdf[(dfmdf['CENHISP']=='1')].groupby(['TABBLKST', 'TABBLKCOU']).size().reset_index(name='MDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU).drop(columns = ['TABBLKST', 'TABBLKCOU']).set_index('GEOID').reindex(allcountiesindex, fill_value=0).reset_index()
counties_nonhisp = pd.merge(hdfcounties_nonhisp, mdfcounties_nonhisp, on = "GEOID", how="outer", validate = mergeValidation).pipe(calculate_stats)
ss = counties_nonhisp.pipe(calculate_ss, geography = "County", sizecategory = "All", characteristic = "Not Hispanic")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
counties_hisp = counties_hisp.assign(Hisp_PopSize = lambda x: pd.cut(x['HDF_Population'], [0,10,100,np.inf], include_lowest=True, right=False))
for i in counties_hisp['Hisp_PopSize'].cat.categories:
ss = counties_hisp[counties_hisp['Hisp_PopSize'] == i].pipe(calculate_ss, geography="County", sizecategory = str(i), characteristic = "Hispanic")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
counties_nonhisp = counties_nonhisp.assign(Hisp_PopSize = lambda x: pd.cut(x['HDF_Population'], [0,10,100,np.inf], include_lowest=True, right=False))
for i in counties_nonhisp['Hisp_PopSize'].cat.categories:
ss = counties_nonhisp[counties_nonhisp['Hisp_PopSize'] == i].pipe(calculate_ss, geography="County", sizecategory = str(i), characteristic = "Not Hispanic")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# Place Hispanic Origin
hdfplaces_hisp = dfhdf[(dfhdf['CENHISP']=='2')].groupby(['IncPlaceGEOID']).size().reset_index(name='HDF_Population').set_index('IncPlaceGEOID').reindex(allplacesindex, fill_value=0).reset_index()
mdfplaces_hisp = dfmdf[(dfmdf['CENHISP']=='2')].groupby(['IncPlaceGEOID']).size().reset_index(name='MDF_Population').set_index('IncPlaceGEOID').reindex(allplacesindex, fill_value=0).reset_index()
places_hisp = pd.merge(hdfplaces_hisp, mdfplaces_hisp, on = "IncPlaceGEOID", how="outer", validate = mergeValidation).pipe(calculate_stats)
ss = places_hisp.pipe(calculate_ss, geography = "Place", sizecategory = "All", characteristic = "Hispanic")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
hdfplaces_nonhisp = dfhdf[(dfhdf['CENHISP']=='1')].groupby(['IncPlaceGEOID']).size().reset_index(name='HDF_Population').set_index('IncPlaceGEOID').reindex(allplacesindex, fill_value=0).reset_index()
mdfplaces_nonhisp = dfmdf[(dfmdf['CENHISP']=='1')].groupby(['IncPlaceGEOID']).size().reset_index(name='MDF_Population').set_index('IncPlaceGEOID').reindex(allplacesindex, fill_value=0).reset_index()
places_nonhisp = pd.merge(hdfplaces_nonhisp, mdfplaces_nonhisp, on = "IncPlaceGEOID", how="outer", validate = mergeValidation).pipe(calculate_stats)
ss = places_nonhisp.pipe(calculate_ss, geography = "Place", sizecategory = "All", characteristic = "Not Hispanic")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
places_hisp = places_hisp.assign(Hisp_PopSize = lambda x: pd.cut(x['HDF_Population'], [0,10,100,np.inf], include_lowest=True, right=False))
for i in places_hisp['Hisp_PopSize'].cat.categories:
ss = places_hisp[places_hisp['Hisp_PopSize'] == i].pipe(calculate_ss, geography="Place", sizecategory = str(i), characteristic = "Hispanic")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
places_nonhisp = places_nonhisp.assign(Hisp_PopSize = lambda x: pd.cut(x['HDF_Population'], [0,10,100,np.inf], include_lowest=True, right=False))
for i in places_nonhisp['Hisp_PopSize'].cat.categories:
ss = places_nonhisp[places_nonhisp['Hisp_PopSize'] == i].pipe(calculate_ss, geography="Place", sizecategory = str(i), characteristic = "Not Hispanic")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# Tracts Hispanic Origin
hdftracts_hisp = dfhdf[(dfhdf['CENHISP']=='2')].groupby(['TABBLKST','TABBLKCOU','TABTRACTCE']).size().reset_index(name='HDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU + x.TABTRACTCE).drop(columns = ['TABBLKST', 'TABBLKCOU', 'TABTRACTCE']).set_index('GEOID').reindex(alltractsindex, fill_value=0).reset_index()
mdftracts_hisp = dfmdf[(dfmdf['CENHISP']=='2')].groupby(['TABBLKST','TABBLKCOU','TABTRACTCE']).size().reset_index(name='MDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU + x.TABTRACTCE).drop(columns = ['TABBLKST', 'TABBLKCOU', 'TABTRACTCE']).set_index('GEOID').reindex(alltractsindex, fill_value=0).reset_index()
tracts_hisp = pd.merge(hdftracts_hisp, mdftracts_hisp, on='GEOID', how = 'outer', validate = mergeValidation).pipe(calculate_stats)
ss = tracts_hisp.pipe(calculate_ss, geography="Tract", sizecategory = "All", characteristic = "Hispanic")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
hdftracts_nonhisp = dfhdf[(dfhdf['CENHISP']=='1')].groupby(['TABBLKST','TABBLKCOU','TABTRACTCE']).size().reset_index(name='HDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU + x.TABTRACTCE).drop(columns = ['TABBLKST', 'TABBLKCOU', 'TABTRACTCE']).set_index('GEOID').reindex(alltractsindex, fill_value=0).reset_index()
mdftracts_nonhisp = dfmdf[(dfmdf['CENHISP']=='1')].groupby(['TABBLKST','TABBLKCOU','TABTRACTCE']).size().reset_index(name='MDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU + x.TABTRACTCE).drop(columns = ['TABBLKST', 'TABBLKCOU', 'TABTRACTCE']).set_index('GEOID').reindex(alltractsindex, fill_value=0).reset_index()
tracts_nonhisp = pd.merge(hdftracts_nonhisp, mdftracts_nonhisp, on='GEOID', how = 'outer', validate = mergeValidation).pipe(calculate_stats)
ss = tracts_nonhisp.pipe(calculate_ss, geography="Tract", sizecategory = "All", characteristic = "Not Hispanic")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
tracts_hisp = tracts_hisp.assign(Hisp_PopSize = lambda x: pd.cut(x['HDF_Population'], [0,10,100,np.inf], include_lowest=True, right=False))
for i in tracts_hisp['Hisp_PopSize'].cat.categories:
ss = tracts_hisp[tracts_hisp['Hisp_PopSize'] == i].pipe(calculate_ss, geography="Tract", sizecategory = str(i), characteristic = "Hispanic")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
tracts_nonhisp = tracts_nonhisp.assign(Hisp_PopSize = lambda x: pd.cut(x['HDF_Population'], [0,10,100,np.inf], include_lowest=True, right=False))
for i in tracts_nonhisp['Hisp_PopSize'].cat.categories:
ss = tracts_nonhisp[tracts_nonhisp['Hisp_PopSize'] == i].pipe(calculate_ss, geography="Tract", sizecategory = str(i), characteristic = "Not Hispanic")
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
print("{} Hispanic Origin Done".format(datetime.now()))
# State Race Alone
hdfstates_racealone = dfhdf.groupby(['TABBLKST', 'RACEALONE']).size().reset_index(name='HDF_Population').assign(GEOID = lambda x: x.TABBLKST).drop(columns = ['TABBLKST']).set_index(['GEOID', 'RACEALONE']).reindex(state_racealone_index, fill_value=0).reset_index()
mdfstates_racealone = dfmdf.groupby(['TABBLKST', 'RACEALONE']).size().reset_index(name='MDF_Population').assign(GEOID = lambda x: x.TABBLKST).drop(columns = ['TABBLKST']).set_index(['GEOID', 'RACEALONE']).reindex(state_racealone_index, fill_value=0).reset_index()
states_racealone = pd.merge(hdfstates_racealone, mdfstates_racealone, on=['GEOID','RACEALONE'], how = 'outer', validate = mergeValidation).pipe(calculate_stats)
for r in racealonecats:
ss = states_racealone[states_racealone['RACEALONE'] == r].pipe(calculate_ss, geography="State", sizecategory = "All", characteristic = "{race}".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# County Race Alone
hdfcounties_racealone = dfhdf.groupby(['TABBLKST', 'TABBLKCOU', 'RACEALONE']).size().reset_index(name='HDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU).drop(columns = ['TABBLKST', 'TABBLKCOU']).set_index(['GEOID', 'RACEALONE']).reindex(county_racealone_index, fill_value=0).reset_index()
mdfcounties_racealone = dfmdf.groupby(['TABBLKST', 'TABBLKCOU', 'RACEALONE']).size().reset_index(name='MDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU).drop(columns = ['TABBLKST', 'TABBLKCOU']).set_index(['GEOID', 'RACEALONE']).reindex(county_racealone_index, fill_value=0).reset_index()
counties_racealone = pd.merge(hdfcounties_racealone, mdfcounties_racealone, on=['GEOID','RACEALONE'], how = 'outer', validate = mergeValidation).pipe(calculate_stats)
for r in racealonecats:
ss = counties_racealone[counties_racealone['RACEALONE'] == r].pipe(calculate_ss, geography="County", sizecategory = "All", characteristic = "{race}".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# County Race Alone By Size
for r in racealonecats:
counties_race = counties_racealone[counties_racealone['RACEALONE'] == r]
counties_race = counties_race.assign(Race_PopSize = lambda x: pd.cut(x['HDF_Population'], [0,10,100,np.inf], include_lowest=True, right=False))
for i in counties_race['Race_PopSize'].cat.categories:
temp = counties_race[counties_race['Race_PopSize'] == i]
# temp.to_csv(f"{OUTPUTDIR}/counties_race_race{r}_{i}.csv",index=False)
ss = counties_race[counties_race['Race_PopSize'] == i].pipe(calculate_ss, geography="County", sizecategory = str(i), characteristic = "{race}".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# Place Race Alone
hdfplaces_racealone = dfhdf.groupby(['IncPlaceGEOID', 'RACEALONE']).size().reset_index(name='HDF_Population').set_index(['IncPlaceGEOID', 'RACEALONE']).reindex(place_racealone_index, fill_value=0).reset_index()
mdfplaces_racealone = dfmdf.groupby(['IncPlaceGEOID', 'RACEALONE']).size().reset_index(name='MDF_Population').set_index(['IncPlaceGEOID', 'RACEALONE']).reindex(place_racealone_index, fill_value=0).reset_index()
places_racealone = pd.merge(hdfplaces_racealone, mdfplaces_racealone, on=['IncPlaceGEOID','RACEALONE'], how = 'outer', validate = mergeValidation).pipe(calculate_stats)
for r in racealonecats:
ss = places_racealone[places_racealone['RACEALONE'] == r].pipe(calculate_ss, geography="Place", sizecategory = "All", characteristic = "{race}".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# Place Race Alone By Size
for r in racealonecats:
places_race = places_racealone[places_racealone['RACEALONE'] == r]
places_race = places_race.assign(Race_PopSize = lambda x: pd.cut(x['HDF_Population'], [0,10,100,np.inf], include_lowest=True, right=False))
for i in places_race['Race_PopSize'].cat.categories:
temp = places_race[places_race['Race_PopSize'] == i]
# temp.to_csv(f"{OUTPUTDIR}/places_race_race{r}_{i}.csv",index=False)
ss = places_race[places_race['Race_PopSize'] == i].pipe(calculate_ss, geography="Place", sizecategory = str(i), characteristic = "{race}".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# Tract Race Alone
hdftracts_racealone = dfhdf.groupby(['TABBLKST', 'TABBLKCOU', 'TABTRACTCE','RACEALONE']).size().reset_index(name='HDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU + x.TABTRACTCE).drop(columns = ['TABBLKST', 'TABBLKCOU', 'TABTRACTCE']).set_index(['GEOID', 'RACEALONE']).reindex(tract_racealone_index, fill_value=0).reset_index()
mdftracts_racealone = dfmdf.groupby(['TABBLKST', 'TABBLKCOU', 'TABTRACTCE','RACEALONE']).size().reset_index(name='MDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU + x.TABTRACTCE).drop(columns = ['TABBLKST', 'TABBLKCOU', 'TABTRACTCE']).set_index(['GEOID', 'RACEALONE']).reindex(tract_racealone_index, fill_value=0).reset_index()
tracts_racealone = pd.merge(hdftracts_racealone, mdftracts_racealone, on=['GEOID','RACEALONE'], how = 'outer', validate = mergeValidation).pipe(calculate_stats)
for r in racealonecats:
ss = tracts_racealone[tracts_racealone['RACEALONE'] == r].pipe(calculate_ss, geography="Tract", sizecategory = "All", characteristic = "{race}".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# Tract Race Alone By Size
for r in racealonecats:
tracts_race = tracts_racealone[tracts_racealone['RACEALONE'] == r]
tracts_race = tracts_race.assign(Race_PopSize = lambda x: pd.cut(x['HDF_Population'], [0,10,100,np.inf], include_lowest=True, right=False))
for i in tracts_race['Race_PopSize'].cat.categories:
ss = tracts_race[tracts_race['Race_PopSize'] == i].pipe(calculate_ss, geography="Tract", sizecategory = str(i), characteristic = "{race}".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
if runPRhere:
# PR Counties/Municipios Race Alone
for r in racealonecats:
hdfcountiespr_racealone = dfhdfpr[dfhdfpr['RACEALONE'] == r].groupby(['TABBLKST', 'TABBLKCOU']).size().reset_index(name='HDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU).drop(columns = ['TABBLKST', 'TABBLKCOU']).set_index(['GEOID']).reindex(allcountiesprindex, fill_value=0).reset_index()
mdfcountiespr_racealone = dfmdfpr[dfmdfpr['RACEALONE'] == r].groupby(['TABBLKST', 'TABBLKCOU']).size().reset_index(name='MDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU).drop(columns = ['TABBLKST', 'TABBLKCOU']).set_index(['GEOID']).reindex(allcountiesprindex, fill_value=0).reset_index()
countiespr_racealone = pd.merge(hdfcountiespr_racealone, mdfcountiespr_racealone, on=['GEOID'], how = 'outer', validate = mergeValidation).pipe(calculate_stats)
ss = countiespr_racealone.pipe(calculate_ss, geography="PR County/Municipio", sizecategory = "All", characteristic = "{race}".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
countiespr_racealone = countiespr_racealone.assign(Race_PopSize = lambda x: pd.cut(x['HDF_Population'], [0,10,100,np.inf], include_lowest=True, right=False))
for i in countiespr_racealone['Race_PopSize'].cat.categories:
ss = countiespr_racealone[countiespr_racealone['Race_PopSize'] == i].pipe(calculate_ss, geography="PR County/Municipio", sizecategory = str(i), characteristic = "{race}".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# PR Tracts Race Alone
for r in racealonecats:
hdftractspr_racealone = dfhdfpr[dfhdfpr['RACEALONE'] == r].groupby(['TABBLKST', 'TABBLKCOU','TABTRACTCE']).size().reset_index(name='HDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU + x.TABTRACTCE).drop(columns = ['TABBLKST', 'TABBLKCOU', 'TABTRACTCE']).set_index('GEOID').reindex(alltractsprindex, fill_value=0).reset_index()
mdftractspr_racealone = dfmdfpr[dfmdfpr['RACEALONE'] == r].groupby(['TABBLKST', 'TABBLKCOU','TABTRACTCE']).size().reset_index(name='MDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU + x.TABTRACTCE).drop(columns = ['TABBLKST', 'TABBLKCOU', 'TABTRACTCE']).set_index('GEOID').reindex(alltractsprindex, fill_value=0).reset_index()
tractspr_racealone = pd.merge(hdftractspr_racealone, mdftractspr_racealone, on=['GEOID'], how = 'outer', validate = mergeValidation).pipe(calculate_stats)
ss = tractspr_racealone.pipe(calculate_ss, geography="PR Tract", sizecategory = "All", characteristic = "{race}".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
tractspr_racealone = tractspr_racealone.assign(Race_PopSize = lambda x: pd.cut(x['HDF_Population'], [0,10,100,np.inf], include_lowest=True, right=False))
for i in tractspr_racealone['Race_PopSize'].cat.categories:
ss = tractspr_racealone[tractspr_racealone['Race_PopSize'] == i].pipe(calculate_ss, geography="PR Tract", sizecategory = str(i), characteristic = "{race}".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# Federal AIR Race Alone
for r in racealonecats:
hdffedairs_racealone = dfhdf[dfhdf['RACEALONE'] == r].groupby(['FedAIRGEOID']).size().reset_index(name='HDF_Population').set_index('FedAIRGEOID').reindex(allfedairsindex, fill_value=0).reset_index()
mdffedairs_racealone = dfmdf[dfmdf['RACEALONE'] == r].groupby(['FedAIRGEOID']).size().reset_index(name='MDF_Population').set_index('FedAIRGEOID').reindex(allfedairsindex, fill_value=0).reset_index()
fedairs_racealone = pd.merge(hdffedairs_racealone, mdffedairs_racealone, on=['FedAIRGEOID'], how = 'outer', validate = mergeValidation).pipe(calculate_stats)
ss = fedairs_racealone.pipe(calculate_ss, geography="Fed AIR", sizecategory = "All", characteristic = "{race}".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
fedairs_racealone = fedairs_racealone.assign(Race_PopSize = lambda x: pd.cut(x['HDF_Population'], [0,10,100,np.inf], include_lowest=True, right=False))
for i in fedairs_racealone['Race_PopSize'].cat.categories:
ss = fedairs_racealone[fedairs_racealone['Race_PopSize'] == i].pipe(calculate_ss, geography="Fed AIR", sizecategory = str(i), characteristic = "{race}".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# OTSA Race Alone
for r in racealonecats:
hdfotsas_racealone = dfhdf[dfhdf['RACEALONE'] == r].groupby(['OTSAGEOID']).size().reset_index(name='HDF_Population').set_index('OTSAGEOID').reindex(allotsasindex, fill_value=0).reset_index()
mdfotsas_racealone = dfmdf[dfmdf['RACEALONE'] == r].groupby(['OTSAGEOID']).size().reset_index(name='MDF_Population').set_index('OTSAGEOID').reindex(allotsasindex, fill_value=0).reset_index()
otsas_racealone = pd.merge(hdfotsas_racealone, mdfotsas_racealone, on=['OTSAGEOID'], how = 'outer', validate = mergeValidation).pipe(calculate_stats)
ss = otsas_racealone.pipe(calculate_ss, geography="OTSA", sizecategory = "All", characteristic = "{race}".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
otsas_racealone = otsas_racealone.assign(Race_PopSize = lambda x: pd.cut(x['HDF_Population'], [0,10,100,np.inf], include_lowest=True, right=False))
for i in otsas_racealone['Race_PopSize'].cat.categories:
ss = otsas_racealone[otsas_racealone['Race_PopSize'] == i].pipe(calculate_ss, geography="OTSA", sizecategory = str(i), characteristic = "{race}".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# ANVSA Race Alone
for r in racealonecats:
hdfanvsas_racealone = dfhdf[dfhdf['RACEALONE'] == r].groupby(['ANVSAGEOID']).size().reset_index(name='HDF_Population').set_index('ANVSAGEOID').reindex(allanvsasindex, fill_value=0).reset_index()
mdfanvsas_racealone = dfmdf[dfmdf['RACEALONE'] == r].groupby(['ANVSAGEOID']).size().reset_index(name='MDF_Population').set_index('ANVSAGEOID').reindex(allanvsasindex, fill_value=0).reset_index()
anvsas_racealone = pd.merge(hdfanvsas_racealone, mdfanvsas_racealone, on=['ANVSAGEOID'], how = 'outer', validate = mergeValidation).pipe(calculate_stats)
ss = anvsas_racealone.pipe(calculate_ss, geography="ANVSA", sizecategory = "All", characteristic = "{race}".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
anvsas_racealone = anvsas_racealone.assign(Race_PopSize = lambda x: pd.cut(x['HDF_Population'], [0,10,100,np.inf], include_lowest=True, right=False))
for i in anvsas_racealone['Race_PopSize'].cat.categories:
ss = anvsas_racealone[anvsas_racealone['Race_PopSize'] == i].pipe(calculate_ss, geography="ANVSA", sizecategory = str(i), characteristic = "{race}".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
print("{} Race Alone Done".format(datetime.now()))
# State Hispanic By Race Alone
for r in racealonecats:
hdfstates_hispracealone = dfhdf[(dfhdf['CENHISP'] == '2')&(dfhdf['RACEALONE'] == r)].groupby(['TABBLKST']).size().reset_index(name='HDF_Population').assign(GEOID = lambda x: x.TABBLKST).drop(columns = ['TABBLKST']).set_index(['GEOID']).reindex(allstatesindex, fill_value=0).reset_index()
mdfstates_hispracealone = dfmdf[(dfmdf['CENHISP'] == '2')&(dfmdf['RACEALONE'] == r)].groupby(['TABBLKST']).size().reset_index(name='MDF_Population').assign(GEOID = lambda x: x.TABBLKST).drop(columns = ['TABBLKST']).set_index(['GEOID']).reindex(allstatesindex, fill_value=0).reset_index()
states_hispracealone = pd.merge(hdfstates_hispracealone, mdfstates_hispracealone, on=['GEOID'], how = 'outer', validate = mergeValidation).pipe(calculate_stats)
ss = states_hispracealone.pipe(calculate_ss, geography="State", sizecategory = "All", characteristic = "Hispanic {race}".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
hdfstates_nonhispracealone = dfhdf[(dfhdf['CENHISP'] == '1')&(dfhdf['RACEALONE'] == r)].groupby(['TABBLKST']).size().reset_index(name='HDF_Population').assign(GEOID = lambda x: x.TABBLKST).drop(columns = ['TABBLKST']).set_index(['GEOID']).reindex(allstatesindex, fill_value=0).reset_index()
mdfstates_nonhispracealone = dfmdf[(dfmdf['CENHISP'] == '1')&(dfmdf['RACEALONE'] == r)].groupby(['TABBLKST']).size().reset_index(name='MDF_Population').assign(GEOID = lambda x: x.TABBLKST).drop(columns = ['TABBLKST']).set_index(['GEOID']).reindex(allstatesindex, fill_value=0).reset_index()
states_nonhispracealone = pd.merge(hdfstates_nonhispracealone, mdfstates_nonhispracealone, on=['GEOID'], how = 'outer', validate = mergeValidation).pipe(calculate_stats)
ss = states_nonhispracealone.pipe(calculate_ss, geography="State", sizecategory = "All", characteristic = "Non-Hispanic {race}".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# County Hispanic By Race Alone
for r in racealonecats:
hdfcounties_hispracealone = dfhdf[(dfhdf['CENHISP'] == '2')&(dfhdf['RACEALONE'] == r)].groupby(['TABBLKST', 'TABBLKCOU']).size().reset_index(name='HDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU).drop(columns = ['TABBLKST', 'TABBLKCOU']).set_index(['GEOID']).reindex(allcountiesindex, fill_value=0).reset_index()
mdfcounties_hispracealone = dfmdf[(dfmdf['CENHISP'] == '2')&(dfmdf['RACEALONE'] == r)].groupby(['TABBLKST', 'TABBLKCOU']).size().reset_index(name='MDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU).drop(columns = ['TABBLKST', 'TABBLKCOU']).set_index(['GEOID']).reindex(allcountiesindex, fill_value=0).reset_index()
counties_hispracealone = pd.merge(hdfcounties_hispracealone, mdfcounties_hispracealone, on=['GEOID'], how = 'outer', validate = mergeValidation).pipe(calculate_stats)
ss = counties_hispracealone.pipe(calculate_ss, geography="County", sizecategory = "All", characteristic = "Hispanic {race}".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
hdfcounties_nonhispracealone = dfhdf[(dfhdf['CENHISP'] == '1')&(dfhdf['RACEALONE'] == r)].groupby(['TABBLKST', 'TABBLKCOU']).size().reset_index(name='HDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU).drop(columns = ['TABBLKST', 'TABBLKCOU']).set_index(['GEOID']).reindex(allcountiesindex, fill_value=0).reset_index()
mdfcounties_nonhispracealone = dfmdf[(dfmdf['CENHISP'] == '1')&(dfmdf['RACEALONE'] == r)].groupby(['TABBLKST', 'TABBLKCOU']).size().reset_index(name='MDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU).drop(columns = ['TABBLKST', 'TABBLKCOU']).set_index(['GEOID']).reindex(allcountiesindex, fill_value=0).reset_index()
counties_nonhispracealone = pd.merge(hdfcounties_nonhispracealone, mdfcounties_nonhispracealone, on=['GEOID'], how = 'outer', validate = mergeValidation).pipe(calculate_stats)
ss = counties_nonhispracealone.pipe(calculate_ss, geography="County", sizecategory = "All", characteristic = "Non-Hispanic {race}".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
counties_hispracealone = counties_hispracealone.assign(HispRace_PopSize = lambda x: pd.cut(x['HDF_Population'], [0,10,100,np.inf], include_lowest=True, right=False))
for i in counties_hispracealone['HispRace_PopSize'].cat.categories:
ss = counties_hispracealone[counties_hispracealone['HispRace_PopSize'] == i].pipe(calculate_ss, geography="County", sizecategory = str(i), characteristic = "Hispanic {race}".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
counties_nonhispracealone = counties_nonhispracealone.assign(HispRace_PopSize = lambda x: pd.cut(x['HDF_Population'], [0,10,100,np.inf], include_lowest=True, right=False))
for i in counties_nonhispracealone['HispRace_PopSize'].cat.categories:
ss = counties_nonhispracealone[counties_nonhispracealone['HispRace_PopSize'] == i].pipe(calculate_ss, geography="County", sizecategory = str(i), characteristic = "Non-Hispanic {race}".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# Place Hispanic By Race Alone
for r in racealonecats:
hdfplaces_hispracealone = dfhdf[(dfhdf['CENHISP'] == '2')&(dfhdf['RACEALONE'] == r)].groupby(['IncPlaceGEOID']).size().reset_index(name='HDF_Population').set_index(['IncPlaceGEOID']).reindex(allplacesindex, fill_value=0).reset_index()
mdfplaces_hispracealone = dfmdf[(dfmdf['CENHISP'] == '2')&(dfmdf['RACEALONE'] == r)].groupby(['IncPlaceGEOID']).size().reset_index(name='MDF_Population').set_index(['IncPlaceGEOID']).reindex(allplacesindex, fill_value=0).reset_index()
places_hispracealone = pd.merge(hdfplaces_hispracealone, mdfplaces_hispracealone, on=['IncPlaceGEOID'], how = 'outer', validate = mergeValidation).pipe(calculate_stats)
ss = places_hispracealone.pipe(calculate_ss, geography="Place", sizecategory = "All", characteristic = "Hispanic {race}".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
hdfplaces_nonhispracealone = dfhdf[(dfhdf['CENHISP'] == '1')&(dfhdf['RACEALONE'] == r)].groupby(['IncPlaceGEOID']).size().reset_index(name='HDF_Population').set_index(['IncPlaceGEOID']).reindex(allplacesindex, fill_value=0).reset_index()
mdfplaces_nonhispracealone = dfmdf[(dfmdf['CENHISP'] == '1')&(dfmdf['RACEALONE'] == r)].groupby(['IncPlaceGEOID']).size().reset_index(name='MDF_Population').set_index(['IncPlaceGEOID']).reindex(allplacesindex, fill_value=0).reset_index()
places_nonhispracealone = pd.merge(hdfplaces_nonhispracealone, mdfplaces_nonhispracealone, on=['IncPlaceGEOID'], how = 'outer', validate = mergeValidation).pipe(calculate_stats)
ss = places_nonhispracealone.pipe(calculate_ss, geography="Place", sizecategory = "All", characteristic = "Non-Hispanic {race}".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
places_hispracealone = places_hispracealone.assign(HispRace_PopSize = lambda x: pd.cut(x['HDF_Population'], [0,10,100,np.inf], include_lowest=True, right=False))
for i in places_hispracealone['HispRace_PopSize'].cat.categories:
ss = places_hispracealone[places_hispracealone['HispRace_PopSize'] == i].pipe(calculate_ss, geography="Place", sizecategory = str(i), characteristic = "Hispanic {race}".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
places_nonhispracealone = places_nonhispracealone.assign(HispRace_PopSize = lambda x: pd.cut(x['HDF_Population'], [0,10,100,np.inf], include_lowest=True, right=False))
for i in places_nonhispracealone['HispRace_PopSize'].cat.categories:
ss = places_nonhispracealone[places_nonhispracealone['HispRace_PopSize'] == i].pipe(calculate_ss, geography="Place", sizecategory = str(i), characteristic = "Non-Hispanic {race}".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# Tract Hispanic By Race Alone
for r in racealonecats:
hdftracts_hispracealone = dfhdf[(dfhdf['CENHISP'] == '2')&(dfhdf['RACEALONE'] == r)].groupby(['TABBLKST', 'TABBLKCOU', 'TABTRACTCE']).size().reset_index(name='HDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU + x.TABTRACTCE).drop(columns = ['TABBLKST', 'TABBLKCOU', 'TABTRACTCE']).set_index(['GEOID']).reindex(alltractsindex, fill_value=0).reset_index()
mdftracts_hispracealone = dfmdf[(dfmdf['CENHISP'] == '2')&(dfmdf['RACEALONE'] == r)].groupby(['TABBLKST', 'TABBLKCOU', 'TABTRACTCE']).size().reset_index(name='MDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU + x.TABTRACTCE).drop(columns = ['TABBLKST', 'TABBLKCOU', 'TABTRACTCE']).set_index(['GEOID']).reindex(alltractsindex, fill_value=0).reset_index()
tracts_hispracealone = pd.merge(hdftracts_hispracealone, mdftracts_hispracealone, on=['GEOID'], how = 'outer', validate = mergeValidation).pipe(calculate_stats)
ss = tracts_hispracealone.pipe(calculate_ss, geography="Tract", sizecategory = "All", characteristic = "Hispanic {race}".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
hdftracts_nonhispracealone = dfhdf[(dfhdf['CENHISP'] == '1')&(dfhdf['RACEALONE'] == r)].groupby(['TABBLKST', 'TABBLKCOU', 'TABTRACTCE']).size().reset_index(name='HDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU + x.TABTRACTCE).drop(columns = ['TABBLKST', 'TABBLKCOU', 'TABTRACTCE']).set_index(['GEOID']).reindex(alltractsindex, fill_value=0).reset_index()
mdftracts_nonhispracealone = dfmdf[(dfmdf['CENHISP'] == '1')&(dfmdf['RACEALONE'] == r)].groupby(['TABBLKST', 'TABBLKCOU', 'TABTRACTCE']).size().reset_index(name='MDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU + x.TABTRACTCE).drop(columns = ['TABBLKST', 'TABBLKCOU', 'TABTRACTCE']).set_index(['GEOID']).reindex(alltractsindex, fill_value=0).reset_index()
tracts_nonhispracealone = pd.merge(hdftracts_nonhispracealone, mdftracts_nonhispracealone, on=['GEOID'], how = 'outer', validate = mergeValidation).pipe(calculate_stats)
ss = tracts_nonhispracealone.pipe(calculate_ss, geography="Tract", sizecategory = "All", characteristic = "Non-Hispanic {race}".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
tracts_hispracealone = tracts_hispracealone.assign(HispRace_PopSize = lambda x: pd.cut(x['HDF_Population'], [0,10,100,np.inf], include_lowest=True, right=False))
for i in tracts_hispracealone['HispRace_PopSize'].cat.categories:
ss = tracts_hispracealone[tracts_hispracealone['HispRace_PopSize'] == i].pipe(calculate_ss, geography="Tract", sizecategory = str(i), characteristic = "Hispanic {race}".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
tracts_nonhispracealone = tracts_nonhispracealone.assign(HispRace_PopSize = lambda x: pd.cut(x['HDF_Population'], [0,10,100,np.inf], include_lowest=True, right=False))
for i in tracts_nonhispracealone['HispRace_PopSize'].cat.categories:
ss = tracts_nonhispracealone[tracts_nonhispracealone['HispRace_PopSize'] == i].pipe(calculate_ss, geography="Tract", sizecategory = str(i), characteristic = "Non-Hispanic {race}".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# Tract Hispanic By Race Alone Aged 18+
for r in racealonecats:
hdftracts_hispracealone18p = dfhdf[(dfhdf['CENHISP'] == '2')&(dfhdf['QAGE'] >= 18)&(dfhdf['RACEALONE'] == r)].groupby(['TABBLKST', 'TABBLKCOU', 'TABTRACTCE']).size().reset_index(name='HDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU + x.TABTRACTCE).drop(columns = ['TABBLKST', 'TABBLKCOU', 'TABTRACTCE']).set_index(['GEOID']).reindex(alltractsindex, fill_value=0).reset_index()
mdftracts_hispracealone18p = dfmdf[(dfmdf['CENHISP'] == '2')&(dfmdf['QAGE'] >= 18)&(dfmdf['RACEALONE'] == r)].groupby(['TABBLKST', 'TABBLKCOU', 'TABTRACTCE']).size().reset_index(name='MDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU + x.TABTRACTCE).drop(columns = ['TABBLKST', 'TABBLKCOU', 'TABTRACTCE']).set_index(['GEOID']).reindex(alltractsindex, fill_value=0).reset_index()
tracts_hispracealone18p = pd.merge(hdftracts_hispracealone18p, mdftracts_hispracealone18p, on=['GEOID'], how = 'outer', validate = mergeValidation).pipe(calculate_stats)
ss = tracts_hispracealone18p.pipe(calculate_ss, geography="Tract", sizecategory = "All", characteristic = "Hispanic {race} Aged 18+".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
hdftracts_nonhispracealone18p = dfhdf[(dfhdf['CENHISP'] == '1')&(dfhdf['QAGE'] >= 18)&(dfhdf['RACEALONE'] == r)].groupby(['TABBLKST', 'TABBLKCOU', 'TABTRACTCE']).size().reset_index(name='HDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU + x.TABTRACTCE).drop(columns = ['TABBLKST', 'TABBLKCOU', 'TABTRACTCE']).set_index(['GEOID']).reindex(alltractsindex, fill_value=0).reset_index()
mdftracts_nonhispracealone18p = dfmdf[(dfmdf['CENHISP'] == '1')&(dfmdf['QAGE'] >= 18)&(dfmdf['RACEALONE'] == r)].groupby(['TABBLKST', 'TABBLKCOU', 'TABTRACTCE']).size().reset_index(name='MDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU + x.TABTRACTCE).drop(columns = ['TABBLKST', 'TABBLKCOU', 'TABTRACTCE']).set_index(['GEOID']).reindex(alltractsindex, fill_value=0).reset_index()
tracts_nonhispracealone18p = pd.merge(hdftracts_nonhispracealone18p, mdftracts_nonhispracealone18p, on=['GEOID'], how = 'outer', validate = mergeValidation).pipe(calculate_stats)
ss = tracts_nonhispracealone18p.pipe(calculate_ss, geography="Tract", sizecategory = "All", characteristic = "Non-Hispanic {race} Aged 18+".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
tracts_hispracealone18p = tracts_hispracealone18p.assign(HispRace_PopSize = lambda x: pd.cut(x['HDF_Population'], [0,10,100,np.inf], include_lowest=True, right=False))
for i in tracts_hispracealone18p['HispRace_PopSize'].cat.categories:
ss = tracts_hispracealone18p[tracts_hispracealone18p['HispRace_PopSize'] == i].pipe(calculate_ss, geography="Tract", sizecategory = str(i), characteristic = "Hispanic {race} Aged 18+".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
tracts_nonhispracealone18p = tracts_nonhispracealone18p.assign(HispRace_PopSize = lambda x: pd.cut(x['HDF_Population'], [0,10,100,np.inf], include_lowest=True, right=False))
for i in tracts_nonhispracealone18p['HispRace_PopSize'].cat.categories:
ss = tracts_nonhispracealone18p[tracts_nonhispracealone18p['HispRace_PopSize'] == i].pipe(calculate_ss, geography="Tract", sizecategory = str(i), characteristic = "Non-Hispanic {race} Aged 18+".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# Block Group Hispanic By Race Alone Aged 18+
for r in racealonecats:
hdfblockgroups_hispracealone18p = dfhdf[(dfhdf['CENHISP'] == '2')&(dfhdf['QAGE'] >= 18)&(dfhdf['RACEALONE'] == r)].groupby(['TABBLKST', 'TABBLKCOU', 'TABTRACTCE', 'TABBLKGRPCE']).size().reset_index(name='HDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU + x.TABTRACTCE + x.TABBLKGRPCE).drop(columns = ['TABBLKST', 'TABBLKCOU', 'TABTRACTCE', 'TABBLKGRPCE']).set_index(['GEOID']).reindex(allblockgroupsindex, fill_value=0).reset_index()
mdfblockgroups_hispracealone18p = dfmdf[(dfmdf['CENHISP'] == '2')&(dfmdf['QAGE'] >= 18)&(dfmdf['RACEALONE'] == r)].groupby(['TABBLKST', 'TABBLKCOU', 'TABTRACTCE', 'TABBLKGRPCE']).size().reset_index(name='MDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU + x.TABTRACTCE + x.TABBLKGRPCE).drop(columns = ['TABBLKST', 'TABBLKCOU', 'TABTRACTCE', 'TABBLKGRPCE']).set_index(['GEOID']).reindex(allblockgroupsindex, fill_value=0).reset_index()
blockgroups_hispracealone18p = pd.merge(hdfblockgroups_hispracealone18p, mdfblockgroups_hispracealone18p, on=['GEOID'], how = 'outer', validate = mergeValidation).pipe(calculate_stats)
ss = blockgroups_hispracealone18p.pipe(calculate_ss, geography="Block Group", sizecategory = "All", characteristic = "Hispanic {race} Aged 18+".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
hdfblockgroups_nonhispracealone18p = dfhdf[(dfhdf['CENHISP'] == '1')&(dfhdf['QAGE'] >= 18)&(dfhdf['RACEALONE'] == r)].groupby(['TABBLKST', 'TABBLKCOU', 'TABTRACTCE', 'TABBLKGRPCE']).size().reset_index(name='HDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU + x.TABTRACTCE + x.TABBLKGRPCE).drop(columns = ['TABBLKST', 'TABBLKCOU', 'TABTRACTCE', 'TABBLKGRPCE']).set_index(['GEOID']).reindex(allblockgroupsindex, fill_value=0).reset_index()
mdfblockgroups_nonhispracealone18p = dfmdf[(dfmdf['CENHISP'] == '1')&(dfmdf['QAGE'] >= 18)&(dfmdf['RACEALONE'] == r)].groupby(['TABBLKST', 'TABBLKCOU', 'TABTRACTCE', 'TABBLKGRPCE']).size().reset_index(name='MDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU + x.TABTRACTCE + x.TABBLKGRPCE).drop(columns = ['TABBLKST', 'TABBLKCOU', 'TABTRACTCE', 'TABBLKGRPCE']).set_index(['GEOID']).reindex(allblockgroupsindex, fill_value=0).reset_index()
blockgroups_nonhispracealone18p = pd.merge(hdfblockgroups_nonhispracealone18p, mdfblockgroups_nonhispracealone18p, on=['GEOID'], how = 'outer', validate = mergeValidation).pipe(calculate_stats)
ss = blockgroups_nonhispracealone18p.pipe(calculate_ss, geography="Block Group", sizecategory = "All", characteristic = "Non-Hispanic {race} Aged 18+".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
blockgroups_hispracealone18p = blockgroups_hispracealone18p.assign(HispRace_PopSize = lambda x: pd.cut(x['HDF_Population'], [0,10,100,np.inf], include_lowest=True, right=False))
for i in blockgroups_hispracealone18p['HispRace_PopSize'].cat.categories:
ss = blockgroups_hispracealone18p[blockgroups_hispracealone18p['HispRace_PopSize'] == i].pipe(calculate_ss, geography="Block Group", sizecategory = str(i), characteristic = "Hispanic {race} Aged 18+".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
blockgroups_nonhispracealone18p = blockgroups_nonhispracealone18p.assign(HispRace_PopSize = lambda x: pd.cut(x['HDF_Population'], [0,10,100,np.inf], include_lowest=True, right=False))
for i in blockgroups_nonhispracealone18p['HispRace_PopSize'].cat.categories:
ss = blockgroups_nonhispracealone18p[blockgroups_nonhispracealone18p['HispRace_PopSize'] == i].pipe(calculate_ss, geography="Block Group", sizecategory = str(i), characteristic = "Non-Hispanic {race} Aged 18+".format(race = racealonedict.get(r)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
print("{} Hispanic By Race Alone Done".format(datetime.now()))
# Race Alone Or In Combination
print("{} Starting Hispanic By Race Alone Or In Combination".format(datetime.now()))
dfhdf = dfhdf.pipe(assign_racealone_or_incomb)
dfmdf = dfmdf.pipe(assign_racealone_or_incomb)
if runPRhere:
dfhdfpr = dfhdfpr.pipe(assign_racealone_or_incomb)
dfmdfpr = dfmdfpr.pipe(assign_racealone_or_incomb)
racegroups = ['whitealone-or-incomb', 'blackalone-or-incomb', 'aianalone-or-incomb', 'asianalone-or-incomb', 'nhopialone-or-incomb', 'soralone-or-incomb']
# State Race Alone Or In Combination/ Hispanic Race Alone Or In Combination/ Non-Hispanic Race Alone Or In Combination
for rg in racegroups:
hdfstates_raceincomb = dfhdf[(dfhdf[rg]==1)].groupby(['TABBLKST']).size().reset_index(name='HDF_Population').assign(GEOID = lambda x: x.TABBLKST).drop(columns = ['TABBLKST']).set_index(['GEOID']).reindex(allstatesindex, fill_value=0).reset_index()
mdfstates_raceincomb = dfmdf[(dfmdf[rg]==1)].groupby(['TABBLKST']).size().reset_index(name='MDF_Population').assign(GEOID = lambda x: x.TABBLKST).drop(columns = ['TABBLKST']).set_index(['GEOID']).reindex(allstatesindex, fill_value=0).reset_index()
states_raceincomb = pd.merge(hdfstates_raceincomb, mdfstates_raceincomb, on=['GEOID'], how = 'outer', validate = mergeValidation).pipe(calculate_stats)
ss = states_raceincomb.pipe(calculate_ss, geography="State", sizecategory = "All", characteristic = "{race}".format(race = raceincombdict.get(rg)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
for rg in racegroups:
hdfstates_hispraceincomb = dfhdf[(dfhdf['CENHISP'] == '2') & (dfhdf[rg]==1)].groupby(['TABBLKST']).size().reset_index(name='HDF_Population').assign(GEOID = lambda x: x.TABBLKST).drop(columns = ['TABBLKST']).set_index(['GEOID']).reindex(allstatesindex, fill_value=0).reset_index()
mdfstates_hispraceincomb = dfmdf[(dfmdf['CENHISP'] == '2') & (dfmdf[rg]==1)].groupby(['TABBLKST']).size().reset_index(name='MDF_Population').assign(GEOID = lambda x: x.TABBLKST).drop(columns = ['TABBLKST']).set_index(['GEOID']).reindex(allstatesindex, fill_value=0).reset_index()
states_hispraceincomb = pd.merge(hdfstates_hispraceincomb, mdfstates_hispraceincomb, on=['GEOID'], how = 'outer', validate = mergeValidation).pipe(calculate_stats)
ss = states_hispraceincomb.pipe(calculate_ss, geography="State", sizecategory = "All", characteristic = "Hispanic {race}".format(race = raceincombdict.get(rg)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
for rg in racegroups:
hdfstates_nonhispraceincomb = dfhdf[(dfhdf['CENHISP'] == '1') & (dfhdf[rg]==1)].groupby(['TABBLKST']).size().reset_index(name='HDF_Population').assign(GEOID = lambda x: x.TABBLKST).drop(columns = ['TABBLKST']).set_index(['GEOID']).reindex(allstatesindex, fill_value=0).reset_index()
mdfstates_nonhispraceincomb = dfmdf[(dfmdf['CENHISP'] == '1') & (dfmdf[rg]==1)].groupby(['TABBLKST']).size().reset_index(name='MDF_Population').assign(GEOID = lambda x: x.TABBLKST).drop(columns = ['TABBLKST']).set_index(['GEOID']).reindex(allstatesindex, fill_value=0).reset_index()
states_nonhispraceincomb = pd.merge(hdfstates_nonhispraceincomb, mdfstates_nonhispraceincomb, on=['GEOID'], how = 'outer', validate = mergeValidation).pipe(calculate_stats)
ss = states_nonhispraceincomb.pipe(calculate_ss, geography="State", sizecategory = "All", characteristic = "Non-Hispanic {race}".format(race = raceincombdict.get(rg)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
# County Race Alone Or In Combination/ Hispanic Race Alone Or In Combination/ Non-Hispanic Race Alone Or In Combination
for rg in racegroups:
hdfcounties_raceincomb = dfhdf[(dfhdf[rg]==1)].groupby(['TABBLKST','TABBLKCOU']).size().reset_index(name='HDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU).drop(columns = ['TABBLKST', 'TABBLKCOU']).set_index(['GEOID']).reindex(allcountiesindex, fill_value=0).reset_index()
mdfcounties_raceincomb = dfmdf[(dfmdf[rg]==1)].groupby(['TABBLKST','TABBLKCOU']).size().reset_index(name='MDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU).drop(columns = ['TABBLKST', 'TABBLKCOU']).set_index(['GEOID']).reindex(allcountiesindex, fill_value=0).reset_index()
counties_raceincomb = pd.merge(hdfcounties_raceincomb, mdfcounties_raceincomb, on=['GEOID'], how = 'outer', validate = mergeValidation).pipe(calculate_stats)
ss = counties_raceincomb.pipe(calculate_ss, geography="County", sizecategory = "All", characteristic = "{race}".format(race = raceincombdict.get(rg)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
counties_raceincomb = counties_raceincomb.assign(RaceInComb_SizeA = lambda x: pd.cut(x['HDF_Population'], [0,10,100,np.inf], include_lowest=True, right=False))
for i in counties_raceincomb['RaceInComb_SizeA'].cat.categories:
ss = counties_raceincomb[counties_raceincomb['RaceInComb_SizeA'] == i].pipe(calculate_ss, geography="County", sizecategory = str(i), characteristic = "{race}".format(race = raceincombdict.get(rg)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
for rg in racegroups:
hdfcounties_hispraceincomb = dfhdf[(dfhdf['CENHISP'] == '2') & (dfhdf[rg]==1)].groupby(['TABBLKST', 'TABBLKCOU']).size().reset_index(name='HDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU).drop(columns = ['TABBLKST', 'TABBLKCOU']).set_index(['GEOID']).reindex(allcountiesindex, fill_value=0).reset_index()
mdfcounties_hispraceincomb = dfmdf[(dfmdf['CENHISP'] == '2') & (dfmdf[rg]==1)].groupby(['TABBLKST', 'TABBLKCOU']).size().reset_index(name='MDF_Population').assign(GEOID = lambda x: x.TABBLKST + x.TABBLKCOU).drop(columns = ['TABBLKST', 'TABBLKCOU']).set_index(['GEOID']).reindex(allcountiesindex, fill_value=0).reset_index()
counties_hispraceincomb = pd.merge(hdfcounties_hispraceincomb, mdfcounties_hispraceincomb, on=['GEOID'], how = 'outer', validate = mergeValidation).pipe(calculate_stats)
ss = counties_hispraceincomb.pipe(calculate_ss, geography="County", sizecategory = "All", characteristic = "Hispanic {race}".format(race = raceincombdict.get(rg)))
outputdf = outputdf.append(ss, ignore_index=True, sort=False)
counties_hispraceincomb = counties_hispraceincomb.assign(RaceInComb_SizeA = lambda x: pd.cut(x['HDF_Population'], [0,10,100,np.inf], include_lowest=True, right=False))
for i in counties_hispraceincomb['RaceInComb_SizeA'].cat.categories: