-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMerging_Data.py
159 lines (140 loc) · 7.89 KB
/
Merging_Data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import time
import math
import pandas as pd
import geopandas as gpd
from uszipcode import SearchEngine
# for time-keeping purposes
start_time = time.time()
# initialize SearchEngine
search = SearchEngine()
# imports list of Texas Precincts
tx_precincts = pd.read_csv("List_of_Precincts_in_Texas.csv")
# imports shapefile of Texas Precincts and changes crs
tx_precincts_gdf = gpd.read_file("TX_Precincts_Shapefile//VTDs_22G.shp", encoding="utf-8")
tx_precincts_gdf = tx_precincts_gdf.to_crs(epsg=4326)
# imports shapefile of Texas ZIP Codes and changes crs
zip_codes_gdf = gpd.read_file("US_ZIP_Codes_Shapefile//tl_2019_us_zcta510.shp", encoding="utf-8")
zip_codes_gdf = zip_codes_gdf.to_crs(epsg=4326)
# imports 2020 and 2022 Election Results
results_2020 = pd.read_csv("2020_General_Election_Returns.csv")
results_2022 = pd.read_csv("2022_General_Election_Returns.csv")
# imports demographic data for every precinct
tx_pop_data = pd.read_csv("VTDs_22G_Pop.csv")
# imports county data
tx_county_data = pd.read_csv("TX_Counties.csv")
tx_county_data = tx_county_data.set_index("County")
end_time = time.time()
time_so_far = (end_time - start_time) / 60
print("Files Read in", round(time_so_far, 2), "minutes")
# filters election results for only President in 2020 and Governor in 2022
pres_2020_results = results_2020[results_2020["Office"] == 'President']
gov_2022_results = results_2022[results_2022["Office"] == 'Governor']
# capitalizes precincts with letters
pres_2020_results['cntyvtd'] = pres_2020_results["cntyvtd"].map(lambda p: p.upper())
gov_2022_results['cntyvtd'] = gov_2022_results["cntyvtd"].map(lambda p: p.upper())
# finds the geographic center and area of the precinct
tx_precinct_centers_gdf = gpd.GeoDataFrame()
tx_precinct_centers_gdf['CNTYVTD'] = tx_precincts_gdf['CNTYVTD']
tx_precinct_centers_gdf['geometry'] = tx_precincts_gdf.to_crs('+proj=cea').centroid.to_crs(epsg=4326)
tx_precincts["Lat_C"] = tx_precinct_centers_gdf['geometry'].map(lambda p: p.y)
tx_precincts["Long_C"] = tx_precinct_centers_gdf['geometry'].map(lambda p: p.x)
tx_precincts_with_zip_codes = gpd.sjoin(tx_precinct_centers_gdf, zip_codes_gdf[['ZCTA5CE10', 'geometry']], how="inner",
predicate="within")
tx_precincts['ZIP_Code'] = tx_precinct_centers_gdf.merge(tx_precincts_with_zip_codes, how='left')['ZCTA5CE10']
tx_precincts['ZIP_Code'] = tx_precincts['ZIP_Code'].astype(float)
# Lists to add to dataframe
county_region = []
metro_area = []
border_area = []
total_2020_pres_votes = []
total_2022_gov_votes = []
trump_2020_votes = []
biden_2020_votes = []
abbott_2022_votes = []
o_rourke_2022_votes = []
total_vap_2022 = []
white_vap_2022 = []
latino_vap_2022 = []
black_vap_2022 = []
asian_vap_2022 = []
# useful to keep track of time
start_time = time.time()
# pulls data from other datasets and adds it to the lists above
for count, precinct in tx_precincts.iterrows():
vtd = precinct["CNTYVTD"]
precinct_2020_pres_results = pres_2020_results[pres_2020_results["cntyvtd"] == vtd]
precinct_2022_gov_results = gov_2022_results[gov_2022_results["cntyvtd"] == vtd]
# Adds County Data
county = precinct_2020_pres_results.iloc[0]["County"]
county_region.append(tx_county_data.loc[county]["Region"])
metro_area.append(tx_county_data.loc[county]["Metro"])
border_area.append(tx_county_data.loc[county]["Border"])
# Adds Precinct Vote Totals for major party candidates in both elections
total_2020_pres_votes.append(precinct_2020_pres_results["Votes"].sum())
total_2022_gov_votes.append(precinct_2022_gov_results["Votes"].sum())
trump_2020_votes.append(precinct_2020_pres_results[precinct_2020_pres_results["Name"] == "Trump"].iloc[0]["Votes"])
biden_2020_votes.append(precinct_2020_pres_results[precinct_2020_pres_results["Name"] == "Biden"].iloc[0]["Votes"])
abbott_2022_votes.append(precinct_2022_gov_results[precinct_2022_gov_results["Name"] == "Abbott"].iloc[0]["Votes"])
o_rourke_2022_votes.append(
precinct_2022_gov_results[precinct_2022_gov_results["Name"] == "O'Rourke"].iloc[0]["Votes"])
# adds demographic data for the Voting Age Population in the Precinct
pop_data = tx_pop_data[tx_pop_data["CNTYVTD"] == vtd].iloc[0]
total_vap_2022.append(pop_data["vap"])
white_vap_2022.append(pop_data["anglovap"])
latino_vap_2022.append(pop_data["hispvap"])
black_vap_2022.append(pop_data["blackvap"])
asian_vap_2022.append(pop_data["asianvap"])
# for time estimation purposes
if count % 100 == 0:
print(round((count + 1) / 9634 * 100, 1), "% complete")
end_time = time.time()
time_so_far = (end_time - start_time) / 60
time_when_finished = time_so_far / (count + 1) * 9634
print(round(time_when_finished - time_so_far, 2), "minutes left")
print("Completed in", round(time_so_far, 2), "minutes")
# adds new columns from lists in the tx_gdf GeoDataFrame
tx_precincts["Region"] = county_region
tx_precincts["Metro"] = metro_area
tx_precincts["Border"] = border_area
tx_precincts["Total_Votes_2020_Pres"] = total_2020_pres_votes
tx_precincts["Total_Votes_2022_Gov"] = total_2022_gov_votes
tx_precincts["Trump_2020_Votes"] = trump_2020_votes
tx_precincts["Biden_2020_Votes"] = biden_2020_votes
tx_precincts["Abbott_2022_Votes"] = abbott_2022_votes
tx_precincts["O_Rourke_2022_Votes"] = o_rourke_2022_votes
tx_precincts["VAP_2022"] = total_vap_2022
tx_precincts["White_VAP_2022"] = white_vap_2022
tx_precincts["Latino_VAP_2022"] = latino_vap_2022
tx_precincts["Black_VAP_2022"] = black_vap_2022
tx_precincts["Asian_VAP_2022"] = asian_vap_2022
tx_precincts["Biden_2020_Pct"] = tx_precincts["Biden_2020_Votes"] / tx_precincts["Total_Votes_2020_Pres"] * 100
tx_precincts["Trump_2020_Pct"] = tx_precincts["Trump_2020_Votes"] / tx_precincts["Total_Votes_2020_Pres"] * 100
tx_precincts["Abbott_2022_Pct"] = tx_precincts["Abbott_2022_Votes"] / tx_precincts["Total_Votes_2022_Gov"] * 100
tx_precincts["O_Rourke_2022_Pct"] = tx_precincts["O_Rourke_2022_Votes"] / tx_precincts["Total_Votes_2022_Gov"] * 100
tx_precincts["White_VAP_2022_Pct"] = tx_precincts["White_VAP_2022"] / tx_precincts["VAP_2022"] * 100
tx_precincts["Latino_VAP_2022_Pct"] = tx_precincts["Latino_VAP_2022"] / tx_precincts["VAP_2022"] * 100
tx_precincts["Black_VAP_2022_Pct"] = tx_precincts["Black_VAP_2022"] / tx_precincts["VAP_2022"] * 100
tx_precincts["Asian_VAP_2022_Pct"] = tx_precincts["Asian_VAP_2022"] / tx_precincts["VAP_2022"] * 100
tx_precincts["Dem_Margin_2020_Pres_Pct"] = tx_precincts["Biden_2020_Pct"] - tx_precincts["Trump_2020_Pct"]
tx_precincts["Dem_Margin_2022_Gov_Pct"] = tx_precincts["O_Rourke_2022_Pct"] - tx_precincts["Abbott_2022_Pct"]
tx_precincts["Dem_Margin_Gain_Pct"] = tx_precincts["Dem_Margin_2022_Gov_Pct"] - tx_precincts["Dem_Margin_2020_Pres_Pct"]
tx_precincts["Dem_Raw_Vote_Gain"] = tx_precincts["O_Rourke_2022_Votes"] - tx_precincts["Biden_2020_Votes"]
tx_precincts["GOP_Raw_Vote_Gain"] = tx_precincts["Abbott_2022_Votes"] - tx_precincts["Trump_2020_Votes"]
tx_precincts["Dem_Raw_Vote_Margin_Gain"] = tx_precincts["Dem_Raw_Vote_Gain"] - tx_precincts["GOP_Raw_Vote_Gain"]
tx_precincts["Change_in_Turnout"] = (tx_precincts["Total_Votes_2022_Gov"] / tx_precincts["Total_Votes_2020_Pres"] - 1)\
* 100
def zip_code_finder(row):
# if the precinct does not have a ZIP Code from the shapefile or is assigned one outside of Texas, this fixes that
if math.isnan(row['ZIP_Code']) or not 75000 < row['ZIP_Code'] < 80000:
lat = row['Lat_C']
long = row['Long_C']
zip_codes_within_50_mi = search.by_coordinates(lat, long, 50)
for zip_code in zip_codes_within_50_mi:
zip_code_dict = zip_code.to_dict()
if zip_code_dict['state'] == 'TX':
row['ZIP_Code'] = zip_code_dict['zipcode']
break
return row
tx_precincts = tx_precincts.apply(zip_code_finder, axis='columns')
# exports DataFrame to a csv file
tx_precincts.to_csv("Texas_Precinct_Election_Data.csv", index=False, header=True)