Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

landfills QA/QC updates for gridding #61

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions gch4i/emis_processing/task_industrial_landfills_emis.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,13 +404,15 @@ def task_get_industrial_landfills_food_beverage_inv_data(
.drop(columns=["tot_ch4_kt", "nr_ch4_kt"])
.rename(columns={"r_ch4_kt": "ghgi_ch4_kt"})
.dropna()
.query("ghgi_ch4_kt > 0")
.reset_index(drop=True)
)

nonreporting_fb_emi_df = (corrected_fb_emi_df
.drop(columns=["tot_ch4_kt", "r_ch4_kt"])
.rename(columns={"nr_ch4_kt": "ghgi_ch4_kt"})
.dropna()
.query("ghgi_ch4_kt > 0")
.reset_index(drop=True)
)

Expand Down
1 change: 1 addition & 0 deletions gch4i/emis_processing/task_msw_landfills_emis.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,7 @@ def task_msw_landfills_emi(
pd.concat(emi_df_list)
.groupby(["state_code", "year"])["ghgi_ch4_kt"]
.sum()
.query("ghgi_ch4_kt > 0")
.reset_index()
)
# Save the emissions data to the output path
Expand Down
75 changes: 34 additions & 41 deletions gch4i/proxy_processing/task_industrial_landfills_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,7 @@
def task_get_reporting_industrial_landfills_pulp_paper_proxy_data(
subpart_tt_path = "https://data.epa.gov/efservice/tt_subpart_ghg_info/pub_dim_facility/ghg_name/=/Methane/CSV",
state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
reporting_pulp_paper_proxy_output_path: Annotated[Path, Product] = proxy_data_dir_path
/ "ind_landfills_pp_r_proxy.parquet",
reporting_pulp_paper_proxy_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ind_landfills_pp_r_proxy.parquet",
):
"""
Relative emissions and location information for reporting facilities are taken from
Expand Down Expand Up @@ -99,7 +98,7 @@ def task_get_reporting_industrial_landfills_pulp_paper_proxy_data(
),
)
.drop(columns=["facility_id", "latitude", "longitude", "city", "zip"])
.loc[:, ["facility_name", "state_code", "geometry", "year", "rel_emi"]]
.loc[:, ["year", "state_code", "geometry", "rel_emi"]]
)

reporting_pulp_paper_gdf.to_parquet(reporting_pulp_paper_proxy_output_path)
Expand All @@ -112,8 +111,7 @@ def task_get_nonreporting_industrial_landfills_pulp_paper_proxy_data(
frs_naics_path = global_data_dir_path / "NATIONAL_NAICS_FILE.CSV",
frs_facility_path = global_data_dir_path / "NATIONAL_FACILITY_FILE.CSV",
mills_online_path: Path = V3_DATA_PATH / "sector/landfills/Mills_OnLine.xlsx",
nonreporting_pulp_paper_proxy_output_path: Annotated[Path, Product] = proxy_data_dir_path
/ "ind_landfills_pp_nr_proxy.parquet",
nonreporting_pulp_paper_proxy_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ind_landfills_pp_nr_proxy.parquet",
):
"""
The Mills OnLine database of facilities is compared against the Subpart HH
Expand Down Expand Up @@ -243,7 +241,7 @@ def task_get_nonreporting_industrial_landfills_pulp_paper_proxy_data(

FRS_facility_locs = frs_naics.merge(frs_main, how='inner', on='registry_id')

#try to match mills database with FRS database, based on state and city -
# try to match mills database with FRS database, based on state and city -
# only need to find locations for where 'ghgrp_match' = 0
mills_locs['FRS_match'] = 0
for ifacility in np.arange(0, len(mills_locs)):
Expand All @@ -256,29 +254,24 @@ def task_get_nonreporting_industrial_landfills_pulp_paper_proxy_data(
mills_locs.loc[ifacility,'FRS_match'] = 1
elif len(imatch)>1:
FRS_temp = FRS_facility_locs.loc[imatch,:]
new_match = np.where(np.max(FRS_temp['accuracy_value']))[0]
if len(new_match) >0:
mills_locs.loc[ifacility,'lat'] = FRS_facility_locs.loc[imatch[new_match[0]],'latitude']
mills_locs.loc[ifacility,'lon'] = FRS_facility_locs.loc[imatch[new_match[0]],'longitude']
mills_locs.loc[ifacility,'FRS_match'] = 1
# use the location of the first occurance of the facility with the highest accuracy value
new_match = np.argmax(np.max(FRS_temp['accuracy_value']))
mills_locs.loc[ifacility,'lat'] = FRS_facility_locs.loc[new_match,'latitude']
mills_locs.loc[ifacility,'lon'] = FRS_facility_locs.loc[new_match,'longitude']
mills_locs.loc[ifacility,'FRS_match'] = 1
else:
continue

print('Not Found:',len(mills_locs)-(np.sum(mills_locs.loc[:,'FRS_match'])+np.sum(mills_locs.loc[:,'ghgrp_match'])), 'of',len(mills_locs))

# keep mills that only have FRS locations (mills that are not already covered by
# subpart tt and mills that are not missing locations)
mills_locs = mills_locs.query(
"ghgrp_match == 0 & FRS_match == 1").drop(
columns=["state_name", "county", "city", "grades", "ghgrp_match", "FRS_match"]).rename(
columns={"lat": "latitude", "lon": "longitude"}).dropna()

# add a column to equally allocate unaccounted for GHGI emissions to all non-reporting mills
mills_locs["ch4_kt"] = 1.0
mills_locs = mills_locs.reset_index(drop=True)

nonreporting_pulp_paper_df['rel_emi'] = nonreporting_pulp_paper_df.groupby(["state_code"])['ch4_kt'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0)
nonreporting_pulp_paper_df = nonreporting_pulp_paper_df.drop(columns='ch4_kt')
mills_locs = (mills_locs
.query("ghgrp_match == 0 & FRS_match == 1")
.drop(columns=["state_name", "county", "city", "grades", "ghgrp_match", "FRS_match"])
.rename(columns={"lat": "latitude", "lon": "longitude"})
.dropna()
)

nonreporting_pulp_paper_gdf = (
gpd.GeoDataFrame(
Expand All @@ -290,7 +283,7 @@ def task_get_nonreporting_industrial_landfills_pulp_paper_proxy_data(
),
)
.drop(columns=["facility_id", "latitude", "longitude"])
.loc[:, ["state_code", "geometry", "rel_emi"]]
.loc[:, ["state_code", "geometry"]]
)

nonreporting_pulp_paper_gdf.to_parquet(nonreporting_pulp_paper_proxy_output_path)
Expand All @@ -300,8 +293,7 @@ def task_get_nonreporting_industrial_landfills_pulp_paper_proxy_data(
def task_get_reporting_industrial_landfills_food_beverage_proxy_data(
subpart_tt_path = "https://data.epa.gov/efservice/tt_subpart_ghg_info/pub_dim_facility/ghg_name/=/Methane/CSV",
state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
reporting_food_beverage_proxy_output_path: Annotated[Path, Product] = proxy_data_dir_path
/ "ind_landfills_fb_r_proxy.parquet",
reporting_food_beverage_proxy_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ind_landfills_fb_r_proxy.parquet",
):
"""
Relative emissions and location information for reporting facilities are taken from
Expand Down Expand Up @@ -362,21 +354,20 @@ def task_get_reporting_industrial_landfills_food_beverage_proxy_data(
),
)
.drop(columns=["latitude", "longitude", "city", "zip"])
.loc[:, ["facility_id", "facility_name", "state_code", "geometry", "year", "rel_emi"]]
.loc[:, ["year", "state_code", "geometry", "rel_emi"]]
)

reporting_food_beverage_gdf.to_parquet(reporting_food_beverage_proxy_output_path)
return None


def task_get_nonreporting_industrial_landfills_food_beverage_proxy_data(
state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
subpart_tt_path = "https://data.epa.gov/efservice/tt_subpart_ghg_info/pub_dim_facility/ghg_name/=/Methane/CSV",
frs_naics_path = global_data_dir_path / "NATIONAL_NAICS_FILE.CSV",
frs_facility_path = global_data_dir_path / "NATIONAL_FACILITY_FILE.CSV",
food_manufacturers_processors_path = V3_DATA_PATH / "sector/landfills/Food Manufacturers and Processors.xlsx",
nonreporting_food_beverage_proxy_output_path: Annotated[Path, Product] = proxy_data_dir_path
/ "ind_landfills_fb_nr_proxy.parquet",
state_path: Path = global_data_dir_path / "tl_2020_us_state.zip",
subpart_tt_path = "https://data.epa.gov/efservice/tt_subpart_ghg_info/pub_dim_facility/ghg_name/=/Methane/CSV",
frs_naics_path = global_data_dir_path / "NATIONAL_NAICS_FILE.CSV",
frs_facility_path = global_data_dir_path / "NATIONAL_FACILITY_FILE.CSV",
food_manufacturers_processors_path = V3_DATA_PATH / "sector/landfills/Food Manufacturers and Processors.xlsx",
nonreporting_food_beverage_proxy_output_path: Annotated[Path, Product] = proxy_data_dir_path / "ind_landfills_fb_nr_proxy.parquet",
):
"""

Expand Down Expand Up @@ -547,24 +538,26 @@ def task_get_nonreporting_industrial_landfills_food_beverage_proxy_data(
food_beverage_facilities_locs.loc[ifacility,'FRS_match'] = 1
elif len(imatch)>1:
FRS_temp = FRS_facility_locs.loc[imatch,:]
new_match = np.where(np.max(FRS_temp['accuracy_value']))[0]
if len(new_match) >0:
food_beverage_facilities_locs.loc[ifacility,'lat'] = FRS_facility_locs.loc[imatch[new_match[0]],'latitude']
food_beverage_facilities_locs.loc[ifacility,'lon'] = FRS_facility_locs.loc[imatch[new_match[0]],'longitude']
food_beverage_facilities_locs.loc[ifacility,'FRS_match'] = 1
# use the location of the first occurance of the facility with the highest accuracy value
new_match = np.argmax(FRS_temp['accuracy_value'])
food_beverage_facilities_locs.loc[ifacility,'lat'] = FRS_facility_locs.loc[imatch[new_match],'latitude']
food_beverage_facilities_locs.loc[ifacility,'lon'] = FRS_facility_locs.loc[imatch[new_match],'longitude']
food_beverage_facilities_locs.loc[ifacility,'FRS_match'] = 1
else:
continue

print('Not Found:',len(food_beverage_facilities_locs)-(np.sum(food_beverage_facilities_locs.loc[:,'FRS_match'])+np.sum(food_beverage_facilities_locs.loc[:,'ghgrp_match'])), 'of',len(food_beverage_facilities_locs))

# Find missing locations by Geocoding addresses
# If this portion of the code fails due to timeout error (1), just re-run and it will work.
# This section takes ~152 minutes to run.
geolocator = Nominatim(user_agent="myGeocode")
geopy.geocoders.options.default_timeout = None
print(geolocator.timeout)

food_beverage_facilities_locs['geo_match'] = 0
for ifacility in np.arange(0,len(food_beverage_facilities_locs)):
if food_beverage_facilities_locs.loc[ifacility,'FRS_match'] ==0 and food_beverage_facilities_locs.loc[ifacility,'ghgrp_match'] == 0:
if food_beverage_facilities_locs.loc[ifacility,'FRS_match'] == 0 and food_beverage_facilities_locs.loc[ifacility,'ghgrp_match'] == 0:
location = geolocator.geocode(food_beverage_facilities_locs['full_address'][ifacility])
if location is None:
continue
Expand Down Expand Up @@ -602,7 +595,7 @@ def task_get_nonreporting_industrial_landfills_food_beverage_proxy_data(
#count -= 1
food_beverage_facilities_locs.loc[ifacility,'lat'] = location2.latitude
food_beverage_facilities_locs.loc[ifacility,'lon'] = location2.longitude
food_beverage_facilities_locs.loc[ifacility,'geo_match']=1
food_beverage_facilities_locs.loc[ifacility,'geo_match'] = 1
else:
#count -= 1
food_beverage_facilities_locs.loc[ifacility,'lat'] = location.latitude
Expand Down Expand Up @@ -631,7 +624,7 @@ def task_get_nonreporting_industrial_landfills_food_beverage_proxy_data(
"excessfood_tonyear_lowest", "excessfood_tonyear_highest",
"full_address", "partial_address", "lat", "lon",
"ghgrp_match", "FRS_match", "geo_match"])
.loc[:, ["facility_id", "state_code", "geometry", "rel_emi"]]
.loc[:, ["state_code", "geometry", "rel_emi"]]
)

nonreporting_food_beverage_gdf.to_parquet(nonreporting_food_beverage_proxy_output_path)
Expand Down
Loading