diff --git a/README.md b/README.md index 99bf3a6..397584a 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,16 @@ Parts of the input data used for the Open Grid Emissions dataset is released by Updated datasets will also be published whenever a new version of the open-grid-emissions repository is released. +### Running the pipeline with early release data +The OGE pipeline can be used to generate data using Early Release EIA data as soon as it is integrated into the PUDL nightly builds. In order to do that, `constants.current_early_release_year` must be updated to the current early release year (such that `current_early_release_year` is 1 year greater than `latest_validated_year`). Early release data is typically available from EIA in June/July of the following year, and is integrated into PUDL shortly thereafter. + +In addition, you will need to download and use the pudl nightly build data until the data becomes available through a stable release. To do so, you need to set your `PUDL_BUILD` environment variable to "nightly". You can do this through the command line using `set PUDL_BUILD=nightly` (for Windows), or by adding the following to the `__init__.py` file in `src/oge`: +```python +import os + +os.environ["PUDL_BUILD"] = "nightly" +``` + ## Contribute There are many ways that you can contribute! - Tell us how you are using the dataset or python tools diff --git a/notebooks/explore_data/explore_intermediate_outputs.ipynb b/notebooks/explore_data/explore_intermediate_outputs.ipynb index e5960b4..d0b8852 100644 --- a/notebooks/explore_data/explore_intermediate_outputs.ipynb +++ b/notebooks/explore_data/explore_intermediate_outputs.ipynb @@ -110,7 +110,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -119,9 +119,9 @@ "resolution = \"annual\"\n", "\n", "all_data = []\n", - "for ba in os.listdir(results_folder(f\"2021/power_sector_data/{resolution}/us_units\")):\n", + "for ba in os.listdir(results_folder(f\"2022/power_sector_data/{resolution}/us_units\")):\n", " df = pd.read_csv(\n", - " results_folder(f\"2021/power_sector_data/{resolution}/us_units/{ba}\")\n", + " results_folder(f\"2022/power_sector_data/{resolution}/us_units/{ba}\")\n", " )\n", " df[\"ba_code\"] = ba.split(\".\")[0]\n", " all_data.append(df)\n", @@ -131,6 +131,15 @@ "all_data = all_data.groupby(\"fuel_category\", dropna=False).sum()" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "all_data[\"net_generation_mwh\"] / all_data[\"net_generation_mwh\"].sum()" + ] + }, { "cell_type": "code", "execution_count": null, @@ -198,7 +207,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.2" + "version": "3.11.7" }, "orig_nbformat": 4, "vscode": { diff --git a/src/oge/constants.py b/src/oge/constants.py index 685d091..63f3402 100644 --- a/src/oge/constants.py +++ b/src/oge/constants.py @@ -12,6 +12,14 @@ earliest_hourly_data_year = 2019 # latest_validated_year is the most recent year for which OGE data has been published latest_validated_year = 2022 +# current_early_release_year is the year for which non-final (early-release) data +# is available from the EIA. This enables running the OGE pipeline for this year +# EIA-860ER data is generally available in June and EIA-923ER data is generally +# available in July of the following year. This should not be updated to the next year +# until ER data is available, so for part of the year, latest_validated_year will equal +# current_early_release_year +# TODO: Change this to 2024 around July 2025 (check PUDL to see when integrated) +current_early_release_year = 2023 # specify the energy_source_codes that are considered clean/carbon-free CLEAN_FUELS = ["SUN", "MWH", "WND", "WAT", "WH", "PUR", "NUC"] diff --git a/src/oge/consumed.py b/src/oge/consumed.py index 6462493..dedc7b4 100644 --- a/src/oge/consumed.py +++ b/src/oge/consumed.py @@ -31,6 +31,7 @@ 2020: ["CPLW", "EEI"], 2021: ["CPLW", "GCPD"], 2022: ["CPLW", "GCPD", "HST"], + 2023: [], # TODO: update when final 2023 data published } # Defined in output_data, written to each BA file diff --git a/src/oge/data_pipeline.py b/src/oge/data_pipeline.py index 88d1747..4be11e1 100644 --- a/src/oge/data_pipeline.py +++ b/src/oge/data_pipeline.py @@ -30,6 +30,7 @@ from oge.constants import ( TIME_RESOLUTIONS, latest_validated_year, + current_early_release_year, earliest_hourly_data_year, ) @@ -94,7 +95,6 @@ def main(args): raise OSError( "Invalid OGE_DATA_STORE environment variable. Should be 'local' or '1'" ) - # 0. Set up directory structure path_prefix = "" if not args.small else "small/" path_prefix += "flat/" if args.flat else "" @@ -142,6 +142,7 @@ def main(args): logger.info("1. Downloading data") # PUDL download_data.download_pudl_data(source="aws") + logger.info(f"Using {os.getenv('PUDL_BUILD', default="stable")} PUDL build") # eGRID download_data.download_egrid_files() # EIA-930 @@ -161,7 +162,9 @@ def main(args): # integrated into pudl download_data.download_raw_eia860(year) # download eia860 from the latest validated year for use in subplant identification - download_data.download_raw_eia860(latest_validated_year) + download_data.download_raw_eia860( + max(latest_validated_year, current_early_release_year) + ) download_data.download_raw_eia923(year) # 2. Identify subplants diff --git a/src/oge/download_data.py b/src/oge/download_data.py index 33d6fa4..c6cc69d 100644 --- a/src/oge/download_data.py +++ b/src/oge/download_data.py @@ -7,8 +7,9 @@ import tarfile import zipfile -from oge.filepaths import downloads_folder, data_folder +from oge.filepaths import downloads_folder, data_folder, get_pudl_build_version from oge.logging_util import get_logger +from oge.constants import current_early_release_year, latest_validated_year logger = get_logger(__name__) @@ -89,7 +90,7 @@ def download_helper( return True -def download_pudl_data(source: str = "aws"): +def download_pudl_data(source: str = "aws", build: str = get_pudl_build_version()): """Downloads the pudl database. OGE currently supports two sources: zenodo and aws (i.e. nightly builds). For more information about data sources see: https://catalystcoop-pudl.readthedocs.io/en/latest/data_access.html#data-access @@ -107,44 +108,62 @@ def download_pudl_data(source: str = "aws"): Args: source (str, optional): where to download pudl from, either 'aws' or 'zenodo'. Defaults to 'aws'. + build (str): whether to download the "stable" or "nightly" build Raises: + ValueError: if `build` is neither 'stable' or 'nightly'. ValueError: if `source` is neither 'aws' or 'zenodo'. """ - os.makedirs(downloads_folder("pudl"), exist_ok=True) + if build not in ["stable", "nightly"]: + raise ValueError(f"pudl build must be 'stable' or 'nightly', not {build}") + os.makedirs(downloads_folder(f"pudl/{build}"), exist_ok=True) if source == "aws": - # define the urls - pudl_db_url = "https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/stable/pudl.sqlite.gz" - epacems_parquet_url = "https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/stable/core_epacems__hourly_emissions.parquet" - # download the pudl sqlite database - if not os.path.exists(downloads_folder("pudl/pudl.sqlite")): - output_filepath = downloads_folder("pudl/pudl.sqlite") + if not os.path.exists(downloads_folder(f"pudl/{build}/pudl.sqlite")): + output_filepath = downloads_folder(f"pudl/{build}/pudl.sqlite") + pudl_db_url = f"https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/{build}/pudl.sqlite.zip" download_helper( pudl_db_url, - download_path=output_filepath + ".gz", + download_path=output_filepath + ".zip", output_path=output_filepath, - requires_gzip=True, + requires_unzip=True, should_clean=True, ) + # move the sqlite file from the folder it was extracted into + os.makedirs(downloads_folder(f"pudl/{build}/tmp"), exist_ok=True) + shutil.move( + src=(output_filepath + "/pudl.sqlite"), + dst=downloads_folder(f"pudl/{build}/tmp/pudl.sqlite"), + ) + os.rmdir(output_filepath) + shutil.move( + downloads_folder(f"pudl/{build}/tmp/pudl.sqlite"), output_filepath + ) + os.rmdir(downloads_folder(f"pudl/{build}/tmp")) # add a version file - with open(downloads_folder("pudl/pudl_sqlite_version.txt"), "w+") as v: + with open( + downloads_folder(f"pudl/{build}/pudl_sqlite_version.txt"), "w+" + ) as v: v.write(f"{datetime.date.today()}") else: - with open(downloads_folder("pudl/pudl_sqlite_version.txt"), "r") as f: + with open( + downloads_folder(f"pudl/{build}/pudl_sqlite_version.txt"), "r" + ) as f: existing_version = f.readlines()[0].replace("\n", "") logger.info( f"Using stable build version of PUDL sqlite database downloaded {existing_version}" ) + # download the epacems parquet file + epacems_parquet_url = f"https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/{build}/core_epacems__hourly_emissions.parquet" if not os.path.exists( - downloads_folder("pudl/core_epacems__hourly_emissions.parquet") + downloads_folder(f"pudl/{build}/core_epacems__hourly_emissions.parquet") ): # download the epacems parquet output_filepath = downloads_folder( - "pudl/core_epacems__hourly_emissions.parquet" + f"pudl/{build}/core_epacems__hourly_emissions.parquet" ) download_helper( epacems_parquet_url, @@ -152,11 +171,15 @@ def download_pudl_data(source: str = "aws"): ) # add a version file - with open(downloads_folder("pudl/epacems_parquet_version.txt"), "w+") as v: + with open( + downloads_folder(f"pudl/{build}/epacems_parquet_version.txt"), "w+" + ) as v: v.write(f"{datetime.date.today()}") else: - with open(downloads_folder("pudl/epacems_parquet_version.txt"), "r") as f: + with open( + downloads_folder(f"pudl/{build}/epacems_parquet_version.txt"), "r" + ) as f: existing_version = f.readlines()[0].replace("\n", "") logger.info( f"Using stable build version of PUDL epacems parquet file downloaded {existing_version}" @@ -319,7 +342,12 @@ def download_raw_eia923(year: int): download_raw_eia_906_920(year) else: os.makedirs(downloads_folder("eia923"), exist_ok=True) - url = f"https://www.eia.gov/electricity/data/eia923/xls/f923_{year}.zip" + if (year == current_early_release_year) and ( + current_early_release_year != latest_validated_year + ): + url = f"https://www.eia.gov/electricity/data/eia923/xls/f923_{year}er.zip" + else: + url = f"https://www.eia.gov/electricity/data/eia923/xls/f923_{year}.zip" archive_url = ( f"https://www.eia.gov/electricity/data/eia923/archive/xls/f923_{year}.zip" ) @@ -378,7 +406,12 @@ def download_raw_eia860(year: int): if year < 2005: raise NotImplementedError(f"We haven't tested EIA-860 for '{year}'.") os.makedirs(downloads_folder("eia860"), exist_ok=True) - url = f"https://www.eia.gov/electricity/data/eia860/xls/eia860{year}.zip" + if (year == current_early_release_year) and ( + current_early_release_year != latest_validated_year + ): + url = f"https://www.eia.gov/electricity/data/eia860/xls/eia860{year}ER.zip" + else: + url = f"https://www.eia.gov/electricity/data/eia860/xls/eia860{year}.zip" archive_url = ( f"https://www.eia.gov/electricity/data/eia860/archive/xls/eia860{year}.zip" ) diff --git a/src/oge/filepaths.py b/src/oge/filepaths.py index fa5cf4a..c40cac4 100644 --- a/src/oge/filepaths.py +++ b/src/oge/filepaths.py @@ -20,6 +20,17 @@ def get_data_store(): return f"s3://open-grid-emissions/open_grid_emissions_data/v{oge_data_version}" +def get_pudl_build_version(): + """get the pudl build version to access""" + build = os.getenv("PUDL_BUILD") + if build is None: + return "stable" + elif build.lower() == "stable": + return "stable" + elif build.lower() == "nightly": + return "nightly" + + def top_folder(rel=""): """Returns a path relative to the top-level repo folder. This will work regardless of where the function is imported or called from. @@ -42,6 +53,12 @@ def downloads_folder(rel=""): return os.path.join(data_folder("downloads"), rel).replace("\\", "/") +def pudl_folder(rel=""): + return os.path.join( + downloads_folder(f"pudl/{get_pudl_build_version()}"), rel + ).replace("\\", "/") + + def outputs_folder(rel=""): return os.path.join(data_folder("outputs"), rel).replace("\\", "/") diff --git a/src/oge/helpers.py b/src/oge/helpers.py index 9b2c1bd..0437fdc 100644 --- a/src/oge/helpers.py +++ b/src/oge/helpers.py @@ -6,7 +6,11 @@ from urllib3.exceptions import ReadTimeoutError from oge.column_checks import get_dtypes, apply_dtypes -from oge.constants import earliest_data_year, latest_validated_year +from oge.constants import ( + earliest_data_year, + latest_validated_year, + current_early_release_year, +) from oge.filepaths import reference_table_folder, outputs_folder import oge.load_data as load_data @@ -385,7 +389,7 @@ def add_plant_operating_and_retirement_dates(df: pd.DataFrame) -> pd.DataFrame: generator_dates = load_data.load_pudl_table( "out_eia__yearly_generators", year=earliest_data_year, - end_year=latest_validated_year, + end_year=max(latest_validated_year, current_early_release_year), columns=[ "plant_id_eia", "generator_id", @@ -456,7 +460,7 @@ def add_plant_nameplate_capacity(year: int, df: pd.DataFrame) -> pd.DataFrame: generator_capacity = load_data.load_pudl_table( "core_eia860__scd_generators", year=earliest_data_year, - end_year=latest_validated_year, + end_year=max(latest_validated_year, current_early_release_year), columns=[ "plant_id_eia", "generator_id", @@ -687,7 +691,7 @@ def add_plant_entity(df: pd.DataFrame) -> pd.DataFrame: columns=["plant_id_eia", "timezone"] + eia860_info, ) plants_entity_from_eia860 = load_data.load_raw_eia860_plant_geographical_info( - latest_validated_year + max(latest_validated_year, current_early_release_year) ) complete_plants_entity = plants_entity.merge( plants_entity_from_eia860, diff --git a/src/oge/load_data.py b/src/oge/load_data.py index 5a266f3..46a85bc 100644 --- a/src/oge/load_data.py +++ b/src/oge/load_data.py @@ -5,7 +5,12 @@ from pathlib import Path from oge.column_checks import get_dtypes, apply_dtypes -from oge.filepaths import downloads_folder, reference_table_folder, outputs_folder +from oge.filepaths import ( + downloads_folder, + reference_table_folder, + outputs_folder, + pudl_folder, +) import oge.validation as validation from oge.logging_util import get_logger from oge.constants import ( @@ -14,12 +19,13 @@ earliest_data_year, earliest_validated_year, latest_validated_year, + current_early_release_year, ) logger = get_logger(__name__) # initialize the pudl_engine -PUDL_ENGINE = sa.create_engine("sqlite:///" + downloads_folder("pudl/pudl.sqlite")) +PUDL_ENGINE = sa.create_engine("sqlite:///" + pudl_folder("pudl.sqlite")) def load_cems_data(year: int) -> pd.DataFrame: @@ -51,7 +57,7 @@ def load_cems_data(year: int) -> pd.DataFrame: # load the CEMS data cems = pd.read_parquet( - downloads_folder("pudl/core_epacems__hourly_emissions.parquet"), + pudl_folder("core_epacems__hourly_emissions.parquet"), filters=[["year", "==", year]], columns=cems_columns, ) @@ -129,9 +135,11 @@ def load_cems_ids() -> pd.DataFrame: # duplicates before concatenating the next year to the dataframe cems_ids = [] # The `constants.earliest_data_year` is 2005 - for year in range(earliest_data_year, latest_validated_year + 1): + for year in range( + earliest_data_year, max(latest_validated_year, current_early_release_year) + 1 + ): cems_id_year = pd.read_parquet( - downloads_folder("pudl/core_epacems__hourly_emissions.parquet"), + pudl_folder("core_epacems__hourly_emissions.parquet"), filters=[["year", "==", year]], columns=["plant_id_epa", "plant_id_eia", "emissions_unit_id_epa"], ).drop_duplicates() @@ -189,7 +197,10 @@ def load_complete_eia_generators_for_subplants() -> pd.DataFrame: # this avoids using potentially preliminary early-release data complete_gens = complete_gens[ (complete_gens["report_date"].dt.year >= earliest_data_year) - & (complete_gens["report_date"].dt.year <= latest_validated_year) + & ( + complete_gens["report_date"].dt.year + <= max(latest_validated_year, current_early_release_year) + ) ] # for any retired gens, forward fill the most recently available unit_id_pudl to @@ -225,7 +236,10 @@ def load_complete_eia_generators_for_subplants() -> pd.DataFrame: under_construction_status_codes = ["U", "V", "TS"] complete_gens = complete_gens[ ~( - (complete_gens["report_date"].dt.year < latest_validated_year) + ( + complete_gens["report_date"].dt.year + < max(latest_validated_year, current_early_release_year) + ) & ( complete_gens["operational_status_code"].isin( under_construction_status_codes @@ -251,7 +265,10 @@ def load_complete_eia_generators_for_subplants() -> pd.DataFrame: ~( (complete_gens["generator_operating_date"].isna()) & (complete_gens["generator_retirement_date"].isna()) - & (complete_gens["report_date"].dt.year < latest_validated_year) + & ( + complete_gens["report_date"].dt.year + < max(latest_validated_year, current_early_release_year) + ) & (complete_gens["operational_status_code"] != "TS") ) ] @@ -259,7 +276,7 @@ def load_complete_eia_generators_for_subplants() -> pd.DataFrame: #################### # merge into complete_gens and fill missing operating dates with the EIA-860 data generator_data_from_eia860 = load_raw_eia860_generator_dates_and_unit_ids( - latest_validated_year + max(latest_validated_year, current_early_release_year) ) complete_gens = complete_gens.merge( generator_data_from_eia860, @@ -301,9 +318,17 @@ def load_raw_eia860_plant_geographical_info(year: int) -> pd.DataFrame: """ # load geographic information from the raw EIA-860 file to supplement missing # information from pudl + if (year == current_early_release_year) and ( + current_early_release_year != latest_validated_year + ): + filepath = f"eia860/eia860{year}ER/2___Plant_Y{year}_Early_Release.xlsx" + header_row = 2 + else: + filepath = f"eia860/eia860{year}/2___Plant_Y{year}.xlsx" + header_row = 1 plant_geographical_eia860 = pd.read_excel( - downloads_folder(f"eia860/eia860{year}/2___Plant_Y{year}.xlsx"), - header=1, + downloads_folder(filepath), + header=header_row, usecols=[ "Plant Code", "Plant Name", @@ -351,9 +376,17 @@ def load_raw_eia860_generator_dates_and_unit_ids(year: int) -> pd.DataFrame: """ # load operating dates from the raw EIA-860 file to supplement missing operating # dates from pudl + if (year == current_early_release_year) and ( + current_early_release_year != latest_validated_year + ): + filepath = f"eia860/eia860{year}ER/3_1_Generator_Y{year}_Early_Release.xlsx" + header_row = 2 + else: + filepath = f"eia860/eia860{year}/3_1_Generator_Y{year}.xlsx" + header_row = 1 generator_op_dates_eia860 = pd.read_excel( - downloads_folder(f"eia860/eia860{year}/3_1_Generator_Y{year}.xlsx"), - header=1, + downloads_folder(filepath), + header=header_row, sheet_name="Operable", usecols=[ "Plant Code", @@ -383,9 +416,9 @@ def load_raw_eia860_generator_dates_and_unit_ids(year: int) -> pd.DataFrame: # load unit codes for proposed generators proposed_unit_ids_eia860 = ( pd.read_excel( - downloads_folder(f"eia860/eia860{year}/3_1_Generator_Y{year}.xlsx"), + downloads_folder(filepath), sheet_name="Proposed", - header=1, + header=header_row, usecols=["Plant Code", "Generator ID", "Unit Code"], ) .dropna(subset="Unit Code") @@ -447,7 +480,7 @@ def load_cems_gross_generation(start_year: int, end_year: int) -> pd.DataFrame: # load cems data cems = pd.read_parquet( - downloads_folder("pudl/core_epacems__hourly_emissions.parquet"), + pudl_folder("core_epacems__hourly_emissions.parquet"), filters=[["year", ">=", start_year], ["year", "<=", end_year]], columns=cems_columns, ) @@ -1152,47 +1185,56 @@ def load_emissions_controls_eia923(year: int) -> pd.DataFrame: ) if year >= 2012: - # Handle filename changes across years. - schedule_8_filename = { - 2012: downloads_folder( - f"eia923/f923_{year}/EIA923_Schedule_8_Annual_Environmental_Information_{year}_Final_Revision.xlsx" - ), - 2013: downloads_folder( - f"eia923/f923_{year}/EIA923_Schedule_8_PartsA-D_EnvData_2013_Final_Revision.xlsx" - ), - 2014: downloads_folder( - f"eia923/f923_{year}/EIA923_Schedule_8_Annual_Environmental_Information_{year}_Final_Revision.xlsx" - ), - 2015: downloads_folder( - f"eia923/f923_{year}/EIA923_Schedule_8_Annual_Environmental_Information_{year}_Final_Revision.xlsx" - ), - 2016: downloads_folder( - f"eia923/f923_{year}/EIA923_Schedule_8_Annual_Environmental_Information_{year}_Final_Revision.xlsx" - ), - 2017: downloads_folder( - f"eia923/f923_{year}/EIA923_Schedule_8_Annual_Envir_Infor_{year}_Final.xlsx" - ), - 2018: downloads_folder( - f"eia923/f923_{year}/EIA923_Schedule_8_Annual_Environmental_Information_{year}_Final.xlsx" - ), - 2019: downloads_folder( - f"eia923/f923_{year}/EIA923_Schedule_8_Annual_Environmental_Information_{year}_Final_Revision.xlsx" - ), - 2020: downloads_folder( - f"eia923/f923_{year}/EIA923_Schedule_8_Annual_Environmental_Information_{year}_Final_Revision.xlsx" - ), - 2021: downloads_folder( - f"eia923/f923_{year}/EIA923_Schedule_8_Annual_Environmental_Information_{year}_Final_Revision.xlsx" - ), - 2022: downloads_folder( - f"eia923/f923_{year}/EIA923_Schedule_8_Annual_Environmental_Information_{year}_Final.xlsx" - ), - }[year] + if year <= latest_validated_year: + # Handle filename changes across years. + schedule_8_filename = { + 2012: downloads_folder( + f"eia923/f923_{year}/EIA923_Schedule_8_Annual_Environmental_Information_{year}_Final_Revision.xlsx" + ), + 2013: downloads_folder( + f"eia923/f923_{year}/EIA923_Schedule_8_PartsA-D_EnvData_2013_Final_Revision.xlsx" + ), + 2014: downloads_folder( + f"eia923/f923_{year}/EIA923_Schedule_8_Annual_Environmental_Information_{year}_Final_Revision.xlsx" + ), + 2015: downloads_folder( + f"eia923/f923_{year}/EIA923_Schedule_8_Annual_Environmental_Information_{year}_Final_Revision.xlsx" + ), + 2016: downloads_folder( + f"eia923/f923_{year}/EIA923_Schedule_8_Annual_Environmental_Information_{year}_Final_Revision.xlsx" + ), + 2017: downloads_folder( + f"eia923/f923_{year}/EIA923_Schedule_8_Annual_Envir_Infor_{year}_Final.xlsx" + ), + 2018: downloads_folder( + f"eia923/f923_{year}/EIA923_Schedule_8_Annual_Environmental_Information_{year}_Final.xlsx" + ), + 2019: downloads_folder( + f"eia923/f923_{year}/EIA923_Schedule_8_Annual_Environmental_Information_{year}_Final_Revision.xlsx" + ), + 2020: downloads_folder( + f"eia923/f923_{year}/EIA923_Schedule_8_Annual_Environmental_Information_{year}_Final_Revision.xlsx" + ), + 2021: downloads_folder( + f"eia923/f923_{year}/EIA923_Schedule_8_Annual_Environmental_Information_{year}_Final_Revision.xlsx" + ), + 2022: downloads_folder( + f"eia923/f923_{year}/EIA923_Schedule_8_Annual_Environmental_Information_{year}_Final.xlsx" + ), + }[year] + header_row = 4 + elif (year == current_early_release_year) and ( + current_early_release_year != latest_validated_year + ): + schedule_8_filename = downloads_folder( + f"eia923/f923_{year}er/EIA923_Schedule_8_Annual_Environmental_Information_{year}_Early_Release.xlsx" + ) + header_row = 5 emissions_controls_eia923 = pd.read_excel( io=schedule_8_filename, sheet_name="8C Air Emissions Control Info", - header=4, + header=header_row, names=emissions_controls_eia923_names, dtype=get_dtypes(), na_values=".", diff --git a/src/oge/output_data.py b/src/oge/output_data.py index e7ba3d3..bbe43e9 100644 --- a/src/oge/output_data.py +++ b/src/oge/output_data.py @@ -14,6 +14,7 @@ earliest_validated_year, earliest_hourly_data_year, latest_validated_year, + current_early_release_year, ) logger = get_logger(__name__) @@ -135,7 +136,7 @@ def zip_results_for_s3(): root_dir=data_folder(f"s3_upload/{year_range}_plant_attributes"), ) shutil.rmtree(data_folder(f"s3_upload/{year_range}_plant_attributes")) - for year in range(2019, latest_validated_year + 1): + for year in range(2019, max(latest_validated_year, current_early_release_year) + 1): for data_type in ["power_sector_data", "carbon_accounting", "plant_data"]: for aggregation in ["hourly", "monthly", "annual"]: for unit in ["metric_units", "us_units"]: diff --git a/src/oge/subplant_identification.py b/src/oge/subplant_identification.py index b5c20dc..2beeef5 100644 --- a/src/oge/subplant_identification.py +++ b/src/oge/subplant_identification.py @@ -5,7 +5,7 @@ import oge.load_data as load_data import oge.validation as validation -from oge.constants import latest_validated_year +from oge.constants import latest_validated_year, current_early_release_year from oge.logging_util import get_logger logger = get_logger(__name__) @@ -33,7 +33,9 @@ def generate_subplant_ids() -> pd.DataFrame: cems_ids = load_data.load_cems_ids() # load the crosswalk and filter it by the data that actually exists in cems - crosswalk = load_data.load_epa_eia_crosswalk(latest_validated_year) + crosswalk = load_data.load_epa_eia_crosswalk( + max(latest_validated_year, current_early_release_year) + ) # filter the crosswalk to drop any units that don't exist in CEMS filtered_crosswalk = epacamd_eia.filter_crosswalk(crosswalk, cems_ids) @@ -161,7 +163,8 @@ def generate_subplant_ids() -> pd.DataFrame: # validate that there are no orphaned combined cycle plant parts in a subplant validation.check_for_orphaned_cc_part_in_subplant( - subplant_crosswalk_complete, latest_validated_year + subplant_crosswalk_complete, + max(latest_validated_year, current_early_release_year), ) return subplant_crosswalk_complete diff --git a/src/oge/validation.py b/src/oge/validation.py index c4ad0e6..922ed3e 100644 --- a/src/oge/validation.py +++ b/src/oge/validation.py @@ -12,6 +12,7 @@ CLEAN_FUELS, earliest_data_year, latest_validated_year, + current_early_release_year, ) logger = get_logger(__name__) @@ -40,9 +41,17 @@ def validate_year(year): Input data for {end+1} should be available from the EIA in Fall {end+2} and we will work to validate that the pipeline works with {end+1} data as soon as possible after the data is released. + + If you are looking to run the pipeline with Early Release data, check that + this data is available and integrated into PUDL, then update + `constants.current_early_release_year` ######################################################################### """ - if year < earliest_data_year or year > latest_validated_year: + if (year == current_early_release_year) and (year != latest_validated_year): + logger.warning( + f"Running pipeline with unvalidated Early Release data for {year}" + ) + if year < earliest_data_year or year > current_early_release_year: raise UserWarning(year_warning)