Skip to content

Commit

Permalink
Merge pull request #390 from singularity-energy/greg/2023
Browse files Browse the repository at this point in the history
Enable running OGE pipeline with Early Release data and PUDL nightly builds
  • Loading branch information
grgmiller authored Sep 20, 2024
2 parents 8d4dc3e + 7648084 commit e8bfef7
Show file tree
Hide file tree
Showing 12 changed files with 227 additions and 87 deletions.
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,16 @@ Parts of the input data used for the Open Grid Emissions dataset is released by

Updated datasets will also be published whenever a new version of the open-grid-emissions repository is released.

### Running the pipeline with early release data
The OGE pipeline can be used to generate data using Early Release EIA data as soon as it is integrated into the PUDL nightly builds. In order to do that, `constants.current_early_release_year` must be updated to the current early release year (such that `current_early_release_year` is 1 year greater than `latest_validated_year`). Early release data is typically available from EIA in June/July of the following year, and is integrated into PUDL shortly thereafter.

In addition, you will need to download and use the pudl nightly build data until the data becomes available through a stable release. To do so, you need to set your `PUDL_BUILD` environment variable to "nightly". You can do this through the command line using `set PUDL_BUILD=nightly` (for Windows), or by adding the following to the `__init__.py` file in `src/oge`:
```python
import os

os.environ["PUDL_BUILD"] = "nightly"
```

## Contribute
There are many ways that you can contribute!
- Tell us how you are using the dataset or python tools
Expand Down
17 changes: 13 additions & 4 deletions notebooks/explore_data/explore_intermediate_outputs.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -119,9 +119,9 @@
"resolution = \"annual\"\n",
"\n",
"all_data = []\n",
"for ba in os.listdir(results_folder(f\"2021/power_sector_data/{resolution}/us_units\")):\n",
"for ba in os.listdir(results_folder(f\"2022/power_sector_data/{resolution}/us_units\")):\n",
" df = pd.read_csv(\n",
" results_folder(f\"2021/power_sector_data/{resolution}/us_units/{ba}\")\n",
" results_folder(f\"2022/power_sector_data/{resolution}/us_units/{ba}\")\n",
" )\n",
" df[\"ba_code\"] = ba.split(\".\")[0]\n",
" all_data.append(df)\n",
Expand All @@ -131,6 +131,15 @@
"all_data = all_data.groupby(\"fuel_category\", dropna=False).sum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"all_data[\"net_generation_mwh\"] / all_data[\"net_generation_mwh\"].sum()"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down Expand Up @@ -198,7 +207,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.2"
"version": "3.11.7"
},
"orig_nbformat": 4,
"vscode": {
Expand Down
8 changes: 8 additions & 0 deletions src/oge/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,14 @@
earliest_hourly_data_year = 2019
# latest_validated_year is the most recent year for which OGE data has been published
latest_validated_year = 2022
# current_early_release_year is the year for which non-final (early-release) data
# is available from the EIA. This enables running the OGE pipeline for this year
# EIA-860ER data is generally available in June and EIA-923ER data is generally
# available in July of the following year. This should not be updated to the next year
# until ER data is available, so for part of the year, latest_validated_year will equal
# current_early_release_year
# TODO: Change this to 2024 around July 2025 (check PUDL to see when integrated)
current_early_release_year = 2023

# specify the energy_source_codes that are considered clean/carbon-free
CLEAN_FUELS = ["SUN", "MWH", "WND", "WAT", "WH", "PUR", "NUC"]
Expand Down
1 change: 1 addition & 0 deletions src/oge/consumed.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
2020: ["CPLW", "EEI"],
2021: ["CPLW", "GCPD"],
2022: ["CPLW", "GCPD", "HST"],
2023: [], # TODO: update when final 2023 data published
}

# Defined in output_data, written to each BA file
Expand Down
7 changes: 5 additions & 2 deletions src/oge/data_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
from oge.constants import (
TIME_RESOLUTIONS,
latest_validated_year,
current_early_release_year,
earliest_hourly_data_year,
)

Expand Down Expand Up @@ -94,7 +95,6 @@ def main(args):
raise OSError(
"Invalid OGE_DATA_STORE environment variable. Should be 'local' or '1'"
)

# 0. Set up directory structure
path_prefix = "" if not args.small else "small/"
path_prefix += "flat/" if args.flat else ""
Expand Down Expand Up @@ -142,6 +142,7 @@ def main(args):
logger.info("1. Downloading data")
# PUDL
download_data.download_pudl_data(source="aws")
logger.info(f"Using {os.getenv('PUDL_BUILD', default="stable")} PUDL build")
# eGRID
download_data.download_egrid_files()
# EIA-930
Expand All @@ -161,7 +162,9 @@ def main(args):
# integrated into pudl
download_data.download_raw_eia860(year)
# download eia860 from the latest validated year for use in subplant identification
download_data.download_raw_eia860(latest_validated_year)
download_data.download_raw_eia860(
max(latest_validated_year, current_early_release_year)
)
download_data.download_raw_eia923(year)

# 2. Identify subplants
Expand Down
71 changes: 52 additions & 19 deletions src/oge/download_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@
import tarfile
import zipfile

from oge.filepaths import downloads_folder, data_folder
from oge.filepaths import downloads_folder, data_folder, get_pudl_build_version
from oge.logging_util import get_logger
from oge.constants import current_early_release_year, latest_validated_year

logger = get_logger(__name__)

Expand Down Expand Up @@ -89,7 +90,7 @@ def download_helper(
return True


def download_pudl_data(source: str = "aws"):
def download_pudl_data(source: str = "aws", build: str = get_pudl_build_version()):
"""Downloads the pudl database. OGE currently supports two sources: zenodo and aws
(i.e. nightly builds). For more information about data sources see:
https://catalystcoop-pudl.readthedocs.io/en/latest/data_access.html#data-access
Expand All @@ -107,56 +108,78 @@ def download_pudl_data(source: str = "aws"):
Args:
source (str, optional): where to download pudl from, either 'aws' or 'zenodo'.
Defaults to 'aws'.
build (str): whether to download the "stable" or "nightly" build
Raises:
ValueError: if `build` is neither 'stable' or 'nightly'.
ValueError: if `source` is neither 'aws' or 'zenodo'.
"""
os.makedirs(downloads_folder("pudl"), exist_ok=True)
if build not in ["stable", "nightly"]:
raise ValueError(f"pudl build must be 'stable' or 'nightly', not {build}")
os.makedirs(downloads_folder(f"pudl/{build}"), exist_ok=True)

if source == "aws":
# define the urls
pudl_db_url = "https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/stable/pudl.sqlite.gz"
epacems_parquet_url = "https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/stable/core_epacems__hourly_emissions.parquet"

# download the pudl sqlite database
if not os.path.exists(downloads_folder("pudl/pudl.sqlite")):
output_filepath = downloads_folder("pudl/pudl.sqlite")
if not os.path.exists(downloads_folder(f"pudl/{build}/pudl.sqlite")):
output_filepath = downloads_folder(f"pudl/{build}/pudl.sqlite")
pudl_db_url = f"https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/{build}/pudl.sqlite.zip"
download_helper(
pudl_db_url,
download_path=output_filepath + ".gz",
download_path=output_filepath + ".zip",
output_path=output_filepath,
requires_gzip=True,
requires_unzip=True,
should_clean=True,
)
# move the sqlite file from the folder it was extracted into
os.makedirs(downloads_folder(f"pudl/{build}/tmp"), exist_ok=True)
shutil.move(
src=(output_filepath + "/pudl.sqlite"),
dst=downloads_folder(f"pudl/{build}/tmp/pudl.sqlite"),
)
os.rmdir(output_filepath)
shutil.move(
downloads_folder(f"pudl/{build}/tmp/pudl.sqlite"), output_filepath
)
os.rmdir(downloads_folder(f"pudl/{build}/tmp"))

# add a version file
with open(downloads_folder("pudl/pudl_sqlite_version.txt"), "w+") as v:
with open(
downloads_folder(f"pudl/{build}/pudl_sqlite_version.txt"), "w+"
) as v:
v.write(f"{datetime.date.today()}")
else:
with open(downloads_folder("pudl/pudl_sqlite_version.txt"), "r") as f:
with open(
downloads_folder(f"pudl/{build}/pudl_sqlite_version.txt"), "r"
) as f:
existing_version = f.readlines()[0].replace("\n", "")
logger.info(
f"Using stable build version of PUDL sqlite database downloaded {existing_version}"
)

# download the epacems parquet file
epacems_parquet_url = f"https://s3.us-west-2.amazonaws.com/pudl.catalyst.coop/{build}/core_epacems__hourly_emissions.parquet"
if not os.path.exists(
downloads_folder("pudl/core_epacems__hourly_emissions.parquet")
downloads_folder(f"pudl/{build}/core_epacems__hourly_emissions.parquet")
):
# download the epacems parquet
output_filepath = downloads_folder(
"pudl/core_epacems__hourly_emissions.parquet"
f"pudl/{build}/core_epacems__hourly_emissions.parquet"
)
download_helper(
epacems_parquet_url,
download_path=output_filepath,
)

# add a version file
with open(downloads_folder("pudl/epacems_parquet_version.txt"), "w+") as v:
with open(
downloads_folder(f"pudl/{build}/epacems_parquet_version.txt"), "w+"
) as v:
v.write(f"{datetime.date.today()}")

else:
with open(downloads_folder("pudl/epacems_parquet_version.txt"), "r") as f:
with open(
downloads_folder(f"pudl/{build}/epacems_parquet_version.txt"), "r"
) as f:
existing_version = f.readlines()[0].replace("\n", "")
logger.info(
f"Using stable build version of PUDL epacems parquet file downloaded {existing_version}"
Expand Down Expand Up @@ -319,7 +342,12 @@ def download_raw_eia923(year: int):
download_raw_eia_906_920(year)
else:
os.makedirs(downloads_folder("eia923"), exist_ok=True)
url = f"https://www.eia.gov/electricity/data/eia923/xls/f923_{year}.zip"
if (year == current_early_release_year) and (
current_early_release_year != latest_validated_year
):
url = f"https://www.eia.gov/electricity/data/eia923/xls/f923_{year}er.zip"
else:
url = f"https://www.eia.gov/electricity/data/eia923/xls/f923_{year}.zip"
archive_url = (
f"https://www.eia.gov/electricity/data/eia923/archive/xls/f923_{year}.zip"
)
Expand Down Expand Up @@ -378,7 +406,12 @@ def download_raw_eia860(year: int):
if year < 2005:
raise NotImplementedError(f"We haven't tested EIA-860 for '{year}'.")
os.makedirs(downloads_folder("eia860"), exist_ok=True)
url = f"https://www.eia.gov/electricity/data/eia860/xls/eia860{year}.zip"
if (year == current_early_release_year) and (
current_early_release_year != latest_validated_year
):
url = f"https://www.eia.gov/electricity/data/eia860/xls/eia860{year}ER.zip"
else:
url = f"https://www.eia.gov/electricity/data/eia860/xls/eia860{year}.zip"
archive_url = (
f"https://www.eia.gov/electricity/data/eia860/archive/xls/eia860{year}.zip"
)
Expand Down
17 changes: 17 additions & 0 deletions src/oge/filepaths.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,17 @@ def get_data_store():
return f"s3://open-grid-emissions/open_grid_emissions_data/v{oge_data_version}"


def get_pudl_build_version():
"""get the pudl build version to access"""
build = os.getenv("PUDL_BUILD")
if build is None:
return "stable"
elif build.lower() == "stable":
return "stable"
elif build.lower() == "nightly":
return "nightly"


def top_folder(rel=""):
"""Returns a path relative to the top-level repo folder. This will work regardless
of where the function is imported or called from.
Expand All @@ -42,6 +53,12 @@ def downloads_folder(rel=""):
return os.path.join(data_folder("downloads"), rel).replace("\\", "/")


def pudl_folder(rel=""):
return os.path.join(
downloads_folder(f"pudl/{get_pudl_build_version()}"), rel
).replace("\\", "/")


def outputs_folder(rel=""):
return os.path.join(data_folder("outputs"), rel).replace("\\", "/")

Expand Down
12 changes: 8 additions & 4 deletions src/oge/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,11 @@
from urllib3.exceptions import ReadTimeoutError

from oge.column_checks import get_dtypes, apply_dtypes
from oge.constants import earliest_data_year, latest_validated_year
from oge.constants import (
earliest_data_year,
latest_validated_year,
current_early_release_year,
)
from oge.filepaths import reference_table_folder, outputs_folder

import oge.load_data as load_data
Expand Down Expand Up @@ -385,7 +389,7 @@ def add_plant_operating_and_retirement_dates(df: pd.DataFrame) -> pd.DataFrame:
generator_dates = load_data.load_pudl_table(
"out_eia__yearly_generators",
year=earliest_data_year,
end_year=latest_validated_year,
end_year=max(latest_validated_year, current_early_release_year),
columns=[
"plant_id_eia",
"generator_id",
Expand Down Expand Up @@ -456,7 +460,7 @@ def add_plant_nameplate_capacity(year: int, df: pd.DataFrame) -> pd.DataFrame:
generator_capacity = load_data.load_pudl_table(
"core_eia860__scd_generators",
year=earliest_data_year,
end_year=latest_validated_year,
end_year=max(latest_validated_year, current_early_release_year),
columns=[
"plant_id_eia",
"generator_id",
Expand Down Expand Up @@ -687,7 +691,7 @@ def add_plant_entity(df: pd.DataFrame) -> pd.DataFrame:
columns=["plant_id_eia", "timezone"] + eia860_info,
)
plants_entity_from_eia860 = load_data.load_raw_eia860_plant_geographical_info(
latest_validated_year
max(latest_validated_year, current_early_release_year)
)
complete_plants_entity = plants_entity.merge(
plants_entity_from_eia860,
Expand Down
Loading

0 comments on commit e8bfef7

Please sign in to comment.