From 7c7ed7b8667231883092379bf2384d1cd387a803 Mon Sep 17 00:00:00 2001 From: Lorenzo Stella Date: Tue, 5 Nov 2024 13:25:25 +0100 Subject: [PATCH] Fix freq string issues in datasets (#3232) *Issue #, if available:* fixes #3229, pandas changes in frequency strings broke some of our logic. *Description of changes:* Add missing frequency strings in _tsf_datasets.py, and get rid of other frequency-related warnings with other datasets. I tested the change by running the following script: ```python from gluonts.dataset.repository import get_dataset, dataset_names skip = [ "m3_monthly", "m3_yearly", "m3_quarterly", "m3_other", "m5", ] for dataset_name in dataset_names: if dataset_name in skip: continue print(dataset_name) dataset = get_dataset(dataset_name, regenerate=True) ``` By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice. **Please tag this pr with at least one of these labels to make our release process faster:** BREAKING, new feature, bug fix, other change, dev setup --- src/gluonts/dataset/repository/_ercot.py | 2 +- src/gluonts/dataset/repository/_gp_copula_2019.py | 6 +++--- src/gluonts/dataset/repository/_lstnet.py | 6 +++--- src/gluonts/dataset/repository/_tsf_datasets.py | 4 ++++ src/gluonts/dataset/repository/_tsf_reader.py | 8 ++++---- src/gluonts/dataset/repository/_uber_tlc.py | 2 +- test/dataset/test_tsf_reader.py | 8 ++++---- 7 files changed, 20 insertions(+), 16 deletions(-) diff --git a/src/gluonts/dataset/repository/_ercot.py b/src/gluonts/dataset/repository/_ercot.py index d013a09c9f..7e144d988e 100644 --- a/src/gluonts/dataset/repository/_ercot.py +++ b/src/gluonts/dataset/repository/_ercot.py @@ -26,7 +26,7 @@ def generate_ercot_dataset(dataset_path: Path, dataset_writer: DatasetWriter): df.ffill(inplace=True) regions = [col for col in df.columns if col not in ["ds", "y"]] - freq = "1H" + freq = "1h" prediction_length = 24 start = pd.Period(df["ds"][0], freq=freq) diff --git a/src/gluonts/dataset/repository/_gp_copula_2019.py b/src/gluonts/dataset/repository/_gp_copula_2019.py index 88c6becea2..41d5da9ba6 100644 --- a/src/gluonts/dataset/repository/_gp_copula_2019.py +++ b/src/gluonts/dataset/repository/_gp_copula_2019.py @@ -63,7 +63,7 @@ class GPCopulaDataset(NamedTuple): # original dataset can be found at https://archive.ics.uci.edu/ml/datasets/ElectricityLoadDiagrams20112014# num_series=370, prediction_length=24, - freq="H", + freq="h", rolling_evaluations=7, max_target_dim=None, ), @@ -73,7 +73,7 @@ class GPCopulaDataset(NamedTuple): # note there are 963 in the original dataset from https://archive.ics.uci.edu/ml/datasets/PEMS-SF num_series=963, prediction_length=24, - freq="H", + freq="h", rolling_evaluations=7, max_target_dim=None, ), @@ -82,7 +82,7 @@ class GPCopulaDataset(NamedTuple): url=root + "solar_nips.tar.gz", num_series=137, prediction_length=24, - freq="H", + freq="h", rolling_evaluations=7, max_target_dim=None, ), diff --git a/src/gluonts/dataset/repository/_lstnet.py b/src/gluonts/dataset/repository/_lstnet.py index e933666c77..0253cffd95 100644 --- a/src/gluonts/dataset/repository/_lstnet.py +++ b/src/gluonts/dataset/repository/_lstnet.py @@ -91,7 +91,7 @@ class LstnetDataset(NamedTuple): prediction_length=24, rolling_evaluations=7, start_date="2012-01-01", - freq="1H", + freq="1h", agg_freq=None, ), "traffic": LstnetDataset( @@ -105,7 +105,7 @@ class LstnetDataset(NamedTuple): prediction_length=24, rolling_evaluations=7, start_date="2015-01-01", - freq="H", + freq="h", agg_freq=None, ), "solar-energy": LstnetDataset( @@ -117,7 +117,7 @@ class LstnetDataset(NamedTuple): rolling_evaluations=7, start_date="2006-01-01", freq="10min", - agg_freq="1H", + agg_freq="1h", ), } diff --git a/src/gluonts/dataset/repository/_tsf_datasets.py b/src/gluonts/dataset/repository/_tsf_datasets.py index ba073cdf4c..b6ff0340b2 100644 --- a/src/gluonts/dataset/repository/_tsf_datasets.py +++ b/src/gluonts/dataset/repository/_tsf_datasets.py @@ -278,11 +278,15 @@ def generate_forecasting_dataset( def default_prediction_length_from_frequency(freq: str) -> int: prediction_length_map = { "T": 60, + "min": 60, "H": 48, + "h": 48, "D": 30, "W-SUN": 8, "M": 12, + "ME": 12, "Y": 4, + "YE": 4, } try: freq = to_offset(freq).name diff --git a/src/gluonts/dataset/repository/_tsf_reader.py b/src/gluonts/dataset/repository/_tsf_reader.py index 45386864a7..ca87bb27f3 100644 --- a/src/gluonts/dataset/repository/_tsf_reader.py +++ b/src/gluonts/dataset/repository/_tsf_reader.py @@ -49,10 +49,10 @@ def frequency_converter(freq: str): BASE_FREQ_TO_PANDAS_OFFSET: Dict[str, str] = { "seconds": "S", - "minutely": "T", - "minutes": "T", - "hourly": "H", - "hours": "H", + "minutely": "min", + "minutes": "min", + "hourly": "h", + "hours": "h", "daily": "D", "days": "D", "weekly": "W", diff --git a/src/gluonts/dataset/repository/_uber_tlc.py b/src/gluonts/dataset/repository/_uber_tlc.py index 73aabd0701..26a086db43 100644 --- a/src/gluonts/dataset/repository/_uber_tlc.py +++ b/src/gluonts/dataset/repository/_uber_tlc.py @@ -28,7 +28,7 @@ def generate_uber_dataset( prediction_length: int, dataset_writer: DatasetWriter, ): - subsets = {"daily": "1D", "hourly": "1H"} + subsets = {"daily": "1D", "hourly": "1h"} assert ( uber_freq.lower() in subsets ), f"invalid uber_freq='{uber_freq}'. Allowed values: {subsets.keys()}" diff --git a/test/dataset/test_tsf_reader.py b/test/dataset/test_tsf_reader.py index 2261d7ebfc..686b3f4d2c 100644 --- a/test/dataset/test_tsf_reader.py +++ b/test/dataset/test_tsf_reader.py @@ -20,10 +20,10 @@ "input_freq_str, output_freq_str", [ ("30_seconds", "30S"), - ("minutely", "T"), - ("10_minutes", "10T"), - ("hourly", "H"), - ("half_hourly", "0.5H"), + ("minutely", "min"), + ("10_minutes", "10min"), + ("hourly", "h"), + ("half_hourly", "0.5h"), ("daily", "D"), ("7_days", "7D"), ("weekly", "W"),