From 7c7ed7b8667231883092379bf2384d1cd387a803 Mon Sep 17 00:00:00 2001
From: Lorenzo Stella <stellalo@amazon.com>
Date: Tue, 5 Nov 2024 13:25:25 +0100
Subject: [PATCH] Fix freq string issues in datasets (#3232)

*Issue #, if available:* fixes #3229, pandas changes in frequency
strings broke some of our logic.

*Description of changes:* Add missing frequency strings in
_tsf_datasets.py, and get rid of other frequency-related warnings with
other datasets. I tested the change by running the following script:

```python
from gluonts.dataset.repository import get_dataset, dataset_names

skip = [
    "m3_monthly",
    "m3_yearly",
    "m3_quarterly",
    "m3_other",
    "m5",
]

for dataset_name in dataset_names:
    if dataset_name in skip:
        continue
    print(dataset_name)
    dataset = get_dataset(dataset_name, regenerate=True)
```


By submitting this pull request, I confirm that you can use, modify,
copy, and redistribute this contribution, under the terms of your
choice.


**Please tag this pr with at least one of these labels to make our
release process faster:** BREAKING, new feature, bug fix, other change,
dev setup
---
 src/gluonts/dataset/repository/_ercot.py          | 2 +-
 src/gluonts/dataset/repository/_gp_copula_2019.py | 6 +++---
 src/gluonts/dataset/repository/_lstnet.py         | 6 +++---
 src/gluonts/dataset/repository/_tsf_datasets.py   | 4 ++++
 src/gluonts/dataset/repository/_tsf_reader.py     | 8 ++++----
 src/gluonts/dataset/repository/_uber_tlc.py       | 2 +-
 test/dataset/test_tsf_reader.py                   | 8 ++++----
 7 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/src/gluonts/dataset/repository/_ercot.py b/src/gluonts/dataset/repository/_ercot.py
index d013a09c9f..7e144d988e 100644
--- a/src/gluonts/dataset/repository/_ercot.py
+++ b/src/gluonts/dataset/repository/_ercot.py
@@ -26,7 +26,7 @@ def generate_ercot_dataset(dataset_path: Path, dataset_writer: DatasetWriter):
     df.ffill(inplace=True)
     regions = [col for col in df.columns if col not in ["ds", "y"]]
 
-    freq = "1H"
+    freq = "1h"
     prediction_length = 24
 
     start = pd.Period(df["ds"][0], freq=freq)
diff --git a/src/gluonts/dataset/repository/_gp_copula_2019.py b/src/gluonts/dataset/repository/_gp_copula_2019.py
index 88c6becea2..41d5da9ba6 100644
--- a/src/gluonts/dataset/repository/_gp_copula_2019.py
+++ b/src/gluonts/dataset/repository/_gp_copula_2019.py
@@ -63,7 +63,7 @@ class GPCopulaDataset(NamedTuple):
         # original dataset can be found at https://archive.ics.uci.edu/ml/datasets/ElectricityLoadDiagrams20112014#
         num_series=370,
         prediction_length=24,
-        freq="H",
+        freq="h",
         rolling_evaluations=7,
         max_target_dim=None,
     ),
@@ -73,7 +73,7 @@ class GPCopulaDataset(NamedTuple):
         # note there are 963 in the original dataset from https://archive.ics.uci.edu/ml/datasets/PEMS-SF
         num_series=963,
         prediction_length=24,
-        freq="H",
+        freq="h",
         rolling_evaluations=7,
         max_target_dim=None,
     ),
@@ -82,7 +82,7 @@ class GPCopulaDataset(NamedTuple):
         url=root + "solar_nips.tar.gz",
         num_series=137,
         prediction_length=24,
-        freq="H",
+        freq="h",
         rolling_evaluations=7,
         max_target_dim=None,
     ),
diff --git a/src/gluonts/dataset/repository/_lstnet.py b/src/gluonts/dataset/repository/_lstnet.py
index e933666c77..0253cffd95 100644
--- a/src/gluonts/dataset/repository/_lstnet.py
+++ b/src/gluonts/dataset/repository/_lstnet.py
@@ -91,7 +91,7 @@ class LstnetDataset(NamedTuple):
         prediction_length=24,
         rolling_evaluations=7,
         start_date="2012-01-01",
-        freq="1H",
+        freq="1h",
         agg_freq=None,
     ),
     "traffic": LstnetDataset(
@@ -105,7 +105,7 @@ class LstnetDataset(NamedTuple):
         prediction_length=24,
         rolling_evaluations=7,
         start_date="2015-01-01",
-        freq="H",
+        freq="h",
         agg_freq=None,
     ),
     "solar-energy": LstnetDataset(
@@ -117,7 +117,7 @@ class LstnetDataset(NamedTuple):
         rolling_evaluations=7,
         start_date="2006-01-01",
         freq="10min",
-        agg_freq="1H",
+        agg_freq="1h",
     ),
 }
 
diff --git a/src/gluonts/dataset/repository/_tsf_datasets.py b/src/gluonts/dataset/repository/_tsf_datasets.py
index ba073cdf4c..b6ff0340b2 100644
--- a/src/gluonts/dataset/repository/_tsf_datasets.py
+++ b/src/gluonts/dataset/repository/_tsf_datasets.py
@@ -278,11 +278,15 @@ def generate_forecasting_dataset(
 def default_prediction_length_from_frequency(freq: str) -> int:
     prediction_length_map = {
         "T": 60,
+        "min": 60,
         "H": 48,
+        "h": 48,
         "D": 30,
         "W-SUN": 8,
         "M": 12,
+        "ME": 12,
         "Y": 4,
+        "YE": 4,
     }
     try:
         freq = to_offset(freq).name
diff --git a/src/gluonts/dataset/repository/_tsf_reader.py b/src/gluonts/dataset/repository/_tsf_reader.py
index 45386864a7..ca87bb27f3 100644
--- a/src/gluonts/dataset/repository/_tsf_reader.py
+++ b/src/gluonts/dataset/repository/_tsf_reader.py
@@ -49,10 +49,10 @@ def frequency_converter(freq: str):
 
 BASE_FREQ_TO_PANDAS_OFFSET: Dict[str, str] = {
     "seconds": "S",
-    "minutely": "T",
-    "minutes": "T",
-    "hourly": "H",
-    "hours": "H",
+    "minutely": "min",
+    "minutes": "min",
+    "hourly": "h",
+    "hours": "h",
     "daily": "D",
     "days": "D",
     "weekly": "W",
diff --git a/src/gluonts/dataset/repository/_uber_tlc.py b/src/gluonts/dataset/repository/_uber_tlc.py
index 73aabd0701..26a086db43 100644
--- a/src/gluonts/dataset/repository/_uber_tlc.py
+++ b/src/gluonts/dataset/repository/_uber_tlc.py
@@ -28,7 +28,7 @@ def generate_uber_dataset(
     prediction_length: int,
     dataset_writer: DatasetWriter,
 ):
-    subsets = {"daily": "1D", "hourly": "1H"}
+    subsets = {"daily": "1D", "hourly": "1h"}
     assert (
         uber_freq.lower() in subsets
     ), f"invalid uber_freq='{uber_freq}'. Allowed values: {subsets.keys()}"
diff --git a/test/dataset/test_tsf_reader.py b/test/dataset/test_tsf_reader.py
index 2261d7ebfc..686b3f4d2c 100644
--- a/test/dataset/test_tsf_reader.py
+++ b/test/dataset/test_tsf_reader.py
@@ -20,10 +20,10 @@
     "input_freq_str, output_freq_str",
     [
         ("30_seconds", "30S"),
-        ("minutely", "T"),
-        ("10_minutes", "10T"),
-        ("hourly", "H"),
-        ("half_hourly", "0.5H"),
+        ("minutely", "min"),
+        ("10_minutes", "10min"),
+        ("hourly", "h"),
+        ("half_hourly", "0.5h"),
         ("daily", "D"),
         ("7_days", "7D"),
         ("weekly", "W"),