From 6ec1e193bca4c71229198d3c73b2873803aa41cd Mon Sep 17 00:00:00 2001
From: David Almeida <58078834+dc-almeida@users.noreply.github.com>
Date: Thu, 30 Jan 2025 16:32:38 +0100
Subject: [PATCH] Refactor multiple warning levels for same data validation
 filter (#461)

* Refactor multiple warning levels for same data validation filter

* Coerce legacy format criteria into new DataValidatorCriteriaMultiple class

* Stop validation after higher severity fail; update tests

* Remove debugging print

Co-authored-by: Daniel Huppmann <dh@dergelbesalon.at>

* Fix warning level skipping; add third variant to test

* Apply suggested changes

* Remove ErrorCollector

---------

Co-authored-by: David Almeida <almeida@iiasa.ac.at>
Co-authored-by: Daniel Huppmann <dh@dergelbesalon.at>
---
 nomenclature/processor/data_validator.py      | 95 +++++++++++++++----
 .../validate_warning_joined.yaml              | 12 +++
 .../validate_warning_joined_asc.yaml          | 12 +++
 ...ning.yaml => validate_warning_legacy.yaml} |  6 +-
 tests/test_cli.py                             |  2 +-
 tests/test_config.py                          |  1 -
 tests/test_validate_data.py                   | 59 +++++++++---
 7 files changed, 150 insertions(+), 37 deletions(-)
 create mode 100644 tests/data/validation/validate_data/validate_warning_joined.yaml
 create mode 100644 tests/data/validation/validate_data/validate_warning_joined_asc.yaml
 rename tests/data/validation/validate_data/{validate_warning.yaml => validate_warning_legacy.yaml} (100%)

diff --git a/nomenclature/processor/data_validator.py b/nomenclature/processor/data_validator.py
index debf0e1a..dda02f9c 100644
--- a/nomenclature/processor/data_validator.py
+++ b/nomenclature/processor/data_validator.py
@@ -4,6 +4,7 @@
 from pathlib import Path
 
 import yaml
+from pandas import concat
 from pyam import IamDataFrame
 from pyam.logging import adjust_log_level
 from pydantic import computed_field, field_validator, model_validator
@@ -18,10 +19,10 @@
 
 
 class WarningEnum(str, Enum):
+    error = "error"
     high = "high"
     medium = "medium"
     low = "low"
-    error = "error"
 
 
 class DataValidationCriteria(IamcDataFilter):
@@ -47,6 +48,7 @@ def lower_bound(self) -> float:
 
     @property
     def validation_args(self):
+        """Attributes used for validation (as bounds)."""
         return self.model_dump(
             exclude_none=True,
             exclude_unset=True,
@@ -55,6 +57,7 @@ def validation_args(self):
 
     @property
     def criteria(self):
+        """Attributes used for validation (as specified in the file)."""
         return self.model_dump(
             exclude_none=True,
             exclude_unset=True,
@@ -79,23 +82,46 @@ def validation_args(self):
     @property
     def criteria(self):
         return self.model_dump(
-            exclude_none=True,
-            exclude_unset=True,
-            exclude=["warning_level"],
+            exclude_none=True, exclude_unset=True, exclude=["warning_level"]
+        )
+
+
+class DataValidationCriteriaMultiple(IamcDataFilter):
+    validation: (
+        list[DataValidationCriteriaValue | DataValidationCriteriaBounds] | None
+    ) = None
+
+    @model_validator(mode="after")
+    def check_warnings_order(self):
+        """Check if warnings are set in descending order of severity."""
+        if self.validation != sorted(self.validation, key=lambda c: c.warning_level):
+            raise ValueError(
+                f"Validation criteria for {self.criteria} not"
+                " in descending order of severity."
+            )
+        else:
+            return self
+
+    @property
+    def criteria(self):
+        """Attributes used for validation (as specified in the file)."""
+        return self.model_dump(
+            exclude_none=True, exclude_unset=True, exclude=["validation"]
         )
 
 
 class DataValidator(Processor):
     """Processor for validating IAMC datapoints"""
 
-    criteria_items: list[DataValidationCriteriaBounds | DataValidationCriteriaValue]
+    criteria_items: list[DataValidationCriteriaMultiple]
     file: Path
 
     @field_validator("criteria_items", mode="before")
     def check_criteria(cls, v):
-        for criterion in v:
-            has_bounds = any(c in criterion for c in ["upper_bound", "lower_bound"])
-            has_values = any(c in criterion for c in ["value", "atol", "rtol"])
+        for item in v:
+            for criterion in item["validation"]:
+                has_bounds = any(c in criterion for c in ["upper_bound", "lower_bound"])
+                has_values = any(c in criterion for c in ["value", "atol", "rtol"])
             if has_bounds and has_values:
                 raise ValueError(
                     f"Cannot use bounds and value-criteria simultaneously: {criterion}"
@@ -106,7 +132,21 @@ def check_criteria(cls, v):
     def from_file(cls, file: Path | str) -> "DataValidator":
         with open(file, "r", encoding="utf-8") as f:
             content = yaml.safe_load(f)
-        return cls(file=file, criteria_items=content)
+        criteria_items = []
+        for item in content:
+            filter_args = {k: item[k] for k in item if k in IamcDataFilter.model_fields}
+            criteria_args = {
+                k: item[k]
+                for k in item
+                if k not in IamcDataFilter.model_fields and k != "validation"
+            }
+            if "validation" in item:
+                for criterion in item["validation"]:
+                    criterion.update(filter_args)
+            else:
+                item["validation"] = [{**filter_args, **criteria_args}]
+            criteria_items.append({k: item[k] for k in item if k not in criteria_args})
+        return cls(file=file, criteria_items=criteria_items)
 
     def apply(self, df: IamDataFrame) -> IamDataFrame:
         fail_list = []
@@ -114,18 +154,33 @@ def apply(self, df: IamDataFrame) -> IamDataFrame:
 
         with adjust_log_level():
             for item in self.criteria_items:
-                failed_validation = df.validate(**item.validation_args)
-                if failed_validation is not None:
-                    criteria_msg = "  Criteria: " + ", ".join(
-                        [f"{key}: {value}" for key, value in item.criteria.items()]
-                    )
-                    failed_validation["warning_level"] = item.warning_level.value
-                    if item.warning_level == WarningEnum.error:
-                        error = True
-                    fail_list.append(criteria_msg)
-                    fail_list.append(
-                        textwrap.indent(str(failed_validation), prefix="    ") + "\n"
+                per_item_df = df
+                for criterion in item.validation:
+                    failed_validation = per_item_df.validate(
+                        **criterion.validation_args
                     )
+                    if failed_validation is not None:
+                        per_item_df = IamDataFrame(
+                            concat([df.data, failed_validation]).drop_duplicates(
+                                keep=False
+                            )
+                        )
+                        criteria_msg = "  Criteria: " + ", ".join(
+                            [
+                                f"{key}: {value}"
+                                for key, value in criterion.criteria.items()
+                            ]
+                        )
+                        failed_validation["warning_level"] = (
+                            criterion.warning_level.value
+                        )
+                        if criterion.warning_level == WarningEnum.error:
+                            error = True
+                        fail_list.append(criteria_msg)
+                        fail_list.append(
+                            textwrap.indent(str(failed_validation), prefix="    ")
+                            + "\n"
+                        )
             fail_msg = "(file %s):\n" % get_relative_path(self.file)
             if error:
                 fail_msg = (
diff --git a/tests/data/validation/validate_data/validate_warning_joined.yaml b/tests/data/validation/validate_data/validate_warning_joined.yaml
new file mode 100644
index 00000000..5ff92b3d
--- /dev/null
+++ b/tests/data/validation/validate_data/validate_warning_joined.yaml
@@ -0,0 +1,12 @@
+ - variable: Primary Energy
+   year: 2010
+   validation:
+    - upper_bound: 5
+      lower_bound: 1
+    - warning_level: low
+      upper_bound: 2.5
+      lower_bound: 1
+ - variable: Primary Energy|Coal
+   year: 2010
+   upper_bound: 5
+   lower_bound: 1
diff --git a/tests/data/validation/validate_data/validate_warning_joined_asc.yaml b/tests/data/validation/validate_data/validate_warning_joined_asc.yaml
new file mode 100644
index 00000000..21e484ab
--- /dev/null
+++ b/tests/data/validation/validate_data/validate_warning_joined_asc.yaml
@@ -0,0 +1,12 @@
+ - variable: Primary Energy
+   year: 2010
+   validation:
+    - warning_level: low
+      upper_bound: 2.5
+      lower_bound: 1
+    - upper_bound: 5
+      lower_bound: 1
+ - variable: Primary Energy|Coal
+   year: 2010
+   upper_bound: 5
+   lower_bound: 1
diff --git a/tests/data/validation/validate_data/validate_warning.yaml b/tests/data/validation/validate_data/validate_warning_legacy.yaml
similarity index 100%
rename from tests/data/validation/validate_data/validate_warning.yaml
rename to tests/data/validation/validate_data/validate_warning_legacy.yaml
index 3482305a..4a4692c8 100644
--- a/tests/data/validation/validate_data/validate_warning.yaml
+++ b/tests/data/validation/validate_data/validate_warning_legacy.yaml
@@ -1,12 +1,12 @@
  - variable: Primary Energy
    year: 2010
-   upper_bound: 2.5
+   upper_bound: 5
    lower_bound: 1
-   warning_level: low
  - variable: Primary Energy
    year: 2010
-   upper_bound: 5
+   upper_bound: 2.5
    lower_bound: 1
+   warning_level: low
  - variable: Primary Energy|Coal
    year: 2010
    upper_bound: 5
diff --git a/tests/test_cli.py b/tests/test_cli.py
index 47dafc24..7bbc1384 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -291,7 +291,7 @@ def test_cli_validate_data_fails():
     )
 
     assert cli_result.exit_code == 1
-    assert "Collected 5 errors" in str(cli_result.exception)
+    assert "Collected 6 errors" in str(cli_result.exception)
     assert "Asia" in str(cli_result.exception)
     assert "Final Energy|Industry" in str(cli_result.exception)
 
diff --git a/tests/test_config.py b/tests/test_config.py
index 73f9efa8..a5596501 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -93,7 +93,6 @@ def test_config_with_filter(config_file):
 
 
 def test_config_external_repo_mapping_filter():
-
     config = NomenclatureConfig.from_file(
         TEST_DATA_DIR / "config" / "filter_mappings.yaml"
     )
diff --git a/tests/test_validate_data.py b/tests/test_validate_data.py
index 8d0dbf0a..01a41682 100644
--- a/tests/test_validate_data.py
+++ b/tests/test_validate_data.py
@@ -16,8 +16,14 @@ def test_DataValidator_from_file():
                 {
                     "variable": "Final Energy",
                     "year": [2010],
-                    "upper_bound": 2.5,
-                    "lower_bound": 1.0,  # test that integer in yaml is cast to float
+                    "validation": [
+                        {
+                            "variable": "Final Energy",
+                            "year": [2010],
+                            "upper_bound": 2.5,
+                            "lower_bound": 1.0,  # test that integer in yaml is cast to float
+                        }
+                    ],
                 }
             ],
             "file": DATA_VALIDATION_TEST_DIR / "simple_validation.yaml",
@@ -129,26 +135,55 @@ def test_DataValidator_apply_fails(simple_df, file, item_1, item_2, item_3, capl
     assert failed_validation_message in caplog.text
 
 
-def test_DataValidator_validate_with_warning(simple_df, caplog):
+@pytest.mark.parametrize(
+    "file, value",
+    [("joined", 6.0), ("joined", 3.0), ("legacy", 6.0)],
+)
+def test_DataValidator_validate_with_warning(file, value, simple_df, caplog):
+    """Checks that failed validation rows are printed in log."""
+    simple_df._data.iloc[1] = value
     data_validator = DataValidator.from_file(
-        DATA_VALIDATION_TEST_DIR / "validate_warning.yaml"
+        DATA_VALIDATION_TEST_DIR / f"validate_warning_{file}.yaml"
     )
-    with pytest.raises(ValueError, match="Data validation failed"):
-        data_validator.apply(simple_df)
 
     failed_validation_message = (
         "Data validation with error(s)/warning(s) "
-        f"""(file {(DATA_VALIDATION_TEST_DIR / "validate_warning.yaml").relative_to(Path.cwd())}):
+        f"""(file {(DATA_VALIDATION_TEST_DIR / f"validate_warning_{file}.yaml").relative_to(Path.cwd())}):
+  Criteria: variable: ['Primary Energy'], year: [2010], upper_bound: 5.0, lower_bound: 1.0
+         model scenario region        variable   unit  year  value warning_level
+    0  model_a   scen_a  World  Primary Energy  EJ/yr  2010    6.0         error
+    1  model_a   scen_b  World  Primary Energy  EJ/yr  2010    7.0         error"""
+    )
+    if file == "legacy":
+        # prints both error and low warning levels for legacy format
+        # because these are treated as independent validation-criteria
+        failed_validation_message += """
+
   Criteria: variable: ['Primary Energy'], year: [2010], upper_bound: 2.5, lower_bound: 1.0
          model scenario region        variable   unit  year  value warning_level
     0  model_a   scen_a  World  Primary Energy  EJ/yr  2010    6.0           low
-    1  model_a   scen_b  World  Primary Energy  EJ/yr  2010    7.0           low
+    1  model_a   scen_b  World  Primary Energy  EJ/yr  2010    7.0           low"""
 
+    if value == 3.0:
+        # prints each warning level when each is triggered by different rows
+        failed_validation_message = """
   Criteria: variable: ['Primary Energy'], year: [2010], upper_bound: 5.0, lower_bound: 1.0
          model scenario region        variable   unit  year  value warning_level
-    0  model_a   scen_a  World  Primary Energy  EJ/yr  2010    6.0         error
-    1  model_a   scen_b  World  Primary Energy  EJ/yr  2010    7.0         error"""
-    )
+    0  model_a   scen_b  World  Primary Energy  EJ/yr  2010    7.0         error
+
+  Criteria: variable: ['Primary Energy'], year: [2010], upper_bound: 2.5, lower_bound: 1.0
+         model scenario region        variable   unit  year  value warning_level
+    0  model_a   scen_a  World  Primary Energy  EJ/yr  2010    3.0           low"""
 
-    # only prints two of three criteria in df to be validated
+    with pytest.raises(ValueError, match="Data validation failed"):
+        data_validator.apply(simple_df)
     assert failed_validation_message in caplog.text
+
+
+def test_DataValidator_warning_order_fail():
+    """Raises validation error if warnings for same criteria not in descending order."""
+    match = "Validation criteria for .* not in descending order of severity."
+    with pytest.raises(ValueError, match=match):
+        DataValidator.from_file(
+            DATA_VALIDATION_TEST_DIR / "validate_warning_joined_asc.yaml"
+        )