From 6ec1e193bca4c71229198d3c73b2873803aa41cd Mon Sep 17 00:00:00 2001 From: David Almeida <58078834+dc-almeida@users.noreply.github.com> Date: Thu, 30 Jan 2025 16:32:38 +0100 Subject: [PATCH] Refactor multiple warning levels for same data validation filter (#461) * Refactor multiple warning levels for same data validation filter * Coerce legacy format criteria into new DataValidatorCriteriaMultiple class * Stop validation after higher severity fail; update tests * Remove debugging print Co-authored-by: Daniel Huppmann * Fix warning level skipping; add third variant to test * Apply suggested changes * Remove ErrorCollector --------- Co-authored-by: David Almeida Co-authored-by: Daniel Huppmann --- nomenclature/processor/data_validator.py | 95 +++++++++++++++---- .../validate_warning_joined.yaml | 12 +++ .../validate_warning_joined_asc.yaml | 12 +++ ...ning.yaml => validate_warning_legacy.yaml} | 6 +- tests/test_cli.py | 2 +- tests/test_config.py | 1 - tests/test_validate_data.py | 59 +++++++++--- 7 files changed, 150 insertions(+), 37 deletions(-) create mode 100644 tests/data/validation/validate_data/validate_warning_joined.yaml create mode 100644 tests/data/validation/validate_data/validate_warning_joined_asc.yaml rename tests/data/validation/validate_data/{validate_warning.yaml => validate_warning_legacy.yaml} (100%) diff --git a/nomenclature/processor/data_validator.py b/nomenclature/processor/data_validator.py index debf0e1a..dda02f9c 100644 --- a/nomenclature/processor/data_validator.py +++ b/nomenclature/processor/data_validator.py @@ -4,6 +4,7 @@ from pathlib import Path import yaml +from pandas import concat from pyam import IamDataFrame from pyam.logging import adjust_log_level from pydantic import computed_field, field_validator, model_validator @@ -18,10 +19,10 @@ class WarningEnum(str, Enum): + error = "error" high = "high" medium = "medium" low = "low" - error = "error" class DataValidationCriteria(IamcDataFilter): @@ -47,6 +48,7 @@ def lower_bound(self) -> float: @property def validation_args(self): + """Attributes used for validation (as bounds).""" return self.model_dump( exclude_none=True, exclude_unset=True, @@ -55,6 +57,7 @@ def validation_args(self): @property def criteria(self): + """Attributes used for validation (as specified in the file).""" return self.model_dump( exclude_none=True, exclude_unset=True, @@ -79,23 +82,46 @@ def validation_args(self): @property def criteria(self): return self.model_dump( - exclude_none=True, - exclude_unset=True, - exclude=["warning_level"], + exclude_none=True, exclude_unset=True, exclude=["warning_level"] + ) + + +class DataValidationCriteriaMultiple(IamcDataFilter): + validation: ( + list[DataValidationCriteriaValue | DataValidationCriteriaBounds] | None + ) = None + + @model_validator(mode="after") + def check_warnings_order(self): + """Check if warnings are set in descending order of severity.""" + if self.validation != sorted(self.validation, key=lambda c: c.warning_level): + raise ValueError( + f"Validation criteria for {self.criteria} not" + " in descending order of severity." + ) + else: + return self + + @property + def criteria(self): + """Attributes used for validation (as specified in the file).""" + return self.model_dump( + exclude_none=True, exclude_unset=True, exclude=["validation"] ) class DataValidator(Processor): """Processor for validating IAMC datapoints""" - criteria_items: list[DataValidationCriteriaBounds | DataValidationCriteriaValue] + criteria_items: list[DataValidationCriteriaMultiple] file: Path @field_validator("criteria_items", mode="before") def check_criteria(cls, v): - for criterion in v: - has_bounds = any(c in criterion for c in ["upper_bound", "lower_bound"]) - has_values = any(c in criterion for c in ["value", "atol", "rtol"]) + for item in v: + for criterion in item["validation"]: + has_bounds = any(c in criterion for c in ["upper_bound", "lower_bound"]) + has_values = any(c in criterion for c in ["value", "atol", "rtol"]) if has_bounds and has_values: raise ValueError( f"Cannot use bounds and value-criteria simultaneously: {criterion}" @@ -106,7 +132,21 @@ def check_criteria(cls, v): def from_file(cls, file: Path | str) -> "DataValidator": with open(file, "r", encoding="utf-8") as f: content = yaml.safe_load(f) - return cls(file=file, criteria_items=content) + criteria_items = [] + for item in content: + filter_args = {k: item[k] for k in item if k in IamcDataFilter.model_fields} + criteria_args = { + k: item[k] + for k in item + if k not in IamcDataFilter.model_fields and k != "validation" + } + if "validation" in item: + for criterion in item["validation"]: + criterion.update(filter_args) + else: + item["validation"] = [{**filter_args, **criteria_args}] + criteria_items.append({k: item[k] for k in item if k not in criteria_args}) + return cls(file=file, criteria_items=criteria_items) def apply(self, df: IamDataFrame) -> IamDataFrame: fail_list = [] @@ -114,18 +154,33 @@ def apply(self, df: IamDataFrame) -> IamDataFrame: with adjust_log_level(): for item in self.criteria_items: - failed_validation = df.validate(**item.validation_args) - if failed_validation is not None: - criteria_msg = " Criteria: " + ", ".join( - [f"{key}: {value}" for key, value in item.criteria.items()] - ) - failed_validation["warning_level"] = item.warning_level.value - if item.warning_level == WarningEnum.error: - error = True - fail_list.append(criteria_msg) - fail_list.append( - textwrap.indent(str(failed_validation), prefix=" ") + "\n" + per_item_df = df + for criterion in item.validation: + failed_validation = per_item_df.validate( + **criterion.validation_args ) + if failed_validation is not None: + per_item_df = IamDataFrame( + concat([df.data, failed_validation]).drop_duplicates( + keep=False + ) + ) + criteria_msg = " Criteria: " + ", ".join( + [ + f"{key}: {value}" + for key, value in criterion.criteria.items() + ] + ) + failed_validation["warning_level"] = ( + criterion.warning_level.value + ) + if criterion.warning_level == WarningEnum.error: + error = True + fail_list.append(criteria_msg) + fail_list.append( + textwrap.indent(str(failed_validation), prefix=" ") + + "\n" + ) fail_msg = "(file %s):\n" % get_relative_path(self.file) if error: fail_msg = ( diff --git a/tests/data/validation/validate_data/validate_warning_joined.yaml b/tests/data/validation/validate_data/validate_warning_joined.yaml new file mode 100644 index 00000000..5ff92b3d --- /dev/null +++ b/tests/data/validation/validate_data/validate_warning_joined.yaml @@ -0,0 +1,12 @@ + - variable: Primary Energy + year: 2010 + validation: + - upper_bound: 5 + lower_bound: 1 + - warning_level: low + upper_bound: 2.5 + lower_bound: 1 + - variable: Primary Energy|Coal + year: 2010 + upper_bound: 5 + lower_bound: 1 diff --git a/tests/data/validation/validate_data/validate_warning_joined_asc.yaml b/tests/data/validation/validate_data/validate_warning_joined_asc.yaml new file mode 100644 index 00000000..21e484ab --- /dev/null +++ b/tests/data/validation/validate_data/validate_warning_joined_asc.yaml @@ -0,0 +1,12 @@ + - variable: Primary Energy + year: 2010 + validation: + - warning_level: low + upper_bound: 2.5 + lower_bound: 1 + - upper_bound: 5 + lower_bound: 1 + - variable: Primary Energy|Coal + year: 2010 + upper_bound: 5 + lower_bound: 1 diff --git a/tests/data/validation/validate_data/validate_warning.yaml b/tests/data/validation/validate_data/validate_warning_legacy.yaml similarity index 100% rename from tests/data/validation/validate_data/validate_warning.yaml rename to tests/data/validation/validate_data/validate_warning_legacy.yaml index 3482305a..4a4692c8 100644 --- a/tests/data/validation/validate_data/validate_warning.yaml +++ b/tests/data/validation/validate_data/validate_warning_legacy.yaml @@ -1,12 +1,12 @@ - variable: Primary Energy year: 2010 - upper_bound: 2.5 + upper_bound: 5 lower_bound: 1 - warning_level: low - variable: Primary Energy year: 2010 - upper_bound: 5 + upper_bound: 2.5 lower_bound: 1 + warning_level: low - variable: Primary Energy|Coal year: 2010 upper_bound: 5 diff --git a/tests/test_cli.py b/tests/test_cli.py index 47dafc24..7bbc1384 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -291,7 +291,7 @@ def test_cli_validate_data_fails(): ) assert cli_result.exit_code == 1 - assert "Collected 5 errors" in str(cli_result.exception) + assert "Collected 6 errors" in str(cli_result.exception) assert "Asia" in str(cli_result.exception) assert "Final Energy|Industry" in str(cli_result.exception) diff --git a/tests/test_config.py b/tests/test_config.py index 73f9efa8..a5596501 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -93,7 +93,6 @@ def test_config_with_filter(config_file): def test_config_external_repo_mapping_filter(): - config = NomenclatureConfig.from_file( TEST_DATA_DIR / "config" / "filter_mappings.yaml" ) diff --git a/tests/test_validate_data.py b/tests/test_validate_data.py index 8d0dbf0a..01a41682 100644 --- a/tests/test_validate_data.py +++ b/tests/test_validate_data.py @@ -16,8 +16,14 @@ def test_DataValidator_from_file(): { "variable": "Final Energy", "year": [2010], - "upper_bound": 2.5, - "lower_bound": 1.0, # test that integer in yaml is cast to float + "validation": [ + { + "variable": "Final Energy", + "year": [2010], + "upper_bound": 2.5, + "lower_bound": 1.0, # test that integer in yaml is cast to float + } + ], } ], "file": DATA_VALIDATION_TEST_DIR / "simple_validation.yaml", @@ -129,26 +135,55 @@ def test_DataValidator_apply_fails(simple_df, file, item_1, item_2, item_3, capl assert failed_validation_message in caplog.text -def test_DataValidator_validate_with_warning(simple_df, caplog): +@pytest.mark.parametrize( + "file, value", + [("joined", 6.0), ("joined", 3.0), ("legacy", 6.0)], +) +def test_DataValidator_validate_with_warning(file, value, simple_df, caplog): + """Checks that failed validation rows are printed in log.""" + simple_df._data.iloc[1] = value data_validator = DataValidator.from_file( - DATA_VALIDATION_TEST_DIR / "validate_warning.yaml" + DATA_VALIDATION_TEST_DIR / f"validate_warning_{file}.yaml" ) - with pytest.raises(ValueError, match="Data validation failed"): - data_validator.apply(simple_df) failed_validation_message = ( "Data validation with error(s)/warning(s) " - f"""(file {(DATA_VALIDATION_TEST_DIR / "validate_warning.yaml").relative_to(Path.cwd())}): + f"""(file {(DATA_VALIDATION_TEST_DIR / f"validate_warning_{file}.yaml").relative_to(Path.cwd())}): + Criteria: variable: ['Primary Energy'], year: [2010], upper_bound: 5.0, lower_bound: 1.0 + model scenario region variable unit year value warning_level + 0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 error + 1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 error""" + ) + if file == "legacy": + # prints both error and low warning levels for legacy format + # because these are treated as independent validation-criteria + failed_validation_message += """ + Criteria: variable: ['Primary Energy'], year: [2010], upper_bound: 2.5, lower_bound: 1.0 model scenario region variable unit year value warning_level 0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 low - 1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 low + 1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 low""" + if value == 3.0: + # prints each warning level when each is triggered by different rows + failed_validation_message = """ Criteria: variable: ['Primary Energy'], year: [2010], upper_bound: 5.0, lower_bound: 1.0 model scenario region variable unit year value warning_level - 0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 error - 1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 error""" - ) + 0 model_a scen_b World Primary Energy EJ/yr 2010 7.0 error + + Criteria: variable: ['Primary Energy'], year: [2010], upper_bound: 2.5, lower_bound: 1.0 + model scenario region variable unit year value warning_level + 0 model_a scen_a World Primary Energy EJ/yr 2010 3.0 low""" - # only prints two of three criteria in df to be validated + with pytest.raises(ValueError, match="Data validation failed"): + data_validator.apply(simple_df) assert failed_validation_message in caplog.text + + +def test_DataValidator_warning_order_fail(): + """Raises validation error if warnings for same criteria not in descending order.""" + match = "Validation criteria for .* not in descending order of severity." + with pytest.raises(ValueError, match=match): + DataValidator.from_file( + DATA_VALIDATION_TEST_DIR / "validate_warning_joined_asc.yaml" + )