From b01834bde62fd990add25030c8f6a88724b36c51 Mon Sep 17 00:00:00 2001 From: David Almeida Date: Mon, 20 Jan 2025 17:33:51 +0100 Subject: [PATCH 1/7] Refactor multiple warning levels for same data validation filter --- nomenclature/processor/data_validator.py | 15 +++++++++-- .../validate_warning_joined.yaml | 12 +++++++++ ...ng.yaml => validate_warning_separate.yaml} | 6 ++--- tests/test_validate_data.py | 25 ++++++++++++------- 4 files changed, 44 insertions(+), 14 deletions(-) create mode 100644 tests/data/validation/validate_data/validate_warning_joined.yaml rename tests/data/validation/validate_data/{validate_warning.yaml => validate_warning_separate.yaml} (100%) diff --git a/nomenclature/processor/data_validator.py b/nomenclature/processor/data_validator.py index debf0e1a..90f2535d 100644 --- a/nomenclature/processor/data_validator.py +++ b/nomenclature/processor/data_validator.py @@ -18,10 +18,10 @@ class WarningEnum(str, Enum): + error = "error" high = "high" medium = "medium" low = "low" - error = "error" class DataValidationCriteria(IamcDataFilter): @@ -106,7 +106,18 @@ def check_criteria(cls, v): def from_file(cls, file: Path | str) -> "DataValidator": with open(file, "r", encoding="utf-8") as f: content = yaml.safe_load(f) - return cls(file=file, criteria_items=content) + criteria_items = [] + for item in content: + if "validation" in item: + filter_args = {k: v for k, v in item.items() if k != "validation"} + for validation_args in sorted( + item["validation"], + key=lambda d: d.get("warning_level", WarningEnum.error), + ): + criteria_items.append(dict({**filter_args, **validation_args})) + else: + criteria_items.append(item) + return cls(file=file, criteria_items=criteria_items) def apply(self, df: IamDataFrame) -> IamDataFrame: fail_list = [] diff --git a/tests/data/validation/validate_data/validate_warning_joined.yaml b/tests/data/validation/validate_data/validate_warning_joined.yaml new file mode 100644 index 00000000..5ff92b3d --- /dev/null +++ b/tests/data/validation/validate_data/validate_warning_joined.yaml @@ -0,0 +1,12 @@ + - variable: Primary Energy + year: 2010 + validation: + - upper_bound: 5 + lower_bound: 1 + - warning_level: low + upper_bound: 2.5 + lower_bound: 1 + - variable: Primary Energy|Coal + year: 2010 + upper_bound: 5 + lower_bound: 1 diff --git a/tests/data/validation/validate_data/validate_warning.yaml b/tests/data/validation/validate_data/validate_warning_separate.yaml similarity index 100% rename from tests/data/validation/validate_data/validate_warning.yaml rename to tests/data/validation/validate_data/validate_warning_separate.yaml index 3482305a..4a4692c8 100644 --- a/tests/data/validation/validate_data/validate_warning.yaml +++ b/tests/data/validation/validate_data/validate_warning_separate.yaml @@ -1,12 +1,12 @@ - variable: Primary Energy year: 2010 - upper_bound: 2.5 + upper_bound: 5 lower_bound: 1 - warning_level: low - variable: Primary Energy year: 2010 - upper_bound: 5 + upper_bound: 2.5 lower_bound: 1 + warning_level: low - variable: Primary Energy|Coal year: 2010 upper_bound: 5 diff --git a/tests/test_validate_data.py b/tests/test_validate_data.py index 8d0dbf0a..2d6d71eb 100644 --- a/tests/test_validate_data.py +++ b/tests/test_validate_data.py @@ -129,25 +129,32 @@ def test_DataValidator_apply_fails(simple_df, file, item_1, item_2, item_3, capl assert failed_validation_message in caplog.text -def test_DataValidator_validate_with_warning(simple_df, caplog): +@pytest.mark.parametrize( + "file", + [ + "separate", + "joined", + ], +) +def test_DataValidator_validate_with_warning(file, simple_df, caplog): data_validator = DataValidator.from_file( - DATA_VALIDATION_TEST_DIR / "validate_warning.yaml" + DATA_VALIDATION_TEST_DIR / f"validate_warning_{file}.yaml" ) with pytest.raises(ValueError, match="Data validation failed"): data_validator.apply(simple_df) failed_validation_message = ( "Data validation with error(s)/warning(s) " - f"""(file {(DATA_VALIDATION_TEST_DIR / "validate_warning.yaml").relative_to(Path.cwd())}): - Criteria: variable: ['Primary Energy'], year: [2010], upper_bound: 2.5, lower_bound: 1.0 - model scenario region variable unit year value warning_level - 0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 low - 1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 low - + f"""(file {(DATA_VALIDATION_TEST_DIR / f"validate_warning_{file}.yaml").relative_to(Path.cwd())}): Criteria: variable: ['Primary Energy'], year: [2010], upper_bound: 5.0, lower_bound: 1.0 model scenario region variable unit year value warning_level 0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 error - 1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 error""" + 1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 error + + Criteria: variable: ['Primary Energy'], year: [2010], upper_bound: 2.5, lower_bound: 1.0 + model scenario region variable unit year value warning_level + 0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 low + 1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 low""" ) # only prints two of three criteria in df to be validated From d59885709d3cdd889d7594846c77217dbde4a6ad Mon Sep 17 00:00:00 2001 From: David Almeida Date: Fri, 24 Jan 2025 11:34:06 +0100 Subject: [PATCH 2/7] Coerce legacy format criteria into new DataValidatorCriteriaMultiple class --- nomenclature/processor/data_validator.py | 92 +++++++++++++++++------- tests/test_validate_data.py | 10 ++- 2 files changed, 74 insertions(+), 28 deletions(-) diff --git a/nomenclature/processor/data_validator.py b/nomenclature/processor/data_validator.py index 90f2535d..fd57b5f7 100644 --- a/nomenclature/processor/data_validator.py +++ b/nomenclature/processor/data_validator.py @@ -47,6 +47,7 @@ def lower_bound(self) -> float: @property def validation_args(self): + """Attributes used for validation (as bounds).""" return self.model_dump( exclude_none=True, exclude_unset=True, @@ -55,6 +56,7 @@ def validation_args(self): @property def criteria(self): + """Attributes used for validation (as specified in the file).""" return self.model_dump( exclude_none=True, exclude_unset=True, @@ -79,23 +81,51 @@ def validation_args(self): @property def criteria(self): return self.model_dump( - exclude_none=True, - exclude_unset=True, - exclude=["warning_level"], + exclude_none=True, exclude_unset=True, exclude=["warning_level"] + ) + + +class DataValidationCriteriaMultiple(IamcDataFilter): + validation: ( + list[DataValidationCriteriaValue | DataValidationCriteriaBounds] | None + ) = None + + @model_validator(mode="after") + def check_warnings_order(self): + """Check if warnings are set in descending order of severity.""" + errors = ErrorCollector() + if self.validation != sorted(self.validation, key=lambda c: c.warning_level): + errors.append( + ValueError( + f"Validation criteria for {self.criteria} not" + " in descending order of severity." + ) + ) + else: + return self + if errors: + raise ValueError(errors) + + @property + def criteria(self): + """Attributes used for validation (as specified in the file).""" + return self.model_dump( + exclude_none=True, exclude_unset=True, exclude=["validation"] ) class DataValidator(Processor): """Processor for validating IAMC datapoints""" - criteria_items: list[DataValidationCriteriaBounds | DataValidationCriteriaValue] + criteria_items: list[DataValidationCriteriaMultiple] file: Path @field_validator("criteria_items", mode="before") def check_criteria(cls, v): - for criterion in v: - has_bounds = any(c in criterion for c in ["upper_bound", "lower_bound"]) - has_values = any(c in criterion for c in ["value", "atol", "rtol"]) + for item in v: + for criterion in item["validation"]: + has_bounds = any(c in criterion for c in ["upper_bound", "lower_bound"]) + has_values = any(c in criterion for c in ["value", "atol", "rtol"]) if has_bounds and has_values: raise ValueError( f"Cannot use bounds and value-criteria simultaneously: {criterion}" @@ -108,15 +138,18 @@ def from_file(cls, file: Path | str) -> "DataValidator": content = yaml.safe_load(f) criteria_items = [] for item in content: + filter_args = {k: item[k] for k in item if k in IamcDataFilter.model_fields} + criteria_args = { + k: item[k] + for k in item + if k not in IamcDataFilter.model_fields and k != "validation" + } if "validation" in item: - filter_args = {k: v for k, v in item.items() if k != "validation"} - for validation_args in sorted( - item["validation"], - key=lambda d: d.get("warning_level", WarningEnum.error), - ): - criteria_items.append(dict({**filter_args, **validation_args})) + for criterion in item["validation"]: + criterion.update(filter_args) else: - criteria_items.append(item) + item["validation"] = [{**filter_args, **criteria_args}] + criteria_items.append({k: item[k] for k in item if k not in criteria_args}) return cls(file=file, criteria_items=criteria_items) def apply(self, df: IamDataFrame) -> IamDataFrame: @@ -125,18 +158,25 @@ def apply(self, df: IamDataFrame) -> IamDataFrame: with adjust_log_level(): for item in self.criteria_items: - failed_validation = df.validate(**item.validation_args) - if failed_validation is not None: - criteria_msg = " Criteria: " + ", ".join( - [f"{key}: {value}" for key, value in item.criteria.items()] - ) - failed_validation["warning_level"] = item.warning_level.value - if item.warning_level == WarningEnum.error: - error = True - fail_list.append(criteria_msg) - fail_list.append( - textwrap.indent(str(failed_validation), prefix=" ") + "\n" - ) + for criterion in item.validation: + failed_validation = df.validate(**criterion.validation_args) + if failed_validation is not None: + criteria_msg = " Criteria: " + ", ".join( + [ + f"{key}: {value}" + for key, value in criterion.criteria.items() + ] + ) + failed_validation["warning_level"] = ( + criterion.warning_level.value + ) + if criterion.warning_level == WarningEnum.error: + error = True + fail_list.append(criteria_msg) + fail_list.append( + textwrap.indent(str(failed_validation), prefix=" ") + + "\n" + ) fail_msg = "(file %s):\n" % get_relative_path(self.file) if error: fail_msg = ( diff --git a/tests/test_validate_data.py b/tests/test_validate_data.py index 2d6d71eb..be8a5f2c 100644 --- a/tests/test_validate_data.py +++ b/tests/test_validate_data.py @@ -16,8 +16,14 @@ def test_DataValidator_from_file(): { "variable": "Final Energy", "year": [2010], - "upper_bound": 2.5, - "lower_bound": 1.0, # test that integer in yaml is cast to float + "validation": [ + { + "variable": "Final Energy", + "year": [2010], + "upper_bound": 2.5, + "lower_bound": 1.0, # test that integer in yaml is cast to float + } + ], } ], "file": DATA_VALIDATION_TEST_DIR / "simple_validation.yaml", From 78a61ad400860f77e57e55d87ba6b32c90426d1a Mon Sep 17 00:00:00 2001 From: David Almeida Date: Fri, 24 Jan 2025 12:55:39 +0100 Subject: [PATCH 3/7] Stop validation after higher severity fail; update tests --- nomenclature/processor/data_validator.py | 1 + .../validate_warning_joined_asc.yaml | 12 ++++++++ ...rate.yaml => validate_warning_legacy.yaml} | 0 tests/test_cli.py | 3 +- tests/test_validate_data.py | 28 +++++++++++++------ 5 files changed, 34 insertions(+), 10 deletions(-) create mode 100644 tests/data/validation/validate_data/validate_warning_joined_asc.yaml rename tests/data/validation/validate_data/{validate_warning_separate.yaml => validate_warning_legacy.yaml} (100%) diff --git a/nomenclature/processor/data_validator.py b/nomenclature/processor/data_validator.py index fd57b5f7..b1a2bc82 100644 --- a/nomenclature/processor/data_validator.py +++ b/nomenclature/processor/data_validator.py @@ -177,6 +177,7 @@ def apply(self, df: IamDataFrame) -> IamDataFrame: textwrap.indent(str(failed_validation), prefix=" ") + "\n" ) + break fail_msg = "(file %s):\n" % get_relative_path(self.file) if error: fail_msg = ( diff --git a/tests/data/validation/validate_data/validate_warning_joined_asc.yaml b/tests/data/validation/validate_data/validate_warning_joined_asc.yaml new file mode 100644 index 00000000..21e484ab --- /dev/null +++ b/tests/data/validation/validate_data/validate_warning_joined_asc.yaml @@ -0,0 +1,12 @@ + - variable: Primary Energy + year: 2010 + validation: + - warning_level: low + upper_bound: 2.5 + lower_bound: 1 + - upper_bound: 5 + lower_bound: 1 + - variable: Primary Energy|Coal + year: 2010 + upper_bound: 5 + lower_bound: 1 diff --git a/tests/data/validation/validate_data/validate_warning_separate.yaml b/tests/data/validation/validate_data/validate_warning_legacy.yaml similarity index 100% rename from tests/data/validation/validate_data/validate_warning_separate.yaml rename to tests/data/validation/validate_data/validate_warning_legacy.yaml diff --git a/tests/test_cli.py b/tests/test_cli.py index 47dafc24..f3a754b7 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -291,7 +291,8 @@ def test_cli_validate_data_fails(): ) assert cli_result.exit_code == 1 - assert "Collected 5 errors" in str(cli_result.exception) + print(str(cli_result.exception)) + assert "Collected 6 errors" in str(cli_result.exception) assert "Asia" in str(cli_result.exception) assert "Final Energy|Industry" in str(cli_result.exception) diff --git a/tests/test_validate_data.py b/tests/test_validate_data.py index be8a5f2c..89e643e8 100644 --- a/tests/test_validate_data.py +++ b/tests/test_validate_data.py @@ -137,17 +137,13 @@ def test_DataValidator_apply_fails(simple_df, file, item_1, item_2, item_3, capl @pytest.mark.parametrize( "file", - [ - "separate", - "joined", - ], + ["joined", "legacy"], ) def test_DataValidator_validate_with_warning(file, simple_df, caplog): + """Checks that failed validation rows are printed in log.""" data_validator = DataValidator.from_file( DATA_VALIDATION_TEST_DIR / f"validate_warning_{file}.yaml" ) - with pytest.raises(ValueError, match="Data validation failed"): - data_validator.apply(simple_df) failed_validation_message = ( "Data validation with error(s)/warning(s) " @@ -155,13 +151,27 @@ def test_DataValidator_validate_with_warning(file, simple_df, caplog): Criteria: variable: ['Primary Energy'], year: [2010], upper_bound: 5.0, lower_bound: 1.0 model scenario region variable unit year value warning_level 0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 error - 1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 error + 1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 error""" + ) + + if file == "legacy": + # prints all failed warning levels for legacy format + failed_validation_message += """ Criteria: variable: ['Primary Energy'], year: [2010], upper_bound: 2.5, lower_bound: 1.0 model scenario region variable unit year value warning_level 0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 low 1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 low""" - ) - # only prints two of three criteria in df to be validated + with pytest.raises(ValueError, match="Data validation failed"): + data_validator.apply(simple_df) assert failed_validation_message in caplog.text + + +def test_DataValidator_warning_order_fail(): + """Raises validation error if warnings for same criteria not in descending order.""" + match = "Validation criteria for .* not in descending order of severity." + with pytest.raises(ValueError, match=match): + DataValidator.from_file( + DATA_VALIDATION_TEST_DIR / "validate_warning_joined_asc.yaml" + ) From 67374931318174b3a424a4eeb7762d793b31582c Mon Sep 17 00:00:00 2001 From: David Almeida <58078834+dc-almeida@users.noreply.github.com> Date: Mon, 27 Jan 2025 11:26:09 +0100 Subject: [PATCH 4/7] Remove debugging print Co-authored-by: Daniel Huppmann --- tests/test_cli.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index f3a754b7..7bbc1384 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -291,7 +291,6 @@ def test_cli_validate_data_fails(): ) assert cli_result.exit_code == 1 - print(str(cli_result.exception)) assert "Collected 6 errors" in str(cli_result.exception) assert "Asia" in str(cli_result.exception) assert "Final Energy|Industry" in str(cli_result.exception) From 4a411e963075ba3819bca2935c5bd7ab43529e54 Mon Sep 17 00:00:00 2001 From: David Almeida Date: Wed, 29 Jan 2025 15:21:27 +0100 Subject: [PATCH 5/7] Fix warning level skipping; add third variant to test --- nomenclature/processor/data_validator.py | 12 ++++++++++-- tests/test_config.py | 1 - tests/test_validate_data.py | 20 ++++++++++++++++---- 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/nomenclature/processor/data_validator.py b/nomenclature/processor/data_validator.py index b1a2bc82..42c58749 100644 --- a/nomenclature/processor/data_validator.py +++ b/nomenclature/processor/data_validator.py @@ -4,6 +4,7 @@ from pathlib import Path import yaml +from pandas import concat from pyam import IamDataFrame from pyam.logging import adjust_log_level from pydantic import computed_field, field_validator, model_validator @@ -158,9 +159,17 @@ def apply(self, df: IamDataFrame) -> IamDataFrame: with adjust_log_level(): for item in self.criteria_items: + per_item_df = df for criterion in item.validation: - failed_validation = df.validate(**criterion.validation_args) + failed_validation = per_item_df.validate( + **criterion.validation_args + ) if failed_validation is not None: + per_item_df = IamDataFrame( + concat([df.data, failed_validation]).drop_duplicates( + keep=False + ) + ) criteria_msg = " Criteria: " + ", ".join( [ f"{key}: {value}" @@ -177,7 +186,6 @@ def apply(self, df: IamDataFrame) -> IamDataFrame: textwrap.indent(str(failed_validation), prefix=" ") + "\n" ) - break fail_msg = "(file %s):\n" % get_relative_path(self.file) if error: fail_msg = ( diff --git a/tests/test_config.py b/tests/test_config.py index 73f9efa8..a5596501 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -93,7 +93,6 @@ def test_config_with_filter(config_file): def test_config_external_repo_mapping_filter(): - config = NomenclatureConfig.from_file( TEST_DATA_DIR / "config" / "filter_mappings.yaml" ) diff --git a/tests/test_validate_data.py b/tests/test_validate_data.py index 89e643e8..07b1e26c 100644 --- a/tests/test_validate_data.py +++ b/tests/test_validate_data.py @@ -5,6 +5,7 @@ from nomenclature import DataStructureDefinition from nomenclature.processor.data_validator import DataValidator +from pyam import IamDataFrame DATA_VALIDATION_TEST_DIR = TEST_DATA_DIR / "validation" / "validate_data" @@ -136,11 +137,12 @@ def test_DataValidator_apply_fails(simple_df, file, item_1, item_2, item_3, capl @pytest.mark.parametrize( - "file", - ["joined", "legacy"], + "file, value", + [("joined", 6.0), ("joined", 3.0), ("legacy", 6.0)], ) -def test_DataValidator_validate_with_warning(file, simple_df, caplog): +def test_DataValidator_validate_with_warning(file, value, simple_df, caplog): """Checks that failed validation rows are printed in log.""" + simple_df = IamDataFrame(simple_df._data.replace(6.0, value)) data_validator = DataValidator.from_file( DATA_VALIDATION_TEST_DIR / f"validate_warning_{file}.yaml" ) @@ -153,7 +155,6 @@ def test_DataValidator_validate_with_warning(file, simple_df, caplog): 0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 error 1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 error""" ) - if file == "legacy": # prints all failed warning levels for legacy format failed_validation_message += """ @@ -163,6 +164,17 @@ def test_DataValidator_validate_with_warning(file, simple_df, caplog): 0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 low 1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 low""" + if value == 3.0: + # prints each warning level when each is triggered by different rows + failed_validation_message = """ + Criteria: variable: ['Primary Energy'], year: [2010], upper_bound: 5.0, lower_bound: 1.0 + model scenario region variable unit year value warning_level + 0 model_a scen_b World Primary Energy EJ/yr 2010 7.0 error + + Criteria: variable: ['Primary Energy'], year: [2010], upper_bound: 2.5, lower_bound: 1.0 + model scenario region variable unit year value warning_level + 0 model_a scen_a World Primary Energy EJ/yr 2010 3.0 low""" + with pytest.raises(ValueError, match="Data validation failed"): data_validator.apply(simple_df) assert failed_validation_message in caplog.text From c0b65d961f3feb891cd102b293c2f36c41532331 Mon Sep 17 00:00:00 2001 From: David Almeida Date: Wed, 29 Jan 2025 15:59:12 +0100 Subject: [PATCH 6/7] Apply suggested changes --- tests/test_validate_data.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_validate_data.py b/tests/test_validate_data.py index 07b1e26c..01a41682 100644 --- a/tests/test_validate_data.py +++ b/tests/test_validate_data.py @@ -5,7 +5,6 @@ from nomenclature import DataStructureDefinition from nomenclature.processor.data_validator import DataValidator -from pyam import IamDataFrame DATA_VALIDATION_TEST_DIR = TEST_DATA_DIR / "validation" / "validate_data" @@ -142,7 +141,7 @@ def test_DataValidator_apply_fails(simple_df, file, item_1, item_2, item_3, capl ) def test_DataValidator_validate_with_warning(file, value, simple_df, caplog): """Checks that failed validation rows are printed in log.""" - simple_df = IamDataFrame(simple_df._data.replace(6.0, value)) + simple_df._data.iloc[1] = value data_validator = DataValidator.from_file( DATA_VALIDATION_TEST_DIR / f"validate_warning_{file}.yaml" ) @@ -156,7 +155,8 @@ def test_DataValidator_validate_with_warning(file, value, simple_df, caplog): 1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 error""" ) if file == "legacy": - # prints all failed warning levels for legacy format + # prints both error and low warning levels for legacy format + # because these are treated as independent validation-criteria failed_validation_message += """ Criteria: variable: ['Primary Energy'], year: [2010], upper_bound: 2.5, lower_bound: 1.0 From 20c6b4bf897ea70d65ae8ae1fd7efe99489aeaf5 Mon Sep 17 00:00:00 2001 From: David Almeida Date: Wed, 29 Jan 2025 22:18:27 +0100 Subject: [PATCH 7/7] Remove ErrorCollector --- nomenclature/processor/data_validator.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/nomenclature/processor/data_validator.py b/nomenclature/processor/data_validator.py index 42c58749..dda02f9c 100644 --- a/nomenclature/processor/data_validator.py +++ b/nomenclature/processor/data_validator.py @@ -94,18 +94,13 @@ class DataValidationCriteriaMultiple(IamcDataFilter): @model_validator(mode="after") def check_warnings_order(self): """Check if warnings are set in descending order of severity.""" - errors = ErrorCollector() if self.validation != sorted(self.validation, key=lambda c: c.warning_level): - errors.append( - ValueError( - f"Validation criteria for {self.criteria} not" - " in descending order of severity." - ) + raise ValueError( + f"Validation criteria for {self.criteria} not" + " in descending order of severity." ) else: return self - if errors: - raise ValueError(errors) @property def criteria(self):