Skip to content

Commit

Permalink
Refactor multiple warning levels for same data validation filter (#461)
Browse files Browse the repository at this point in the history
* Refactor multiple warning levels for same data validation filter

* Coerce legacy format criteria into new DataValidatorCriteriaMultiple class

* Stop validation after higher severity fail; update tests

* Remove debugging print

Co-authored-by: Daniel Huppmann <[email protected]>

* Fix warning level skipping; add third variant to test

* Apply suggested changes

* Remove ErrorCollector

---------

Co-authored-by: David Almeida <[email protected]>
Co-authored-by: Daniel Huppmann <[email protected]>
  • Loading branch information
3 people authored Jan 30, 2025
1 parent d5c355c commit 6ec1e19
Show file tree
Hide file tree
Showing 7 changed files with 150 additions and 37 deletions.
95 changes: 75 additions & 20 deletions nomenclature/processor/data_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pathlib import Path

import yaml
from pandas import concat
from pyam import IamDataFrame
from pyam.logging import adjust_log_level
from pydantic import computed_field, field_validator, model_validator
Expand All @@ -18,10 +19,10 @@


class WarningEnum(str, Enum):
error = "error"
high = "high"
medium = "medium"
low = "low"
error = "error"


class DataValidationCriteria(IamcDataFilter):
Expand All @@ -47,6 +48,7 @@ def lower_bound(self) -> float:

@property
def validation_args(self):
"""Attributes used for validation (as bounds)."""
return self.model_dump(
exclude_none=True,
exclude_unset=True,
Expand All @@ -55,6 +57,7 @@ def validation_args(self):

@property
def criteria(self):
"""Attributes used for validation (as specified in the file)."""
return self.model_dump(
exclude_none=True,
exclude_unset=True,
Expand All @@ -79,23 +82,46 @@ def validation_args(self):
@property
def criteria(self):
return self.model_dump(
exclude_none=True,
exclude_unset=True,
exclude=["warning_level"],
exclude_none=True, exclude_unset=True, exclude=["warning_level"]
)


class DataValidationCriteriaMultiple(IamcDataFilter):
validation: (
list[DataValidationCriteriaValue | DataValidationCriteriaBounds] | None
) = None

@model_validator(mode="after")
def check_warnings_order(self):
"""Check if warnings are set in descending order of severity."""
if self.validation != sorted(self.validation, key=lambda c: c.warning_level):
raise ValueError(
f"Validation criteria for {self.criteria} not"
" in descending order of severity."
)
else:
return self

@property
def criteria(self):
"""Attributes used for validation (as specified in the file)."""
return self.model_dump(
exclude_none=True, exclude_unset=True, exclude=["validation"]
)


class DataValidator(Processor):
"""Processor for validating IAMC datapoints"""

criteria_items: list[DataValidationCriteriaBounds | DataValidationCriteriaValue]
criteria_items: list[DataValidationCriteriaMultiple]
file: Path

@field_validator("criteria_items", mode="before")
def check_criteria(cls, v):
for criterion in v:
has_bounds = any(c in criterion for c in ["upper_bound", "lower_bound"])
has_values = any(c in criterion for c in ["value", "atol", "rtol"])
for item in v:
for criterion in item["validation"]:
has_bounds = any(c in criterion for c in ["upper_bound", "lower_bound"])
has_values = any(c in criterion for c in ["value", "atol", "rtol"])
if has_bounds and has_values:
raise ValueError(
f"Cannot use bounds and value-criteria simultaneously: {criterion}"
Expand All @@ -106,26 +132,55 @@ def check_criteria(cls, v):
def from_file(cls, file: Path | str) -> "DataValidator":
with open(file, "r", encoding="utf-8") as f:
content = yaml.safe_load(f)
return cls(file=file, criteria_items=content)
criteria_items = []
for item in content:
filter_args = {k: item[k] for k in item if k in IamcDataFilter.model_fields}
criteria_args = {
k: item[k]
for k in item
if k not in IamcDataFilter.model_fields and k != "validation"
}
if "validation" in item:
for criterion in item["validation"]:
criterion.update(filter_args)
else:
item["validation"] = [{**filter_args, **criteria_args}]
criteria_items.append({k: item[k] for k in item if k not in criteria_args})
return cls(file=file, criteria_items=criteria_items)

def apply(self, df: IamDataFrame) -> IamDataFrame:
fail_list = []
error = False

with adjust_log_level():
for item in self.criteria_items:
failed_validation = df.validate(**item.validation_args)
if failed_validation is not None:
criteria_msg = " Criteria: " + ", ".join(
[f"{key}: {value}" for key, value in item.criteria.items()]
)
failed_validation["warning_level"] = item.warning_level.value
if item.warning_level == WarningEnum.error:
error = True
fail_list.append(criteria_msg)
fail_list.append(
textwrap.indent(str(failed_validation), prefix=" ") + "\n"
per_item_df = df
for criterion in item.validation:
failed_validation = per_item_df.validate(
**criterion.validation_args
)
if failed_validation is not None:
per_item_df = IamDataFrame(
concat([df.data, failed_validation]).drop_duplicates(
keep=False
)
)
criteria_msg = " Criteria: " + ", ".join(
[
f"{key}: {value}"
for key, value in criterion.criteria.items()
]
)
failed_validation["warning_level"] = (
criterion.warning_level.value
)
if criterion.warning_level == WarningEnum.error:
error = True
fail_list.append(criteria_msg)
fail_list.append(
textwrap.indent(str(failed_validation), prefix=" ")
+ "\n"
)
fail_msg = "(file %s):\n" % get_relative_path(self.file)
if error:
fail_msg = (
Expand Down
12 changes: 12 additions & 0 deletions tests/data/validation/validate_data/validate_warning_joined.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
- variable: Primary Energy
year: 2010
validation:
- upper_bound: 5
lower_bound: 1
- warning_level: low
upper_bound: 2.5
lower_bound: 1
- variable: Primary Energy|Coal
year: 2010
upper_bound: 5
lower_bound: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
- variable: Primary Energy
year: 2010
validation:
- warning_level: low
upper_bound: 2.5
lower_bound: 1
- upper_bound: 5
lower_bound: 1
- variable: Primary Energy|Coal
year: 2010
upper_bound: 5
lower_bound: 1
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
- variable: Primary Energy
year: 2010
upper_bound: 2.5
upper_bound: 5
lower_bound: 1
warning_level: low
- variable: Primary Energy
year: 2010
upper_bound: 5
upper_bound: 2.5
lower_bound: 1
warning_level: low
- variable: Primary Energy|Coal
year: 2010
upper_bound: 5
Expand Down
2 changes: 1 addition & 1 deletion tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ def test_cli_validate_data_fails():
)

assert cli_result.exit_code == 1
assert "Collected 5 errors" in str(cli_result.exception)
assert "Collected 6 errors" in str(cli_result.exception)
assert "Asia" in str(cli_result.exception)
assert "Final Energy|Industry" in str(cli_result.exception)

Expand Down
1 change: 0 additions & 1 deletion tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,6 @@ def test_config_with_filter(config_file):


def test_config_external_repo_mapping_filter():

config = NomenclatureConfig.from_file(
TEST_DATA_DIR / "config" / "filter_mappings.yaml"
)
Expand Down
59 changes: 47 additions & 12 deletions tests/test_validate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,14 @@ def test_DataValidator_from_file():
{
"variable": "Final Energy",
"year": [2010],
"upper_bound": 2.5,
"lower_bound": 1.0, # test that integer in yaml is cast to float
"validation": [
{
"variable": "Final Energy",
"year": [2010],
"upper_bound": 2.5,
"lower_bound": 1.0, # test that integer in yaml is cast to float
}
],
}
],
"file": DATA_VALIDATION_TEST_DIR / "simple_validation.yaml",
Expand Down Expand Up @@ -129,26 +135,55 @@ def test_DataValidator_apply_fails(simple_df, file, item_1, item_2, item_3, capl
assert failed_validation_message in caplog.text


def test_DataValidator_validate_with_warning(simple_df, caplog):
@pytest.mark.parametrize(
"file, value",
[("joined", 6.0), ("joined", 3.0), ("legacy", 6.0)],
)
def test_DataValidator_validate_with_warning(file, value, simple_df, caplog):
"""Checks that failed validation rows are printed in log."""
simple_df._data.iloc[1] = value
data_validator = DataValidator.from_file(
DATA_VALIDATION_TEST_DIR / "validate_warning.yaml"
DATA_VALIDATION_TEST_DIR / f"validate_warning_{file}.yaml"
)
with pytest.raises(ValueError, match="Data validation failed"):
data_validator.apply(simple_df)

failed_validation_message = (
"Data validation with error(s)/warning(s) "
f"""(file {(DATA_VALIDATION_TEST_DIR / "validate_warning.yaml").relative_to(Path.cwd())}):
f"""(file {(DATA_VALIDATION_TEST_DIR / f"validate_warning_{file}.yaml").relative_to(Path.cwd())}):
Criteria: variable: ['Primary Energy'], year: [2010], upper_bound: 5.0, lower_bound: 1.0
model scenario region variable unit year value warning_level
0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 error
1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 error"""
)
if file == "legacy":
# prints both error and low warning levels for legacy format
# because these are treated as independent validation-criteria
failed_validation_message += """
Criteria: variable: ['Primary Energy'], year: [2010], upper_bound: 2.5, lower_bound: 1.0
model scenario region variable unit year value warning_level
0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 low
1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 low
1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 low"""

if value == 3.0:
# prints each warning level when each is triggered by different rows
failed_validation_message = """
Criteria: variable: ['Primary Energy'], year: [2010], upper_bound: 5.0, lower_bound: 1.0
model scenario region variable unit year value warning_level
0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 error
1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 error"""
)
0 model_a scen_b World Primary Energy EJ/yr 2010 7.0 error
Criteria: variable: ['Primary Energy'], year: [2010], upper_bound: 2.5, lower_bound: 1.0
model scenario region variable unit year value warning_level
0 model_a scen_a World Primary Energy EJ/yr 2010 3.0 low"""

# only prints two of three criteria in df to be validated
with pytest.raises(ValueError, match="Data validation failed"):
data_validator.apply(simple_df)
assert failed_validation_message in caplog.text


def test_DataValidator_warning_order_fail():
"""Raises validation error if warnings for same criteria not in descending order."""
match = "Validation criteria for .* not in descending order of severity."
with pytest.raises(ValueError, match=match):
DataValidator.from_file(
DATA_VALIDATION_TEST_DIR / "validate_warning_joined_asc.yaml"
)

0 comments on commit 6ec1e19

Please sign in to comment.