Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor multiple warning levels for same data validation filter #461

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 75 additions & 20 deletions nomenclature/processor/data_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pathlib import Path

import yaml
from pandas import concat
from pyam import IamDataFrame
from pyam.logging import adjust_log_level
from pydantic import computed_field, field_validator, model_validator
Expand All @@ -18,10 +19,10 @@


class WarningEnum(str, Enum):
error = "error"
high = "high"
medium = "medium"
low = "low"
error = "error"


class DataValidationCriteria(IamcDataFilter):
Expand All @@ -47,6 +48,7 @@ def lower_bound(self) -> float:

@property
def validation_args(self):
"""Attributes used for validation (as bounds)."""
return self.model_dump(
exclude_none=True,
exclude_unset=True,
Expand All @@ -55,6 +57,7 @@ def validation_args(self):

@property
def criteria(self):
"""Attributes used for validation (as specified in the file)."""
return self.model_dump(
exclude_none=True,
exclude_unset=True,
Expand All @@ -79,23 +82,46 @@ def validation_args(self):
@property
def criteria(self):
return self.model_dump(
exclude_none=True,
exclude_unset=True,
exclude=["warning_level"],
exclude_none=True, exclude_unset=True, exclude=["warning_level"]
)


class DataValidationCriteriaMultiple(IamcDataFilter):
validation: (
list[DataValidationCriteriaValue | DataValidationCriteriaBounds] | None
) = None

@model_validator(mode="after")
def check_warnings_order(self):
"""Check if warnings are set in descending order of severity."""
if self.validation != sorted(self.validation, key=lambda c: c.warning_level):
raise ValueError(
f"Validation criteria for {self.criteria} not"
" in descending order of severity."
)
else:
return self

@property
def criteria(self):
"""Attributes used for validation (as specified in the file)."""
return self.model_dump(
exclude_none=True, exclude_unset=True, exclude=["validation"]
)


class DataValidator(Processor):
"""Processor for validating IAMC datapoints"""

criteria_items: list[DataValidationCriteriaBounds | DataValidationCriteriaValue]
criteria_items: list[DataValidationCriteriaMultiple]
file: Path

@field_validator("criteria_items", mode="before")
def check_criteria(cls, v):
for criterion in v:
has_bounds = any(c in criterion for c in ["upper_bound", "lower_bound"])
has_values = any(c in criterion for c in ["value", "atol", "rtol"])
for item in v:
for criterion in item["validation"]:
has_bounds = any(c in criterion for c in ["upper_bound", "lower_bound"])
has_values = any(c in criterion for c in ["value", "atol", "rtol"])
if has_bounds and has_values:
raise ValueError(
f"Cannot use bounds and value-criteria simultaneously: {criterion}"
Expand All @@ -106,26 +132,55 @@ def check_criteria(cls, v):
def from_file(cls, file: Path | str) -> "DataValidator":
with open(file, "r", encoding="utf-8") as f:
content = yaml.safe_load(f)
return cls(file=file, criteria_items=content)
criteria_items = []
for item in content:
filter_args = {k: item[k] for k in item if k in IamcDataFilter.model_fields}
criteria_args = {
k: item[k]
for k in item
if k not in IamcDataFilter.model_fields and k != "validation"
}
if "validation" in item:
for criterion in item["validation"]:
criterion.update(filter_args)
else:
item["validation"] = [{**filter_args, **criteria_args}]
criteria_items.append({k: item[k] for k in item if k not in criteria_args})
return cls(file=file, criteria_items=criteria_items)

def apply(self, df: IamDataFrame) -> IamDataFrame:
fail_list = []
error = False

with adjust_log_level():
for item in self.criteria_items:
failed_validation = df.validate(**item.validation_args)
if failed_validation is not None:
criteria_msg = " Criteria: " + ", ".join(
[f"{key}: {value}" for key, value in item.criteria.items()]
)
failed_validation["warning_level"] = item.warning_level.value
if item.warning_level == WarningEnum.error:
error = True
fail_list.append(criteria_msg)
fail_list.append(
textwrap.indent(str(failed_validation), prefix=" ") + "\n"
per_item_df = df
for criterion in item.validation:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Too much for this PR but as a future refactoring, I'd suggest to add an apply function (or something along those lines) to DataValidationCriteriaMultiple. This way this code would look something like this:

for item in self.criteria_items:
    item.apply(df)

and the rest is handled by DataValidationCriteriaMultiple.

failed_validation = per_item_df.validate(
**criterion.validation_args
)
if failed_validation is not None:
per_item_df = IamDataFrame(
concat([df.data, failed_validation]).drop_duplicates(
keep=False
)
)
criteria_msg = " Criteria: " + ", ".join(
[
f"{key}: {value}"
for key, value in criterion.criteria.items()
]
)
failed_validation["warning_level"] = (
criterion.warning_level.value
)
if criterion.warning_level == WarningEnum.error:
error = True
fail_list.append(criteria_msg)
fail_list.append(
textwrap.indent(str(failed_validation), prefix=" ")
+ "\n"
)
fail_msg = "(file %s):\n" % get_relative_path(self.file)
if error:
fail_msg = (
Expand Down
12 changes: 12 additions & 0 deletions tests/data/validation/validate_data/validate_warning_joined.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
- variable: Primary Energy
year: 2010
validation:
- upper_bound: 5
lower_bound: 1
- warning_level: low
upper_bound: 2.5
lower_bound: 1
- variable: Primary Energy|Coal
year: 2010
upper_bound: 5
lower_bound: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
- variable: Primary Energy
year: 2010
validation:
- warning_level: low
danielhuppmann marked this conversation as resolved.
Show resolved Hide resolved
upper_bound: 2.5
lower_bound: 1
- upper_bound: 5
lower_bound: 1
- variable: Primary Energy|Coal
year: 2010
upper_bound: 5
lower_bound: 1
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
- variable: Primary Energy
year: 2010
upper_bound: 2.5
upper_bound: 5
lower_bound: 1
warning_level: low
- variable: Primary Energy
year: 2010
upper_bound: 5
upper_bound: 2.5
lower_bound: 1
warning_level: low
- variable: Primary Energy|Coal
year: 2010
upper_bound: 5
Expand Down
2 changes: 1 addition & 1 deletion tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ def test_cli_validate_data_fails():
)

assert cli_result.exit_code == 1
assert "Collected 5 errors" in str(cli_result.exception)
assert "Collected 6 errors" in str(cli_result.exception)
assert "Asia" in str(cli_result.exception)
assert "Final Energy|Industry" in str(cli_result.exception)

Expand Down
1 change: 0 additions & 1 deletion tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,6 @@ def test_config_with_filter(config_file):


def test_config_external_repo_mapping_filter():

config = NomenclatureConfig.from_file(
TEST_DATA_DIR / "config" / "filter_mappings.yaml"
)
Expand Down
59 changes: 47 additions & 12 deletions tests/test_validate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,14 @@ def test_DataValidator_from_file():
{
"variable": "Final Energy",
"year": [2010],
"upper_bound": 2.5,
"lower_bound": 1.0, # test that integer in yaml is cast to float
"validation": [
{
"variable": "Final Energy",
"year": [2010],
"upper_bound": 2.5,
"lower_bound": 1.0, # test that integer in yaml is cast to float
}
],
}
],
"file": DATA_VALIDATION_TEST_DIR / "simple_validation.yaml",
Expand Down Expand Up @@ -129,26 +135,55 @@ def test_DataValidator_apply_fails(simple_df, file, item_1, item_2, item_3, capl
assert failed_validation_message in caplog.text


def test_DataValidator_validate_with_warning(simple_df, caplog):
@pytest.mark.parametrize(
"file, value",
[("joined", 6.0), ("joined", 3.0), ("legacy", 6.0)],
)
def test_DataValidator_validate_with_warning(file, value, simple_df, caplog):
"""Checks that failed validation rows are printed in log."""
simple_df._data.iloc[1] = value
data_validator = DataValidator.from_file(
DATA_VALIDATION_TEST_DIR / "validate_warning.yaml"
DATA_VALIDATION_TEST_DIR / f"validate_warning_{file}.yaml"
)
with pytest.raises(ValueError, match="Data validation failed"):
data_validator.apply(simple_df)

failed_validation_message = (
"Data validation with error(s)/warning(s) "
f"""(file {(DATA_VALIDATION_TEST_DIR / "validate_warning.yaml").relative_to(Path.cwd())}):
f"""(file {(DATA_VALIDATION_TEST_DIR / f"validate_warning_{file}.yaml").relative_to(Path.cwd())}):
Criteria: variable: ['Primary Energy'], year: [2010], upper_bound: 5.0, lower_bound: 1.0
model scenario region variable unit year value warning_level
0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 error
1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 error"""
dc-almeida marked this conversation as resolved.
Show resolved Hide resolved
)
if file == "legacy":
# prints both error and low warning levels for legacy format
# because these are treated as independent validation-criteria
failed_validation_message += """

Criteria: variable: ['Primary Energy'], year: [2010], upper_bound: 2.5, lower_bound: 1.0
model scenario region variable unit year value warning_level
0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 low
1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 low
1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 low"""

if value == 3.0:
# prints each warning level when each is triggered by different rows
failed_validation_message = """
Criteria: variable: ['Primary Energy'], year: [2010], upper_bound: 5.0, lower_bound: 1.0
model scenario region variable unit year value warning_level
0 model_a scen_a World Primary Energy EJ/yr 2010 6.0 error
1 model_a scen_b World Primary Energy EJ/yr 2010 7.0 error"""
)
0 model_a scen_b World Primary Energy EJ/yr 2010 7.0 error

Criteria: variable: ['Primary Energy'], year: [2010], upper_bound: 2.5, lower_bound: 1.0
model scenario region variable unit year value warning_level
0 model_a scen_a World Primary Energy EJ/yr 2010 3.0 low"""

# only prints two of three criteria in df to be validated
with pytest.raises(ValueError, match="Data validation failed"):
data_validator.apply(simple_df)
assert failed_validation_message in caplog.text


def test_DataValidator_warning_order_fail():
"""Raises validation error if warnings for same criteria not in descending order."""
match = "Validation criteria for .* not in descending order of severity."
with pytest.raises(ValueError, match=match):
DataValidator.from_file(
DATA_VALIDATION_TEST_DIR / "validate_warning_joined_asc.yaml"
)