From b3767028fc11a1ce3d39941a6a72c6a00ed27d2e Mon Sep 17 00:00:00 2001 From: David Almeida Date: Fri, 31 Jan 2025 12:57:26 +0100 Subject: [PATCH] Add docstrings and docs pages for `RequiredDataValidator` and `DataValidator` --- docs/user_guide.rst | 1 + docs/user_guide/data-validation.rst | 101 ++++++++++++++++++ nomenclature/processor/data_validator.py | 18 ++++ nomenclature/processor/required_data.py | 17 +++ .../required_data/requiredData.yaml | 1 - 5 files changed, 137 insertions(+), 1 deletion(-) create mode 100644 docs/user_guide/data-validation.rst diff --git a/docs/user_guide.rst b/docs/user_guide.rst index a78b134a..ba6cf4f5 100644 --- a/docs/user_guide.rst +++ b/docs/user_guide.rst @@ -49,3 +49,4 @@ the RegionProcessor and validated using DataStructureDefinition. user_guide/model-registration user_guide/config user_guide/local-usage + user_guide/data-validation diff --git a/docs/user_guide/data-validation.rst b/docs/user_guide/data-validation.rst new file mode 100644 index 00000000..c960bc68 --- /dev/null +++ b/docs/user_guide/data-validation.rst @@ -0,0 +1,101 @@ +.. _data-validation: + +.. currentmodule:: nomenclature + +Data validation +=============== + +The **nomenclature** package allows users to validate IAMC data in two ways. + +For this, validation requirements and criteria can be specified in YAML configuration +files. + +Required data validation +------------------------ + +**Required data validation** checks if certain models, variables, regions and/or +periods of time are covered in the datapoints. + +For this, a configuration file specifies the model(s) and dimension(s) expected +in the dataset. These are ``variable``, ``region`` and/or ``year``. +Alternatively, instead of using ``variable``, it's possible to declare measurands, +which can also specify units. + +.. code:: yaml + + description: Required variables for running MAGICC + model: model_a + required_data: + - measurand: + Emissions|CO2: + unit: Mt CO2/yr + region: World + year: [2020, 2030, 2040, 2050] + +In the example above, for *model_a*, the dataset must include datapoints of the +variable *Emissions|CO2* (measured in *Mt CO2/yr*), in the region *World*, for the +years 2020, 2030, 2040 and 2050. + +Data validation +--------------- + +**Data validation** checks if data values are within reasonable ranges. + +Consider the example below: + +.. code:: yaml + + - variable: Primary Energy + year: 2010 + validation: + - upper_bound: 5 + lower_bound: 1 + - warning_level: low + upper_bound: 2.5 + lower_bound: 1 + - variable: Primary Energy|Coal + year: 2010 + value: 5 + rtol: 2 + atol: 1 + +Each criteria item contains **data filter arguments** and **validation arguments**. + +Data filter arguments include: ``model``, ``scenario``, ``region``, ``variable``, +``unit``, and ``year``. +For the first criteria item, the data is filtered for variable *Primary Energy* +and year 2010. + +The ``validation`` arguments include: ``upper_bound``/``lower_bound`` *or* +``value``/``rtol``/``atol`` (relative tolerance, absolute tolerance). Only one +of the two can be set for each ``warning_level``. +The possible levels are: ``error``, ``high``, ``medium``, or ``low``. +For the same data filters, multiple warning levels with different criteria each +can be set. These must be listed in descending order of severity, otherwise a +``ValidationError`` is raised. +In the example, for the first criteria item, the validation arguments are set +for warning level ``error`` (by default, in case of omission) and ``low``, +using bounds. +Flagged datapoints are skipped for lower severity warnings in the same criteria +item (e.g.: if datapoints are flagged for the ``error`` level, they will not be +checked again for ``low``). + +The second criteria item (for variable *Primary Energy|Coal*) uses the old notation. +Its use is deprecated for being more verbose (requires each warning level to be +a separate criteria item) and slower to process. + +Standard usage +-------------- + +Run the following in a Python script to check that an IAMC dataset has valid +(required) data. + +.. code-block:: python + + from nomenclature import RequiredDataValidator + from nomenclature.processor import DataValidator + + # ...setting directory/file paths and loading dataset + + RequiredDataValidator.from_file(req_data_yaml).apply(df) + DataValidator.from_file(data_val_yaml).apply(df) diff --git a/nomenclature/processor/data_validator.py b/nomenclature/processor/data_validator.py index dda02f9c..60507c9d 100644 --- a/nomenclature/processor/data_validator.py +++ b/nomenclature/processor/data_validator.py @@ -149,6 +149,24 @@ def from_file(cls, file: Path | str) -> "DataValidator": return cls(file=file, criteria_items=criteria_items) def apply(self, df: IamDataFrame) -> IamDataFrame: + """Validates data in IAMC format according to specified criteria. + + Logs warning/error messages for each criterion that is not met. + + Parameters + ---------- + df : IamDataFrame + Data in IAMC format to be validated + + Returns + ------- + IamDataFrame + + Raises + ------ + `ValueError` if any criterion has a warning level of `error` + """ + fail_list = [] error = False diff --git a/nomenclature/processor/required_data.py b/nomenclature/processor/required_data.py index 68826085..eeae1607 100644 --- a/nomenclature/processor/required_data.py +++ b/nomenclature/processor/required_data.py @@ -147,6 +147,8 @@ def _wrong_unit_variables( class RequiredDataValidator(Processor): + """Processor for validating required dimensions in IAMC datapoints""" + description: str | None = None model: list[str] | None = None required_data: list[RequiredData] @@ -164,6 +166,21 @@ def from_file(cls, file: Path | str) -> "RequiredDataValidator": return cls(file=file, **content) def apply(self, df: IamDataFrame) -> IamDataFrame: + """Validates data in IAMC format according to required models and dimensions. + + Parameters + ---------- + df : IamDataFrame + Data in IAMC format to be validated + + Returns + ------- + IamDataFrame + + Raises + ------ + `ValueError` if any required dimension is not found in the data + """ if self.model is not None: models_to_check = [model for model in df.model if model in self.model] else: diff --git a/tests/data/required_data/required_data/requiredData.yaml b/tests/data/required_data/required_data/requiredData.yaml index 721cafd9..e9b2be3a 100644 --- a/tests/data/required_data/required_data/requiredData.yaml +++ b/tests/data/required_data/required_data/requiredData.yaml @@ -6,4 +6,3 @@ required_data: unit: Mt CO2/yr region: World year: [2020, 2030, 2040, 2050] -