Add docstrings and docs pages for RequiredDataValidator and `DataVa…

…lidator`
IAMconsortium · Jan 31, 2025 · b376702 · b376702
1 parent 6ec1e19
commit b376702
Show file tree

Hide file tree

Showing 5 changed files with 137 additions and 1 deletion.
diff --git a/docs/user_guide.rst b/docs/user_guide.rst
@@ -49,3 +49,4 @@ the RegionProcessor and validated using DataStructureDefinition.
   user_guide/model-registration
   user_guide/config
   user_guide/local-usage
+  user_guide/data-validation
diff --git a/docs/user_guide/data-validation.rst b/docs/user_guide/data-validation.rst
@@ -0,0 +1,101 @@
+.. _data-validation:
+
+.. currentmodule:: nomenclature
+
+Data validation
+===============
+
+The **nomenclature** package allows users to validate IAMC data in two ways.
+
+For this, validation requirements and criteria can be specified in YAML configuration
+files.
+
+Required data validation
+------------------------
+
+**Required data validation** checks if certain models, variables, regions and/or 
+periods of time are covered in the datapoints.
+
+For this, a configuration file specifies the model(s) and dimension(s) expected
+in the dataset. These are ``variable``, ``region`` and/or ``year``.
+Alternatively, instead of using ``variable``, it's possible to declare measurands,
+which can also specify units.
+
+.. code:: yaml
+
+  description: Required variables for running MAGICC
+  model: model_a
+  required_data:
+    - measurand:
+        Emissions|CO2:
+          unit: Mt CO2/yr
+      region: World
+      year: [2020, 2030, 2040, 2050]
+
+In the example above, for *model_a*, the dataset must include datapoints of the
+variable *Emissions|CO2* (measured in *Mt CO2/yr*), in the region *World*, for the
+years 2020, 2030, 2040 and 2050.
+
+Data validation
+---------------
+
+**Data validation** checks if data values are within reasonable ranges.
+
+Consider the example below:
+
+.. code:: yaml
+
+  - variable: Primary Energy
+    year: 2010
+    validation:
+      - upper_bound: 5
+        lower_bound: 1
+      - warning_level: low
+        upper_bound: 2.5
+        lower_bound: 1
+  - variable: Primary Energy|Coal
+    year: 2010
+    value: 5
+    rtol: 2
+    atol: 1
+
+Each criteria item contains **data filter arguments** and **validation arguments**.
+
+Data filter arguments include: ``model``, ``scenario``, ``region``, ``variable``,
+``unit``, and ``year``.
+For the first criteria item, the data is filtered for variable *Primary Energy*
+and year 2010.
+
+The ``validation`` arguments include: ``upper_bound``/``lower_bound`` *or*
+``value``/``rtol``/``atol`` (relative tolerance, absolute tolerance). Only one
+of the two can be set for each ``warning_level``.
+The possible levels are: ``error``, ``high``, ``medium``, or ``low``.
+For the same data filters, multiple warning levels with different criteria each
+can be set. These must be listed in descending order of severity, otherwise a
+``ValidationError`` is raised.
+In the example, for the first criteria item, the validation arguments are set
+for warning level ``error`` (by default, in case of omission) and ``low``,
+using bounds.
+Flagged datapoints are skipped for lower severity warnings in the same criteria
+item (e.g.: if datapoints are flagged for the ``error`` level, they will not be
+checked again for ``low``).
+
+The second criteria item (for variable *Primary Energy|Coal*) uses the old notation.
+Its use is deprecated for being more verbose (requires each warning level to be
+a separate criteria item) and slower to process.
+
+Standard usage
+--------------
+
+Run the following in a Python script to check that an IAMC dataset has valid
+(required) data.
+
+.. code-block:: python
+
+  from nomenclature import RequiredDataValidator
+  from nomenclature.processor import DataValidator
+
+  # ...setting directory/file paths and loading dataset
+
+  RequiredDataValidator.from_file(req_data_yaml).apply(df)
+  DataValidator.from_file(data_val_yaml).apply(df)
diff --git a/nomenclature/processor/data_validator.py b/nomenclature/processor/data_validator.py
@@ -149,6 +149,24 @@ def from_file(cls, file: Path | str) -> "DataValidator":
         return cls(file=file, criteria_items=criteria_items)
 
     def apply(self, df: IamDataFrame) -> IamDataFrame:
+        """Validates data in IAMC format according to specified criteria.
+
+        Logs warning/error messages for each criterion that is not met.
+
+        Parameters
+        ----------
+        df : IamDataFrame
+            Data in IAMC format to be validated
+
+        Returns
+        -------
+        IamDataFrame
+
+        Raises
+        ------
+            `ValueError` if any criterion has a warning level of `error`
+        """
+
         fail_list = []
         error = False
 

diff --git a/nomenclature/processor/required_data.py b/nomenclature/processor/required_data.py
@@ -147,6 +147,8 @@ def _wrong_unit_variables(
 
 
 class RequiredDataValidator(Processor):
+    """Processor for validating required dimensions in IAMC datapoints"""
+
     description: str | None = None
     model: list[str] | None = None
     required_data: list[RequiredData]
@@ -164,6 +166,21 @@ def from_file(cls, file: Path | str) -> "RequiredDataValidator":
         return cls(file=file, **content)
 
     def apply(self, df: IamDataFrame) -> IamDataFrame:
+        """Validates data in IAMC format according to required models and dimensions.
+
+        Parameters
+        ----------
+        df : IamDataFrame
+            Data in IAMC format to be validated
+
+        Returns
+        -------
+        IamDataFrame
+
+        Raises
+        ------
+            `ValueError` if any required dimension is not found in the data
+        """
         if self.model is not None:
             models_to_check = [model for model in df.model if model in self.model]
         else:

diff --git a/tests/data/required_data/required_data/requiredData.yaml b/tests/data/required_data/required_data/requiredData.yaml
@@ -6,4 +6,3 @@ required_data:
         unit: Mt CO2/yr
     region: World
     year: [2020, 2030, 2040, 2050]
-
Original file line number	Diff line number	Diff line change
Expand Up		@@ -6,4 +6,3 @@ required_data:
		unit: Mt CO2/yr
		region: World
		year: [2020, 2030, 2040, 2050]