From b3767028fc11a1ce3d39941a6a72c6a00ed27d2e Mon Sep 17 00:00:00 2001
From: David Almeida <almeida@iiasa.ac.at>
Date: Fri, 31 Jan 2025 12:57:26 +0100
Subject: [PATCH] Add docstrings and docs pages for `RequiredDataValidator` and
 `DataValidator`

---
 docs/user_guide.rst                           |   1 +
 docs/user_guide/data-validation.rst           | 101 ++++++++++++++++++
 nomenclature/processor/data_validator.py      |  18 ++++
 nomenclature/processor/required_data.py       |  17 +++
 .../required_data/requiredData.yaml           |   1 -
 5 files changed, 137 insertions(+), 1 deletion(-)
 create mode 100644 docs/user_guide/data-validation.rst

diff --git a/docs/user_guide.rst b/docs/user_guide.rst
index a78b134a..ba6cf4f5 100644
--- a/docs/user_guide.rst
+++ b/docs/user_guide.rst
@@ -49,3 +49,4 @@ the RegionProcessor and validated using DataStructureDefinition.
   user_guide/model-registration
   user_guide/config
   user_guide/local-usage
+  user_guide/data-validation
diff --git a/docs/user_guide/data-validation.rst b/docs/user_guide/data-validation.rst
new file mode 100644
index 00000000..c960bc68
--- /dev/null
+++ b/docs/user_guide/data-validation.rst
@@ -0,0 +1,101 @@
+.. _data-validation:
+
+.. currentmodule:: nomenclature
+
+Data validation
+===============
+
+The **nomenclature** package allows users to validate IAMC data in two ways.
+
+For this, validation requirements and criteria can be specified in YAML configuration
+files.
+
+Required data validation
+------------------------
+
+**Required data validation** checks if certain models, variables, regions and/or 
+periods of time are covered in the datapoints.
+
+For this, a configuration file specifies the model(s) and dimension(s) expected
+in the dataset. These are ``variable``, ``region`` and/or ``year``.
+Alternatively, instead of using ``variable``, it's possible to declare measurands,
+which can also specify units.
+
+.. code:: yaml
+
+  description: Required variables for running MAGICC
+  model: model_a
+  required_data:
+    - measurand:
+        Emissions|CO2:
+          unit: Mt CO2/yr
+      region: World
+      year: [2020, 2030, 2040, 2050]
+
+In the example above, for *model_a*, the dataset must include datapoints of the
+variable *Emissions|CO2* (measured in *Mt CO2/yr*), in the region *World*, for the
+years 2020, 2030, 2040 and 2050.
+
+Data validation
+---------------
+
+**Data validation** checks if data values are within reasonable ranges.
+
+Consider the example below:
+
+.. code:: yaml
+
+  - variable: Primary Energy
+    year: 2010
+    validation:
+      - upper_bound: 5
+        lower_bound: 1
+      - warning_level: low
+        upper_bound: 2.5
+        lower_bound: 1
+  - variable: Primary Energy|Coal
+    year: 2010
+    value: 5
+    rtol: 2
+    atol: 1
+
+Each criteria item contains **data filter arguments** and **validation arguments**.
+
+Data filter arguments include: ``model``, ``scenario``, ``region``, ``variable``,
+``unit``, and ``year``.
+For the first criteria item, the data is filtered for variable *Primary Energy*
+and year 2010.
+
+The ``validation`` arguments include: ``upper_bound``/``lower_bound`` *or*
+``value``/``rtol``/``atol`` (relative tolerance, absolute tolerance). Only one
+of the two can be set for each ``warning_level``.
+The possible levels are: ``error``, ``high``, ``medium``, or ``low``.
+For the same data filters, multiple warning levels with different criteria each
+can be set. These must be listed in descending order of severity, otherwise a
+``ValidationError`` is raised.
+In the example, for the first criteria item, the validation arguments are set
+for warning level ``error`` (by default, in case of omission) and ``low``,
+using bounds.
+Flagged datapoints are skipped for lower severity warnings in the same criteria
+item (e.g.: if datapoints are flagged for the ``error`` level, they will not be
+checked again for ``low``).
+
+The second criteria item (for variable *Primary Energy|Coal*) uses the old notation.
+Its use is deprecated for being more verbose (requires each warning level to be
+a separate criteria item) and slower to process.
+
+Standard usage
+--------------
+
+Run the following in a Python script to check that an IAMC dataset has valid
+(required) data.
+
+.. code-block:: python
+
+  from nomenclature import RequiredDataValidator
+  from nomenclature.processor import DataValidator
+
+  # ...setting directory/file paths and loading dataset
+
+  RequiredDataValidator.from_file(req_data_yaml).apply(df)
+  DataValidator.from_file(data_val_yaml).apply(df)
diff --git a/nomenclature/processor/data_validator.py b/nomenclature/processor/data_validator.py
index dda02f9c..60507c9d 100644
--- a/nomenclature/processor/data_validator.py
+++ b/nomenclature/processor/data_validator.py
@@ -149,6 +149,24 @@ def from_file(cls, file: Path | str) -> "DataValidator":
         return cls(file=file, criteria_items=criteria_items)
 
     def apply(self, df: IamDataFrame) -> IamDataFrame:
+        """Validates data in IAMC format according to specified criteria.
+
+        Logs warning/error messages for each criterion that is not met.
+
+        Parameters
+        ----------
+        df : IamDataFrame
+            Data in IAMC format to be validated
+
+        Returns
+        -------
+        IamDataFrame
+
+        Raises
+        ------
+            `ValueError` if any criterion has a warning level of `error`
+        """
+
         fail_list = []
         error = False
 
diff --git a/nomenclature/processor/required_data.py b/nomenclature/processor/required_data.py
index 68826085..eeae1607 100644
--- a/nomenclature/processor/required_data.py
+++ b/nomenclature/processor/required_data.py
@@ -147,6 +147,8 @@ def _wrong_unit_variables(
 
 
 class RequiredDataValidator(Processor):
+    """Processor for validating required dimensions in IAMC datapoints"""
+
     description: str | None = None
     model: list[str] | None = None
     required_data: list[RequiredData]
@@ -164,6 +166,21 @@ def from_file(cls, file: Path | str) -> "RequiredDataValidator":
         return cls(file=file, **content)
 
     def apply(self, df: IamDataFrame) -> IamDataFrame:
+        """Validates data in IAMC format according to required models and dimensions.
+
+        Parameters
+        ----------
+        df : IamDataFrame
+            Data in IAMC format to be validated
+
+        Returns
+        -------
+        IamDataFrame
+
+        Raises
+        ------
+            `ValueError` if any required dimension is not found in the data
+        """
         if self.model is not None:
             models_to_check = [model for model in df.model if model in self.model]
         else:
diff --git a/tests/data/required_data/required_data/requiredData.yaml b/tests/data/required_data/required_data/requiredData.yaml
index 721cafd9..e9b2be3a 100644
--- a/tests/data/required_data/required_data/requiredData.yaml
+++ b/tests/data/required_data/required_data/requiredData.yaml
@@ -6,4 +6,3 @@ required_data:
         unit: Mt CO2/yr
     region: World
     year: [2020, 2030, 2040, 2050]
-