diff --git a/docs/user_guide/config.rst b/docs/user_guide/config.rst index 558bb471..f000d7b1 100644 --- a/docs/user_guide/config.rst +++ b/docs/user_guide/config.rst @@ -57,11 +57,47 @@ multiple external repositories can be used as the example below illustrates for mappings: repository: common-definitions -The value in *definitions.region.repository* needs to reference the repository in the -*repositories* section. +The value in *definitions.region.repository* can be a list or a single value. For model mappings the process is analogous using *mappings.repository*. +Filter code lists imported from external repositories +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Since importing the entirety of, for example, common-definitions is too much for most +projects, the list can be filtered using ``include`` and ``exclude`` keywords. Under +these keywords, lists of filters can be given that will be applied to the code list from +the given repository. + +The filtering can be done by any attribute: + +.. code:: yaml + + repositories: + common-definitions: + url: https://github.com/IAMconsortium/common-definitions.git/ + definitions: + variable: + repository: + name: common-definitions + include: + - name: [Primary Energy*, Final Energy*] + - name: "Population*" + tier: 1 + exclude: + - name: "Final Energy|Industry*" + depth: 2 + +If a filter is being used for repositories, the *name* attribute **must be used** +for the repository. + +In the example above we are including: +1. All variables starting with *Primary Energy* or *Final Energy* +2. All variables starting with *Population* **and** with the tier attribute equal to 1 + +From this list we are then **excluding** all variables that match "Final +Energy|Industry\*" and have a depth of 2 (meaning that they contain two pipe "|" +characters). Adding countries to the region codelist --------------------------------------- diff --git a/nomenclature/code.py b/nomenclature/code.py index f983edc4..d92e3f9b 100644 --- a/nomenclature/code.py +++ b/nomenclature/code.py @@ -106,6 +106,10 @@ def flattened_dict_serialized(self): for key, value in self.flattened_dict.items() } + @property + def depth(self) -> int: + return self.name.count("|") + def replace_tag(self, tag: str, target: "Code") -> "Code": """Return a new instance with tag applied @@ -188,7 +192,7 @@ class VariableCode(Code): ) method: str | None = None check_aggregate: bool | None = Field(default=False, alias="check-aggregate") - components: Union[List[str], List[Dict[str, List[str]]]] | None = None + components: Union[List[str], Dict[str, list[str]]] | None = None drop_negative_weights: bool | None = None model_config = ConfigDict(populate_by_name=True) @@ -204,6 +208,18 @@ def deserialize_json(cls, v): def convert_none_to_empty_string(cls, v): return v if v is not None else "" + @field_validator("components", mode="before") + def cast_variable_components_args(cls, v): + """Cast "components" list of dicts to a codelist""" + + # translate a list of single-key dictionaries to a simple dictionary + if v is not None and isinstance(v, list) and isinstance(v[0], dict): + comp = {} + for val in v: + comp.update(val) + return comp + return v + @field_serializer("unit") def convert_str_to_none_for_writing(self, v): return v if v != "" else None diff --git a/nomenclature/codelist.py b/nomenclature/codelist.py index 712481e9..1c177a0c 100644 --- a/nomenclature/codelist.py +++ b/nomenclature/codelist.py @@ -233,13 +233,12 @@ def from_directory( for repo in getattr( config.definitions, name.lower(), CodeListConfig() ).repositories: - code_list.extend( - cls._parse_codelist_dir( - config.repositories[repo].local_path / "definitions" / name, - file_glob_pattern, - repo, - ) + repository_code_list = cls._parse_codelist_dir( + config.repositories[repo.name].local_path / "definitions" / name, + file_glob_pattern, + repo.name, ) + code_list.extend(repo.filter_list_of_codes(repository_code_list)) errors = ErrorCollector() mapping: Dict[str, Code] = {} for code in code_list: @@ -591,21 +590,6 @@ def check_weight_in_vars(cls, v): ) return v - @field_validator("mapping") - @classmethod - def cast_variable_components_args(cls, v): - """Cast "components" list of dicts to a codelist""" - - # translate a list of single-key dictionaries to a simple dictionary - for var in v.values(): - if var.components and isinstance(var.components[0], dict): - comp = {} - for val in var.components: - comp.update(val) - v[var.name].components = comp - - return v - def vars_default_args(self, variables: List[str]) -> List[VariableCode]: """return subset of variables which does not feature any special pyam aggregation arguments and where skip_region_aggregation is False""" @@ -758,21 +742,25 @@ def from_directory( # importing from an external repository for repo in config.definitions.region.repositories: - repo_path = config.repositories[repo].local_path / "definitions" / "region" + repo_path = ( + config.repositories[repo.name].local_path / "definitions" / "region" + ) - code_list = cls._parse_region_code_dir( - code_list, + repo_list_of_codes = cls._parse_region_code_dir( repo_path, file_glob_pattern, - repository=repo, + repository=repo.name, ) - code_list = cls._parse_and_replace_tags( - code_list, repo_path, file_glob_pattern + repo_list_of_codes = cls._parse_and_replace_tags( + repo_list_of_codes, repo_path, file_glob_pattern ) + code_list.extend(repo.filter_list_of_codes(repo_list_of_codes)) # parse from current repository - code_list = cls._parse_region_code_dir(code_list, path, file_glob_pattern) - code_list = cls._parse_and_replace_tags(code_list, path, file_glob_pattern) + local_code_list = cls._parse_region_code_dir(path, file_glob_pattern) + code_list.extend( + cls._parse_and_replace_tags(local_code_list, path, file_glob_pattern) + ) # translate to mapping mapping: Dict[str, RegionCode] = {} @@ -808,13 +796,12 @@ def hierarchy(self) -> List[str]: @classmethod def _parse_region_code_dir( cls, - code_list: List[Code], path: Path, file_glob_pattern: str = "**/*", repository: str | None = None, ) -> List[RegionCode]: """""" - + code_list: List[RegionCode] = [] for yaml_file in ( f for f in path.glob(file_glob_pattern) diff --git a/nomenclature/config.py b/nomenclature/config.py index b5a59b7a..ae2c74d3 100644 --- a/nomenclature/config.py +++ b/nomenclature/config.py @@ -1,6 +1,7 @@ from enum import Enum from pathlib import Path -from typing import Annotated, Optional +from typing import Any +import re import yaml from git import Repo @@ -11,29 +12,94 @@ field_validator, model_validator, ConfigDict, - BeforeValidator, ) +from nomenclature.code import Code +from pyam.str import escape_regexp + + +class CodeListFromRepository(BaseModel): + name: str + include: list[dict[str, Any]] = [{"name": "*"}] + exclude: list[dict[str, Any]] = Field(default_factory=list) + + def filter_function(self, code: Code, filter: dict[str, Any], keep: bool): + # if is list -> recursive + # if is str -> escape all special characters except "*" and use a regex + # if is int -> match exactly + # if is None -> Attribute does not exist therefore does not match + def check_attribute_match(code_value, filter_value): + if isinstance(filter_value, int): + return code_value == filter_value + if isinstance(filter_value, str): + pattern = re.compile(escape_regexp(filter_value) + "$") + return re.match(pattern, code_value) is not None + if isinstance(filter_value, list): + return any( + check_attribute_match(code_value, value) for value in filter_value + ) + if filter_value is None: + return False + raise ValueError("Something went wrong with the filtering") + + filter_match = all( + check_attribute_match(getattr(code, attribute, None), value) + for attribute, value in filter.items() + ) + if keep: + return filter_match + else: + return not filter_match + + def filter_list_of_codes(self, list_of_codes: list[Code]) -> list[Code]: + # include first + filter_result = [ + code + for code in list_of_codes + if any( + self.filter_function( + code, + filter, + keep=True, + ) + for filter in self.include + ) + ] + + if self.exclude: + filter_result = [ + code + for code in filter_result + if any( + self.filter_function(code, filter, keep=False) + for filter in self.exclude + ) + ] - -def convert_to_set(v: str | list[str] | set[str]) -> set[str]: - match v: - case set(v): - return v - case list(v): - return set(v) - case str(v): - return {v} - case _: - raise TypeError("`repositories` must be of type str, list or set.") + return filter_result class CodeListConfig(BaseModel): dimension: str | None = None - repositories: Annotated[set[str], BeforeValidator(convert_to_set)] = Field( - default_factory=set, alias="repository" + repositories: list[CodeListFromRepository] = Field( + default_factory=list, alias="repository" ) model_config = ConfigDict(populate_by_name=True) + @field_validator("repositories", mode="before") + @classmethod + def add_name_if_necessary(cls, v: list): + return [ + {"name": repository} if isinstance(repository, str) else repository + for repository in v + ] + + @field_validator("repositories", mode="before") + @classmethod + def convert_to_list_of_repos(cls, v): + if not isinstance(v, list): + return [v] + return v + @property def repository_dimension_path(self) -> str: return f"definitions/{self.dimension}" @@ -122,10 +188,10 @@ class DataStructureConfig(BaseModel): """ - model: Optional[CodeListConfig] = Field(default_factory=CodeListConfig) - scenario: Optional[CodeListConfig] = Field(default_factory=CodeListConfig) - region: Optional[RegionCodeListConfig] = Field(default_factory=RegionCodeListConfig) - variable: Optional[CodeListConfig] = Field(default_factory=CodeListConfig) + model: CodeListConfig = Field(default_factory=CodeListConfig) + scenario: CodeListConfig = Field(default_factory=CodeListConfig) + region: RegionCodeListConfig = Field(default_factory=RegionCodeListConfig) + variable: CodeListConfig = Field(default_factory=CodeListConfig) @field_validator("model", "scenario", "region", "variable", mode="before") @classmethod @@ -141,12 +207,30 @@ def repos(self) -> dict[str, str]: } +class MappingRepository(BaseModel): + name: str + + class RegionMappingConfig(BaseModel): - repositories: Annotated[set[str], BeforeValidator(convert_to_set)] = Field( - default_factory=set, alias="repository" + repositories: list[MappingRepository] = Field( + default_factory=list, alias="repository" ) model_config = ConfigDict(populate_by_name=True) + @field_validator("repositories", mode="before") + @classmethod + def add_name_if_necessary(cls, v: list): + return [ + {"name": repository} if isinstance(repository, str) else repository + for repository in v + ] + + @field_validator("repositories", mode="before") + def convert_to_set_of_repos(cls, v): + if not isinstance(v, list): + return [v] + return v + class DimensionEnum(str, Enum): model = "model" @@ -172,8 +256,9 @@ def check_definitions_repository( mapping_repos = {"mappings": v.mappings.repositories} if v.mappings else {} repos = {**v.definitions.repos, **mapping_repos} for use, repositories in repos.items(): - if repositories - v.repositories.keys(): - raise ValueError((f"Unknown repository {repositories} in '{use}'.")) + repository_names = [repository.name for repository in repositories] + if unknown_repos := repository_names - v.repositories.keys(): + raise ValueError((f"Unknown repository {unknown_repos} in '{use}'.")) return v def fetch_repos(self, target_folder: Path): diff --git a/nomenclature/processor/region.py b/nomenclature/processor/region.py index ce7ba43d..92ac5248 100644 --- a/nomenclature/processor/region.py +++ b/nomenclature/processor/region.py @@ -486,7 +486,7 @@ def from_directory(cls, path: DirectoryPath, dsd: DataStructureDefinition): mapping_files.extend( f for f in ( - dsd.config.repositories[repository].local_path / "mappings" + dsd.config.repositories[repository.name].local_path / "mappings" ).glob("**/*") if f.suffix in {".yaml", ".yml"} ) diff --git a/poetry.lock b/poetry.lock index 208977e7..45bd6595 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "alabaster" diff --git a/tests/data/config/external_repo_filters.yaml b/tests/data/config/external_repo_filters.yaml new file mode 100644 index 00000000..bb77dd7b --- /dev/null +++ b/tests/data/config/external_repo_filters.yaml @@ -0,0 +1,20 @@ +repositories: + common-definitions: + url: https://github.com/IAMconsortium/common-definitions.git/ +definitions: + variable: + repository: + name: common-definitions + include: + - name: [Primary Energy*, Final Energy*] + - name: "Population*" + tier: 1 + exclude: + - name: "Final Energy|*|*" + region: + repository: + name: common-definitions + include: + - hierarchy: R5 + exclude: + - name: Other (R5) diff --git a/tests/data/config/filter/definitions/region/.gitkeep b/tests/data/config/filter/definitions/region/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/config/filter/definitions/variable/.gitkeep b/tests/data/config/filter/definitions/variable/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/config/filter/nomenclature.yaml b/tests/data/config/filter/nomenclature.yaml new file mode 100644 index 00000000..32344089 --- /dev/null +++ b/tests/data/config/filter/nomenclature.yaml @@ -0,0 +1,17 @@ +repositories: + common-definitions: + url: https://github.com/IAMconsortium/common-definitions.git/ + legacy-definitions: + url: https://github.com/IAMconsortium/legacy-definitions.git/ +definitions: + variable: + repository: + - name: common-definitions + filters: + - name: [Primary Energy*, Final Energy*] + - name: "Population*" + tier: 1 + - legacy-definitions + region: + repository: common-definitions + country: true diff --git a/tests/data/config/multiple_external_repos_filters.yaml b/tests/data/config/multiple_external_repos_filters.yaml new file mode 100644 index 00000000..939e17b7 --- /dev/null +++ b/tests/data/config/multiple_external_repos_filters.yaml @@ -0,0 +1,16 @@ +repositories: + common-definitions: + url: https://github.com/IAMconsortium/common-definitions.git/ + legacy-definitions: + url: https://github.com/IAMconsortium/legacy-definitions.git/ +definitions: + variable: + repository: + - name: common-definitions + include: + - name: [Primary Energy*, Final Energy*] + - name: "Population*" + tier: 1 + exclude: + - name: "Primary Energy|*|*" # exclude all third tier variables + - legacy-definitions diff --git a/tests/data/nomenclature_configs/regions/.gitkeep b/tests/data/nomenclature_configs/regions/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/region_processing/external_repo_test_missing_region/definitions/.keep b/tests/data/region_processing/external_repo_test_missing_region/definitions/.keep new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/region_processing/external_repo_test_missing_region/mappings/.keep b/tests/data/region_processing/external_repo_test_missing_region/mappings/.keep new file mode 100644 index 00000000..e69de29b diff --git a/tests/data/region_processing/external_repo_test_missing_region/nomenclature.yaml b/tests/data/region_processing/external_repo_test_missing_region/nomenclature.yaml new file mode 100644 index 00000000..1dacf528 --- /dev/null +++ b/tests/data/region_processing/external_repo_test_missing_region/nomenclature.yaml @@ -0,0 +1,17 @@ +dimensions: + - region + - variable +repositories: + common-definitions: + url: https://github.com/IAMconsortium/common-definitions.git/ + hash: cb85704 +definitions: + region: + repository: + name: common-definitions + include: + - name: "World" + variable: + repository: common-definitions +mappings: + repository: common-definitions diff --git a/tests/test_codelist.py b/tests/test_codelist.py index 85b01218..6efa7f99 100644 --- a/tests/test_codelist.py +++ b/tests/test_codelist.py @@ -488,3 +488,56 @@ def test_variablecodelist_list_missing_variables_to_new_file(simple_df, tmp_path } assert obs.mapping == exp + + +def test_variable_code_list_external_repo_with_filters(): + nomenclature_config = NomenclatureConfig.from_file( + TEST_DATA_DIR / "config" / "external_repo_filters.yaml" + ) + try: + variable_code_list = VariableCodeList.from_directory( + "variable", + TEST_DATA_DIR / "nomenclature_configs" / "variable", + nomenclature_config, + ) + exp_included_variables = [ + "Final Energy", + "Population", + "Primary Energy|Oil|Hydrogen|w/ CCS", + ] + exp_excluded_variables = [ + "Final Energy|Agriculture|Electricity", # no third level Final Energy + "Population|Clean Cooking Access", # only tier 1 Population + ] + assert all( + variable in variable_code_list for variable in exp_included_variables + ) + assert all( + variable not in variable_code_list for variable in exp_excluded_variables + ) + finally: + clean_up_external_repos(nomenclature_config.repositories) + + +def test_region_code_list_external_repo_with_filters(): + nomenclature_config = NomenclatureConfig.from_file( + TEST_DATA_DIR / "config" / "external_repo_filters.yaml" + ) + try: + region_code_list = RegionCodeList.from_directory( + "region", + TEST_DATA_DIR / "config" / "variable", + nomenclature_config, + ) + R5_regions = [ + "OECD & EU (R5)", + "Reforming Economies (R5)", + "Asia (R5)", + "Middle East & Africa (R5)", + "Latin America (R5)", + ] + assert len(region_code_list) == 5 + assert all(r5_region in region_code_list for r5_region in R5_regions) + assert "Other (R5)" not in region_code_list + finally: + clean_up_external_repos(nomenclature_config.repositories) diff --git a/tests/test_config.py b/tests/test_config.py index 70449073..1385bfa5 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,7 +1,11 @@ from pathlib import Path +import pytest from pytest import raises -from nomenclature.config import Repository, NomenclatureConfig, CodeListConfig +from nomenclature.config import ( + Repository, + NomenclatureConfig, +) from conftest import TEST_DATA_DIR, clean_up_external_repos @@ -32,24 +36,16 @@ def test_multiple_definition_repos(): try: exp_repos = {"common-definitions", "legacy-definitions"} assert nomenclature_config.repositories.keys() == exp_repos - assert nomenclature_config.definitions.variable.repositories == exp_repos finally: clean_up_external_repos(nomenclature_config.repositories) -def test_codelist_config_set_input(): - exp_repos = {"repo1", "repo2"} - code_list_config = CodeListConfig(dimension="variable", repositories=exp_repos) - assert code_list_config.repositories == exp_repos - - def test_multiple_mapping_repos(): nomenclature_config = NomenclatureConfig.from_file( MODULE_TEST_DATA_DIR / "multiple_repos_for_mapping.yaml" ) try: exp_repos = {"common-definitions", "legacy-definitions"} - assert nomenclature_config.mappings.repositories == exp_repos assert nomenclature_config.repositories.keys() == exp_repos finally: clean_up_external_repos(nomenclature_config.repositories) @@ -85,3 +81,15 @@ def test_invalid_config_dimensions_raises(): ), ): NomenclatureConfig(dimensions=["year"]) + + +@pytest.mark.parametrize( + "config_file", + ["external_repo_filters.yaml", "multiple_external_repos_filters.yaml"], +) +def test_config_with_filter(config_file): + config = NomenclatureConfig.from_file(TEST_DATA_DIR / "config" / config_file) + try: + assert isinstance(config.definitions.variable.repositories, list) + finally: + clean_up_external_repos(config.repositories) diff --git a/tests/test_region_aggregation.py b/tests/test_region_aggregation.py index 31711225..80ed54cb 100644 --- a/tests/test_region_aggregation.py +++ b/tests/test_region_aggregation.py @@ -256,6 +256,26 @@ def test_mapping_from_external_repository(): clean_up_external_repos(dsd.config.repositories) +def test_mapping_from_external_repository_missing_regions_raises(): + try: + with pytest.raises( + pydantic.ValidationError, + match="validation errors for RegionProcessor", + ): + RegionProcessor.from_directory( + TEST_FOLDER_REGION_PROCESSING + / "external_repo_test_missing_region" + / "mappings", + dsd := DataStructureDefinition( + TEST_FOLDER_REGION_PROCESSING + / "external_repo_test_missing_region" + / "definitions" + ), + ) + finally: + clean_up_external_repos(dsd.config.repositories) + + def test_reverse_region_aggregation(): processor = RegionProcessor.from_directory( TEST_FOLDER_REGION_PROCESSING / "complete_processing_list",