diff --git a/src/pyobo/__init__.py b/src/pyobo/__init__.py index adaf461d..9bb1f30c 100644 --- a/src/pyobo/__init__.py +++ b/src/pyobo/__init__.py @@ -6,6 +6,8 @@ get_children, get_definition, get_descendants, + get_edges, + get_edges_df, get_filtered_properties_df, get_filtered_properties_mapping, get_filtered_properties_multimapping, @@ -20,11 +22,15 @@ get_id_synonyms_mapping, get_id_to_alts, get_ids, + get_literal_properties, + get_literal_properties_df, get_mappings_df, get_metadata, get_name, get_name_by_curie, get_name_id_mapping, + get_object_properties, + get_object_properties_df, get_obsolete, get_primary_curie, get_primary_identifier, @@ -80,6 +86,8 @@ "get_children", "get_definition", "get_descendants", + "get_edges", + "get_edges_df", "get_filtered_properties_df", "get_filtered_properties_mapping", "get_filtered_properties_multimapping", @@ -94,11 +102,15 @@ "get_id_synonyms_mapping", "get_id_to_alts", "get_ids", + "get_literal_properties", + "get_literal_properties_df", "get_mappings_df", "get_metadata", "get_name", "get_name_by_curie", "get_name_id_mapping", + "get_object_properties", + "get_object_properties_df", "get_obsolete", "get_ontology", "get_primary_curie", diff --git a/src/pyobo/api/__init__.py b/src/pyobo/api/__init__.py index b534612d..b2991d0d 100644 --- a/src/pyobo/api/__init__.py +++ b/src/pyobo/api/__init__.py @@ -6,6 +6,7 @@ get_primary_curie, get_primary_identifier, ) +from .edges import get_edges, get_edges_df, get_graph from .hierarchy import ( get_ancestors, get_children, @@ -32,13 +33,16 @@ get_filtered_properties_df, get_filtered_properties_mapping, get_filtered_properties_multimapping, + get_literal_properties, + get_literal_properties_df, + get_object_properties, + get_object_properties_df, get_properties, get_properties_df, get_property, ) from .relations import ( get_filtered_relations_df, - get_graph, get_id_multirelations_mapping, get_relation, get_relation_mapping, @@ -61,6 +65,8 @@ "get_children", "get_definition", "get_descendants", + "get_edges", + "get_edges_df", "get_equivalent", "get_filtered_properties_df", "get_filtered_properties_mapping", @@ -76,11 +82,15 @@ "get_id_synonyms_mapping", "get_id_to_alts", "get_ids", + "get_literal_properties", + "get_literal_properties_df", "get_mappings_df", "get_metadata", "get_name", "get_name_by_curie", "get_name_id_mapping", + "get_object_properties", + "get_object_properties_df", "get_obsolete", "get_ontology", "get_primary_curie", diff --git a/src/pyobo/api/edges.py b/src/pyobo/api/edges.py new file mode 100644 index 00000000..1469b83d --- /dev/null +++ b/src/pyobo/api/edges.py @@ -0,0 +1,68 @@ +"""High-level API for edges.""" + +import networkx as nx +import pandas as pd +from tqdm import tqdm +from typing_extensions import Unpack + +from pyobo.api.names import get_ids +from pyobo.api.utils import get_version_from_kwargs +from pyobo.constants import ( + GetOntologyKwargs, + check_should_cache, + check_should_force, + check_should_use_tqdm, +) +from pyobo.getters import get_ontology +from pyobo.struct import Reference +from pyobo.utils.path import prefix_cache_join + +from ..utils.cache import cached_df + +__all__ = [ + "get_edges", + "get_edges_df", + "get_graph", +] + + +def get_graph(prefix: str, **kwargs: Unpack[GetOntologyKwargs]) -> nx.DiGraph: + """Get the relation graph.""" + rv = nx.MultiDiGraph() + for s in get_ids(prefix, **kwargs): + rv.add_node(f"{prefix}:{s}") + df = get_edges_df(prefix=prefix, **kwargs) + for s, p, o in df.values: + rv.add_edge(s, p, key=o) + return rv + + +def get_edges_df(prefix, **kwargs: Unpack[GetOntologyKwargs]) -> pd.DataFrame: + """Get a dataframe of edges triples.""" + version = get_version_from_kwargs(prefix, kwargs) + path = prefix_cache_join(prefix, name="object_properties.tsv", version=version) + + @cached_df( + path=path, dtype=str, force=check_should_force(kwargs), cache=check_should_cache(kwargs) + ) + def _df_getter() -> pd.DataFrame: + return get_ontology(prefix, **kwargs).get_edges_df(use_tqdm=check_should_use_tqdm(kwargs)) + + return _df_getter() + + +def get_edges( + prefix, **kwargs: Unpack[GetOntologyKwargs] +) -> list[tuple[Reference, Reference, Reference]]: + """Get a list of edge triples.""" + df = get_edges_df(prefix, **kwargs) + return [ + (Reference.from_curie(s), Reference.from_curie(p), Reference.from_curie(o)) + for s, p, o in tqdm( + df.values, + desc=f"[{prefix}] parsing edges", + unit="edge", + unit_scale=True, + disable=not check_should_use_tqdm(kwargs), + ) + ] diff --git a/src/pyobo/api/hierarchy.py b/src/pyobo/api/hierarchy.py index 08409430..39ebdaaf 100644 --- a/src/pyobo/api/hierarchy.py +++ b/src/pyobo/api/hierarchy.py @@ -5,12 +5,15 @@ from functools import lru_cache import networkx as nx - -from .names import get_name -from .properties import get_filtered_properties_mapping -from .relations import get_filtered_relations_df -from ..identifier_utils import wrap_norm_prefix -from ..struct import has_member, is_a, part_of +from curies import ReferenceTuple +from typing_extensions import Unpack + +from .edges import get_edges_df +from .names import get_ids, get_name +from .properties import get_literal_properties +from .utils import _get_pi +from ..constants import GetOntologyKwargs +from ..struct import has_member, has_part, is_a, member_of, part_of from ..struct.reference import Reference from ..struct.struct_utils import ReferenceHint, _ensure_ref @@ -24,23 +27,22 @@ "is_descendent", ] - logger = logging.getLogger(__name__) +class HierarchyKwargs(GetOntologyKwargs): + """Keyword argument hints for hierarchy getter functions.""" + + include_part_of: bool + include_has_member: bool + + def get_hierarchy( prefix: str, *, - include_part_of: bool = True, - include_has_member: bool = False, extra_relations: Iterable[ReferenceHint] | None = None, properties: Iterable[ReferenceHint] | None = None, - use_tqdm: bool = False, - force: bool = False, - force_process: bool = False, - version: str | None = None, - strict: bool = True, - cache: bool = True, + **kwargs: Unpack[HierarchyKwargs], ) -> nx.DiGraph: """Get hierarchy of parents as a directed graph. @@ -54,135 +56,72 @@ def get_hierarchy( example, it might be useful to include the positively_regulates :param properties: Properties to include in the data part of each node. For example, might want to include SMILES strings with the ChEBI tree. - :param use_tqdm: Show a progress bar :param force: should the resources be reloaded when extracting relations? :returns: A directional graph representing the hierarchy This function thinly wraps :func:`_get_hierarchy_helper` to make it easier to work with the lru_cache mechanism. """ - extra_relations_ = tuple( - sorted(_ensure_ref(r, ontology_prefix=prefix) for r in extra_relations or []) - ) - properties_ = tuple( - sorted(_ensure_ref(prop, ontology_prefix=prefix) for prop in properties or []) - ) - return _get_hierarchy_helper( prefix=prefix, - include_part_of=include_part_of, - include_has_member=include_has_member, - extra_relations=extra_relations_, - properties=properties_, - use_tqdm=use_tqdm, - force=force, - force_process=force_process, - version=version, - strict=strict, - cache=cache, + extra_relations=_tp(prefix, extra_relations), + properties=_tp(prefix, properties), + **kwargs, + ) + + +def _tp(prefix: str, references: Iterable[ReferenceHint] | None) -> tuple[Reference, ...]: + return tuple( + sorted(_ensure_ref(reference, ontology_prefix=prefix) for reference in references or []) ) @lru_cache -@wrap_norm_prefix def _get_hierarchy_helper( prefix: str, *, extra_relations: tuple[Reference, ...], properties: tuple[Reference, ...], - include_part_of: bool, - include_has_member: bool, - use_tqdm: bool, - force: bool = False, - force_process: bool = False, - version: str | None = None, - strict: bool = True, - cache: bool = True, + include_part_of: bool = False, + include_has_member: bool = False, + **kwargs: Unpack[GetOntologyKwargs], ) -> nx.DiGraph: - rv = nx.DiGraph() - - is_a_df = get_filtered_relations_df( - prefix=prefix, - relation=is_a, - use_tqdm=use_tqdm, - force=force, - force_process=force_process, - version=version, - strict=strict, + predicates, reverse_predicates = _get_predicate_sets( + extra_relations, include_part_of, include_has_member ) - for source_id, target_ns, target_id in is_a_df.values: - rv.add_edge(f"{prefix}:{source_id}", f"{target_ns}:{target_id}", relation="is_a") - if include_has_member: - has_member_df = get_filtered_relations_df( - prefix=prefix, - relation=has_member, - use_tqdm=use_tqdm, - force=force, - force_process=force_process, - version=version, - strict=strict, - ) - for target_id, source_ns, source_id in has_member_df.values: - rv.add_edge(f"{source_ns}:{source_id}", f"{prefix}:{target_id}", relation="is_a") + rv = nx.DiGraph() + for s in get_ids(prefix, **kwargs): + rv.add_node(f"{prefix}:{s}") - if include_part_of: - part_of_df = get_filtered_relations_df( - prefix=prefix, - relation=part_of, - use_tqdm=use_tqdm, - force=force, - force_process=force_process, - version=version, - strict=strict, - ) - for source_id, target_ns, target_id in part_of_df.values: - rv.add_edge(f"{prefix}:{source_id}", f"{target_ns}:{target_id}", relation="part_of") - - has_part_df = get_filtered_relations_df( - prefix=prefix, - relation=part_of, - use_tqdm=use_tqdm, - force=force, - force_process=force_process, - version=version, - strict=strict, - ) - for target_id, source_ns, source_id in has_part_df.values: - rv.add_edge(f"{source_ns}:{source_id}", f"{prefix}:{target_id}", relation="part_of") - - for relation in extra_relations: - relation_df = get_filtered_relations_df( - prefix=prefix, - relation=relation, - use_tqdm=use_tqdm, - force=force, - force_process=force_process, - version=version, - strict=strict, - ) - for source_id, target_ns, target_id in relation_df.values: - rv.add_edge( - f"{prefix}:{source_id}", f"{target_ns}:{target_id}", relation=relation.identifier - ) - - for prop in properties: - props = get_filtered_properties_mapping( - prefix=prefix, - prop=prop, - use_tqdm=use_tqdm, - force=force, - force_process=force_process, - strict=strict, - version=version, - ) - for identifier, value in props.items(): - curie = f"{prefix}:{identifier}" - if curie in rv: - rv.nodes[curie][prop] = value + edges_df = get_edges_df(prefix, **kwargs) + for s, p, o in edges_df.values: + if p in predicates: + rv.add_edge(s, o, relation=p) + elif p in reverse_predicates: + rv.add_edge(o, s, relation=p) + + properties_ = set(properties) + for s, p, op in get_literal_properties(prefix, **kwargs): + if s in rv and p in properties_: + rv.nodes[s][p] = op.value return rv +def _get_predicate_sets( + extra_relations: Iterable[Reference], include_part_of: bool, include_has_member: bool +) -> tuple[set[str], set[str]]: + predicates: set[Reference] = {is_a.reference, *extra_relations} + reverse_predicates: set[Reference] = set() + if include_part_of: + predicates.add(part_of.reference) + reverse_predicates.add(has_part.reference) + if include_has_member: + predicates.add(has_member.reference) + reverse_predicates.add(member_of.reference) + return {p.curie for p in predicates}, {p.curie for p in reverse_predicates} + + def is_descendent( prefix, identifier, ancestor_prefix, ancestor_identifier, *, version: str | None = None ) -> bool: @@ -198,61 +137,32 @@ def is_descendent( @lru_cache def get_descendants( - prefix: str, + prefix: str | Reference | ReferenceTuple, identifier: str | None = None, - include_part_of: bool = True, - include_has_member: bool = False, - use_tqdm: bool = False, - force: bool = False, - **kwargs, + /, + **kwargs: Unpack[HierarchyKwargs], ) -> set[str] | None: """Get all the descendants (children) of the term as CURIEs.""" - curie, prefix, identifier = _pic(prefix, identifier) - hierarchy = get_hierarchy( - prefix=prefix, - include_has_member=include_has_member, - include_part_of=include_part_of, - use_tqdm=use_tqdm, - force=force, - **kwargs, - ) - if curie not in hierarchy: + t = _get_pi(prefix, identifier) + hierarchy = get_hierarchy(prefix=t.prefix, **kwargs) + if t.curie not in hierarchy: return None - return nx.ancestors(hierarchy, curie) # note this is backwards - - -def _pic(prefix, identifier=None) -> tuple[str, str, str]: - if identifier is None: - curie = prefix - prefix, identifier = prefix.split(":") - else: - curie = f"{prefix}:{identifier}" - return curie, prefix, identifier + return nx.ancestors(hierarchy, t.curie) # note this is backwards @lru_cache def get_children( - prefix: str, + prefix: str | Reference | ReferenceTuple, identifier: str | None = None, - include_part_of: bool = True, - include_has_member: bool = False, - use_tqdm: bool = False, - force: bool = False, - **kwargs, + /, + **kwargs: Unpack[HierarchyKwargs], ) -> set[str] | None: """Get all the descendants (children) of the term as CURIEs.""" - curie, prefix, identifier = _pic(prefix, identifier) - hierarchy = get_hierarchy( - prefix=prefix, - include_has_member=include_has_member, - include_part_of=include_part_of, - use_tqdm=use_tqdm, - force=force, - **kwargs, - ) - if curie not in hierarchy: + t = _get_pi(prefix, identifier) + hierarchy = get_hierarchy(prefix=t.prefix, **kwargs) + if t.curie not in hierarchy: return None - return set(hierarchy.predecessors(curie)) + return set(hierarchy.predecessors(t.curie)) def has_ancestor( @@ -269,52 +179,30 @@ def has_ancestor( @lru_cache def get_ancestors( - prefix: str, + prefix: str | Reference | ReferenceTuple, identifier: str | None = None, - include_part_of: bool = True, - include_has_member: bool = False, - use_tqdm: bool = False, - force: bool = False, - **kwargs, + /, + **kwargs: Unpack[HierarchyKwargs], ) -> set[str] | None: """Get all the ancestors (parents) of the term as CURIEs.""" - curie, prefix, identifier = _pic(prefix, identifier) - hierarchy = get_hierarchy( - prefix=prefix, - include_has_member=include_has_member, - include_part_of=include_part_of, - use_tqdm=use_tqdm, - force=force, - **kwargs, - ) - if curie not in hierarchy: + t = _get_pi(prefix, identifier) + hierarchy = get_hierarchy(prefix=t.prefix, **kwargs) + if t.curie not in hierarchy: return None - return nx.descendants(hierarchy, curie) # note this is backwards + return nx.descendants(hierarchy, t.curie) # note this is backwards def get_subhierarchy( - prefix: str, + prefix: str | Reference | ReferenceTuple, identifier: str | None = None, - include_part_of: bool = True, - include_has_member: bool = False, - use_tqdm: bool = False, - force: bool = False, - **kwargs, + /, + **kwargs: Unpack[HierarchyKwargs], ) -> nx.DiGraph: """Get the subhierarchy for a given node.""" - curie, prefix, identifier = _pic(prefix, identifier) - hierarchy = get_hierarchy( - prefix=prefix, - include_has_member=include_has_member, - include_part_of=include_part_of, - use_tqdm=use_tqdm, - force=force, - **kwargs, - ) - logger.info( - "getting descendants of %s:%s ! %s", prefix, identifier, get_name(prefix, identifier) - ) - curies = nx.ancestors(hierarchy, curie) # note this is backwards + t = _get_pi(prefix, identifier) + hierarchy = get_hierarchy(prefix=t.prefix, **kwargs) + logger.info("getting descendants of %s ! %s", t.curie, get_name(t)) + curies = set(nx.ancestors(hierarchy, t.curie)) | {t.curie} # note this is backwards logger.info("inducing subgraph") sg = hierarchy.subgraph(curies).copy() logger.info("subgraph has %d nodes/%d edges", sg.number_of_nodes(), sg.number_of_edges()) diff --git a/src/pyobo/api/names.py b/src/pyobo/api/names.py index ba8b3414..297512bc 100644 --- a/src/pyobo/api/names.py +++ b/src/pyobo/api/names.py @@ -12,7 +12,7 @@ from typing_extensions import Unpack from .alts import get_primary_identifier -from .utils import get_version, get_version_from_kwargs +from .utils import _get_pi, get_version, get_version_from_kwargs from ..constants import GetOntologyKwargs, check_should_cache, check_should_force from ..getters import NoBuildError, get_ontology from ..identifier_utils import wrap_norm_prefix @@ -87,12 +87,8 @@ def get_name( **kwargs: Unpack[GetOntologyKwargs], ) -> str | None: """Get the name for an entity.""" - if isinstance(prefix, ReferenceTuple | Reference): - identifier = prefix.identifier - prefix = prefix.prefix - if identifier is None: - raise ValueError("identifier is None") - return _help_get(get_id_name_mapping, prefix=prefix, identifier=identifier, **kwargs) + t = _get_pi(prefix, identifier) + return _help_get(get_id_name_mapping, prefix=t.prefix, identifier=t.identifier, **kwargs) @lru_cache diff --git a/src/pyobo/api/properties.py b/src/pyobo/api/properties.py index 5bcc77fd..9f6c0382 100644 --- a/src/pyobo/api/properties.py +++ b/src/pyobo/api/properties.py @@ -4,6 +4,7 @@ from collections.abc import Mapping import pandas as pd +from tqdm import tqdm from typing_extensions import Unpack from .utils import get_version_from_kwargs @@ -15,8 +16,9 @@ ) from ..getters import get_ontology from ..identifier_utils import wrap_norm_prefix -from ..struct.struct_utils import ReferenceHint, _ensure_ref -from ..utils.cache import cached_df, cached_mapping, cached_multidict +from ..struct.reference import Reference +from ..struct.struct_utils import OBOLiteral, ReferenceHint, _ensure_ref +from ..utils.cache import cached_df from ..utils.io import multidict from ..utils.path import prefix_cache_join @@ -24,6 +26,10 @@ "get_filtered_properties_df", "get_filtered_properties_mapping", "get_filtered_properties_multimapping", + "get_literal_properties", + "get_literal_properties_df", + "get_object_properties", + "get_object_properties_df", "get_properties", "get_properties_df", "get_property", @@ -32,6 +38,70 @@ logger = logging.getLogger(__name__) +def get_object_properties_df(prefix, **kwargs: Unpack[GetOntologyKwargs]) -> pd.DataFrame: + """Get a dataframe of object property triples.""" + version = get_version_from_kwargs(prefix, kwargs) + path = prefix_cache_join(prefix, name="object_properties.tsv", version=version) + + @cached_df( + path=path, dtype=str, force=check_should_force(kwargs), cache=check_should_cache(kwargs) + ) + def _df_getter() -> pd.DataFrame: + return get_ontology(prefix, **kwargs).get_object_properties_df( + use_tqdm=check_should_use_tqdm(kwargs) + ) + + return _df_getter() + + +def get_object_properties( + prefix, **kwargs: Unpack[GetOntologyKwargs] +) -> list[tuple[Reference, Reference, Reference]]: + """Get a list of object property triples.""" + df = get_object_properties_df(prefix, **kwargs) + return [ + (Reference.from_curie(s), Reference.from_curie(p), Reference.from_curie(o)) + for s, p, o in df.values + ] + + +def get_literal_properties( + prefix: str, **kwargs: Unpack[GetOntologyKwargs] +) -> list[tuple[Reference, Reference, OBOLiteral]]: + """Get a list of literal property triples.""" + df = get_literal_properties_df(prefix, **kwargs) + return [ + ( + Reference.from_curie(s), + Reference.from_curie(p), + OBOLiteral(value, Reference.from_curie(datatype)), + ) + for s, p, value, datatype in tqdm( + df.values, + desc=f"[{prefix}] parsing properties", + unit_scale=True, + unit="triple", + disable=not check_should_use_tqdm(kwargs), + ) + ] + + +def get_literal_properties_df(prefix: str, **kwargs: Unpack[GetOntologyKwargs]) -> pd.DataFrame: + """Get a dataframe of literal property quads.""" + version = get_version_from_kwargs(prefix, kwargs) + path = prefix_cache_join(prefix, name="literal_properties.tsv", version=version) + + @cached_df( + path=path, dtype=str, force=check_should_force(kwargs), cache=check_should_cache(kwargs) + ) + def _df_getter() -> pd.DataFrame: + return get_ontology(prefix, **kwargs).get_literal_properties_df( + use_tqdm=check_should_use_tqdm(kwargs) + ) + + return _df_getter() + + @wrap_norm_prefix def get_properties_df(prefix: str, **kwargs: Unpack[GetOntologyKwargs]) -> pd.DataFrame: """Extract properties. @@ -46,10 +116,9 @@ def get_properties_df(prefix: str, **kwargs: Unpack[GetOntologyKwargs]) -> pd.Da path=path, dtype=str, force=check_should_force(kwargs), cache=check_should_cache(kwargs) ) def _df_getter() -> pd.DataFrame: - ontology = get_ontology(prefix, **kwargs) - df = ontology.get_properties_df() - df.dropna(inplace=True) - return df + return get_ontology(prefix, **kwargs).get_properties_df( + use_tqdm=check_should_use_tqdm(kwargs) + ) return _df_getter() @@ -64,33 +133,8 @@ def get_filtered_properties_mapping( :param prop: the property to extract :returns: A mapping from identifier to property value """ - prop = _ensure_ref(prop, ontology_prefix=prefix) - prop_curie = prop.curie - version = get_version_from_kwargs(prefix, kwargs) - all_properties_path = prefix_cache_join(prefix, name="properties.tsv", version=version) - if all_properties_path.is_file(): - logger.info("[%s] loading pre-cached properties", prefix) - df = pd.read_csv(all_properties_path, sep="\t") - logger.info("[%s] filtering pre-cached properties", prefix) - df = df.loc[df["property"] == prop_curie, [f"{prefix}_id", "value"]] - return dict(df.values) - - path = prefix_cache_join(prefix, "properties", name=f"{prop_curie}.tsv", version=version) - - @cached_mapping( - path=path, - header=[f"{prefix}_id", prop_curie], - force=check_should_force(kwargs), - cache=check_should_cache(kwargs), - ) - def _mapping_getter() -> Mapping[str, str]: - logger.info("[%s] no cached properties found. getting from OBO loader", prefix) - ontology = get_ontology(prefix, **kwargs) - return ontology.get_filtered_properties_mapping( - prop, use_tqdm=check_should_use_tqdm(kwargs) - ) - - return _mapping_getter() + df = get_filtered_properties_df(prefix, prop, **kwargs) + return dict(df.values) @wrap_norm_prefix @@ -103,34 +147,8 @@ def get_filtered_properties_multimapping( :param prop: the property to extract :returns: A mapping from identifier to property values """ - prop = _ensure_ref(prop, ontology_prefix=prefix) - prop_curie = prop.curie - version = get_version_from_kwargs(prefix, kwargs) - all_properties_path = prefix_cache_join(prefix, name="properties.tsv", version=version) - - if all_properties_path.is_file(): - logger.info("[%s] loading pre-cached properties", prefix) - df = pd.read_csv(all_properties_path, sep="\t") - logger.info("[%s] filtering pre-cached properties", prefix) - df = df.loc[df["property"] == prop_curie, [f"{prefix}_id", "value"]] - return multidict(df.values) - - path = prefix_cache_join(prefix, "properties", name=f"{prop_curie}.tsv", version=version) - - @cached_multidict( - path=path, - header=[f"{prefix}_id", prop_curie], - force=check_should_force(kwargs), - cache=check_should_cache(kwargs), - ) - def _mapping_getter() -> Mapping[str, list[str]]: - logger.info("[%s] no cached properties found. getting from OBO loader", prefix) - ontology = get_ontology(prefix, **kwargs) - return ontology.get_filtered_properties_multimapping( - prop, use_tqdm=check_should_use_tqdm(kwargs) - ) - - return _mapping_getter() + df = get_filtered_properties_df(prefix, prop, **kwargs) + return multidict(df.values) def get_property( @@ -154,7 +172,10 @@ def get_property( def get_properties( - prefix: str, identifier: str, prop: str, **kwargs: Unpack[GetOntologyKwargs] + prefix: str, + identifier: str, + prop: ReferenceHint, + **kwargs: Unpack[GetOntologyKwargs], ) -> list[str] | None: """Extract a set of properties for the given entity. @@ -171,7 +192,7 @@ def get_properties( @wrap_norm_prefix def get_filtered_properties_df( - prefix: str, prop: str, **kwargs: Unpack[GetOntologyKwargs] + prefix: str, prop: ReferenceHint, **kwargs: Unpack[GetOntologyKwargs] ) -> pd.DataFrame: """Extract a single property for each term. @@ -179,21 +200,7 @@ def get_filtered_properties_df( :param prop: the property to extract :returns: A dataframe from identifier to property value. Columns are [_id, value]. """ - version = get_version_from_kwargs(prefix, kwargs) - all_properties_path = prefix_cache_join(prefix, name="properties.tsv", version=version) - if all_properties_path.is_file(): - logger.info("[%s] loading pre-cached properties", prefix) - df = pd.read_csv(all_properties_path, sep="\t") - logger.info("[%s] filtering pre-cached properties", prefix) - return df.loc[df["property"] == prop, [f"{prefix}_id", "value"]] - - path = prefix_cache_join(prefix, "properties", name=f"{prop}.tsv", version=version) - - @cached_df( - path=path, dtype=str, force=check_should_force(kwargs), cache=check_should_cache(kwargs) - ) - def _df_getter() -> pd.DataFrame: - ontology = get_ontology(prefix, **kwargs) - return ontology.get_filtered_properties_df(prop, use_tqdm=check_should_use_tqdm(kwargs)) - - return _df_getter() + prop = _ensure_ref(prop, ontology_prefix=prefix) + df = get_properties_df(prefix, **kwargs) + df = df.loc[df["property"] == prop.curie, [f"{prefix}_id", "value"]] + return df diff --git a/src/pyobo/api/relations.py b/src/pyobo/api/relations.py index 5072e393..d19c2210 100644 --- a/src/pyobo/api/relations.py +++ b/src/pyobo/api/relations.py @@ -4,7 +4,6 @@ from collections.abc import Mapping from functools import lru_cache -import networkx as nx import pandas as pd from typing_extensions import Unpack @@ -31,18 +30,32 @@ __all__ = [ "get_filtered_relations_df", - "get_graph", "get_id_multirelations_mapping", "get_relation", "get_relation_mapping", + "get_relations", "get_relations_df", ] -# TODO get_relation, get_relations - logger = logging.getLogger(__name__) +@wrap_norm_prefix +def get_relations( + prefix: str, **kwargs: Unpack[GetOntologyKwargs] +) -> list[tuple[Reference, Reference, Reference]]: + """Get relations.""" + df = get_relations_df(prefix, wide=False, **kwargs) + return [ + ( + Reference(prefix=prefix, identifier=source_id), + Reference(prefix=relation_prefix, identifier=relation_id), + Reference(prefix=target_prefix, identifier=target_id), + ) + for source_id, relation_prefix, relation_id, target_prefix, target_id in df.values + ] + + @wrap_norm_prefix def get_relations_df( prefix: str, *, wide: bool = False, **kwargs: Unpack[GetOntologyKwargs] @@ -171,18 +184,3 @@ def get_relation( **kwargs, ) return relation_mapping.get(source_identifier) - - -def get_graph( - prefix: str, *, wide: bool = False, **kwargs: Unpack[GetOntologyKwargs] -) -> nx.DiGraph: - """Get the relation graph.""" - rv = nx.MultiDiGraph() - df = get_relations_df(prefix=prefix, wide=wide, **kwargs) - for source_id, relation_prefix, relation_id, target_ns, target_id in df.values: - rv.add_edge( - f"{prefix}:{source_id}", - f"{target_ns}:{target_id}", - key=f"{relation_prefix}:{relation_id}", - ) - return rv diff --git a/src/pyobo/api/utils.py b/src/pyobo/api/utils.py index c5ce3b73..e3a6df80 100644 --- a/src/pyobo/api/utils.py +++ b/src/pyobo/api/utils.py @@ -3,10 +3,13 @@ import json import logging import os +import warnings from functools import lru_cache from typing import Literal, overload import bioversions +import curies +from curies import ReferenceTuple from ..constants import GetOntologyKwargs from ..utils.path import prefix_directory_join @@ -129,3 +132,26 @@ def get_version_pins() -> dict[str, str]: f"name." ) return version_pins + + +def _get_pi( + prefix: str | curies.Reference | ReferenceTuple, identifier: str | None = None, / +) -> ReferenceTuple: + if isinstance(prefix, ReferenceTuple): + if identifier is not None: + raise ValueError("unexpected non-none value passed as second positional argument") + return prefix + if isinstance(prefix, curies.Reference): + if identifier is not None: + raise ValueError("unexpected non-none value passed as second positional argument") + return prefix.pair + if identifier is None: + raise ValueError( + "prefix was given as a string, so an identifier was expected to be passed as a string as well" + ) + warnings.warn( + "Passing a prefix and identifier as seperate arguments is deprecated. Please pass a curies.Reference or curies.ReferenceTuple in the first positional-only argument instead.", + DeprecationWarning, + stacklevel=4, # this is 4 since this is (always?) called from inside a decorator + ) + return ReferenceTuple(prefix, identifier) diff --git a/src/pyobo/struct/reference.py b/src/pyobo/struct/reference.py index b1d30753..f66592a6 100644 --- a/src/pyobo/struct/reference.py +++ b/src/pyobo/struct/reference.py @@ -9,8 +9,8 @@ import bioontologies.upgrade import bioregistry import curies -from curies import ReferenceTuple -from curies.api import ExpansionError +from curies import Converter, ReferenceTuple +from curies.api import ExpansionError, _split from pydantic import Field, field_validator, model_validator from .utils import obo_escape @@ -79,6 +79,24 @@ def bioregistry_link(self) -> str: """Get the bioregistry link.""" return f"https://bioregistry.io/{self.curie}" + # override from_curie to get typing right + @classmethod + def from_curie( + cls, curie: str, *, sep: str = ":", converter: Converter | None = None + ) -> Reference: + """Parse a CURIE string and populate a reference. + + :param curie: A string representation of a compact URI (CURIE) + :param sep: The separator + :param converter: The converter to use as context when parsing + :return: A reference object + + >>> Reference.from_curie("chebi:1234") + Reference(prefix='CHEBI', identifier='1234') + """ + prefix, identifier = _split(curie, sep=sep) + return cls.model_validate({"prefix": prefix, "identifier": identifier}, context=converter) + @classmethod def from_curie_or_uri( cls, diff --git a/src/pyobo/struct/struct.py b/src/pyobo/struct/struct.py index c26014b7..52502a99 100644 --- a/src/pyobo/struct/struct.py +++ b/src/pyobo/struct/struct.py @@ -1005,6 +1005,14 @@ def _relations_path(self) -> Path: def _properties_path(self) -> Path: return self._cache(name="properties.tsv") + @property + def _literal_properties_path(self) -> Path: + return self._cache(name="literal_properties.tsv") + + @property + def _object_properties_path(self) -> Path: + return self._cache(name="object_properties.tsv") + @property def _root_metadata_path(self) -> Path: return prefix_directory_join(self.ontology, name="metadata.json") @@ -1074,7 +1082,19 @@ def _get_cache_config(self) -> list[tuple[str, Path, Sequence[str], Callable]]: "properties", self._properties_path, self.properties_header, - self.iter_property_rows, + self._iter_property_rows, + ), + ( + "object_properties", + self._object_properties_path, + self.object_properties_header, + self.iter_object_properties, + ), + ( + "literal_properties", + self._literal_properties_path, + self.literal_properties_header, + self.iter_literal_properties, ), ] @@ -1415,9 +1435,18 @@ def properties_header(self): """Property dataframe header.""" return [f"{self.ontology}_id", "property", "value", "datatype"] - def iter_property_rows(self, *, use_tqdm: bool = False) -> Iterable[tuple[str, str, str, str]]: + @property + def object_properties_header(self): + """Property dataframe header.""" + return ["source", "predicate", "target"] + + @property + def literal_properties_header(self): + """Property dataframe header.""" + return ["source", "predicate", "target", "datatype"] + + def _iter_property_rows(self, *, use_tqdm: bool = False) -> Iterable[tuple[str, str, str, str]]: """Iterate property rows.""" - tuple[Term, Reference, Reference, None] | tuple[Term, Reference, str, Reference] for term, t in self.iterate_properties(use_tqdm=use_tqdm): pred = term._reference(t.predicate, ontology_prefix=self.ontology) match t.value: @@ -1428,12 +1457,39 @@ def iter_property_rows(self, *, use_tqdm: bool = False) -> Iterable[tuple[str, s case _: raise TypeError(f"got: {type(t)} - {t}") - def get_properties_df(self, *, use_tqdm: bool = False) -> pd.DataFrame: + def get_properties_df(self, *, use_tqdm: bool = False, drop_na: bool = True) -> pd.DataFrame: """Get all properties as a dataframe.""" - return pd.DataFrame( - list(self.iter_property_rows(use_tqdm=use_tqdm)), + df = pd.DataFrame( + self._iter_property_rows(use_tqdm=use_tqdm), columns=self.properties_header, ) + if drop_na: + df.dropna(inplace=True) + return df + + def iter_object_properties(self, *, use_tqdm: bool = False) -> Iterable[tuple[str, str, str]]: + """Iterate over object property triples.""" + for term in self._iter_terms(use_tqdm=use_tqdm): + for predicate, target in term.iterate_object_properties(): + yield term.curie, predicate.curie, target.curie + + def get_object_properties_df(self, *, use_tqdm: bool = False) -> pd.DataFrame: + """Get all properties as a dataframe.""" + return pd.DataFrame( + self.iter_object_properties(use_tqdm=use_tqdm), columns=self.object_properties_header + ) + + def iter_literal_properties( + self, *, use_tqdm: bool = False + ) -> Iterable[tuple[str, str, str, str]]: + """Iterate over literal properties quads.""" + for term in self._iter_terms(use_tqdm=use_tqdm): + for predicate, target in term.iterate_literal_properties(): + yield term.curie, predicate.curie, target.value, target.datatype.curie + + def get_literal_properties_df(self, *, use_tqdm: bool = False) -> pd.DataFrame: + """Get all properties as a dataframe.""" + return pd.DataFrame(self.iter_literal_properties(), columns=self.literal_properties_header) def iterate_filtered_properties( self, prop: ReferenceHint, *, use_tqdm: bool = False @@ -1513,10 +1569,14 @@ def iterate_relations( if td := self._get_typedef(term, predicate, _warned, typedefs): yield term, td, reference + def get_edges_df(self, *, use_tqdm: bool = False) -> pd.DataFrame: + """Get an edges dataframe.""" + return pd.DataFrame(self.iterate_edge_rows(use_tqdm=use_tqdm), columns=self.edges_header) + def iterate_edge_rows(self, use_tqdm: bool = False) -> Iterable[tuple[str, str, str]]: """Iterate the edge rows.""" for term, typedef, reference in self.iterate_edges(use_tqdm=use_tqdm): - yield (term.curie, typedef.curie, reference.curie) + yield term.curie, typedef.curie, reference.curie def _get_typedef( self, diff --git a/src/pyobo/struct/struct_utils.py b/src/pyobo/struct/struct_utils.py index bd684cab..163b7885 100644 --- a/src/pyobo/struct/struct_utils.py +++ b/src/pyobo/struct/struct_utils.py @@ -421,6 +421,20 @@ def iterate_relations(self) -> Iterable[tuple[Reference, Reference]]: for target in sorted(targets): yield typedef, target + def iterate_object_properties(self) -> Iterable[tuple[Reference, Reference]]: + """Iterate over properties with references as their targets.""" + for predicate, values in self.properties.items(): + for value in values: + if isinstance(value, Reference): + yield predicate, value + + def iterate_literal_properties(self) -> Iterable[tuple[Reference, OBOLiteral]]: + """Iterate over properties with literals as their targets.""" + for predicate, values in self.properties.items(): + for value in values: + if isinstance(value, OBOLiteral): + yield predicate, value + def get_relationships(self, typedef: ReferenceHint) -> list[Reference]: """Get relationships from the given type.""" return self.relationships.get(_ensure_ref(typedef), []) @@ -511,25 +525,37 @@ def get_edges(self) -> list[tuple[Reference, Reference]]: """Get edges.""" return list(self._iter_edges()) - def _iter_edges(self) -> Iterable[tuple[Reference, Reference]]: - yield from self.iterate_relations() + def _iter_parents(self) -> Iterable[tuple[Reference, Reference]]: parent_prop = stanza_type_to_prop[self.type] for parent in itt.chain(self.parents, self.union_of): yield parent_prop, parent - for subset in self.subsets: - yield v.in_subset, subset - for k, values in self.properties.items(): - for value in values: - if isinstance(value, Reference): - yield k, value + + def _iter_intersections(self) -> Iterable[tuple[Reference, Reference]]: + parent_prop = stanza_type_to_prop[self.type] for intersection_of in self.intersection_of: match intersection_of: case Reference(): yield parent_prop, intersection_of case (predicate, target): yield predicate, target + + def _iter_edges(self) -> Iterable[tuple[Reference, Reference]]: + # The following are "object" properties, meaning + # they're part of the definition of the object + yield from self.iterate_relations() + yield from self._iter_parents() + yield from self._iter_intersections() + for equivalent_to in self.equivalent_to: + yield v.equivalent_class, equivalent_to + + # The following are "annotation" properties + for subset in self.subsets: + yield v.in_subset, subset + yield from self.iterate_object_properties() + for xref_reference in self.xrefs: + yield v.has_dbxref, xref_reference + # TODO disjoint_from - yield from self.get_mappings(include_xrefs=True, add_context=False) # docstr-coverage:excused `overload` @overload diff --git a/tests/test_alt_ids.py b/tests/test_api.py similarity index 77% rename from tests/test_alt_ids.py rename to tests/test_api.py index 840a64ed..2d41870f 100644 --- a/tests/test_alt_ids.py +++ b/tests/test_api.py @@ -9,9 +9,16 @@ import pyobo from pyobo import Reference as PyOBOReference -from pyobo import get_name, get_name_by_curie, get_primary_curie, get_primary_identifier +from pyobo import ( + default_reference, + get_name, + get_name_by_curie, + get_primary_curie, + get_primary_identifier, +) from pyobo.mocks import get_mock_id_alts_mapping, get_mock_id_name_mapping -from pyobo.struct.struct import Obo, Term, make_ad_hoc_ontology +from pyobo.struct import vocabulary as v +from pyobo.struct.struct import Obo, Term, TypeDef, make_ad_hoc_ontology mock_id_alts_mapping = get_mock_id_alts_mapping( { @@ -133,15 +140,28 @@ def test_no_alts(self, _, __): def test_api(self) -> None: """Test getting the hierarchy.""" + tr1 = default_reference(TEST_P1, "r1") + td1 = TypeDef(reference=tr1) r1 = PyOBOReference(prefix=TEST_P1, identifier="1", name="test name") r2 = PyOBOReference(prefix=TEST_P1, identifier="2") + r3 = PyOBOReference(prefix=TEST_P1, identifier="3") t1 = Term(reference=r1).append_alt(r2) + t1.append_comment("test comment") t2 = Term(reference=r2) - ontology = make_ad_hoc_ontology(TEST_P1, terms=[t1, t2]) + t3 = Term(reference=r3).append_parent(r1) + terms = [t1, t2, t3] + ontology = make_ad_hoc_ontology(TEST_P1, terms=terms, _typedefs=[td1]) + + targets = [ + "pyobo.api.names.get_ontology", + "pyobo.api.alts.get_ontology", + "pyobo.api.properties.get_ontology", + "pyobo.api.relations.get_ontology", + "pyobo.api.edges.get_ontology", + ] + with patch_ontologies(ontology, targets): + # Alts - with patch_ontologies( - ontology, ["pyobo.api.names.get_ontology", "pyobo.api.alts.get_ontology"] - ): ids_alts = pyobo.get_id_to_alts(TEST_P1, cache=False) self.assertEqual({"1": ["2"]}, ids_alts) @@ -154,8 +174,10 @@ def test_api(self) -> None: self.assertEqual("test:1", pyobo.get_primary_curie(r1.curie, cache=False)) self.assertEqual("test:1", pyobo.get_primary_curie(r2.curie, cache=False)) + # Names + ids = pyobo.get_ids(TEST_P1, cache=False) - self.assertEqual({"1", "2"}, ids) + self.assertEqual({t.identifier for t in terms}, ids) id_name = pyobo.get_id_name_mapping(TEST_P1, cache=False) self.assertEqual({t1.identifier: t1.name}, id_name) @@ -165,3 +187,20 @@ def test_api(self) -> None: self.assertEqual(t1.name, pyobo.get_name(r1, cache=False)) self.assertEqual(t1.name, pyobo.get_name(r2, cache=False)) + + # Properties + + value = pyobo.get_property( + r1.prefix, r1.identifier, prop=v.comment, cache=False, use_tqdm=False + ) + self.assertEqual("test comment", value) + + edges = pyobo.get_edges(TEST_P1, cache=False, use_tqdm=False) + self.assertEqual({(r3, v.is_a, r1), (r1, v.alternative_term, r2)}, set(edges)) + + graph = pyobo.get_hierarchy(TEST_P1, cache=False, use_tqdm=False) + self.assertEqual(3, graph.number_of_nodes()) + self.assertIn(r1.curie, graph) + self.assertIn(r2.curie, graph) + self.assertIn(r3.curie, graph) + self.assertEqual(1, graph.number_of_edges())