Additional improvements for OFN output (#301)

1. It seems like the debio issue is getting closer to being solved, but somehow the chemrof predicates seem to work where the debio ones don't for InChI and SMILES. debio predicates also seem to work in the miRBase exporter when used for object properties. 2. Skip more bad IDs from BiGG 3. Output oboInOWL to OFN if auto-generated-by is used, but not in OBO since it's builtin
biopragmatics · Jan 13, 2025 · 8cc1c97 · 8cc1c97
1 parent 27e7708
commit 8cc1c97
Show file tree

Hide file tree

Showing 8 changed files with 60 additions and 40 deletions.
diff --git a/src/pyobo/sources/bigg/bigg_metabolite.py b/src/pyobo/sources/bigg/bigg_metabolite.py
@@ -1,5 +1,6 @@
 """Converter for metabolites in BiGG."""
 
+import logging
 import re
 from collections.abc import Iterable
 
@@ -15,6 +16,8 @@
     "BiGGMetaboliteGetter",
 ]
 
+logger = logging.getLogger(__name__)
+
 PREFIX = "bigg.metabolite"
 URL = "http://bigg.ucsd.edu/static/namespace/bigg_models_metabolites.txt"
 PATTERN = re.compile("^[a-z_A-Z0-9]+$")
@@ -105,7 +108,7 @@ def iterate_terms(force: bool = False, version: str | None = None) -> Iterable[T
         )
         if pd.notna(bigg_compartmental_id):
             if not PATTERN.match(bigg_compartmental_id):
-                tqdm.write(
+                logger.debug(
                     f"[{PREFIX}:{universal_bigg_id}] invalid compartment ID: {bigg_compartmental_id}"
                 )
             else:
@@ -116,7 +119,7 @@ def iterate_terms(force: bool = False, version: str | None = None) -> Iterable[T
             if not PATTERN.match(old_bigg_id):
                 if not old_bigg_id.endswith("]"):
                     # if it ends with ']' then it's a compartment identifier
-                    tqdm.write(f"[{PREFIX}:{universal_bigg_id}] invalid alt ID: {old_bigg_id}")
+                    logger.debug(f"[{PREFIX}:{universal_bigg_id}] invalid alt ID: {old_bigg_id}")
                 continue
             term.append_alt(Reference(prefix=PREFIX, identifier=old_bigg_id))
         _parse_model_links(term, model_list)

diff --git a/src/pyobo/sources/bigg/bigg_reaction.py b/src/pyobo/sources/bigg/bigg_reaction.py
@@ -47,6 +47,10 @@ def iterate_terms(force: bool = False, version: str | None = None) -> Iterable[T
     for bigg_id, name, reaction_string, model_list, database_links, old_bigg_ids in tqdm(
         bigg_reaction_df.values, unit_scale=True, unit="reaction", desc=f"[{PREFIX}] processing"
     ):
+        if "(" in bigg_id:
+            tqdm.write(f"[{PREFIX}] identifier has open paren. can't encode in OWL: {bigg_id}")
+            continue
+
         term = Term(
             reference=Reference(
                 prefix=PREFIX, identifier=bigg_id, name=name if pd.notna(name) else None
@@ -56,6 +60,8 @@ def iterate_terms(force: bool = False, version: str | None = None) -> Iterable[T
         for old_bigg_id in _split(old_bigg_ids):
             if old_bigg_id == bigg_id:
                 continue
+            if "(" in old_bigg_id:
+                continue
             term.append_alt(Reference(prefix=PREFIX, identifier=old_bigg_id))
         _parse_model_links(term, model_list)
 

diff --git a/src/pyobo/sources/rhea.py b/src/pyobo/sources/rhea.py
@@ -7,17 +7,8 @@
 import pystow
 
 from pyobo.api.utils import get_version
-from pyobo.struct import Obo, Reference, Term
-from pyobo.struct.typedef import (
-    enabled_by,
-    has_bidirectional_reaction,
-    has_input,
-    has_left_to_right_reaction,
-    has_output,
-    has_participant,
-    has_right_to_left_reaction,
-    reaction_enabled_by_molecular_function,
-)
+from pyobo.struct import Obo, Reference, Term, TypeDef
+from pyobo.struct import typedef as v
 from pyobo.utils.path import ensure_df
 
 if TYPE_CHECKING:
@@ -31,6 +22,16 @@
 PREFIX = "rhea"
 RHEA_RDF_GZ_URL = "ftp://ftp.expasy.org/databases/rhea/rdf/rhea.rdf.gz"
 
+has_left_to_right_reaction = TypeDef.default(
+    PREFIX, "hasLeftToRightReaction", name="has left to right reaction", is_metadata_tag=True
+).append_xref(v.has_left_to_right_reaction)
+has_right_to_left_reaction = TypeDef.default(
+    PREFIX, "hasRightToLeftReaction", name="has right to left reaction", is_metadata_tag=True
+).append_xref(v.has_right_to_left_reaction)
+has_bidirectional_reaction = TypeDef.default(
+    PREFIX, "hasBidirectionalReaction", name="has bidirectional reaction", is_metadata_tag=True
+).append_xref(v.has_bidirectional_reaction)
+
 
 class RheaGetter(Obo):
     """An ontology representation of Rhea's chemical reaction database."""
@@ -40,11 +41,11 @@ class RheaGetter(Obo):
         has_left_to_right_reaction,
         has_bidirectional_reaction,
         has_right_to_left_reaction,
-        enabled_by,
-        has_input,
-        has_output,
-        has_participant,
-        reaction_enabled_by_molecular_function,
+        v.enabled_by,
+        v.has_input,
+        v.has_output,
+        v.has_participant,
+        v.reaction_enabled_by_molecular_function,
     ]
 
     def iter_terms(self, force: bool = False) -> Iterable[Term]:
@@ -159,10 +160,10 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
             right_rhea_id = master_to_left[master_rhea_id]
         else:
             raise ValueError(f"Invalid side: {side_uri}")
-        terms[master_rhea_id].annotate_object(has_participant, chebi_reference)
-        terms[master_to_bi[master_rhea_id]].annotate_object(has_participant, chebi_reference)
-        terms[left_rhea_id].append_relationship(has_input, chebi_reference)
-        terms[right_rhea_id].append_relationship(has_output, chebi_reference)
+        terms[master_rhea_id].annotate_object(v.has_participant, chebi_reference)
+        terms[master_to_bi[master_rhea_id]].annotate_object(v.has_participant, chebi_reference)
+        terms[left_rhea_id].append_relationship(v.has_input, chebi_reference)
+        terms[right_rhea_id].append_relationship(v.has_output, chebi_reference)
 
     hierarchy = ensure_df(
         PREFIX,
@@ -181,8 +182,8 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
         ("reactome", "rhea2reactome", None),
         ("macie", "rhea2macie", None),
         ("metacyc", "rhea2metacyc", None),
-        ("go", "rhea2go", reaction_enabled_by_molecular_function),
-        ("uniprot", "rhea2uniprot", enabled_by),
+        ("go", "rhea2go", v.reaction_enabled_by_molecular_function),
+        ("uniprot", "rhea2uniprot", v.enabled_by),
     ]:
         xref_df = ensure_df(
             PREFIX,
@@ -223,11 +224,11 @@ def iter_terms(version: str, force: bool = False) -> Iterable[Term]:
         _iubmb,
     ) in ec_df.values:
         terms[directional_rhea_id].append_relationship(
-            enabled_by, Reference(prefix="eccode", identifier=ec)
+            v.enabled_by, Reference(prefix="eccode", identifier=ec)
         )
 
     yield from terms.values()
 
 
 if __name__ == "__main__":
-    RheaGetter.cli()
+    RheaGetter.cli(["--owl"])
diff --git a/src/pyobo/struct/functional/dsl.py b/src/pyobo/struct/functional/dsl.py
@@ -164,6 +164,8 @@ def to_funowl(self) -> str:
         """Represent this identifier for functional OWL."""
         if isinstance(self.identifier, term.URIRef):
             return f"<{self.identifier}>"
+        if any(c in self.identifier.identifier for c in "()"):
+            raise ValueError(f"Can't encode CURIE with parentheses to OFN: {self.identifier}")
         return self.identifier.curie
 
     def to_funowl_args(self) -> str:  # pragma: no cover

diff --git a/src/pyobo/struct/struct.py b/src/pyobo/struct/struct.py
@@ -658,6 +658,8 @@ def _get_prefixes(self) -> set[str]:
         # _iterate_property_pairs covers metadata, root terms,
         # and properties in self.property_values
         prefixes.update(_get_prefixes_from_annotations(self._iterate_property_pairs() or []))
+        if self.auto_generated_by:
+            prefixes.add("oboInOwl")
         return prefixes
 
     def _get_version(self) -> str | None:
@@ -808,12 +810,16 @@ def iterate_obo_lines(
         # 10 TODO namespace-id-rule
         # 11
         for prefix, url in sorted(self._get_clean_idspaces().items()):
-            if prefix in DEFAULT_PREFIX_MAP or prefix == "obo":
+            if prefix in DEFAULT_PREFIX_MAP:
                 # we don't need to write out the 4 default prefixes from
                 # table 2 in https://www.w3.org/TR/owl2-syntax/#IRIs since
                 # they're considered to always be builtin
                 continue
 
+            # additional assumptions about built in
+            if prefix in {"obo", "oboInOwl"}:
+                continue
+
             # ROBOT assumes that all OBO foundry prefixes are builtin,
             # so don't re-declare them
             if bioregistry.is_obo_foundry(prefix):

diff --git a/src/pyobo/struct/typedef.py b/src/pyobo/struct/typedef.py
@@ -7,7 +7,7 @@
 from curies import ReferenceTuple
 
 from . import vocabulary as v
-from .reference import Reference
+from .reference import Reference, default_reference
 from .struct import TypeDef
 from ..resources.ro import load_ro
 
@@ -73,10 +73,12 @@
 has_left_to_right_reaction = TypeDef(v.has_left_to_right_reaction, is_metadata_tag=True)
 has_right_to_left_reaction = TypeDef(v.has_right_to_left_reaction, is_metadata_tag=True)
 has_bidirectional_reaction = TypeDef(
-    Reference(prefix="debio", identifier="0000009", name="has bi-directional reaction"),
+    reference=default_reference("RO", "hasBiDirectionalReaction"),
     is_metadata_tag=True,
-)
+).append_xref(Reference(prefix="debio", identifier="0000009", name="has bi-directional reaction"))
 reaction_enabled_by_molecular_function = TypeDef(
+    reference=default_reference("RO", "reactionEnabledByMolecularFunction")
+).append_xref(
     Reference(prefix="debio", identifier="0000047", name="reaction enabled by molecular function")
 )
 
@@ -249,15 +251,9 @@
     range=Reference(prefix="IAO", identifier="0000013", name="journal article"),
 )
 
-has_smiles = TypeDef(
-    reference=Reference(prefix="debio", identifier="0000022", name="has SMILES"),
-    is_metadata_tag=True,
-)
+has_smiles = TypeDef(reference=v.has_smiles, is_metadata_tag=True).append_xref(v.debio_has_smiles)
 
-has_inchi = TypeDef(
-    reference=Reference(prefix="debio", identifier="0000020", name="has InChI"),
-    is_metadata_tag=True,
-)
+has_inchi = TypeDef(reference=v.has_inchi, is_metadata_tag=True).append_xref(v.debio_has_inchi)
 
 has_homepage = TypeDef(
     reference=Reference(prefix="foaf", identifier="homepage", name="homepage"), is_metadata_tag=True

diff --git a/src/pyobo/struct/vocabulary.py b/src/pyobo/struct/vocabulary.py
@@ -69,6 +69,12 @@ def _c(c: curies.NamedReference) -> Reference:
 has_right_to_left_reaction = Reference(
     prefix="debio", identifier="0000008", name="has right-to-left reaction"
 )
+debio_has_inchi = Reference(prefix="debio", identifier="0000020", name="has InChI")
+has_inchi = Reference(prefix="chemrof", identifier="inchi_string")
+
+debio_has_smiles = Reference(prefix="debio", identifier="0000022", name="has SMILES")
+has_smiles = Reference(prefix="chemrof", identifier="smiles_string")
+
 # TODO update to use debio, or put in RO
 has_citation = default_reference(prefix="RO", identifier="hasCitation", name="has citation")
 has_description = Reference(prefix="dcterms", identifier="description", name="description")

diff --git a/tests/test_struct/test_obo/test_typedef.py b/tests/test_struct/test_obo/test_typedef.py
@@ -373,15 +373,15 @@ def test_11_property_value(self) -> None:
             [Typedef]
             id: RO:0000087
             property_value: dcterms:contributor orcid:0000-0003-4423-4370 ! contributor Charles Tapley Hoyt
-            property_value: debio:0000020 "abc" xsd:string
+            property_value: ChEMROF:inchi_string "abc" xsd:string
             """,
             typedef,
         )
         self.assert_funowl_lines(
             """\
             Declaration(ObjectProperty(RO:0000087))
             AnnotationAssertion(dcterms:contributor RO:0000087 orcid:0000-0003-4423-4370)
-            AnnotationAssertion(debio:0000020 RO:0000087 "abc"^^xsd:string)
+            AnnotationAssertion(ChEMROF:inchi_string RO:0000087 "abc"^^xsd:string)
             """,
             typedef,
         )