Update parsing and standardization

biopragmatics · May 28, 2022 · 854d009 · 854d009
1 parent 8ad40c5
commit 854d009
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 15 deletions.
diff --git a/src/bioontologies/obograph.py b/src/bioontologies/obograph.py
@@ -5,7 +5,7 @@
 
 from collections import defaultdict
 from operator import attrgetter
-from typing import Any, Iterable, List, Mapping, Optional, Set, Union
+from typing import Any, Iterable, List, Mapping, Optional, Set, Tuple, Union
 
 from bioregistry import normalize_curie
 from pydantic import BaseModel
@@ -81,6 +81,10 @@ class Edge(BaseModel):
     obj: str
     meta: Optional[Meta]
 
+    def as_tuple(self) -> Tuple[str, str, str]:
+        """Get the edge as a tuple."""
+        return self.sub, self.pred, self.obj
+
 
 class Node(BaseModel):
     """Represents a node in an OBO Graph."""
@@ -189,18 +193,22 @@ def _get_properties(self, pred: Union[str, List[str]]) -> List[str]:
             if any(prop.pred == p for p in pred)
         ]
 
-    def standardize(self, keep_invalid: bool = False) -> "Graph":
+    def standardize(self, keep_invalid: bool = False, use_tqdm: bool = True) -> "Graph":
         """Standardize the OBO graph.
 
         :param keep_invalid: Should CURIEs/IRIs that aren't handled
             by the Bioregistry be kept? Defaults to false.
+        :param use_tqdm:
+            Should a progress bar be used?
         :returns: This OBO graph, modified in place as follows:
 
             1. Convert IRIs to CURIEs (in many places) using :mod:`bioregistry`
             2. Add alternative identifiers to :class:`Node` objects
         """
         # Convert URIs to CURIEs
-        for node in tqdm(self.nodes, desc="standardizing nodes", unit_scale=True):
+        for node in tqdm(
+            self.nodes, desc="standardizing nodes", unit_scale=True, disable=not use_tqdm
+        ):
             if node.id.startswith(OBO_URI_PREFIX):
                 node.id = _clean_uri(node.id, keep_invalid=True)  # type:ignore
             if node.meta:
@@ -231,7 +239,9 @@ def standardize(self, keep_invalid: bool = False) -> "Graph":
                     xrefs.append(xref)
                 node.meta.xrefs = sorted(xrefs, key=attrgetter("val"))
 
-        for edge in tqdm(self.edges, desc="standardizing edges", unit_scale=True):
+        for edge in tqdm(
+            self.edges, desc="standardizing edges", unit_scale=True, disable=not use_tqdm
+        ):
             edge.sub = _clean_uri(edge.sub, keep_invalid=True)
             edge.pred = _clean_uri(edge.pred, keep_invalid=True)
             edge.obj = _clean_uri(edge.obj, keep_invalid=True)
@@ -274,12 +284,18 @@ def _clean_uri(s: str, *, keep_invalid: bool) -> Optional[str]:
         return None
 
 
-IS_A_STRINGS = {"is_a", "isa"}
+IS_A_STRINGS = {
+    "is_a",
+    "isa",
+    "type",  # used for instance to class
+}
 
 
 def _compress_uri(s: str) -> str:
     if s in IS_A_STRINGS:
         return "rdfs:subClassOf"
+    if s == "subPropertyOf":
+        return "rdfs:subPropertyOf"
     if s.startswith(OBO_URI_PREFIX):
         s = s[len(OBO_URI_PREFIX) :]
         if "_" in s and s.split("_")[1].isnumeric():  # best guess that it's an identifier
@@ -291,9 +307,13 @@ def _compress_uri(s: str) -> str:
                 return s
             else:
                 return s.replace("/", ":", 1)
-    if s.startswith("http://www.geneontology.org/formats/oboInOwl#"):
-        s = s[len("http://www.geneontology.org/formats/oboInOwl#") :]
-        s = "oboinowl:" + s
+    for uri_prefix, prefix in [
+        ("http://www.geneontology.org/formats/oboInOwl#", "oboinowl"),
+        ("http://www.w3.org/2002/07/owl#", "owl"),
+    ]:
+        if s.startswith(uri_prefix):
+            s = s[len(uri_prefix) :]
+            s = f"{prefix}:{s}"
     return s
 
 

diff --git a/src/bioontologies/robot.py b/src/bioontologies/robot.py
@@ -39,11 +39,13 @@
 class ParseResults:
     """A dataclass containing an OBO Graph JSON and text output from ROBOT."""
 
-    graph_document: GraphDocument
+    graph_document: Optional[GraphDocument]
     messages: List[str] = dataclasses.field(default_factory=list)
 
     def squeeze(self) -> Graph:
         """Get the first graph."""
+        if self.graph_document is None:
+            raise ValueError(f"graph document was not successfully parsed: {self.messages}")
         return self.graph_document.graphs[0]
 
 
@@ -63,29 +65,37 @@ def get_obograph_by_prefix(
     if prefix != bioregistry.normalize_prefix(prefix):
         raise ValueError("this function requires bioregistry canonical prefixes")
 
+    messages = []
     json_iri = bioregistry.get_json_download(prefix)
 
     if json_iri is not None:
-        res_json = requests.get(json_iri).json()
-        graph_document = GraphDocument(**res_json)
-        return ParseResults(graph_document=graph_document)
+        try:
+            return get_obograph_by_iri(json_iri)
+        except (IOError, ValueError):
+            msg = f"could not parse JSON for {prefix} from {json_iri}"
+            messages.append(msg)
+            logger.warning(msg)
 
     owl_iri = bioregistry.get_owl_download(prefix)
     obo_iri = bioregistry.get_obo_download(prefix)
 
-    for iri in [owl_iri, obo_iri]:
+    for label, iri in [("OWL", owl_iri), ("OBO", obo_iri)]:
         if iri is None:
             continue
 
         try:
             parse_results = convert_to_obograph_remote(iri, json_path=json_path)
         except subprocess.CalledProcessError:
-            logger.warning("could not parse OBO for %s from %s", prefix, iri)
+            msg = f"could not parse {label} for {prefix} from {iri}"
+            messages.append(msg)
+            logger.warning(msg)
             continue
         else:
+            # stick all messages before
+            parse_results.messages = [*messages, *parse_results.messages]
             return parse_results
 
-    raise RuntimeError(f"no IRI available for Bioregistry prefix {prefix}")
+    return ParseResults(graph_document=None, messages=messages)
 
 
 def convert_to_obograph_local(