Skip to content

Commit

Permalink
Update parsing and standardization
Browse files Browse the repository at this point in the history
  • Loading branch information
cthoyt committed May 28, 2022
1 parent 8ad40c5 commit 854d009
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 15 deletions.
36 changes: 28 additions & 8 deletions src/bioontologies/obograph.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from collections import defaultdict
from operator import attrgetter
from typing import Any, Iterable, List, Mapping, Optional, Set, Union
from typing import Any, Iterable, List, Mapping, Optional, Set, Tuple, Union

from bioregistry import normalize_curie
from pydantic import BaseModel
Expand Down Expand Up @@ -81,6 +81,10 @@ class Edge(BaseModel):
obj: str
meta: Optional[Meta]

def as_tuple(self) -> Tuple[str, str, str]:
"""Get the edge as a tuple."""
return self.sub, self.pred, self.obj


class Node(BaseModel):
"""Represents a node in an OBO Graph."""
Expand Down Expand Up @@ -189,18 +193,22 @@ def _get_properties(self, pred: Union[str, List[str]]) -> List[str]:
if any(prop.pred == p for p in pred)
]

def standardize(self, keep_invalid: bool = False) -> "Graph":
def standardize(self, keep_invalid: bool = False, use_tqdm: bool = True) -> "Graph":
"""Standardize the OBO graph.
:param keep_invalid: Should CURIEs/IRIs that aren't handled
by the Bioregistry be kept? Defaults to false.
:param use_tqdm:
Should a progress bar be used?
:returns: This OBO graph, modified in place as follows:
1. Convert IRIs to CURIEs (in many places) using :mod:`bioregistry`
2. Add alternative identifiers to :class:`Node` objects
"""
# Convert URIs to CURIEs
for node in tqdm(self.nodes, desc="standardizing nodes", unit_scale=True):
for node in tqdm(
self.nodes, desc="standardizing nodes", unit_scale=True, disable=not use_tqdm
):
if node.id.startswith(OBO_URI_PREFIX):
node.id = _clean_uri(node.id, keep_invalid=True) # type:ignore
if node.meta:
Expand Down Expand Up @@ -231,7 +239,9 @@ def standardize(self, keep_invalid: bool = False) -> "Graph":
xrefs.append(xref)
node.meta.xrefs = sorted(xrefs, key=attrgetter("val"))

for edge in tqdm(self.edges, desc="standardizing edges", unit_scale=True):
for edge in tqdm(
self.edges, desc="standardizing edges", unit_scale=True, disable=not use_tqdm
):
edge.sub = _clean_uri(edge.sub, keep_invalid=True)
edge.pred = _clean_uri(edge.pred, keep_invalid=True)
edge.obj = _clean_uri(edge.obj, keep_invalid=True)
Expand Down Expand Up @@ -274,12 +284,18 @@ def _clean_uri(s: str, *, keep_invalid: bool) -> Optional[str]:
return None


IS_A_STRINGS = {"is_a", "isa"}
IS_A_STRINGS = {
"is_a",
"isa",
"type", # used for instance to class
}


def _compress_uri(s: str) -> str:
if s in IS_A_STRINGS:
return "rdfs:subClassOf"
if s == "subPropertyOf":
return "rdfs:subPropertyOf"
if s.startswith(OBO_URI_PREFIX):
s = s[len(OBO_URI_PREFIX) :]
if "_" in s and s.split("_")[1].isnumeric(): # best guess that it's an identifier
Expand All @@ -291,9 +307,13 @@ def _compress_uri(s: str) -> str:
return s
else:
return s.replace("/", ":", 1)
if s.startswith("http://www.geneontology.org/formats/oboInOwl#"):
s = s[len("http://www.geneontology.org/formats/oboInOwl#") :]
s = "oboinowl:" + s
for uri_prefix, prefix in [
("http://www.geneontology.org/formats/oboInOwl#", "oboinowl"),
("http://www.w3.org/2002/07/owl#", "owl"),
]:
if s.startswith(uri_prefix):
s = s[len(uri_prefix) :]
s = f"{prefix}:{s}"
return s


Expand Down
24 changes: 17 additions & 7 deletions src/bioontologies/robot.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,13 @@
class ParseResults:
"""A dataclass containing an OBO Graph JSON and text output from ROBOT."""

graph_document: GraphDocument
graph_document: Optional[GraphDocument]
messages: List[str] = dataclasses.field(default_factory=list)

def squeeze(self) -> Graph:
"""Get the first graph."""
if self.graph_document is None:
raise ValueError(f"graph document was not successfully parsed: {self.messages}")
return self.graph_document.graphs[0]


Expand All @@ -63,29 +65,37 @@ def get_obograph_by_prefix(
if prefix != bioregistry.normalize_prefix(prefix):
raise ValueError("this function requires bioregistry canonical prefixes")

messages = []
json_iri = bioregistry.get_json_download(prefix)

if json_iri is not None:
res_json = requests.get(json_iri).json()
graph_document = GraphDocument(**res_json)
return ParseResults(graph_document=graph_document)
try:
return get_obograph_by_iri(json_iri)
except (IOError, ValueError):
msg = f"could not parse JSON for {prefix} from {json_iri}"
messages.append(msg)
logger.warning(msg)

owl_iri = bioregistry.get_owl_download(prefix)
obo_iri = bioregistry.get_obo_download(prefix)

for iri in [owl_iri, obo_iri]:
for label, iri in [("OWL", owl_iri), ("OBO", obo_iri)]:
if iri is None:
continue

try:
parse_results = convert_to_obograph_remote(iri, json_path=json_path)
except subprocess.CalledProcessError:
logger.warning("could not parse OBO for %s from %s", prefix, iri)
msg = f"could not parse {label} for {prefix} from {iri}"
messages.append(msg)
logger.warning(msg)
continue
else:
# stick all messages before
parse_results.messages = [*messages, *parse_results.messages]
return parse_results

raise RuntimeError(f"no IRI available for Bioregistry prefix {prefix}")
return ParseResults(graph_document=None, messages=messages)


def convert_to_obograph_local(
Expand Down

0 comments on commit 854d009

Please sign in to comment.