biopragmatics · cthoyt · Dec 5, 2023 · Dec 5, 2023 · Dec 5, 2023 · Jan 22, 2024
diff --git a/src/semra/evaluate_prediction.py b/src/semra/evaluate_prediction.py
@@ -0,0 +1,168 @@
+import itertools as itt
+import logging
+from collections import defaultdict
+from collections.abc import Iterable
+from typing import TYPE_CHECKING
+
+import click
+from tqdm import tqdm
+
+from semra.api import assemble_evidences, get_index
+from semra.io import from_pyobo
+from semra.rules import EXACT_MATCH, LEXICAL_MAPPING
+from semra.struct import Mapping, MappingSet, Reference, SimpleEvidence
+
+if TYPE_CHECKING:
+    import gilda
+
+logger = logging.getLogger(__name__)
+
+
+def _get_v1(positive_set, negative_set, predicted_set):
+    tp = len(positive_set.intersection(predicted_set))  # true positives
+    fp = len(negative_set.intersection(predicted_set))  # false positives
+    fn = len(positive_set - predicted_set)  # false negatives
+    tn = len(negative_set - predicted_set)  # true negatives
+    return tp, fp, fn, tn
+
+
+def evaluate_predictions(
+    *,
+    positive: Iterable[Mapping],
+    negative: Iterable[Mapping],
+    predicted: Iterable[Mapping],
+    tag: str | None = None,
+):
+    """Evaluate predicted mappings using ground truth positive and negative mappings."""
+    positive_index = get_index(positive, progress=False)
+    negative_index = get_index(negative, progress=False)
+    predicted_index = get_index(predicted, progress=False)
+
+    positive_set = set(positive_index)
+    negative_set = set(negative_index)
+    predicted_set = set(predicted_index)
+
+    tp, fp, fn, tn = _get_v1(positive_set, negative_set, predicted_set)
+
+    predicted_only = len(predicted_set - positive_set - negative_set)
+    union_len = len(positive_set.union(predicted_set).union(negative_set))
+
+    msg = f"union={union_len:,}, intersection={tp:,}, curated={fn:,}, predicted={predicted_only:,}"
+    if tag is not None:
+        msg = f"[{tag}] {msg}"
+    logger.info(msg)
+
+    accuracy = (tp + tn) / (tp + tn + fp + fn)
+    recall = tp / (tp + fn)
+    precision = tp / (tp + fp)
+    f1 = 2 * tp / (2 * tp + fp + fn)
+    completion = 1 - predicted_only / len(predicted_set)
+
+    # what is the percentage of curated examples that are positive?
+    # positive_percentage = len(positive_set) / (len(positive_set) + len(negative_set))
+    return completion, accuracy, precision, recall, f1
+
+
+def _index_text(grounder: "gilda.Grounder") -> dict[str, list["gilda.Term"]]:
+    dd = defaultdict(list)
+    for terms in grounder.entries.values():
+        for term in terms:
+            dd[term.text].append(term)
+    return dict(dd)
+
+
+def _grounder_to_mappings(grounders: dict[str, "gilda.Grounder"]) -> Iterable[tuple["gilda.Term", "gilda.Term", float]]:
+    terms = {prefix: _index_text(grounder) for prefix, grounder in tqdm(grounders.items(), desc="Indexing texts")}
+    for (p1, g1), (p2, _g2) in tqdm(
+        itt.combinations(grounders.items(), 2), unit_scale=True, desc="Generating mappings"
+    ):
+        text_to_terms = terms[p2]
+        for text, terms in tqdm(text_to_terms.items(), unit_scale=True, desc=f"{p1}-{p2} lexical"):
+            scored_matches = g1.ground(text)
+            # there are lots of ways to do this, now we do all-by-all
+            for subject_term, scored_match in itt.product(terms, scored_matches):
+                yield subject_term, scored_match.term, scored_match.score
+
+
+#: A default confidence for mappings generated by Gilda
+GILDA_CONFIDENCE = 0.9
+
+
+def grounder_to_mappings(grounders: dict[str, "gilda.Grounder"]) -> list[Mapping]:
+    prefix_list_str = ", ".join(sorted(grounders))
+    mapping_set = MappingSet(name=f"Gilda predicted mappings for {prefix_list_str}", confidence=GILDA_CONFIDENCE)
+    mappings = []
+    for subject_term, object_term, confidence in _grounder_to_mappings(grounders):
+        mapping = Mapping(
+            s=Reference(prefix=subject_term.db, identifier=subject_term.id),
+            p=EXACT_MATCH,
+            o=Reference(prefix=object_term.db, identifier=object_term.id),
+            evidence=[SimpleEvidence(justification=LEXICAL_MAPPING, mapping_set=mapping_set, confidence=confidence)],
+        )
+        mappings.append(mapping)
+    mappings = assemble_evidences(mappings, progress=False)
+    return mappings
+
+
+@click.command()
+def main():
+    import pyobo.gilda_utils
+    import pystow
+    from tabulate import tabulate
+
+    from semra.api import infer_reversible, keep_prefixes
+    from semra.io import from_sssom, write_sssom
+    from semra.sources import from_biomappings_negative, get_biomappings_positive_mappings, get_clo_mappings
+
+    positive_mappings = get_biomappings_positive_mappings()
+    positive_mappings = infer_reversible(positive_mappings, progress=False)
+    click.echo(f"Got {len(positive_mappings):,} positive mappings")
+
+    negative_mappings = from_biomappings_negative()
+    negative_mappings = infer_reversible(negative_mappings, progress=False)
+    click.echo(f"Got {len(negative_mappings):,} negative mappings")
+
+    rows = []
+    mesh_grounder = pyobo.gilda_utils.get_grounder("mesh", versions="2023")
+    for prefix in sorted(["chebi", "maxo", "cl", "doid", "go", "uberon", "vo", "clo"]):
+        path = pystow.join("semra", "evaluation_prediction", name=f"evaluation_prediction_sample_{prefix}.tsv")
+
+        if path.is_file():
+            predicted_mappings = from_sssom(path, mapping_set_name="gilda predictions")
+        else:
+            grounders = {"mesh": mesh_grounder, prefix: pyobo.gilda_utils.get_grounder(prefix)}
+            predicted_mappings = grounder_to_mappings(grounders)
+            click.echo(f"Got {len(predicted_mappings):,} predicted mappings")
+            predicted_mappings = infer_reversible(predicted_mappings, progress=False)
+            write_sssom(predicted_mappings, path)
+
+        if prefix == "clo":
+            ontology_mappings = get_clo_mappings()
+            ontology_mappings = keep_prefixes(ontology_mappings, [prefix, "mesh"], progress=False)
+        else:
+            ontology_mappings = from_pyobo(prefix, "mesh")
+        ontology_mappings = infer_reversible(ontology_mappings, progress=False)
+        click.echo(f"[{prefix}] got {len(ontology_mappings):,} mappings from the ontology")
+
+        positive_mappings_subset = keep_prefixes(positive_mappings, [prefix, "mesh"], progress=False)
+        negative_mappings_subset = keep_prefixes(negative_mappings, [prefix, "mesh"], progress=False)
+        evaluation_row = evaluate_predictions(
+            positive=itt.chain(positive_mappings_subset, ontology_mappings),
+            negative=negative_mappings_subset,
+            predicted=predicted_mappings,
+            tag=prefix,
+        )
+        rows.append((f"[{prefix}](https://bioregistry.io/{prefix})", *evaluation_row))
+
+    click.echo(
+        tabulate(
+            rows,
+            headers=["prefix", "completion", "accuracy", "precision", "recall", "f1"],
+            floatfmt=".1%",
+            tablefmt="github",
+        )
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/semra/io.py b/src/semra/io.py
@@ -194,7 +194,7 @@ def _from_df(
 
 def from_pyobo(prefix: str, target_prefix: str | None = None, *, standardize: bool = False, **kwargs) -> list[Mapping]:
     if target_prefix:
-        return _from_pyobo_pair(prefix, target_prefix, standardize=standardize, **kwargs)
+        return _from_pyobo_pair(prefix, target_prefix, **kwargs)
     return _from_pyobo_prefix(prefix, standardize=standardize, **kwargs)