From 4c4f169058b870978ef8f6a2b2b6052ec7708805 Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Tue, 1 Feb 2022 11:12:16 +0100
Subject: [PATCH 1/4] Add discrete weighted ORA

---
 .../client/enrichment/discrete_weighted.py    | 208 ++++++++++++++++++
 1 file changed, 208 insertions(+)
 create mode 100644 src/indra_cogex/client/enrichment/discrete_weighted.py

diff --git a/src/indra_cogex/client/enrichment/discrete_weighted.py b/src/indra_cogex/client/enrichment/discrete_weighted.py
new file mode 100644
index 000000000..b97f35b94
--- /dev/null
+++ b/src/indra_cogex/client/enrichment/discrete_weighted.py
@@ -0,0 +1,208 @@
+# -*- coding: utf-8 -*-
+
+"""Weighted ORA."""
+
+import pickle
+from functools import lru_cache
+from typing import Iterable, List, Mapping, Tuple
+
+import numpy as np
+import pandas as pd
+import pystow
+from scipy.stats import fisher_exact
+
+from indra_cogex.client.enrichment.discrete import EXAMPLE_GENE_IDS, count_human_genes
+from indra_cogex.client.neo4j_client import Neo4jClient
+
+ENTITY_TO_TARGETS_CYPHER = """\
+MATCH (regulator:BioEntity)-[r:indra_rel]->(gene:BioEntity)
+WHERE
+    gene.id STARTS WITH "hgnc"                  // Collecting human genes only
+    AND r.stmt_type <> "Complex"                // Ignore complexes since they are non-directional
+    AND NOT regulator.id STARTS WITH "uniprot"  // This is a simple way to ignore non-human proteins
+RETURN 
+    regulator.id,
+    regulator.name,
+    collect({gene:gene.id, evidence_count:r.evidence_count})
+"""
+
+TEST_RESULTS_PATH = pystow.join("indra", "cogex", name="weighted_ora_test_results.tsv")
+
+
+@lru_cache(maxsize=1)
+def _get_data(
+    *,
+    client: Neo4jClient,
+    reload: bool = False,
+    cutoff: int = 1,
+) -> List[Tuple[str, str, Mapping[str, int]]]:
+    cache_path = pystow.join(
+        "indra", "cogex", name=f"weighted_ora_test_{cutoff:03d}.pkl"
+    )
+    if cache_path.exists() and not reload:
+        with cache_path.open("rb") as file:
+            return pickle.load(file)
+    rv = [
+        (
+            curie,
+            name,
+            {
+                collection_row["gene"]: collection_row["evidence_count"]
+                for collection_row in collection_rows
+                if cutoff <= collection_row["evidence_count"]
+            },
+        )
+        for curie, name, collection_rows in client.query_tx(ENTITY_TO_TARGETS_CYPHER)
+    ]
+    with cache_path.open("wb") as file:
+        pickle.dump(rv, file, protocol=pickle.HIGHEST_PROTOCOL)
+    return rv
+
+
+def indra_upstream_weighted_ora(
+    gene_ids: Iterable[str],
+    *,
+    client: Neo4jClient,
+    minimum_evidence_count: int = 1,
+):
+    gene_universe = count_human_genes(client=client)
+    query_weights = {
+        # TODO need some kind of pre-calculated global adjustment here
+        gene_id: 1
+        for gene_id in gene_ids
+    }
+    rows = []
+    debug_rows = []
+    for curie, name, pathway_weights in _get_data(client=client):
+        # print(pathway_curie, pathway_name)
+        # print(pathway_weights)
+        # The weight for all remaining pathways is estimated by this.
+        # Lots of room for improvemnt here. Maybe use label smoothing ideas?
+        estimated_average_weight = sum(pathway_weights.values()) / gene_universe
+        print(curie, estimated_average_weight)
+        intersection = sum(
+            pathway_weights[gene_id]
+            for gene_id in set(query_weights).intersection(pathway_weights)
+        )
+        pathway_minus_query = sum(
+            pathway_weights[gene_id]
+            for gene_id in set(pathway_weights).difference(query_weights)
+        )
+        query_minus_pathway = sum(
+            estimated_average_weight
+            for _ in set(query_weights).difference(pathway_weights)
+        )
+        union = sum((intersection, pathway_minus_query, query_minus_pathway))
+        total = gene_universe * estimated_average_weight
+        bottom_right = total - union
+        table = np.array(
+            [
+                [
+                    intersection,
+                    query_minus_pathway,
+                ],
+                [
+                    pathway_minus_query,
+                    bottom_right,
+                ],
+            ]
+        )
+        debug_rows.append(
+            (
+                curie,
+                intersection,
+                estimated_average_weight,
+                query_minus_pathway,
+                pathway_minus_query,
+                union,
+                total,
+                bottom_right,
+            )
+        )
+        _, pvalue = fisher_exact(table, alternative="greater")
+        rows.append((curie, name, pvalue))
+
+    df = pd.DataFrame(rows, columns=["curie", "name", "p"]).sort_values(
+        "p", ascending=True
+    )
+    df["mlp"] = -np.log10(df["p"])
+    return df
+
+
+def indra_upstream_weighted_ora(
+    gene_ids: Iterable[str],
+    *,
+    client: Neo4jClient,
+    minimum_evidence_count: int = 1,
+):
+    gene_universe = count_human_genes(client=client)
+    query_weights = {
+        # TODO need some kind of pre-calculated global adjustment here
+        gene_id: 1
+        for gene_id in gene_ids
+    }
+    rows = []
+    debug_rows = []
+    for curie, name, pathway_weights in _get_data(client=client):
+        estimated_average_weight = np.mean(
+            np.fromiter(pathway_weights.values(), dtype=int)
+        ).item()
+        print(curie, estimated_average_weight)
+        intersection = sum(
+            pathway_weights[gene_id]
+            for gene_id in set(query_weights).intersection(pathway_weights)
+        )
+        pathway_minus_query = sum(
+            pathway_weights[gene_id]
+            for gene_id in set(pathway_weights).difference(query_weights)
+        )
+        query_minus_pathway = sum(
+            estimated_average_weight
+            for _ in set(query_weights).difference(pathway_weights)
+        )
+        union = sum((intersection, pathway_minus_query, query_minus_pathway))
+        total = gene_universe * estimated_average_weight
+        bottom_right = total - union
+        table = np.array(
+            [
+                [
+                    intersection,
+                    query_minus_pathway,
+                ],
+                [
+                    pathway_minus_query,
+                    bottom_right,
+                ],
+            ]
+        )
+        debug_rows.append(
+            (
+                curie,
+                intersection,
+                estimated_average_weight,
+                query_minus_pathway,
+                pathway_minus_query,
+                union,
+                total,
+                bottom_right,
+            )
+        )
+        _, pvalue = fisher_exact(table, alternative="greater")
+        rows.append((curie, name, pvalue))
+
+    df = pd.DataFrame(rows, columns=["curie", "name", "p"]).sort_values(
+        "p", ascending=True
+    )
+    df["mlp"] = -np.log10(df["p"])
+    return df
+
+
+def _main():
+    client = Neo4jClient()
+    rv = indra_upstream_weighted_ora(gene_ids=EXAMPLE_GENE_IDS, client=client)
+    rv.to_csv(TEST_RESULTS_PATH, sep="\t", index=False)
+    print(TEST_RESULTS_PATH)
+
+
+if __name__ == "__main__":
+    _main()

From d7f49e30415a362d64cbebe6619a386c5e6b6fc3 Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Tue, 15 Mar 2022 15:03:40 +0100
Subject: [PATCH 2/4] Update discrete weighted ORA implementation

---
 src/indra_cogex/client/__init__.py            |   5 +
 .../client/enrichment/discrete_weighted.py    | 345 +++++++++---------
 2 files changed, 177 insertions(+), 173 deletions(-)
 create mode 100644 src/indra_cogex/client/__init__.py

diff --git a/src/indra_cogex/client/__init__.py b/src/indra_cogex/client/__init__.py
new file mode 100644
index 000000000..bd65c7e6d
--- /dev/null
+++ b/src/indra_cogex/client/__init__.py
@@ -0,0 +1,5 @@
+"""INDRA CoGEx Client."""
+
+from .neo4j_client import *
+from .queries import *
+from .subnetwork import *
diff --git a/src/indra_cogex/client/enrichment/discrete_weighted.py b/src/indra_cogex/client/enrichment/discrete_weighted.py
index b97f35b94..a6783ed54 100644
--- a/src/indra_cogex/client/enrichment/discrete_weighted.py
+++ b/src/indra_cogex/client/enrichment/discrete_weighted.py
@@ -2,207 +2,206 @@
 
 """Weighted ORA."""
 
-import pickle
-from functools import lru_cache
-from typing import Iterable, List, Mapping, Tuple
+from typing import Mapping, Tuple
 
 import numpy as np
 import pandas as pd
 import pystow
-from scipy.stats import fisher_exact
 
-from indra_cogex.client.enrichment.discrete import EXAMPLE_GENE_IDS, count_human_genes
-from indra_cogex.client.neo4j_client import Neo4jClient
+from .utils import get_wikipathways
+from ..neo4j_client import Neo4jClient, autoclient
 
-ENTITY_TO_TARGETS_CYPHER = """\
-MATCH (regulator:BioEntity)-[r:indra_rel]->(gene:BioEntity)
-WHERE
-    gene.id STARTS WITH "hgnc"                  // Collecting human genes only
-    AND r.stmt_type <> "Complex"                // Ignore complexes since they are non-directional
-    AND NOT regulator.id STARTS WITH "uniprot"  // This is a simple way to ignore non-human proteins
-RETURN 
-    regulator.id,
-    regulator.name,
-    collect({gene:gene.id, evidence_count:r.evidence_count})
-"""
+__all__ = [
+    "get_weighted_contingency",
+    "get_lookup",
+    "get_gene_universe",
+    # Pathway database functions
+    "wikipathways_weighted_downstream_ora",
+    "wikipathways_weighted_upstream_ora",
+]
 
-TEST_RESULTS_PATH = pystow.join("indra", "cogex", name="weighted_ora_test_results.tsv")
+LOOKUP_CACHE_PATH = pystow.join("indra", name="weighted_ora_belief_cache.tsv")
 
+ALL_BELIEFS_CYPHER = """\
+MATCH (h:BioEntity)-[r:indra_rel]->(t:BioEntity)
+WHERE
+    h.id STARTS WITH "hgnc"                  // Collecting human genes only
+    AND t.id STARTS WITH "hgnc"
+    AND r.stmt_type <> "Complex"             // Ignore complexes since they are non-directional
+RETURN DISTINCT
+    h.id, t.id, r.stmt_hash, r.belief
+"""
 
-@lru_cache(maxsize=1)
-def _get_data(
-    *,
-    client: Neo4jClient,
-    reload: bool = False,
-    cutoff: int = 1,
-) -> List[Tuple[str, str, Mapping[str, int]]]:
-    cache_path = pystow.join(
-        "indra", "cogex", name=f"weighted_ora_test_{cutoff:03d}.pkl"
-    )
-    if cache_path.exists() and not reload:
-        with cache_path.open("rb") as file:
-            return pickle.load(file)
-    rv = [
-        (
-            curie,
-            name,
-            {
-                collection_row["gene"]: collection_row["evidence_count"]
-                for collection_row in collection_rows
-                if cutoff <= collection_row["evidence_count"]
-            },
-        )
-        for curie, name, collection_rows in client.query_tx(ENTITY_TO_TARGETS_CYPHER)
-    ]
-    with cache_path.open("wb") as file:
-        pickle.dump(rv, file, protocol=pickle.HIGHEST_PROTOCOL)
-    return rv
+ALL_GENES_CYPHER = """\
+MATCH (n:BioEntity)
+WHERE n.id STARTS WITH 'hgnc'
+RETURN n.id
+"""
 
 
-def indra_upstream_weighted_ora(
-    gene_ids: Iterable[str],
-    *,
-    client: Neo4jClient,
-    minimum_evidence_count: int = 1,
-):
-    gene_universe = count_human_genes(client=client)
-    query_weights = {
-        # TODO need some kind of pre-calculated global adjustment here
-        gene_id: 1
-        for gene_id in gene_ids
-    }
-    rows = []
-    debug_rows = []
-    for curie, name, pathway_weights in _get_data(client=client):
-        # print(pathway_curie, pathway_name)
-        # print(pathway_weights)
-        # The weight for all remaining pathways is estimated by this.
-        # Lots of room for improvemnt here. Maybe use label smoothing ideas?
-        estimated_average_weight = sum(pathway_weights.values()) / gene_universe
-        print(curie, estimated_average_weight)
-        intersection = sum(
-            pathway_weights[gene_id]
-            for gene_id in set(query_weights).intersection(pathway_weights)
-        )
-        pathway_minus_query = sum(
-            pathway_weights[gene_id]
-            for gene_id in set(pathway_weights).difference(query_weights)
-        )
-        query_minus_pathway = sum(
-            estimated_average_weight
-            for _ in set(query_weights).difference(pathway_weights)
-        )
-        union = sum((intersection, pathway_minus_query, query_minus_pathway))
-        total = gene_universe * estimated_average_weight
-        bottom_right = total - union
-        table = np.array(
+@autoclient(cache=True, maxsize=1)
+def get_lookup(
+    *, client: Neo4jClient, force: bool = False
+) -> Mapping[Tuple[str, str], float]:
+    """Get the source/target to belief lookup table."""
+    if LOOKUP_CACHE_PATH.is_file() and not force:
+        df = pd.read_csv(LOOKUP_CACHE_PATH, sep="\t")
+    else:
+        res = client.query_tx(all_beliefs)
+        df = pd.DataFrame(res, columns=["source", "target", "stmt_hash", "belief"])
+        df.to_csv(LOOKUP_CACHE_PATH, sep="\t", index=False)
+    return df.groupby(["source", "target"])["belief"].max().to_dict()
+
+
+@autoclient(cache=True, maxsize=1)
+def get_gene_universe(client: Neo4jClient) -> Set[str]:
+    return {row[0] for row in client.query_tx(ALL_GENES_CYPHER)}
+
+
+def get_weighted_contingency(
+    query_genes: set[str],
+    pathway_genes: set[str],
+    universe: set[str],  # all gene CURIEs
+    lookup: dict[tuple[str, str], float],
+    query_is_source: bool = True,
+) -> np.ndarray:
+    a_11, a_12, a_21, a_22 = 0.0, 0.0, 0.0, 0.0
+
+    for gene in universe:
+        # TODO could also use mean or median
+        query_v = np.max(
             [
-                [
-                    intersection,
-                    query_minus_pathway,
-                ],
-                [
-                    pathway_minus_query,
-                    bottom_right,
-                ],
+                lookup.get(
+                    (query_gene, gene) if query_is_source else (gene, query_gene), 0.0
+                )
+                for query_gene in query_genes
             ]
         )
-        debug_rows.append(
-            (
-                curie,
-                intersection,
-                estimated_average_weight,
-                query_minus_pathway,
-                pathway_minus_query,
-                union,
-                total,
-                bottom_right,
-            )
-        )
-        _, pvalue = fisher_exact(table, alternative="greater")
-        rows.append((curie, name, pvalue))
+        m_query_v = 1.0 - query_v
 
-    df = pd.DataFrame(rows, columns=["curie", "name", "p"]).sort_values(
-        "p", ascending=True
-    )
-    df["mlp"] = -np.log10(df["p"])
-    return df
+        if gene in pathway_genes:
+            pathway_v = 1.0
+            m_pathway_v = 0.0
+        else:
+            pathway_v = 0.0
+            m_pathway_v = 1.0
+
+        a_11 += query_v * pathway_v
+        a_12 += query_v * m_pathway_v
+        a_21 += m_query_v * pathway_v
+        a_22 += m_query_v * m_pathway_v
 
+    return np.array([[a_11, a_12], [a_21, a_22]])
 
-def indra_upstream_weighted_ora(
+
+def _do_weighted_ora(
+    curie_to_hgnc_ids: Dict[Tuple[str, str], Set[str]],
     gene_ids: Iterable[str],
-    *,
-    client: Neo4jClient,
-    minimum_evidence_count: int = 1,
-):
-    gene_universe = count_human_genes(client=client)
-    query_weights = {
-        # TODO need some kind of pre-calculated global adjustment here
-        gene_id: 1
-        for gene_id in gene_ids
-    }
+    universe: Set[str],
+    method: Optional[str] = "fdr_bh",
+    alpha: Optional[float] = None,
+    keep_insignificant: bool = True,
+    query_is_source: bool = True,
+) -> pd.DataFrame:
+    if alpha is None:
+        alpha = 0.05
+    query_gene_set = set(gene_ids)
     rows = []
-    debug_rows = []
-    for curie, name, pathway_weights in _get_data(client=client):
-        estimated_average_weight = np.mean(
-            np.fromiter(pathway_weights.values(), dtype=int)
-        ).item()
-        print(curie, estimated_average_weight)
-        intersection = sum(
-            pathway_weights[gene_id]
-            for gene_id in set(query_weights).intersection(pathway_weights)
-        )
-        pathway_minus_query = sum(
-            pathway_weights[gene_id]
-            for gene_id in set(pathway_weights).difference(query_weights)
-        )
-        query_minus_pathway = sum(
-            estimated_average_weight
-            for _ in set(query_weights).difference(pathway_weights)
-        )
-        union = sum((intersection, pathway_minus_query, query_minus_pathway))
-        total = gene_universe * estimated_average_weight
-        bottom_right = total - union
-        table = np.array(
-            [
-                [
-                    intersection,
-                    query_minus_pathway,
-                ],
-                [
-                    pathway_minus_query,
-                    bottom_right,
-                ],
-            ]
-        )
-        debug_rows.append(
-            (
-                curie,
-                intersection,
-                estimated_average_weight,
-                query_minus_pathway,
-                pathway_minus_query,
-                union,
-                total,
-                bottom_right,
-            )
+    for (curie, name), pathway_hgnc_ids in curie_to_hgnc_ids.items():
+        table = get_weighted_contingency(
+            query_gene_set=query_gene_set,
+            pathway_gene_set=pathway_hgnc_ids,
+            gene_universe=count,
+            query_is_source=query_is_source,
         )
         _, pvalue = fisher_exact(table, alternative="greater")
         rows.append((curie, name, pvalue))
-
     df = pd.DataFrame(rows, columns=["curie", "name", "p"]).sort_values(
         "p", ascending=True
     )
     df["mlp"] = -np.log10(df["p"])
+    if method:
+        correction_results = multipletests(
+            df["p"],
+            method=method,
+            is_sorted=True,
+            alpha=alpha,
+        )
+        df["q"] = correction_results[1]
+        df["mlq"] = -np.log10(df["q"])
+        df = df.sort_values("q", ascending=True)
+    if not keep_insignificant:
+        df = df[df["q"] < alpha]
     return df
 
 
-def _main():
-    client = Neo4jClient()
-    rv = indra_upstream_weighted_ora(gene_ids=EXAMPLE_GENE_IDS, client=client)
-    rv.to_csv(TEST_RESULTS_PATH, sep="\t", index=False)
-    print(TEST_RESULTS_PATH)
+def _ora(func, query_is_source, client: Neo4jClient, **kwargs):
+    universe = get_gene_universe(client=client)
+    return _do_weighted_ora(
+        func(client=client),
+        query_is_source=query_is_source,
+        universe=universe,
+        **kwargs,
+    )
+
+
+@autoclient()
+def wikipathways_weighted_upstream_ora(
+    gene_ids: Iterable[str], *, client: Neo4jClient, **kwargs
+) -> pd.DataFrame:
+    """Calculate weighted over-representation on all WikiPathway pathways.
+
+    Parameters
+    ----------
+    gene_ids :
+        List of gene identifiers
+    client :
+        Neo4jClient
+    **kwargs :
+        Additional keyword arguments to pass to _do_ora
+
+    Returns
+    -------
+    :
+        DataFrame with columns:
+        curie, name, p, q, mlp, mlq
+    """
+    return _ora(
+        func=get_wikipathways,
+        client=client,
+        query_is_source=True,
+        gene_ids=gene_ids,
+        universe=universe,
+        **kwargs,
+    )
 
 
-if __name__ == "__main__":
-    _main()
+@autoclient()
+def wikipathways_weighted_downstream_ora(
+    gene_ids: Iterable[str], *, client: Neo4jClient, **kwargs
+) -> pd.DataFrame:
+    """Calculate weighted over-representation on all WikiPathway pathways.
+
+    Parameters
+    ----------
+    gene_ids :
+        List of gene identifiers
+    client :
+        Neo4jClient
+    **kwargs :
+        Additional keyword arguments to pass to _do_ora
+
+    Returns
+    -------
+    :
+        DataFrame with columns:
+        curie, name, p, q, mlp, mlq
+    """
+    return _ora(
+        func=get_wikipathways,
+        client=client,
+        query_is_source=False,
+        gene_ids=gene_ids,
+        universe=universe,
+        **kwargs,
+    )

From bd6243638743ca903c1d4589acb2b43e9c86a8bb Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Tue, 15 Mar 2022 15:22:15 +0100
Subject: [PATCH 3/4] Add lookup cache

---
 src/indra_cogex/client/enrichment/discrete_weighted.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/indra_cogex/client/enrichment/discrete_weighted.py b/src/indra_cogex/client/enrichment/discrete_weighted.py
index a6783ed54..8990723b8 100644
--- a/src/indra_cogex/client/enrichment/discrete_weighted.py
+++ b/src/indra_cogex/client/enrichment/discrete_weighted.py
@@ -3,6 +3,7 @@
 """Weighted ORA."""
 
 from typing import Mapping, Tuple
+import pickle
 
 import numpy as np
 import pandas as pd
@@ -21,6 +22,7 @@
 ]
 
 LOOKUP_CACHE_PATH = pystow.join("indra", name="weighted_ora_belief_cache.tsv")
+LOOKUP_DICT_PATH = pystow.join("indra", name="weighted_ora_belief_cache.pkl")
 
 ALL_BELIEFS_CYPHER = """\
 MATCH (h:BioEntity)-[r:indra_rel]->(t:BioEntity)
@@ -44,13 +46,17 @@ def get_lookup(
     *, client: Neo4jClient, force: bool = False
 ) -> Mapping[Tuple[str, str], float]:
     """Get the source/target to belief lookup table."""
+    if LOOKUP_DICT_PATH.is_file() and not force:
+        return pickle.loads(LOOKUP_DICT_PATH.read_bytes())
     if LOOKUP_CACHE_PATH.is_file() and not force:
         df = pd.read_csv(LOOKUP_CACHE_PATH, sep="\t")
     else:
         res = client.query_tx(all_beliefs)
         df = pd.DataFrame(res, columns=["source", "target", "stmt_hash", "belief"])
         df.to_csv(LOOKUP_CACHE_PATH, sep="\t", index=False)
-    return df.groupby(["source", "target"])["belief"].max().to_dict()
+    rv = df.groupby(["source", "target"])["belief"].max().to_dict()
+    LOOKUP_DICT_PATH.write_bytes(pickle.dumps(rv))
+    return rv
 
 
 @autoclient(cache=True, maxsize=1)

From f22768b8155d3a0776e510d311b521b44372d7ee Mon Sep 17 00:00:00 2001
From: Charles Tapley Hoyt <cthoyt@gmail.com>
Date: Tue, 15 Mar 2022 15:22:24 +0100
Subject: [PATCH 4/4] Cleanup arguments

---
 .../client/enrichment/discrete_weighted.py    | 38 ++++++++++++-------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/src/indra_cogex/client/enrichment/discrete_weighted.py b/src/indra_cogex/client/enrichment/discrete_weighted.py
index 8990723b8..4bd736107 100644
--- a/src/indra_cogex/client/enrichment/discrete_weighted.py
+++ b/src/indra_cogex/client/enrichment/discrete_weighted.py
@@ -2,12 +2,15 @@
 
 """Weighted ORA."""
 
-from typing import Mapping, Tuple
 import pickle
+from typing import Iterable, Mapping, Optional, Set, Tuple
 
 import numpy as np
 import pandas as pd
 import pystow
+from scipy.stats import fisher_exact
+from statsmodels.stats.multitest import multipletests
+from tqdm.auto import tqdm
 
 from .utils import get_wikipathways
 from ..neo4j_client import Neo4jClient, autoclient
@@ -60,15 +63,15 @@ def get_lookup(
 
 
 @autoclient(cache=True, maxsize=1)
-def get_gene_universe(client: Neo4jClient) -> Set[str]:
+def get_gene_universe(*, client: Neo4jClient) -> Set[str]:
     return {row[0] for row in client.query_tx(ALL_GENES_CYPHER)}
 
 
 def get_weighted_contingency(
-    query_genes: set[str],
-    pathway_genes: set[str],
-    universe: set[str],  # all gene CURIEs
-    lookup: dict[tuple[str, str], float],
+    query_gene_set: Set[str],
+    pathway_gene_set: Set[str],
+    universe: Set[str],  # all gene CURIEs
+    lookup: Mapping[Tuple[str, str], float],
     query_is_source: bool = True,
 ) -> np.ndarray:
     a_11, a_12, a_21, a_22 = 0.0, 0.0, 0.0, 0.0
@@ -80,12 +83,12 @@ def get_weighted_contingency(
                 lookup.get(
                     (query_gene, gene) if query_is_source else (gene, query_gene), 0.0
                 )
-                for query_gene in query_genes
+                for query_gene in query_gene_set
             ]
         )
         m_query_v = 1.0 - query_v
 
-        if gene in pathway_genes:
+        if gene in pathway_gene_set:
             pathway_v = 1.0
             m_pathway_v = 0.0
         else:
@@ -101,24 +104,31 @@ def get_weighted_contingency(
 
 
 def _do_weighted_ora(
-    curie_to_hgnc_ids: Dict[Tuple[str, str], Set[str]],
+    *,
+    curie_to_hgnc_ids: Mapping[Tuple[str, str], Set[str]],
     gene_ids: Iterable[str],
     universe: Set[str],
+    lookup: Mapping[Tuple[str, str], float],
     method: Optional[str] = "fdr_bh",
     alpha: Optional[float] = None,
     keep_insignificant: bool = True,
     query_is_source: bool = True,
+    use_tqdm: bool = True,
 ) -> pd.DataFrame:
     if alpha is None:
         alpha = 0.05
     query_gene_set = set(gene_ids)
     rows = []
-    for (curie, name), pathway_hgnc_ids in curie_to_hgnc_ids.items():
+
+    _tqdm_kwargs = dict(desc="Weighted ORA", unit="pathway", unit_scale=True)
+    it = tqdm(curie_to_hgnc_ids.items(), disable=not use_tqdm, **_tqdm_kwargs)
+    for (curie, name), pathway_hgnc_ids in it:
         table = get_weighted_contingency(
             query_gene_set=query_gene_set,
             pathway_gene_set=pathway_hgnc_ids,
-            gene_universe=count,
+            universe=universe,
             query_is_source=query_is_source,
+            lookup=lookup,
         )
         _, pvalue = fisher_exact(table, alternative="greater")
         rows.append((curie, name, pvalue))
@@ -143,10 +153,12 @@ def _do_weighted_ora(
 
 def _ora(func, query_is_source, client: Neo4jClient, **kwargs):
     universe = get_gene_universe(client=client)
+    lookup = get_lookup(client=client)
     return _do_weighted_ora(
-        func(client=client),
+        curie_to_hgnc_ids=func(client=client),
         query_is_source=query_is_source,
         universe=universe,
+        lookup=lookup,
         **kwargs,
     )
 
@@ -177,7 +189,6 @@ def wikipathways_weighted_upstream_ora(
         client=client,
         query_is_source=True,
         gene_ids=gene_ids,
-        universe=universe,
         **kwargs,
     )
 
@@ -208,6 +219,5 @@ def wikipathways_weighted_downstream_ora(
         client=client,
         query_is_source=False,
         gene_ids=gene_ids,
-        universe=universe,
         **kwargs,
     )