From bb94e948269f09a4bfb84db04cd58380c86a6d2f Mon Sep 17 00:00:00 2001 From: Charles Tapley Hoyt Date: Wed, 24 Apr 2024 12:24:33 +0200 Subject: [PATCH] Improvements to full mapping database build 1. Add zenodo uploads 2. Write configuration to output directory 3. Demonstrate automated upload on protein complex landscape 4. Add automated upload to full database build --- setup.cfg | 1 + src/semra/database.py | 82 ++++++++++++++++++++++++-------- src/semra/landscape/complexes.py | 33 ++++++++++++- src/semra/pipeline.py | 7 +++ src/semra/rules.py | 1 + 5 files changed, 102 insertions(+), 22 deletions(-) diff --git a/setup.cfg b/setup.cfg index 98b319e..c0d66a2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -65,6 +65,7 @@ install_requires = bioontologies pyobo typing_extensions + zenodo_client # Random options zip_safe = false diff --git a/src/semra/database.py b/src/semra/database.py index d684f64..a42cde9 100644 --- a/src/semra/database.py +++ b/src/semra/database.py @@ -1,7 +1,8 @@ """Assemble a database.""" -import pickle +import csv import time +import typing as t import bioregistry import click @@ -11,21 +12,31 @@ from bioontologies.robot import write_getter_warnings from tqdm.auto import tqdm from tqdm.contrib.logging import logging_redirect_tqdm - -from semra.io import from_bioontologies, from_pyobo, write_neo4j, write_pickle, write_sssom +from zenodo_client import Creator, Metadata, ensure_zenodo + +from semra import Mapping +from semra.io import ( + from_bioontologies, + from_pickle, + from_pyobo, + write_neo4j, + write_pickle, + write_sssom, +) +from semra.rules import CHARLIE_NAME, CHARLIE_ORCID from semra.sources import SOURCE_RESOLVER MODULE = pystow.module("semra", "database") SOURCES = MODULE.module("sources") -DATABASE_PATH = MODULE.join(name="sssom.tsv") -WARNINGS_PATH = MODULE.join("logs", name="warnings.tsv") -ERRORS_PATH = MODULE.join("logs", name="errors.tsv") -SUMMARY_PATH = MODULE.join("logs", name="summary.tsv") -EMPTY_PATH = MODULE.join("logs", name="empty.txt") +LOGS = MODULE.module("logs") +SSSOM_PATH = MODULE.join(name="sssom.tsv") +WARNINGS_PATH = LOGS.join(name="warnings.tsv") +ERRORS_PATH = LOGS.join(name="errors.tsv") +SUMMARY_PATH = LOGS.join(name="summary.tsv") +EMPTY_PATH = LOGS.join(name="empty.txt") NEO4J_DIR = MODULE.join("neo4j") EMPTY = [] - summaries = [] @@ -45,6 +56,8 @@ def main(): "edam.format", "edam.operation", "edam.topic", + "gwascentral.phenotype", # added on 2024-04-24, service down + "gwascentral.study", # added on 2024-04-24, service down } #: A set of prefixes whose obo files need to be parsed without ROBOT checks loose = { @@ -79,7 +92,7 @@ def main(): continue _write_source(resource_mappings, resource.prefix) mappings.extend(resource_mappings) - summaries.append((resource.prefix, len(resource_mappings), time.time() - start)) + summaries.append((resource.prefix, len(resource_mappings), time.time() - start, "pyobo")) _write_summary() it = tqdm(list(SOURCE_RESOLVER), unit="source", desc="Custom sources") @@ -91,7 +104,7 @@ def main(): resource_mappings = func() _write_source(resource_mappings, resource_name) mappings.extend(resource_mappings) - summaries.append((resource_name, len(resource_mappings), time.time() - start)) + summaries.append((resource_name, len(resource_mappings), time.time() - start, "custom")) _write_summary() it = tqdm(ontology_resources, unit="ontology", desc="Ontology sources") @@ -99,7 +112,7 @@ def main(): it.set_postfix(prefix=resource.prefix) path = SOURCES.join(name=f"{resource.prefix}.pkl") if path.is_file(): - resource_mappings = pickle.loads(path.read_bytes()) + resource_mappings = from_pickle(path) else: start = time.time() try: @@ -112,18 +125,43 @@ def main(): # this outputs on each iteration to get faster insight write_warned(WARNINGS_PATH) write_getter_warnings(ERRORS_PATH) - summaries.append((resource.prefix, len(resource_mappings), time.time() - start)) + summaries.append((resource.prefix, len(resource_mappings), time.time() - start, "bioontologies")) _write_summary() mappings.extend(resource_mappings) - click.echo(f"Writing SSSOM to {DATABASE_PATH}") - write_sssom(mappings, DATABASE_PATH) - click.echo(f"Writing Neo4j folder to {DATABASE_PATH}") + click.echo(f"Writing SSSOM to {SSSOM_PATH}") + write_sssom(mappings, SSSOM_PATH) + click.echo(f"Writing Neo4j folder to {SSSOM_PATH}") write_neo4j(mappings, NEO4J_DIR) - -def _write_source(mappings, key): + # Define the metadata that will be used on initial upload + zenodo_metadata = Metadata( + title="SeMRA Mapping Database", + upload_type="dataset", + description=f"A compendium of mappings extracted from {len(summaries)} database/ontologies. " + f"Note that primary mappings are marked with the license of their source (when available). " + f"Inferred mappings are distributed under the CC0 license.", + creators=[ + Creator(name=CHARLIE_NAME, orcid=CHARLIE_ORCID.identifier), + ], + ) + res = ensure_zenodo( + key="semra-database-test-1", + data=zenodo_metadata, + paths=[ + SSSOM_PATH, + WARNINGS_PATH, + ERRORS_PATH, + SUMMARY_PATH, + *NEO4J_DIR.iterdir(), + ], + sandbox=True, + ) + click.echo(res.json()["links"]["html"]) + + +def _write_source(mappings: t.List[Mapping], key: str) -> None: write_pickle(mappings, SOURCES.join(name=f"{key}.pkl")) if mappings: write_sssom(mappings, SOURCES.join(name=f"{key}.sssom.tsv")) @@ -132,8 +170,12 @@ def _write_source(mappings, key): EMPTY_PATH.write_text("\n".join(EMPTY)) -def _write_summary(): - SUMMARY_PATH.write_text("\n".join(f"{p}\t{n:,}\t{round(delta, 3)}" for p, n, delta in summaries)) +def _write_summary() -> None: + with SUMMARY_PATH.open("w") as file: + writer = csv.writer(file, delimiter="\t") + writer.writerow(("prefix", "mappings", "time", "source_type")) + for prefix, n_mappings, time_delta, source_type in summaries: + writer.writerow((prefix, f"{n_mappings:,}", round(time_delta, 3), source_type)) if __name__ == "__main__": diff --git a/src/semra/landscape/complexes.py b/src/semra/landscape/complexes.py index 2724b52..c832caa 100644 --- a/src/semra/landscape/complexes.py +++ b/src/semra/landscape/complexes.py @@ -2,8 +2,10 @@ import click import pystow +from zenodo_client import Creator, Metadata, ensure_zenodo from semra.pipeline import Configuration, Input, Mutation +from semra.rules import CHARLIE_NAME, CHARLIE_ORCID __all__ = [ "MODULE", @@ -51,20 +53,47 @@ ], raw_pickle_path=MODULE.join(name="raw.pkl"), raw_sssom_path=MODULE.join(name="raw.sssom.tsv"), - # raw_neo4j_path=MODULE.join("neo4j_raw"), + raw_neo4j_path=MODULE.join("neo4j_raw"), + raw_neo4j_name="semra-complex", processed_pickle_path=MODULE.join(name="processed.pkl"), processed_sssom_path=MODULE.join(name="processed.sssom.tsv"), processed_neo4j_path=MODULE.join("neo4j"), processed_neo4j_name="semra-complex", priority_pickle_path=MODULE.join(name="priority.pkl"), priority_sssom_path=MODULE.join(name="priority.sssom.tsv"), + configuration_path=MODULE.join(name="configuration.json"), +) + + +# Define the metadata that will be used on initial upload +ZENODO_METADATA = Metadata( + title="SeMRA Protein Complex Mapping Database", + upload_type="dataset", + description=CONFIGURATION.description, + creators=[ + Creator(name=CHARLIE_NAME, orcid=CHARLIE_ORCID.identifier), + ], ) @click.command() def main(): """Build the mapping database for protein complex terms.""" - CONFIGURATION.get_mappings(refresh_raw=True, refresh_processed=True) + # CONFIGURATION.get_mappings(refresh_raw=False, refresh_processed=False) + + res = ensure_zenodo( + key="semra-complex", + data=ZENODO_METADATA, + paths=[ + CONFIGURATION.raw_sssom_path, + CONFIGURATION.configuration_path, + CONFIGURATION.processed_sssom_path, + CONFIGURATION.priority_sssom_path, + *CONFIGURATION.raw_neo4j_path.iterdir(), + ], + sandbox=True, + ) + click.echo(res.json()["links"]["html"]) if __name__ == "__main__": diff --git a/src/semra/pipeline.py b/src/semra/pipeline.py index 7c9514d..f29b5ac 100644 --- a/src/semra/pipeline.py +++ b/src/semra/pipeline.py @@ -138,6 +138,8 @@ class Configuration(BaseModel): add_labels: bool = Field(default=False, description="Should PyOBO be used to look up labels for SSSOM output?") + configuration_path: Optional[Path] = Field(None, description="The path where this configuration should be written.") + @root_validator(skip_on_failure=True) def infer_priority(cls, values): # noqa:N805 """Infer the priority from the input list of not given.""" @@ -214,6 +216,11 @@ def get_mappings_from_config( "loaded cached raw mappings from %s in %.2f seconds", configuration.raw_pickle_path, time.time() - start ) else: + if configuration.configuration_path is not None: + configuration.configuration_path.write_text( + configuration.model_dump_json(exclude_none=True, exclude_unset=True, indent=2) + ) + raw_mappings = get_raw_mappings(configuration) if configuration.validate_raw: validate_mappings(raw_mappings) diff --git a/src/semra/rules.py b/src/semra/rules.py index 1d44c18..7e92d93 100644 --- a/src/semra/rules.py +++ b/src/semra/rules.py @@ -44,4 +44,5 @@ KNOWLEDGE_MAPPING = Reference.from_curie("semapv:BackgroundKnowledgeBasedMatching") CHARLIE_ORCID = Reference.from_curie("orcid:0000-0003-4423-4370") +CHARLIE_NAME = "Charles Tapley Hoyt" BEN_ORCID = Reference.from_curie("orcid:0000-0001-9439-5346")