Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Track obsoletions in EC #182

Merged
merged 3 commits into from
Apr 17, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 26 additions & 13 deletions src/pyobo/sources/expasy.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@

import logging
from collections import defaultdict
from typing import Dict, Iterable, Mapping, Optional, Set, Tuple
from typing import Any, Dict, Iterable, Mapping, Optional, Set, Tuple

from .utils import get_go_mapping
from ..struct import Obo, Reference, Synonym, Term
from ..struct.typedef import enables, has_member
from ..struct.typedef import enables, has_member, term_replaced_by
from ..utils.path import ensure_path

__all__ = [
Expand Down Expand Up @@ -93,12 +93,29 @@ def get_terms(version: str, force: bool = False) -> Iterable[Term]:

database_path = ensure_path(PREFIX, url=EXPASY_DATABASE_URL, version=version)
with open(database_path) as file:
_data = get_database(file)
id_to_data = get_database(file)

ec2go = get_ec2go(version=version)

ec_code_to_alt_ids = {}
for ec_code, data in _data.items():
for ec_code, data in id_to_data.items():
if data.get("deleted"):
terms[ec_code] = Term(
reference=Reference(prefix=PREFIX, identifier=ec_code), is_obsolete=True
)
continue

transfer_ids = data.get("transfer_id")
if transfer_ids:
term = terms[ec_code] = Term(
reference=Reference(prefix=PREFIX, identifier=ec_code), is_obsolete=True
)
for transfer_id in transfer_ids:
term.append_relationship(
term_replaced_by, Reference(prefix=PREFIX, identifier=transfer_id)
)
continue

parent_ec_code = data["parent"]["identifier"]
parent_term = terms[parent_ec_code]

Expand Down Expand Up @@ -210,7 +227,7 @@ def get_database(lines: Iterable[str]) -> Mapping:
for groups in _group_by_id(lines):
_, expasy_id = groups[0]

rv[expasy_id] = ec_data_entry = {
ec_data_entry: Dict[str, Any] = {
"concept": {
"namespace": PREFIX,
"identifier": expasy_id,
Expand All @@ -230,10 +247,10 @@ def get_database(lines: Iterable[str]) -> Mapping:
if descriptor == "//":
continue
elif descriptor == DE and value == "Deleted entry.":
continue
ec_data_entry["deleted"] = True
elif descriptor == DE and value.startswith("Transferred entry: "):
value = value[len("Transferred entry: ") :].rstrip()
ec_data_entry["transfer_id"] = value
value = value[len("Transferred entry: ") :].rstrip().rstrip(".")
ec_data_entry["transfer_id"] = value.split(" and ")
elif descriptor == DE:
ec_data_entry["concept"]["name"] = value.rstrip(".") # type:ignore
elif descriptor == AN:
Expand All @@ -259,11 +276,7 @@ def get_database(lines: Iterable[str]) -> Mapping:
)
)

for expasy_id, data in rv.items():
transfer_id = data.pop("transfer_id", None)
if transfer_id is not None:
rv[expasy_id]["alt_ids"].append(transfer_id) # type:ignore

rv[expasy_id] = ec_data_entry
return rv


Expand Down
Loading