Skip to content

Commit

Permalink
Updating template parser
Browse files Browse the repository at this point in the history
  • Loading branch information
pnrobinson committed Nov 3, 2024
1 parent 9e5d3ee commit 27565e3
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 25 deletions.
2 changes: 1 addition & 1 deletion src/pyphetools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from . import validation


__version__ = "0.9.111"
__version__ = "0.9.112"


__all__ = [
Expand Down
41 changes: 17 additions & 24 deletions src/pyphetools/creation/case_template_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
CellType = Enum('Celltype', ['DATA', 'HPO', 'NTR', 'MISC', 'NULL'])

class CellEncoder(metaclass=abc.ABCMeta):
"""Superclass for classes that help to decode contents of individuals cells in the pyphetools Excel template.
"""
def __init__(self, name):
self._name = name

Expand Down Expand Up @@ -181,24 +183,22 @@ def columntype(self) -> "CellType":

EXPECTED_HEADERS = {"PMID", "title", "individual_id", "comment", "disease_id", "disease_label",
"HGNC_id", "gene_symbol", "transcript", "allele_1", "allele_2",
"variant.comment", "age_of_onset", "age_at_last_encounter", "sex"}
"variant.comment", "age_of_onset", "age_at_last_encounter", "deceased", "sex"}

DATA_ITEMS = {"PMID", "title", "individual_id", "disease_id", "disease_label", "HGNC_id",
"gene_symbol", "transcript", "allele_1", "allele_2", "age_of_onset",
"age_at_last_encounter", "deceased", "sex"}

# note that the allele_2 field is option
REQUIRED_H1_FIELDS = ["PMID", "title", "individual_id", "comment", "disease_id", "disease_label",
"HGNC_id", "gene_symbol", "transcript", "allele_1", "allele_2", "variant.comment", "age_of_onset", "age_at_last_encounter", "sex", "HPO"]
"HGNC_id", "gene_symbol", "transcript", "allele_1", "allele_2", "variant.comment",
"age_of_onset", "age_at_last_encounter", "deceased", "sex", "HPO"]
ALLELE_2_IDX = 8

REQUIRED_H2_FIELDS = ["CURIE", "str", "str", "optional", "CURIE", "str", "CURIE", "str",
"str","str","str","optional", "age", "age", "M:F:O:U", "na"]
"str","str","str","optional", "age", "age", "yes/no/na", "M:F:O:U", "na"]


OPTIONAL_H1_FIELDS = REQUIRED_H1_FIELDS
OPTIONAL_H1_FIELDS.insert(14, "deceased")
OPTIONAL_H2_FIELDS = REQUIRED_H2_FIELDS
OPTIONAL_H2_FIELDS.insert(14, "yes/no/na")


class CaseTemplateEncoder:
Expand Down Expand Up @@ -229,13 +229,8 @@ def __init__(self, df:pd.DataFrame, hpo_cr:HpoConceptRecognizer, created_by:str,
raise ValueError("headers are different lengths. Check template file for correctness.")
# check headers are well formed
idx = 0
## The deceased field is optional
if "deceased" in header_1:
required_h1 = OPTIONAL_H1_FIELDS
required_h2 = OPTIONAL_H2_FIELDS
else:
required_h1 = REQUIRED_H1_FIELDS
required_h2 = REQUIRED_H2_FIELDS
required_h1 = REQUIRED_H1_FIELDS
required_h2 = REQUIRED_H2_FIELDS
for i in range(len(required_h1)):
if idx == ALLELE_2_IDX and header_1[idx] != required_h1[idx]:
idx += 1 # skip optional index
Expand Down Expand Up @@ -286,8 +281,6 @@ def _process_header(self,
elif not in_hpo_range:
if h1 in EXPECTED_HEADERS:
index_to_decoder_d[i] = DataEncoder(h1=h1, h2=h2)
elif h1 == "deceased":
index_to_decoder_d[i] = DataEncoder(h1=h1, h2=h2)
else:
raise ValueError(f"Malformed template header at column {i}: \"{h1}\"")
elif in_hpo_range:
Expand Down Expand Up @@ -417,14 +410,14 @@ def _parse_individual(self,
else:
encounter_age = None
vitStat = None
if "deceased" in data_items:
decsd = data_items.get("deceased")
if decsd == "yes" and encounter_age is not None:
vitStat = VitalStatus(status=VitalStatus.Status.DECEASED, time_of_death=encounter_age)
elif decsd == "no":
vitStat = VitalStatus(status=VitalStatus.Status.ALIVE)
else:
vitStat = VitalStatus(status=VitalStatus.Status.UNKNOWN_STATUS)
# deceased is a required field from version 0.9.112 on
decsd = data_items.get("deceased")
if decsd == "yes" and encounter_age is not None:
vitStat = VitalStatus(status=VitalStatus.Status.DECEASED, time_of_death=encounter_age)
elif decsd == "no":
vitStat = VitalStatus(status=VitalStatus.Status.ALIVE)
else:
vitStat = VitalStatus(status=VitalStatus.Status.UNKNOWN_STATUS)
disease_id = data_items.get("disease_id")
disease_label = data_items.get("disease_label")
# common error -- e.g. PMID: 3000312 or OMIM: 600123 (whitespace after colon)
Expand Down

0 comments on commit 27565e3

Please sign in to comment.