diff --git a/docs/user-guide/validation.md b/docs/user-guide/validation.md index 402d43a4..f2c04e69 100644 --- a/docs/user-guide/validation.md +++ b/docs/user-guide/validation.md @@ -42,6 +42,28 @@ Create a [ContentValidator](../api/validation/content_validator.md) for the phen checks that the generated phenopackets have a minimum number of HPO terms, alleles, and variants. ```python title="Generating GA4GH phenopackets from a pyphetools individual list" -validator = ContentValidator(min_var=1, min_hpo=3) -errors = validator.validate_phenopacket_list(ppkt_list) -``` \ No newline at end of file +cohort = [individual1, individual2, individual3] +validator = ContentValidator(cohort=cohort, ontology=hpo_ontology, min_hpo=1, allelic_requirement=AllelicRequirement.MONO_ALLELIC) +validated_individuals = cvalidator.get_validated_individual_list() +qc = QcVisualizer(ontology=hpo_ontology) +display(HTML(qc.to_html(validated_individuals))) +``` + +This will either print a message that no errors were found or show a table with a summary of the errors. If errors were found +with incorrect HPO ids or labels, they need to be corrected in the previous part of the script. If redundancies or ontology conflicts are found, these can be corrected automatically by the following command + + +```python title="Getting an individual list with corrected ontology errors (clean terms)" +cl_individuals = [vi.get_individual_with_clean_terms() for vi in validated_individuals] +``` + +Them the above analysis can be repeated to check the results. + +```python title="Note the 'cohort' argument is pointing to the corrected individual objects" +cvalidator = CohortValidator(cohort=cl_individuals, ontology=hpo_ontology, min_allele=1, min_hpo=1, min_var=1) +qc = QcVisualizer(ontology=hpo_ontology) +display(HTML(qc.to_html(cvalidator.get_validated_individual_list()))) +``` + + +If this analysis shows no error, then the script can proceed to [visualize](visualization.md) and output the phenopackets. \ No newline at end of file diff --git a/src/pyphetools/__init__.py b/src/pyphetools/__init__.py index 91da7ea6..de953997 100644 --- a/src/pyphetools/__init__.py +++ b/src/pyphetools/__init__.py @@ -4,7 +4,7 @@ from . import validation -__version__ = "0.8.9" +__version__ = "0.8.12" __all__ = [ diff --git a/src/pyphetools/creation/__init__.py b/src/pyphetools/creation/__init__.py index 6c7f6d20..8b18146f 100644 --- a/src/pyphetools/creation/__init__.py +++ b/src/pyphetools/creation/__init__.py @@ -1,5 +1,6 @@ from .age_column_mapper import AgeColumnMapper from .age_isoformater import AgeIsoFormater +from .allelic_requirement import AllelicRequirement from .case_encoder import CaseEncoder from .cohort_encoder import CohortEncoder from .column_mapper import ColumnMapper @@ -27,6 +28,7 @@ __all__ = [ "AgeColumnMapper", "AgeIsoFormater", + "AllelicRequirement", "CaseEncoder" , "CohortEncoder", "ColumnMapper", diff --git a/src/pyphetools/creation/allelic_requirement.py b/src/pyphetools/creation/allelic_requirement.py new file mode 100644 index 00000000..764d033f --- /dev/null +++ b/src/pyphetools/creation/allelic_requirement.py @@ -0,0 +1,5 @@ +from enum import Enum + +class AllelicRequirement(Enum): + MONO_ALLELIC = "monoallelic" + BI_ALLELIC = "biallelic" diff --git a/src/pyphetools/creation/case_encoder.py b/src/pyphetools/creation/case_encoder.py index 9687ceba..b5fae2c8 100644 --- a/src/pyphetools/creation/case_encoder.py +++ b/src/pyphetools/creation/case_encoder.py @@ -221,6 +221,7 @@ def get_individual(self)-> Individual: def get_phenopacket(self): """ :return: the GA4GH phenopacket corresponding to the current case report + :rtype: PPKt.Phenopacket """ individual = self.get_individual() pmid = individual.pmid diff --git a/src/pyphetools/creation/constants.py b/src/pyphetools/creation/constants.py index 519a7198..1a77f846 100644 --- a/src/pyphetools/creation/constants.py +++ b/src/pyphetools/creation/constants.py @@ -11,4 +11,5 @@ class Constants: FEMALE_SYMBOL = 'FEMALE' OTHER_SEX_SYMBOL = 'OTHER' UNKOWN_SEX_SYMBOL = 'UNKNOWN' - NOT_PROVIDED = 'NOT_PROVIDED' \ No newline at end of file + NOT_PROVIDED = 'NOT_PROVIDED' + diff --git a/src/pyphetools/creation/individual.py b/src/pyphetools/creation/individual.py index c892a0e5..0bbbb209 100644 --- a/src/pyphetools/creation/individual.py +++ b/src/pyphetools/creation/individual.py @@ -30,12 +30,12 @@ class Individual: def __init__(self, individual_id:str, - hpo_terms:List[HpTerm]=[], + hpo_terms:List[HpTerm]=None, pmid:str=None, title:str=None, sex:str=Constants.NOT_PROVIDED, age:str=Constants.NOT_PROVIDED, - interpretation_list:List[PPKt.VariantInterpretation]=[], + interpretation_list:List[PPKt.VariantInterpretation]=None, disease:Disease=None): """Constructor """ @@ -50,8 +50,14 @@ def __init__(self, else: self._sex = sex self._age = age - self._hpo_terms = hpo_terms - self._interpretation_list = interpretation_list + if hpo_terms is None: + self._hpo_terms = list() + else: + self._hpo_terms = hpo_terms + if interpretation_list is None: + self._interpretation_list = list() + else: + self._interpretation_list = interpretation_list self._disease = disease self._pmid = pmid self._title = title @@ -245,15 +251,15 @@ def to_ga4gh_phenopacket(self, metadata, phenopacket_id=None): return php @staticmethod - def output_individuals_as_phenopackets(individual_list, metadata, pmid=None, outdir="phenopackets"): - """write a list of Individial objects to file in GA4GH Phenopacket format + def output_individuals_as_phenopackets(individual_list, metadata:MetaData, outdir="phenopackets"): + """write a list of Individual objects to file in GA4GH Phenopacket format + + This methods depends on the MetaData object having a PMID and will fail otherwise :param individual_list: List of individuals to be written to file as phenopackets :type individual_list: List[Individual] - :param metadata: GA4GH Phenopacket Schema MetaData object - :type metadata: PPKt.MetaData - :param pmid: A string such as PMID:3415687. Defaults to None. - :type pmid: str + :param metadata: pyphetools MetaData object + :type metadata: MetaData :param outdir: Path to output directory. Defaults to "phenopackets". Created if not exists. :type outdir: str """ @@ -262,6 +268,7 @@ def output_individuals_as_phenopackets(individual_list, metadata, pmid=None, out if not os.path.isdir(outdir): os.makedirs(outdir) written = 0 + pmid = metadata.get_pmid() for individual in individual_list: phenopckt = individual.to_ga4gh_phenopacket(metadata=metadata) json_string = MessageToJson(phenopckt) diff --git a/src/pyphetools/creation/metadata.py b/src/pyphetools/creation/metadata.py index d9e49b91..60e13849 100644 --- a/src/pyphetools/creation/metadata.py +++ b/src/pyphetools/creation/metadata.py @@ -105,11 +105,11 @@ def hpo(self, version): :type version: str """ self._resource_d["hp"] = Resource(resource_id="hp", - name="human phenotype ontology", - namespace_prefix="HP", - iriprefix="http://purl.obolibrary.org/obo/HP_", - url="http://purl.obolibrary.org/obo/hp.owl", - version=version) + name="human phenotype ontology", + namespace_prefix="HP", + iriprefix="http://purl.obolibrary.org/obo/HP_", + url="http://purl.obolibrary.org/obo/hp.owl", + version=version) def geno(self, version=default_versions.get('geno')): """_summary_ @@ -150,19 +150,19 @@ def mondo(self, version=default_versions.get('mondo')): :param version: the Mondo version """ self._resource_d["mondo"] = Resource(resource_id="mondo", - name="Mondo Disease Ontology", - namespace_prefix="MONDO", - iriprefix="http://purl.obolibrary.org/obo/MONDO_", - url="http://purl.obolibrary.org/obo/mondo.obo", - version=version) - + name="Mondo Disease Ontology", + namespace_prefix="MONDO", + iriprefix="http://purl.obolibrary.org/obo/MONDO_", + url="http://purl.obolibrary.org/obo/mondo.obo", + version=version) + def sequence_ontology(self, version=default_versions.get("so")): self._resource_d["so"] = Resource(resource_id="so", - name="Sequence types and features ontology", - namespace_prefix="SO", - iriprefix="http://purl.obolibrary.org/obo/SO_", - url="http://purl.obolibrary.org/obo/so.obo", - version=version) + name="Sequence types and features ontology", + namespace_prefix="SO", + iriprefix="http://purl.obolibrary.org/obo/SO_", + url="http://purl.obolibrary.org/obo/so.obo", + version=version) def set_external_reference(self, pmid, pubmed_title) -> None: """ @@ -178,7 +178,21 @@ def set_external_reference(self, pmid, pubmed_title) -> None: pm = pmid.replace("PMID:", "") self._extref.reference = f"https://pubmed.ncbi.nlm.nih.gov/{pm}" self._extref.description = pubmed_title - + + def get_pmid(self)->str: + """ + :returns: The PubMed identifier + :rtype: str: + :raises ValueError: Throw an error if no PMID is available + """ + if self._extref is not None: + if self._extref.id.startswith("PMID"): + return self._extref.id + else: + raise ValueError(f"Malformed PMID in external reference: {self._extref.id}") + else: + raise ValueError("Could not get PMID because MetaData._extref was None") + def to_ga4gh(self): """ diff --git a/src/pyphetools/validation/cohort_validator.py b/src/pyphetools/validation/cohort_validator.py index e6f2178b..6bf61b0b 100644 --- a/src/pyphetools/validation/cohort_validator.py +++ b/src/pyphetools/validation/cohort_validator.py @@ -1,16 +1,17 @@ from typing import List +from ..creation.allelic_requirement import AllelicRequirement from ..creation.individual import Individual from .validated_individual import ValidatedIndividual import hpotk class CohortValidator: - def __init__(self, cohort:List[Individual], ontology:hpotk.MinimalOntology, min_var:int, min_hpo:int, min_allele:int=None) -> None: + def __init__(self, cohort:List[Individual], ontology:hpotk.MinimalOntology, min_hpo:int, allelic_requirement:AllelicRequirement=None) -> None: self._cohort = cohort self._validated_individual_list = [] for indi in cohort: vindi = ValidatedIndividual(individual=indi) - vindi.validate(ontology=ontology, min_hpo=min_hpo, min_allele=min_allele, min_var=min_var) + vindi.validate(ontology=ontology, min_hpo=min_hpo, allelic_requirement=allelic_requirement) self._validated_individual_list.append(vindi) def get_validated_individual_list(self): diff --git a/src/pyphetools/validation/content_validator.py b/src/pyphetools/validation/content_validator.py index 6bb3386b..43b9e882 100644 --- a/src/pyphetools/validation/content_validator.py +++ b/src/pyphetools/validation/content_validator.py @@ -3,7 +3,8 @@ import os import phenopackets from .phenopacket_validator import PhenopacketValidator -from .validation_result import ValidationResult +from .validation_result import ValidationResult, ValidationResultBuilder +from ..creation.allelic_requirement import AllelicRequirement from ..creation.individual import Individual from typing import List, Union @@ -26,18 +27,15 @@ class ContentValidator(PhenopacketValidator): Note that this class does not test for all errors. Use phenopacket-tools to check for redundant or conflicting annotations. - :param min_var: minimum number of variants for this phenopacket to be considered valid - :type min_var: int :param min_hpo: minimum number of phenotypic features (HP terms) for this phenopacket to be considered valid :type min_hpo: int - :param min_allele: minimum number of alleles for this phenopacket to be considered valid - :type min_allele: int + :param allelic_requirement: used to check number of alleles and variants + :type allelic_requirement: AllelicRequirement """ - def __init__(self, min_var:int, min_hpo:int, min_allele:int=None) -> None: - self._min_var = min_var + def __init__(self, min_hpo:int, allelic_requirement:AllelicRequirement=None) -> None: self._min_hpo = min_hpo - self._min_allele = min_allele + self._allelic_requirement = allelic_requirement def validate_individual(self, individual:Individual) -> List[ValidationResult]: @@ -66,12 +64,26 @@ def validate_individual(self, individual:Individual) -> List[ValidationResult]: if n_pf < self._min_hpo: msg = f"Minimum HPO terms required {self._min_hpo} but only {n_pf} found" validation_results.append(ValidationResult.error(phenopacket_id=pp_id, message=msg)) - if n_var < self._min_var: - msg = f"Minimum variants required {self._min_var} but only {n_var} found" - validation_results.append(ValidationResult.error(phenopacket_id=pp_id, message=msg)) - if self._min_allele is not None and n_alleles < self._min_allele: - msg = f"Minimum alleles required {self._min_allele} but only {n_alleles} found" - validation_results.append(ValidationResult.error(phenopacket_id=pp_id, message=msg)) + if self._allelic_requirement is None: + return validation_results + if self._allelic_requirement == AllelicRequirement.MONO_ALLELIC: + if n_var != 1: + msg = f"Expected one variant for monoallelic but got {n_var} variants" + val_result = ValidationResultBuilder(phenopacket_id=pp_id).error().incorrect_variant_count().set_message(msg=msg).build() + validation_results.append(val_result) + if n_alleles != 1: + msg = f"Expected one allele for monoallelic but got {n_alleles} alleles" + val_result = ValidationResultBuilder(phenopacket_id=pp_id).error().incorrect_allele_count().set_message(msg=msg).build() + validation_results.append(val_result) + elif self._allelic_requirement == AllelicRequirement.BI_ALLELIC: + if n_var < 1 or n_var > 2: + msg = f"Expected one or two variant for biallelic but got {n_var} variants" + val_result = ValidationResultBuilder(phenopacket_id=pp_id).error().incorrect_variant_count().set_message(msg=msg).build() + validation_results.append(val_result) + if n_alleles != 2: + msg = f"Expected two alleles for biallelic but got {n_alleles} alleles" + val_result = ValidationResultBuilder(phenopacket_id=pp_id).error().incorrect_allele_count().set_message(msg=msg).build() + validation_results.append(val_result) return validation_results def validate_phenopacket(self, phenopacket) -> List[ValidationResult]: diff --git a/src/pyphetools/validation/validated_individual.py b/src/pyphetools/validation/validated_individual.py index d1d8ceeb..34569a42 100644 --- a/src/pyphetools/validation/validated_individual.py +++ b/src/pyphetools/validation/validated_individual.py @@ -1,4 +1,4 @@ - +from ..creation.allelic_requirement import AllelicRequirement from ..creation.individual import Individual from .content_validator import ContentValidator from typing import List @@ -13,19 +13,17 @@ def __init__(self, individual:Individual) -> None: self._clean_terms = [] self._validation_errors = [] - def validate(self, ontology:hpotk.MinimalOntology, min_var:int, min_hpo:int, min_allele:int=None) -> None: + def validate(self, ontology:hpotk.MinimalOntology, min_hpo:int, allelic_requirement:AllelicRequirement=None) -> None: """validate an Individual object for errors in the Ontology or the minimum number of HPO terms/alleles/variants :param ontology: HPO object :type ontology: hpotk.MinimalOntology - :param min_var: minimum number of variants for this phenopacket to be considered valid - :type min_var: int :param min_hpo: minimum number of phenotypic features (HP terms) for this phenopacket to be considered valid :type min_hpo: int - :param min_allele: minimum number of alleles for this phenopacket to be considered valid - :type min_allele: int + :param allelic_requirement: used to check number of alleles and variants + :type allelic_requirement: AllelicRequirement """ - cvalidator = ContentValidator(min_hpo=min_hpo, min_allele=min_allele, min_var=min_var) + cvalidator = ContentValidator(min_hpo=min_hpo, allelic_requirement=allelic_requirement) validation_results = cvalidator.validate_individual(individual=self._individual) self._validation_errors.extend(validation_results) qc = OntologyQC(individual=self._individual, ontology=ontology) diff --git a/src/pyphetools/validation/validation_result.py b/src/pyphetools/validation/validation_result.py index eb55462c..c1707eca 100644 --- a/src/pyphetools/validation/validation_result.py +++ b/src/pyphetools/validation/validation_result.py @@ -20,8 +20,8 @@ class Category(Enum): REDUNDANT = 1 CONFLICT = 2 INSUFFICIENT_HPOS = 3 - INSUFFICIENT_ALLELES = 4 - INSUFFICIENT_VARIANTS = 5 + INCORRECT_ALLELE_COUNT = 4 + INCORRECT_VARIANT_COUNT = 5 MALFORMED_ID = 6 MALFORMED_LABEL = 7 UNKNOWN = 8 @@ -126,12 +126,12 @@ def insufficient_hpos(self): self._category = Category.INSUFFICIENT_HPOS return self - def insufficient_alleles(self): - self._category = Category.INSUFFICIENT_ALLELES + def incorrect_allele_count(self): + self._category = Category.INCORRECT_ALLELE_COUNT return self - def insufficient_variants(self): - self._category = Category.INSUFFICIENT_VARIANTS + def incorrect_variant_count(self): + self._category = Category.INCORRECT_VARIANT_COUNT return self def set_message(self, msg): diff --git a/src/pyphetools/visualization/html_table_generator.py b/src/pyphetools/visualization/html_table_generator.py new file mode 100644 index 00000000..2717a01b --- /dev/null +++ b/src/pyphetools/visualization/html_table_generator.py @@ -0,0 +1,36 @@ +from typing import List + + + +class HtmlTableGenerator: + """ + Helper class to generator an HTML table. This class is not intended to be used by client code. + + """ + + def __init__(self, caption, header_items:List[str], rows:List[List[str]]) -> None: + self._html_rows = [] + self._n_columns = len(header_items) + self._html_rows.append('