From b54c5d1dacc691d8dc1f8e69c1f9c8d31f44c5c5 Mon Sep 17 00:00:00 2001 From: Peter Robinson Date: Fri, 10 Jan 2025 06:39:52 +0100 Subject: [PATCH] Updating documentation --- docs/api/creation/age_column_mapper.md | 4 - docs/api/creation/column_mapper.md | 3 - docs/api/creation/constant_column_mapper.md | 4 - docs/api/creation/disease_id_column_mapper.md | 4 - docs/api/creation/option_column_mapper.md | 3 - docs/api/creation/sex_column_mapper.md | 3 - docs/api/creation/simple_column_mapper.md | 3 - .../simple_column_mapper_generator.md | 3 - docs/index.md | 1 - docs/tabular/choosing_column_mapper.md | 18 -- docs/tabular/cohort_encoder.md | 38 --- docs/tabular/constant_column_mapper.md | 30 --- docs/tabular/jupyter.md | 222 ------------------ docs/tabular/option_column_mapper.md | 93 -------- docs/tabular/overview.md | 34 --- docs/tabular/simple_column_mapper.md | 57 ----- docs/tabular/threshold_column_mapper.md | 42 ---- docs/tabular/validation.md | 75 ------ docs/tabular/variant_column_mapper.md | 91 ------- docs/tabular/visualization.md | 5 - mkdocs.yml | 24 +- 21 files changed, 2 insertions(+), 755 deletions(-) delete mode 100644 docs/api/creation/age_column_mapper.md delete mode 100644 docs/api/creation/column_mapper.md delete mode 100644 docs/api/creation/constant_column_mapper.md delete mode 100644 docs/api/creation/disease_id_column_mapper.md delete mode 100644 docs/api/creation/option_column_mapper.md delete mode 100644 docs/api/creation/sex_column_mapper.md delete mode 100644 docs/api/creation/simple_column_mapper.md delete mode 100644 docs/api/creation/simple_column_mapper_generator.md delete mode 100644 docs/tabular/choosing_column_mapper.md delete mode 100644 docs/tabular/cohort_encoder.md delete mode 100644 docs/tabular/constant_column_mapper.md delete mode 100644 docs/tabular/jupyter.md delete mode 100644 docs/tabular/option_column_mapper.md delete mode 100644 docs/tabular/overview.md delete mode 100644 docs/tabular/simple_column_mapper.md delete mode 100644 docs/tabular/threshold_column_mapper.md delete mode 100644 docs/tabular/validation.md delete mode 100644 docs/tabular/variant_column_mapper.md delete mode 100644 docs/tabular/visualization.md diff --git a/docs/api/creation/age_column_mapper.md b/docs/api/creation/age_column_mapper.md deleted file mode 100644 index 848d563a..00000000 --- a/docs/api/creation/age_column_mapper.md +++ /dev/null @@ -1,4 +0,0 @@ -# AgeColumnMapper - - -::: pyphetools.creation.AgeColumnMapper \ No newline at end of file diff --git a/docs/api/creation/column_mapper.md b/docs/api/creation/column_mapper.md deleted file mode 100644 index 37a18183..00000000 --- a/docs/api/creation/column_mapper.md +++ /dev/null @@ -1,3 +0,0 @@ -# ColumnMapper - -::: pyphetools.creation.ColumnMapper \ No newline at end of file diff --git a/docs/api/creation/constant_column_mapper.md b/docs/api/creation/constant_column_mapper.md deleted file mode 100644 index e2b3e95c..00000000 --- a/docs/api/creation/constant_column_mapper.md +++ /dev/null @@ -1,4 +0,0 @@ -# ConstantColumnMapper - -::: pyphetools.creation.ConstantColumnMapper - diff --git a/docs/api/creation/disease_id_column_mapper.md b/docs/api/creation/disease_id_column_mapper.md deleted file mode 100644 index 42f2e811..00000000 --- a/docs/api/creation/disease_id_column_mapper.md +++ /dev/null @@ -1,4 +0,0 @@ -# DiseaseIdColumnMapper - -::: pyphetools.creation.DiseaseIdColumnMapper - diff --git a/docs/api/creation/option_column_mapper.md b/docs/api/creation/option_column_mapper.md deleted file mode 100644 index 6cbfd24d..00000000 --- a/docs/api/creation/option_column_mapper.md +++ /dev/null @@ -1,3 +0,0 @@ -# OptionColumnMapper - -::: pyphetools.creation.OptionColumnMapper diff --git a/docs/api/creation/sex_column_mapper.md b/docs/api/creation/sex_column_mapper.md deleted file mode 100644 index a1384c29..00000000 --- a/docs/api/creation/sex_column_mapper.md +++ /dev/null @@ -1,3 +0,0 @@ -# SexColumnMapper - -::: pyphetools.creation.SexColumnMapper diff --git a/docs/api/creation/simple_column_mapper.md b/docs/api/creation/simple_column_mapper.md deleted file mode 100644 index 3e56076b..00000000 --- a/docs/api/creation/simple_column_mapper.md +++ /dev/null @@ -1,3 +0,0 @@ -# SimpleColumnMapper - -::: pyphetools.creation.SimpleColumnMapper diff --git a/docs/api/creation/simple_column_mapper_generator.md b/docs/api/creation/simple_column_mapper_generator.md deleted file mode 100644 index 6ec49c62..00000000 --- a/docs/api/creation/simple_column_mapper_generator.md +++ /dev/null @@ -1,3 +0,0 @@ -# SimpleColumnMapperGenerator - -::: pyphetools.creation.SimpleColumnMapperGenerator \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index ea3f4d15..fe0009a4 100644 --- a/docs/index.md +++ b/docs/index.md @@ -10,7 +10,6 @@ from tabular data such as databases or supplemental files found in the medical l This documentation contains information about - How to use the [Excel template](user-guide/excel.md) to code clinical data -- How to use [pyphetools classes](tabular/jupyter.md) to convert tabular data (e.g., supplemental tables) to phenopackets - Information for [developers](developers/developers.md) - A description of the pyphetools [API](api/overview.md) diff --git a/docs/tabular/choosing_column_mapper.md b/docs/tabular/choosing_column_mapper.md deleted file mode 100644 index 7f7f72ec..00000000 --- a/docs/tabular/choosing_column_mapper.md +++ /dev/null @@ -1,18 +0,0 @@ -# Choosing a column mapper - -pyphetools defines several ColumnMapper's that map the contents of a specific column to HPO annotations. -In addition, there are AgeMapper, SexMapper, and VariantMapper classes that help to ingest this type of data but are not covered here. - -To choose a ColumnMapper for a specific column, see the following table. Detailed documentation about each mapper is linked. - - -| Column Mapper | Scope | -|:------------------------|:--------------------------------------------------------------| -| [Simple column mapper](simple_column_mapper.md) | A column describes a single abnormality (HPO term) and contains symbols that represent `observed`, `excluded`, and `not measured/not available` | -| [Constant column mapper](constant_column_mapper.md) | A column for which all individuals (i.e., all rows) have a given HPO term (or the exclusion of the term) | -| [Option column mapper](option_column_mapper.md) | A column that contains several phentypic abnormalities, usually all in one organ system. | -| [Threshold column mapper](threshold_column_mapper.md) | A column that contains a number that implies an abnormality (HPO term) if the number is above (or below) a given threshold. | - - - -It is advisable to review several notebooks in the [Phenopacket-Store](https://github.com/monarch-initiative/phenopacket-store){:target="\_blank"} project to get a feeling for how and when to use the various mappers. \ No newline at end of file diff --git a/docs/tabular/cohort_encoder.md b/docs/tabular/cohort_encoder.md deleted file mode 100644 index 29c24428..00000000 --- a/docs/tabular/cohort_encoder.md +++ /dev/null @@ -1,38 +0,0 @@ -# CohortEncoder - -This class coordinates the extract-transform-load (ETL) operations for a cohort, usually taken from -a table in a publication or supplemental file. It is intended to be used with the ColumnMappers to map -each relevant column of the table. - - - - -```python title="CohortEncoder constructor" -pmid = "PMID:30612693" -encoder = CohortEncoder(df=dft, hpo_cr=hpo_cr, column_mapper_d=column_mapper_d, - individual_column_name="patient_id", - agemapper=ageMapper, - sexmapper=sexMapper, - metadata=metadata, - variant_mapper=varMapper, - pmid=pmid) -disease_id = "OMIM:618443" -disease_label = "Neurodevelopmental disorder with or without variable brain abnormalities" -encoder.set_disease(Disease(disease_id=disease_id, disease_label=disease_label)) -``` - -### Display a phenopacket - -Optionally, it can be useful to displayand manually assess a phenopackets - -```python title="Display a phenopacket" -individuals = encoder.get_individuals() -i1 = individuals[0] -phenopacket1 = i1.to_ga4gh_phenopacket(metadata=metadata.to_ga4gh()) -json_string = MessageToJson(phenopacket1) -print(json_string) -``` - - - - diff --git a/docs/tabular/constant_column_mapper.md b/docs/tabular/constant_column_mapper.md deleted file mode 100644 index 652fa7e1..00000000 --- a/docs/tabular/constant_column_mapper.md +++ /dev/null @@ -1,30 +0,0 @@ -# ConstantColumnMapper - - -Column mapper for cases where all individuals in a cohort have a certain phenotypic abnormality. If the excluded argument is set to True, -then the abnormality was excluded in all individuals. This mapper is a shortcut that appears to be useful for some supplemental files. - - - -```python title="ConstantColumnMapper constructor" -hp_id = "HP:0031956" -hp_label = "Elevated circulating aspartate aminotransferase concentration" -mapper = ConstantColumnMapper(hpo_id=hp_id, hpo_label=hp_label) -hp_term_list = mapper.map_cell("name of column") -``` - - -Replace "name of column" with the actual column name in the input table. - -In this case, **hp_term_list** contains a single term corresponding to the indicated HP term (regardless of the contents of the cell). If this -mapper is used with the CohortMapper, then all individuals in the cohort will be annotated to the term. - -The following code is used if the phenotypic abnormality was explicitly excluded in all individuals in the cohort. - - -```python title="ConstantColumnMapper constructor" -hp_id = "HP:0031956" -hp_label = "Elevated circulating aspartate aminotransferase concentration" -mapper = ConstantColumnMapper(hpo_id=hp_id, hpo_label=hp_label, excluded=True) -hp_term_list = mapper.map_cell("name of column) -``` diff --git a/docs/tabular/jupyter.md b/docs/tabular/jupyter.md deleted file mode 100644 index 86c50aa8..00000000 --- a/docs/tabular/jupyter.md +++ /dev/null @@ -1,222 +0,0 @@ -# Using pyphetools in a Jupyter notebook - - -This page provides an overview of how to structure a Jupyter notebook to import tabular data. We recommend importing data for one disease at a time. - -### Importing necessary packages - - -Most notebooks will want to first import all necessary packages. It is helpful to print out the version -used (see the last two lines) in case of errors or feature requests. Sometimes, additional packages need -to be imported to support special cases. - - -```python title="Imports" -import pandas as pd -pd.set_option('display.max_colwidth', None) # show entire column contents, important! -from IPython.display import HTML, display -from pyphetools.creation import * -from pyphetools.validation import * -from pyphetools.visualization import * -import pyphetools -print(f"pyphetools version {pyphetools.__version__}") -``` - -### Import the Human Phenotype Ontology (HPO) file - -It is useful to import the HPO file and create the `MetaData` object -(which records your `ORCID `_ id and the version of the HPO used) in one step. - -The following code first creates a `Citation` object with data about the PubMed identifier and title of the paper -we are curating. -It then imports the HPO (which in this example has been previously downloaded and stored at the file location `../hp.json`). Note -that the hp.json file can be downloaded from many places including the [HPO Homepage](https://hpo.jax.org/app/){:target="_blank"}. -We recommend always using the latest version. The code then initializes the HPO concept recognizer (i.e., text-mining) object, the ontology object, -and the MetaData object (see the -[GA4GH phenopackets documentation](https://phenopacket-schema.readthedocs.io/en/latest/){:target="\_blank"} for more details on MetaData). - -```python title="Configure MetaData" -PMID = "PMID:36189931" -title = "Comprehensive genetic screening for vascular Ehlers-Danlos syndrome through an amplification-based next-generation sequencing system" -cite = Citation(pmid=PMID, title=title) -parser = HpoParser(hpo_json_file="../hp.json") -hpo_cr = parser.get_hpo_concept_recognizer() -hpo_version = parser.get_version() -hpo_ontology = parser.get_ontology() -metadata = MetaData(created_by="ORCID:0000-0002-5648-2155", citation=cite) -metadata.default_versions_with_hpo(version=hpo_version) -print(f"HPO version {hpo_version}") -``` - -Note that if you leave the argument of HpoParser empty, the class will download the latest version of HPO automatically. Depending on the settings -of your system, this may lead to an SSL certificate error, which can be addressed by adding the following two lines to the top of the cell -``` -import ssl -ssl._create_default_https_context = ssl._create_unverified_context -``` - - - -### Importing the data - - -In general, we have taken the input data from Excel files or from CSV (command-separated value) files or TSV (tab-separated value files). Excel files can be imported using the following pandas command. - -```python title="Reading an Excel input file" -df = pd.read_excel('some/path/my_supplement.xlsx') -``` - -Standard pandas functions are available to read CSV and TSV files. We refer to the [pandas documentation](https://pandas.pydata.org/) for more details. - - -### Inspecting the input data - - -Users will need to carefully inspect the input table (e.g. a Supplemental file) and determine which columns or rows contain the individual id, age, and sex, the variants, and clinical information that can be encoded using HPO terms. -We recommand inspecting the first several rows using - -```python title="Inspecting the data" -df.head() -``` - -It is also useful to look at the column names. - -```python title="Inspecting the column names" -df.columns -``` - - - -## Clinical columns -pyphetools expects to get a dictionary whose keys correspond to the column names used by the pandas DataFrame, -and the values are the corresponding ColumnMapper objects. pyphetools offers different types of ColumnMapper objects, whose goal is to -encode the id, age, sex, variants, and clinical information encoded by HPO terms. We first create a dictionary whose keys should be the -names (strings) of the columns of the table and whose values are the corresponding ColumnMapper objects that we need to create for each column we -want to map. Note that it is not necessary to map each column of a table. - - -Data with clinical columns in typical supplemental files often have one of several formats. - - -1. Simple. The column header is a string such as 'ID' that corresponds to an HPO term, for instance [Intellectual disability HP:0001249](https://hpo.jax.org/app/browse/term/HP:0001249){:target="\_blank"}, whereby each cell has a symbol such as -'Y', 'y', '+', 'yes', ''n', '-', etc. to indicate whether the feature was present in the individual specified by the row. See :ref:`simple_column_mapper` for more information about how to work with this kind of column. -2. Options. Some columns contain several strings, each of which corresponds to a specific HPO term. For instance, a columns such as 'severity of ID' with entries such as `mild`, `moderate`, `severe` would correspond to HPO terms for -`Intellectual disability, mild HP:0001256 `_, etc. See :ref:`option_column_mapper` for more information about how to work with this kind of column. -3. Constant. This mapper can be used if all individuals diusplay the same feature. See :ref:`simple_column_mapper`. -4. Threshold. This can be used for columns that have numerical data whereby being above or below a certain threshold is abnormal. See :ref:`threshold_column_mapper`. - - -## Row-based vs column-based - -pyphetools expects the rows to represent individuals. In some cases, input files represent individuals in columns. In this case, it is necessary to transpose the table before working with pyphetools. - - -### Converting to row-based format - -To use pyphetools, we need to have the individuals represented as rows (one row per individual) and have the items of interest be encoded as column names. -The required transformations for doing this may be different for different input data, but often we will want to transpose the table (using the pandas transpose function) -and set the column names of the new table to the zero-th row. After this, we drop the zero-th row (otherwise, it will be interpreted as an individual by the pyphetools code). - - - -Here is an example. Other examples can be found in several of the notebooks in phenopacket-store. - -```python title="Transforming from column-based to row-based format" -dft = df.transpose() -dft.columns = dft.iloc[0] -dft.drop(dft.index[0], inplace=True) -dft.head() -``` - - -Another thing to look out for is whether the individuals (usually the first column) are regarded as the index of the table or as the first normal column. -If this is the case, it is easiest to create a new column with the contents of the index -- this will work with the pyphetools software. -An example follows -- we can now use 'patient_id' as the column name. It is easier to work with this than with the index column. - - - -```python title="creating column with patient identifiers" -dft.index # first check the index -dft['patient_id'] = dft.index # Set the new column 'patient_id' to be identical to the contents of the index -dft.head() # check the transposed table -``` - -After this step is completed, the remaining steps to create phenopackets are the same as in the row-based notebook. - - -## Mapping the data - -Unfortunately, tabular data as it is currently available is so hetgerogeneous, that it is difficult to provide a simple step-for-step recipe for -how to use pyphetools to encode it. The basic steps are to choose a column mapper type for each of the phenotype columns in the table, and to use -age, sex, and variant column mappers for these types. - -- HPO (phenotype) [column mapper types](choosing_column_mapper.md) -- Variants: [variant column mapper](variant_column_mapper.md) -- Age of onset and age at last examination: TODO -- Sex column mapper: TODO - -We recommend studying available notebooks in the [phenopacket-store](https://github.com/monarch-initiative/phenopacket-store){:target:"_blank"} to get an idea of how to combine the column mappers for several examples. - -## Cohort encoder - -The [CohortEncoder](../api/creation/cohort_encoder.md) class was designed to work with the above column mappers. It can be setup as follows. - -```python title="Setting up the cohort mapper" -encoder = CohortEncoder(df=df, - hpo_cr=hpo_cr, - column_mapper_list=column_mapper_list, - individual_column_name="individual_id", - age_of_onset_mapper=onsetMapper, - age_at_last_encounter_mapper=lastEncounterMapper, - sexmapper=sexMapper, - variant_mapper=varMapper, - metadata=metadata) -``` -Note that the ``age_of_onset_mapper`` and ``age_at_last_encounter_mapper`` arguments can be omitted if this information is not available. - -## Specify the disease - -pyphetools requires a disease identifier and label, as follows. - -```python title="Specify the disease" -vEDS = Disease(disease_id='OMIM:130050', disease_label='Ehlers-Danlos syndrome, vascular type') -encoder.set_disease(vEDS) -``` - -## Validation - -Now we can retrieve the Individual objects and do Q/C - - -```python title="pyphetools validation" -individuals = encoder.get_individuals() -cvalidator = CohortValidator(cohort=individuals, ontology=hpo_ontology, min_hpo=1, allelic_requirement=AllelicRequirement.MONO_ALLELIC) -qc = QcVisualizer(cohort_validator=cvalidator) -display(HTML(qc.to_summary_html())) -``` - -This step will show warnings that can generally be ignored (e.g., the redundant terms were removed). If there are serious errors, a message will be shown, and the user will need to fix the errors before going on. - -## Summaries of the phenopackets - -The following commands display a table with a summary of each phenopacket created. -```python title="List of phenopackets created in the current notebook" -individuals = cvalidator.get_error_free_individual_list() -table = PhenopacketTable(individual_list=individuals, metadata=metadata) -display(HTML(table.to_html())) -``` -## Saving phenopackets to file -The following command writes each phenopacket as a JSON file to the directory ``phenopackets``(other directory names can be chosen). - -```python title="output" -Individual.output_individuals_as_phenopackets(individual_list=individuals, - metadata=metadata) -``` - - - -## HPOA files -To create the HPOA files used to create the phenotype.hpoa by the HPO team, adapt the following code. Note that this code is -slightly different to the code used with the Excel template to build HPOA files. - -Please see [HPOA files](../developers/hpoa_editing.md). \ No newline at end of file diff --git a/docs/tabular/option_column_mapper.md b/docs/tabular/option_column_mapper.md deleted file mode 100644 index 2d1329f4..00000000 --- a/docs/tabular/option_column_mapper.md +++ /dev/null @@ -1,93 +0,0 @@ -# OptionColumnMapper - - -Mapper to be used if the column has a set of defined items. These items are defined with a map that relates the text used in the -table to the HPO label. If the original HPO label is used in the table, it does not need to be specified in the map. - -```python title="OptionColumnMapper constructor" -other_d = { - "HP": "High palate", - "D": "Dolichocephaly", - "En": "Deeply set eye", # i.e., Enophthalmus - "DE": "Dural ectasia", - "St": "Striae distensae" -} -otherMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=other_d) -``` - -This mapper will recognize ``HP`` as well as ``HP,D,St``. The mapper interprets -``,``, ``;``, ``|``, and ``/`` as delimiters. Note that the value of the dictionary -can be either a two-element array as shown above or a simple string that must be the -label of the HPO term. - -If text-mining is required, use the :ref:`custom_column_mapper`. - - -### Excluded items - -If some of the items should be mapped to an excluded HPO term, then the `exluded_d` aregument is used analogously. For instance, the following all refer to normal findings. - -```python title="OptionColumnMapper - excluded findings" - urine_not_xa_d = {'0.04mmol/L': "Xanthinuria", - "1.6umol/mmolCr": "Xanthinuria", - "0.0214XA/Cr": "Xanthinuria", - "normal": "Xanthinuria"} -``` - -### Assume excluded - -By default, the OptionColumnMapper will assume that items that are not mentioned in a table cell were not measured. -In some cases, we know that the items have been excluded if they are not listed in the cell (because the article says so or because of contextual knowledge). -In this case, we can set the argeument `assumeExcluded` to True. - - -```python title="OptionColumnMapper constructor" -other_d = { - "HP": "High palate", - "D": "Dolichocephaly", - "En": "Deeply set eye", # i.e., Enophthalmus - "DE": "Dural ectasia", - "St": "Striae distensae" -} -otherMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=other_d, assumeExcluded=True) -otherMapper.map_cell("HP") -``` - -In this example, the mapper would map "HP" to High palate, but would also map it to excluded Dolicocephaly (all four terms except for High palate would be excluded). - - -## Shortcut to creating option mapper objects - - -It is possible to create the dictionaries used by the OptionColumnMapper by hand. -However, the following command will generate a code-template from which users -can copy and adapt code for relevant columns. - -```python title="OptionColumnMapper shortcut" -dft = ... # Pandas DataFrame with columns representing clinical data -output = OptionColumnMapper.autoformat(df=dft, concept_recognizer=hpo_cr) -print(output) - -post_fossa_d = {'Mega cisterna magna': 'Enlarged cisterna magna', - 'Normal': 'PLACEHOLDER', - 'Mega cistema magna': 'PLACEHOLDER'} -post_fossaMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=post_fossa_d) -post_fossaMapper.preview_column(df['Post fossa'])) -column_mapper_d['Post fossa'] = post_fossaMapper - -pituitary_d = {'Normal': 'PLACEHOLDER'} -pituitaryMapper = OptionColumnMapper(concept_recognizer=hpo_cr, option_d=pituitary_d) -pituitaryMapper.preview_column(df['Pituitary'])) -column_mapper_d['Pituitary'] = pituitaryMapper -(...) -``` - -For instance, in the above example, there is a column called `Post fossa` in the DataFrame dft. The cell contents -for the rows of the column contained several strings that we might want to map. `Enlarged cisterna magna` was -recognized as the label of the HPO term -[Enlarged cisterna magna (HP:0002280)](https://hpo.jax.org/app/browse/term/HP:0002280). -We would remove the label 'Normal' (and possible code it as excluded using other commands). The -string `Mega cistema magna` is clearly a spelling error in the original data, and so we can map it -to the string `Enlarged cisterna magna` (replace the PLACEHOLDER) so that the string will also be mapped to the HPO term. -The next column, `Pituitary`, just shows normal, and this would not be appropriate for the OptionColumnMapper, but users -might want to use the :ref:`simple_column_mapper` to encoded that Abnormalities of the pituitary were excluded. \ No newline at end of file diff --git a/docs/tabular/overview.md b/docs/tabular/overview.md deleted file mode 100644 index 62721bd6..00000000 --- a/docs/tabular/overview.md +++ /dev/null @@ -1,34 +0,0 @@ -# Encoding tabular data with pyphetools scripts - -This option is intended for people who are comfortable using Python scripting, and is designed to import tabular data such as is commonly found in the supplemental files of medical publications about cohorts of individuals diagnosed with a certain disease. See also the instructions for using an [Excel template](template.md) for entering data with a minimum of scripting. - - -The best way to get a feeling for how to work with pyphetools is to examine the various notenooks in the -[phenopacket-store](https://github.com/monarch-initiative/phenopacket-store){:target="\_blank"} repository. - -This tutorial provides some general tips for how to use the library. The library is intended to be used in a Jupyter notebook environment so that users can check intermediate results. -There are many ways of setting this up, but here is one that we often use. - - -The typical use case for using pyphetools in this way is to ingest complicated tables that would be too difficult or unweildly to import using the Excel template. - - -## Setting up the Jupyter environment - -We recommend developing scripts using a Jupyter notebook so that parsing results can be checked. - -There are many ways of setting up Jupyter, all of which should work with pyphetools. We use the following approach. - -```bash title="installing jupyter and running pyphetools in a notebook" -python3 -m venv your_env -source your_env/bin/activiate -pip install --upgrade pip -pip install pyphetools -pip install jupyter ipykernel -python3 -m ipykernel install --name your_env --user -jupyter-notebook -``` - -The virtual environment (here *your_env*) can be named as desired. The last line opens a Jupyter Notebook page; -create a new Notebook and choose the kernel called *your_env* (or whatever you called it). - diff --git a/docs/tabular/simple_column_mapper.md b/docs/tabular/simple_column_mapper.md deleted file mode 100644 index 37f72aec..00000000 --- a/docs/tabular/simple_column_mapper.md +++ /dev/null @@ -1,57 +0,0 @@ -# SimpleColumnMapper - - -ColumnMapper for columns that contain information about a single phenotypic abnormality only. -This kind of ColumnMapper should be used for columns that can be representing by one HPO term -and which can contain symbols such as "+", "Y", or "yes" indicating that the abnormality was -observed, symbols such as "-", "N", or "no" indicating that the abnormality was explicity excluded, -and (optionally) symbols indicating that the abnormality was not measured or assessed. - -For instance, the following mapper would generate a phenotypic feature for -[Global developmental delay](https://hpo.jax.org/app/browse/term/HP:0001263){:target="\_blank"} if the -column contains "yes" (observed) and would call it as excluded if the column contains "no". -For any other text or for an empty cell, the feature would be called as not measured (and would -not be included in the phenopacket). - - -```python title="SimpleColumnMapper constructor" -from pyphetools.creation import SimpleColumnMapper - -ddMapper = SimpleColumnMapper(hpo_id='HP:0001263', - hpo_label='Global developmental delay', - observed='yes', - excluded='no') -``` - - - -It can be convenient to add multiple SimpleColumnMappers at the same time. The following function enables this. Note that the `HpoParser` object from the pyphetools.creation package is used to create a concept recognizer object. The `column_d` dictionary is used to store the individual mappers, and will be passed later on to a `CohortMapper` object. - -```python title="Creating multiple SimpleColumnMapper objects at once" -import hpotk - -onto_store = hpotk.configure_ontology_store() -hpo = onto_store.load_hpo() - -from pyphetools.creation import HpoExactConceptRecognizer - -hpo_cr = HpoExactConceptRecognizer.from_hpo(hpo) - -## -column_mapper_d = {} -items = { - 'regression': ["Developmental regression", "HP:0002376"], - 'autism': ['Autism', 'HP:0000717'], - 'hypotonia': ['Hypotonia', 'HP:0001252'], - 'movement disorder': ['Abnormality of movement', 'HP:0100022'], - 'CVI': ['Cerebral visual impairment', 'HP:0100704'], - 'seizures': ['Seizure', 'HP:0001250'], - 'DD': ['Global developmental delay', 'HP:0001263'] -} -item_column_mapper_d = hpo_cr.initialize_simple_column_maps(column_name_to_hpo_label_map=items, - observed='yes', - excluded='no') -# Transfer to column_mapper_d -for k, v in item_column_mapper_d.items(): - column_mapper_d[k] = v -``` diff --git a/docs/tabular/threshold_column_mapper.md b/docs/tabular/threshold_column_mapper.md deleted file mode 100644 index 2eb4a74e..00000000 --- a/docs/tabular/threshold_column_mapper.md +++ /dev/null @@ -1,42 +0,0 @@ -# ThresholdColumnMapper - - - -This mapper can be used for columns that have numerical data whereby being above or below a certain threshold is abnormal and -can be represented by an HPO term. For instance, if "Age at head control (months)" is over 4 months, we would call Persistent head lag HP:0032988. - - - -```python title="ThresholdedColumnMapper constructor" -headLagMapper = ThresholdedColumnMapper(hpo_id="HP:0032988", - hpo_label="Persistent head lag", - threshold=4, - call_if_above=True) -headLagMapper.preview_column(dft["Age at head control (months)"]) -``` - -This might lead to a result such as the following, in which only the individual in the fourth row required more than 4 months to display head control. - - - -| term | status | -| :---------------------------------|----------| -| Persistent head lag (HP:0032988) | excluded | -| Persistent head lag (HP:0032988) | excluded | -| Persistent head lag (HP:0032988) | excluded | -| Persistent head lag (HP:0032988) | observed | -| Persistent head lag (HP:0032988) | excluded | - - - -### ThresholdedColumnMapper - special cases - -In some cases, phrases such as 'not attained' are used to denote that a child has not attained a certain milestone at the time of last examination and this this constitutes an abnormal finding. In this case, the optional argument ''observed_code'' should be used. - -```python title="ThresholdedColumnMapper special cases" -delayedSittingMapper = ThresholdedColumnMapper(hpo_id="HP:0025336", - hpo_label="Delayed ability to sit", - threshold=9, - call_if_above=True, - observed_code='Not acquired') -``` diff --git a/docs/tabular/validation.md b/docs/tabular/validation.md deleted file mode 100644 index f2cf4fdb..00000000 --- a/docs/tabular/validation.md +++ /dev/null @@ -1,75 +0,0 @@ -# Validation - - -There are many types of errors that can occur in phenopackets. The Java application [phenopacket-tools](https://github.com/phenopackets/phenopacket-tools) is a general purpose app for validating and working with phenopackets. pyphetools provides a limited number of commands to check validity of the generated phenoopackets that can be conveniently used as a part of notebooks that create phenopackets. - -Commonly encvountered errors include redundancy and inheritance conflicts. - -### Mistaken HPO identifiers or labels - -Sometimes a phenopacket may contain an obsolete HPO id or a spelling error in the label. - -### Redundant terms - -If an individual is found to have [Nuclear cataract(HP:0100018)](https://hpo.jax.org/app/browse/term/HP:0100018){:target="\_blank"}, which means an opacity that develops in the nucleus of the lens of the eye, then the individual always can be said to have a [Cataract (HP:0000518)](https://hpo.jax.org/app/browse/term/HP:0000518){:target="\_blank"}, which refers to an opacity anywhere in the lens of the eye. This is because of the so-called true-path rule of ontologies, according to which if an HPO term is used to annotate an individual, then the parent of that term and all of the ancestors of that term must also apply. In this case, Cataract is a grand-parent of Nuclear cataract. - -Because of this, if we have annotated with [Nuclear cataract(HP:0100018)](https://hpo.jax.org/app/browse/term/HP:0100018){:target="\_blank"}, it is not necessary to annotate with [Cataract (HP:0000518)](https://hpo.jax.org/app/browse/term/HP:0000518){:target="\_blank"}, because it is implicitly true. - -We therefore recommend that only the most specific HPO term be used for a time point. - -### Conflicting terms - -In some datasets we have seen, a patient is annotated with a specific term in an organ, but also indicate that abnormalities have been excluded at a higher level. For instance, we might see [Ventricular septal hypertrophy (HP:0005144)](https://hpo.jax.org/app/browse/term/HP:0005144) but also excluded [Abnormal heart morphology (HP:0001627)](https://hpo.jax.org/app/browse/term/HP:0001627). - - -## QC with pyphetools. -We recommned checking all generated phenopackets with the following steps. First obtain the list of [Individual](../api/creation/individual.md){:target="_blank"} objects. -Pass this list together with a reference to the HPO to a [CohortValidator](../api/validation/content_validator.md){:target="_blank"} object. -To display the results of validation, use a [QcVisualizer](../api/visualization/qc_visualizer.md){:target="_blank"}. - -The QcVisualizer can show either a list of all issues with the *to_html* method or a summary of issues with the *to_summary_html* method. - -```python title="Generating GA4GH phenopackets from a pyphetools individual list" -individuals = encoder.get_individuals() -cvalidator = CohortValidator(cohort=individuals, ontology=hpo_ontology, min_hpo=1) -qc = QcVisualizer(ontology=hpo_ontology, cohort_validator=cvalidator) -display(HTML(qc.to_html())) -# alternatively: display(HTML(qc.to_summary_html())) -``` - -This will either display a message that no errors were found, or will show a table as in the following figure. - -
-![Validation results](../img/validation_results.png){ width="1000" } -
Validation Results. -
-
- -There are some kinds of error that need to be corrected in the notebook, such as malformed HPO ids or labels. Others can be corrected automatically, such as -redudant terms. - -| Category | Explanation | Autocorrect? | -|:---------|:------------|:-------------| -| REDUNDANT| HPO term and ancestor term both reported as observed | Yes (redundant ancestor term removed) | -|CONFLICT| HPO term observed and ancestor term excluded |Yes (conflicting ancestor term removed) | -|INSUFFICIENT_HPOS | Individual does not have at least threshold number of HPOs | Yes, individual removed from cohort | -| INCORRECT_ALLELE_COUNT | number of alleles unexpected given mode of inheritance | No | -| INCORRECT_VARIANT_COUNT | number of variants unexpected given mode of inheritance | No | -| MALFORMED_ID | HPO id (e.g., HP:0001234) obsolete or incorrect | No | -| MALFORMED_LABEL | Label obsolete or incorrect | No | -| NOT_MEASURED | HPO term reported as not measure | Yes, not measured term removed | -| OBSERVED_AND_EXCLUDED | Same HPO term reported as observed and excluded | No | - - -The issues that are shown as Autocorrect = Yes will be fixed automatically with the following command. - - -```python title="Getting individual objects with no syntax or ontology errors" -individuals = cvalidator.get_error_free_individual_list() -``` - -Issues that cannot be fixed with Autocorrect will lead to the individual being removed from the cohort and usually should be fixed in the notebook before proceeding. - - - -If desired, it is possible to double-check that these individuals have no errors by doing another round of checks with the CohortValidator. \ No newline at end of file diff --git a/docs/tabular/variant_column_mapper.md b/docs/tabular/variant_column_mapper.md deleted file mode 100644 index 6c5d861e..00000000 --- a/docs/tabular/variant_column_mapper.md +++ /dev/null @@ -1,91 +0,0 @@ -# VariantColumnMapper - -TODO -- update - -ColumnMapper for columns that contain information about the variant(s) found in an individual. -There are two main ways of using this class. If there is a column that contains an HGVS expression -for the variant (without the transcript), then the transcript is indicated by the default_transcript argument. - - -# HGVS vs Structural - -pyphetools uses two different classes to ingest small variants (which must be encoded using valid -[HGVS notation](https://varnomen.hgvs.org/){:target="\_blank"} or structural variants (which are not validated). - - -### HGVS - -The following code can be adapted to read HGVS-encoded variants. It assumes that there is -a column in the input table called *Variant*. It first extracts a list of unique variants, -and used [variant validator](https://variantvalidator.org/) to check them and to retrieve additional -data such as the chromosomal position. The resulting *Variant* object is placed in a dictionary together -with the original string used in the table (if necessary, one can correct minor errors such as the failure to -use the "c." below). - -```python title="Calling Variant Validator for HGVS-encoded variants" -hg38 = 'hg38' -default_genotype = 'heterozygous' -WFS1_transcript='NM_006005.3' -vvalidator = VariantValidator(genome_build=hg38, transcript=WFS1_transcript) -variant_list = df['Variant'].unique() -print(variant_list) -variant_d = {} -for v in variant_list: - if v == "1380del9": - hgvs = "c.1385_1393del" - else: - hgvs = f"c.{v}" - var = vvalidator.encode_hgvs(hgvs) - variant_d[v] = var -``` - -For structural variants, pyphetools enocdes the variant using corresponding classes from Sequence Ontology with -the *StructuralVariant* class, which has the static functions - -- chromosomal_deletion -- chromosomal_duplication -- chromosomal_inversion - -as well as a constructor that can take any relevant sequence ontology class. - - -```python title="chromosomal deletion" -sv = StructuralVariant.chromosomal_deletion(cell_contents="46,XY.ish del(7)(p14.1)(RP11-816F16-)", -````````````````````````````````````````````gene_id="HGNC:4319", - gene_symbol="GLI3") -``` -The cell contents argument contains the variant name as used in the original table. - -It is possible to mix HGVS and structural variants in Python code. An example follows. -```python title="HGVS and structural variants" -struct_variants = { "rsa7p14.1(kit P179)x1", - "46,XY.ish del(7)(p14.1)(RP11-816F16-)", - "46,XX.ish del(7)(p14.1p14.1)(GLI3-)" } -gli3_symbol = "GLI3" -gli3_id = "HGNC:4319" -gli3_variants = df1['cDNA alteration'].unique() -gli3_variant_d = {} -for gli3v in gli3_variants: - if gli3v in struct_variants: - sv = StructuralVariant.chromosomal_deletion(cell_contents=gli3v, gene_id=gli3_id, gene_symbol=gli3_symbol) - print(gli3v) - gli3_variant_d[gli3v] = sv - else: - v = hgvsMapper.encode_hgvs(gli3v) - gli3_variant_d[gli3v] = v -``` - -## VariantColumnMapper - -One the dictionary of variants has been constructed as above, we create the variant mapper as follows. - -```python title="VariantColumnMapper constructor" -variantMapper = VariantColumnMapper(variant_d=gli3_variant_d, - variant_column_name="cDNA alteration", - default_genotype='heterozygous' - ) -``` -The VariantColumnMapper is now ready to go to map HGVS expressions in the column "cDNA alteration". - - - diff --git a/docs/tabular/visualization.md b/docs/tabular/visualization.md deleted file mode 100644 index a121c655..00000000 --- a/docs/tabular/visualization.md +++ /dev/null @@ -1,5 +0,0 @@ -# Visualization - - - -todo describe Table Converter \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index dea8a0ff..6d2bb7b2 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -39,19 +39,7 @@ nav: - 'user-guide/tips_for_curation.md' - 'user-guide/variant_notation.md' - 'user-guide/discombobulator.md' - - Coding tabular data with Python scripts: - - Overview: 'tabular/overview.md' - - Jupyter notebooks: 'tabular/jupyter.md' - - Column mappers: - - Choosing a column mapper: 'tabular/choosing_column_mapper.md' - - Simple column mapper: 'tabular/simple_column_mapper.md' - - Constant column mapper: 'tabular/constant_column_mapper.md' - - Option column mapper: 'tabular/option_column_mapper.md' - - Threshold column mapper: 'tabular/threshold_column_mapper.md' - - Variant column mapper: 'tabular/variant_column_mapper.md' - - Cohort encoder: 'tabular/cohort_encoder.md' - - Validation: 'tabular/validation.md' - - Visualization: 'tabular/visualization.md' + - Variant column mapper: 'user-guide/variant_column_mapper.md' - Visualization: - Overview: 'visualization/index.md' - Kaplan Meier: 'visualization/kaplan_meier_visualizer.md' @@ -64,15 +52,11 @@ nav: - Overview: 'api/overview.md' - creation: - overview: 'api/creation.md' - - AgeColumnMapper: "api/creation/age_column_mapper.md" - CaseTemplateEncoder: "api/creation/case_template_encoder.md" - Citation: "api/creation/citation.md" - CohortEncoder: "api/creation/cohort_encoder.md" - - ColumnMapper: "api/creation/column_mapper.md" - - ConstantColumnMapper: "api/creation/constant_column_mapper.md" - - Disease: "api/creation/discombobulator.md" + - Discombobulator: "api/creation/discombobulator.md" - Disease: "api/creation/disease.md" - - DiseaseIdColumnMapper: "api/creation/disease_id_column_mapper.md" - HgvsVariant: "api/creation/hgvs_variant.md" - HpoConceptRecognizer: "api/creation/hpo_cr.md" - HpoExactConceptRecognizer: "api/creation/hpo_exact_cr.md" @@ -80,10 +64,6 @@ nav: - HpTerm: "api/creation/hp_term.md" - Individual: "api/creation/individual.md" - MetaData: "api/creation/metadata.md" - - OptionColumnMapper: "api/creation/option_column_mapper.md" - - SexColumnMapper: "api/creation/sex_column_mapper.md" - - SimpleColumnMapper: "api/creation/simple_column_mapper.md" - - SimpleColumnMapperGenerator: "api/creation/simple_column_mapper_generator.md" - StructuralVariant: "api/creation/structural_variant.md" - TemplateCreator: "api/creation/create_template.md" - Thresholder: "api/creation/thresholder.md"