refactoring citations

monarch-initiative · Nov 27, 2023 · 3e2f439 · 3e2f439
1 parent 786ddac
commit 3e2f439
Show file tree

Hide file tree

Showing 17 changed files with 155 additions and 128 deletions.
diff --git a/docs/api/creation/citation.md b/docs/api/creation/citation.md
@@ -0,0 +1,3 @@
+# CohortEncoder
+
+::: src.pyphetools.creation.Citation
diff --git a/docs/user-guide/tutorial.md b/docs/user-guide/tutorial.md
@@ -1,15 +1,15 @@
 # Tutorial
 
 
-The goal of the pyphetools library is to provide a software framework for transforming tabular 
-data about cohorts with rare of common disease into a collection 
-of [GA4GH phenopackets](https://phenopacket-schema.readthedocs.io/en/latest/){:target="\_blank"}. We have applied to library 
-to extract phenopackets from tables provided as  
-supplemental files of publications describing cohorts of individuals with a rare disease 
-(Excel, tab or comma-separated value files, and even files copied into a 
+The goal of the pyphetools library is to provide a software framework for transforming tabular
+data about cohorts with rare of common disease into a collection
+of [GA4GH phenopackets](https://phenopacket-schema.readthedocs.io/en/latest/){:target="\_blank"}. We have applied to library
+to extract phenopackets from tables provided as
+supplemental files of publications describing cohorts of individuals with a rare disease
+(Excel, tab or comma-separated value files, and even files copied into a
 spreadsheet application from the original PDF file of the article).
 
-The best way to get a feeling for how to work with pyphetools is to examine the various notenooks in the 
+The best way to get a feeling for how to work with pyphetools is to examine the various notenooks in the
 [phenopacket-store](https://github.com/monarch-initiative/phenopacket-store){:target="\_blank"} repository.
 
 This tutorial provides some general tips for how to use the library. The library is intended to be used in a Jupyter notebook environment so that users can check intermediate results.
@@ -27,7 +27,7 @@ python3 -m ipykernel install --name your_env --user
 jupyter-notebook
 ```
 
-The virtual environment (here *your_env*) can be named as desired. The last line opens a Jupyter Notebook page; 
+The virtual environment (here *your_env*) can be named as desired. The last line opens a Jupyter Notebook page;
 create a new Notebook and choose the kernel called *your_env* (or whatever you called it).
 
 
@@ -36,22 +36,19 @@ create a new Notebook and choose the kernel called *your_env* (or whatever you c
 
 
 Most notebooks will want to first import all necessary packages. It is helpful to print out the version
-used (see the last two lines) in case of errors or feature requests.
+used (see the last two lines) in case of errors or feature requests. Sometimes, additional packages need
+to be imported to support special cases.
 
 
 ```python title="imports"
-import phenopackets as php
-from google.protobuf.json_format import MessageToDict, MessageToJson
-from google.protobuf.json_format import Parse, ParseDict
 import pandas as pd
-import math
-from csv import DictReader
 pd.set_option('display.max_colwidth', None) # show entire column contents, important!
-from collections import defaultdict
-import re
+from IPython.display import display, HTML
 from pyphetools.creation import *
-from pyphetools.visualization import PhenopacketTable
-print(f"pyphetools version {pyphetools.__version__}")
+from pyphetools.visualization import *
+from pyphetools.validation import *
+import pyphetools
+print(f"Using pyphetools version {pyphetools.__version__}")
 ```
 
 
@@ -66,11 +63,15 @@ It is useful to import the HPO file and create the MetaData object (which record
 parser = HpoParser()
 hpo_cr = parser.get_hpo_concept_recognizer()
 hpo_version = parser.get_version()
-metadata = MetaData(created_by="ORCID:0000-0002-0736-9199")
+hpo_ontology = parser.get_ontology()
+PMID = "PMID:16783569"
+title = "A novel X-linked recessive mental retardation syndrome comprising macrocephaly and ciliary dysfunction is allelic to oral-facial-digital type I syndrome"
+metadata = MetaData(created_by="ORCID:0000-0002-5648-2155", pmid=PMID, pubmed_title=title)
 metadata.default_versions_with_hpo(version=hpo_version)
+print(f"HPO version {hpo_version}")
 ```
 
-It is often useful to display the title and PubMed identifier of the publication from which the data come. 
+It is often useful to display the title and PubMed identifier of the publication from which the data come.
 Simply replace the above code with the following.
 
 ```python title="HPO and MetaData (with title and PMID)"
@@ -118,21 +119,21 @@ df.columns
 
 
 
-## Clinical columns 
-pyphetools expects to get a dictionary whose keys correspond to the column names used by the pandas DataFrame, 
-and the values are the corresponding ColumnMapper objects. pyphetools offers different types of ColumnMapper objects, whose goal is to 
-encode the id, age, sex, variants, and clinical information encoded by HPO terms. We first create a dictionary whose keys should be the 
-names (strings) of the columns of the table and whose values are the corresponding ColumnMapper objects that we need to create for each column we 
+## Clinical columns
+pyphetools expects to get a dictionary whose keys correspond to the column names used by the pandas DataFrame,
+and the values are the corresponding ColumnMapper objects. pyphetools offers different types of ColumnMapper objects, whose goal is to
+encode the id, age, sex, variants, and clinical information encoded by HPO terms. We first create a dictionary whose keys should be the
+names (strings) of the columns of the table and whose values are the corresponding ColumnMapper objects that we need to create for each column we
 want to map. Note that it is not necessary to map each column of a table.
 
 
 Data with clinical columns in typical supplemental files often have one of three forms.
 
 
-1. Simple. The column header is a string such as 'ID' that corresponds to an HPO term 
-2. ([Intellectual disability HP:0001249](https://hpo.jax.org/app/browse/term/HP:0001249){:target="\_blank"} and each cell has a symbol such as 
+1. Simple. The column header is a string such as 'ID' that corresponds to an HPO term
+2. ([Intellectual disability HP:0001249](https://hpo.jax.org/app/browse/term/HP:0001249){:target="\_blank"} and each cell has a symbol such as
 'Y', 'y', '+', 'yes', ''n', '-', etc. to indicate whether the feature was present in the individual specified by the row.  See :ref:`simple_column_mapper` for more information about how to work with this kind of column.
-3. Options. Some columns contain several strings, each of which corresponds to a specific HPO term. For instance, a columns such as 'severity of ID' with entries such as `mild`, `moderate`, `severe` would correspond to HPO terms for 
+3. Options. Some columns contain several strings, each of which corresponds to a specific HPO term. For instance, a columns such as 'severity of ID' with entries such as `mild`, `moderate`, `severe` would correspond to HPO terms for
 `Intellectual disability, mild HP:0001256 <https://hpo.jax.org/app/browse/term/HP:0001256>`_, etc. See :ref:`option_column_mapper` for more information about how to work with this kind of column.
 4. Custom. This mapper is used for columns whose cells contain longer strings. We use a combination of text mining and specification of strings that were not matched by mining to extract HPO terms. See :ref:`custom_column_mapper_rst` for more information.
 5. Constant. This mapper can be used if all individuals diusplay the same feature. See :ref:`simple_column_mapper`.
@@ -146,8 +147,8 @@ pyphetools expects the rows to represent individuals. In some cases, input files
 
 ### Converting to row-based format
 
-To use pyphetools, we need to have the individuals represented as rows (one row per individual) and have the items of interest be encoded as column names. 
-The required transformations for doing this may be different for different input data, but often we will want to transpose the table (using the pandas transpose function) 
+To use pyphetools, we need to have the individuals represented as rows (one row per individual) and have the items of interest be encoded as column names.
+The required transformations for doing this may be different for different input data, but often we will want to transpose the table (using the pandas transpose function)
 and set the column names of the new table to the zero-th row. After this, we drop the zero-th row (otherwise, it will be interpreted as an individual by the pyphetools code).
 
 
@@ -163,7 +164,7 @@ dft.head()
 
 
 Another thing to look out for is whether the individuals (usually the first column) are regarded as the index of the table or as the first normal column.
-If this is the case, it is easiest to create a new column with the contents of the index -- this will work with the pyphetools software. 
+If this is the case, it is easiest to create a new column with the contents of the index -- this will work with the pyphetools software.
 An example follows -- we can now use 'patient_id' as the column name. It is easier to work with this than with the index column.
 
 

diff --git a/mkdocs.yml b/mkdocs.yml
@@ -51,6 +51,7 @@ nav:
       - "overview": "api/creation.md"
       - "AgeColumnMapper": "api/creation/age_column_mapper.md"
       - "CaseEncoder": "api/creation/case_encoder.md"
+      - Citation:  "api/creation/citation.md"
       - "CohortEncoder": "api/creation/cohort_encoder.md"
       - "ColumnMapper": "api/creation/column_mapper.md"
       - "ConstantColumnMapper": "api/creation/constant_column_mapper.md"

diff --git a/src/pyphetools/__init__.py b/src/pyphetools/__init__.py
@@ -3,7 +3,7 @@
 from . import visualization
 from . import validation
 
-__version__ = "0.8.27"
+__version__ = "0.8.31"
 
 __all__ = [
     "creation",

diff --git a/src/pyphetools/creation/__init__.py b/src/pyphetools/creation/__init__.py
@@ -2,6 +2,7 @@
 from .age_isoformater import AgeIsoFormater
 from .allelic_requirement import AllelicRequirement
 from .case_encoder import CaseEncoder
+from .citation import Citation
 from .cohort_encoder import CohortEncoder
 from .column_mapper import ColumnMapper
 from .constant_column_mapper import ConstantColumnMapper
@@ -29,7 +30,8 @@
     "AgeColumnMapper",
     "AgeIsoFormater",
     "AllelicRequirement",
-    "CaseEncoder" ,
+    "CaseEncoder",
+    "Citation",
     "CohortEncoder",
     "ColumnMapper",
     "ConstantColumnMapper",

diff --git a/src/pyphetools/creation/case_encoder.py b/src/pyphetools/creation/case_encoder.py
@@ -5,6 +5,7 @@
 import re
 from typing import Set
 from collections import defaultdict
+from .citation import Citation
 from .column_mapper import ColumnMapper
 from .constants import Constants
 from .disease import Disease
@@ -25,8 +26,8 @@ class CaseEncoder:
 
     :param hpo_cr: HpoConceptRecognizer for text mining
     :type hpo_cr: pyphetools.creation.HpoConceptRecognizer
-    :param pmid: PubMed identifier of this case report
-    :type pmid: str
+    :param citation: PubMed identifierand title of this case report
+    :type citation: Citation
     :param individual_id: Application specific individual identifier
     :type individual_id: str
     :param metadata: GA4GH MetaData object
@@ -41,7 +42,7 @@ class CaseEncoder:
 
     def __init__(self,
                 hpo_cr: HpoConceptRecognizer,
-                pmid: str,
+                citation: Citation,
                 individual_id:str,
                 metadata:PPKt.MetaData,
                 age_at_last_exam:str=None,
@@ -51,6 +52,7 @@ def __init__(self,
             raise ValueError(
                 f"concept_recognizer argument must be HpoConceptRecognizer but was {type(hpo_cr)}")
         self._hpo_concept_recognizer = hpo_cr
+        pmid = citation.pmid
         if not pmid.startswith("PMID:"):
             raise ValueError(f"Malformed pmid argument ({pmid}). Must start with PMID:")
         if age_at_last_exam is not None:
@@ -84,8 +86,8 @@ def __init__(self,
             self._individual.set_age(age_at_last_exam)
         if disease is not None:
             self._individual.set_disease(disease=disease)
-        if pmid is not None:
-            self._individual.set_pmid(pmid=pmid)
+        if citation is not None:
+            self._individual.set_citation(citation=citation)
 
 
     def add_vignette(self, vignette, custom_d=None, custom_age=None, false_positive=None, excluded_terms:Set[str]=None) -> pd.DataFrame:

diff --git a/src/pyphetools/creation/citation.py b/src/pyphetools/creation/citation.py
@@ -0,0 +1,24 @@
+
+
+
+class Citation:
+    """encapsulate information about a citation that we add to the metadata for display
+
+    :param pmid: PubMed identifier for the publication in which this individual was described (e.g. PMID:321..).
+    :type pmid: str
+    :param title: Title of the publication in which this individual was described.
+    :type title: str
+    """
+
+    def __init__(self, pmid, title) -> None:
+        self._pmid = pmid
+        self._title = title
+
+
+    @property
+    def pmid(self):
+        return self._pmid
+
+    @property
+    def title(self):
+        return self._title
diff --git a/src/pyphetools/creation/individual.py b/src/pyphetools/creation/individual.py
@@ -3,6 +3,7 @@
 import os
 from typing import List, Union
 from google.protobuf.json_format import MessageToJson
+from .citation import Citation
 from .constants import Constants
 from .disease import Disease
 from .hp_term import HpTerm
@@ -31,8 +32,7 @@ class Individual:
     def __init__(self,
                 individual_id:str,
                 hpo_terms:List[HpTerm]=None,
-                pmid:str=None,
-                title:str=None,
+                citation:Citation=None,
                 sex:str=Constants.NOT_PROVIDED,
                 age:str=Constants.NOT_PROVIDED,
                 interpretation_list:List[PPKt.VariantInterpretation]=None,
@@ -59,8 +59,7 @@ def __init__(self,
         else:
             self._interpretation_list = interpretation_list
         self._disease = disease
-        self._pmid = pmid
-        self._title = title
+        self._citation = citation
 
     @property
     def id(self):
@@ -155,14 +154,14 @@ def set_hpo_terms(self, cleansed_hpo_terms:List[HpTerm]):
 
     @property
     def pmid(self):
-        return self._pmid
+        return self._citation.pmid
 
-    def set_pmid(self, pmid:str):
+    def set_citation(self, citation:Citation):
         """
-        :param pmid: The PubMed identifier for the publication in which this individual was described (e.g. PMID:321..)
-        :type pmid: str
+        :param citation: Object with the title and PubMed identifier for the publication in which this individual was described (e.g. PMID:321..)
+        :type citation: Citation
         """
-        self._pmid = pmid
+        self._citation = citation
 
     def get_phenopacket_id(self, phenopacket_id=None) -> str:
         """
@@ -171,8 +170,8 @@ def get_phenopacket_id(self, phenopacket_id=None) -> str:
         """
         if phenopacket_id is None:
             indi_id = self._individual_id.replace(" ", "_")
-            if self._pmid is not None:
-                pmid = self._pmid.replace(":", "_")
+            if self._citation is not None:
+                pmid = self._citation.pmid.replace(":", "_")
                 ppkt_id = f"{pmid}_{indi_id}"
             else:
                 ppkt_id = indi_id
@@ -238,14 +237,14 @@ def to_ga4gh_phenopacket(self, metadata, phenopacket_id=None):
                 genomic_interpretation.variant_interpretation.CopyFrom(var)
                 interpretation.diagnosis.genomic_interpretations.append(genomic_interpretation)
             php.interpretations.append(interpretation)
-        if self._pmid is not None and self._title is not None:
+        if self._citation is not None:
             # overrides the "general" setting of the external reference for the entire cohort
             metadata.external_references.clear()
             extref = PPKt.ExternalReference()
-            extref.id = self._pmid
-            pm = self._pmid.replace("PMID:", "")
+            extref.id = self._citation.pmid
+            pm = self._citation.pmid.replace("PMID:", "")
             extref.reference = f"https://pubmed.ncbi.nlm.nih.gov/{pm}"
-            extref.description = self._title
+            extref.description = self._citation.title
             metadata.external_references.append(extref)
         php.meta_data.CopyFrom(metadata)
         return php

diff --git a/src/pyphetools/creation/metadata.py b/src/pyphetools/creation/metadata.py
@@ -4,6 +4,8 @@
 import phenopackets as PPKt
 from google import protobuf
 
+from .citation import Citation
+
 
 class Resource:
     def __init__(self, resource_id, name, namespace_prefix, iriprefix, url, version) -> None:
@@ -64,24 +66,22 @@ class MetaData:
 
     :param created_by: identifier (such as ORCID id) of the person who created this Phenopacket
     :type created_by: str
-    :param pmid: PubMed identifier of the article from which the data for the phenopacket was taken, optional
-    :type pmid: str
-    :param pubmed_title: title of the article (if any), for use in the Resource section
-    :type pubmed_title: str
+    :param citation: PubMed identifier of the article from which the data for the phenopacket was taken, optional
+    :type citation: Citation
     """
 
-    def __init__(self, created_by, pmid=None, pubmed_title=None) -> None:
+    def __init__(self, created_by, citation=None) -> None:
         """
         Constructor
         """
         self._created_by = created_by
         self._schema_version = "2.0"
         self._extref = None
-        if pmid is not None and pubmed_title is not None:
-            self.set_external_reference(pmid=pmid, pubmed_title=pubmed_title)
+        if citation is not None:
+            self.set_external_reference(pmid=citation.pmid, pubmed_title=citation.title)
         self._resource_d = defaultdict(Resource)
 
-    def default_versions_with_hpo(self, version, pmid=None, pubmed_title=None):
+    def default_versions_with_hpo(self, version):
         """
         Add resources for HPO (with specified version), GENO, HGNC, and OMIM (with default versions)
         The HPO version can be easily obtained from the HpoParser using the get_version() function
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# CohortEncoder

		::: src.pyphetools.creation.Citation