Merge pull request #40 from airr-community/development

In preparation for release
airr-community · Oct 27, 2017 · 790a8ea · 790a8ea
2 parents 48027d3 + f275db3
commit 790a8ea
Show file tree

Hide file tree

Showing 29 changed files with 823 additions and 112 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+# emacs backup
+*~
diff --git a/.travis.yml b/.travis.yml
@@ -2,6 +2,6 @@ language: python
 python:
   - 3.6
 install:
-  - pip install pyyaml
+  - pip install pyyaml pandas xlrd deepdiff
 script:
-  - scripts/ensure-consistency.py
+  - scripts/check-consistency.py
diff --git a/AIRR_Minimal_Standard_Data_Elements.tsv b/AIRR_Minimal_Standard_Data_Elements.tsv
diff --git a/MiAIRR-Elements_NCBI_mapping.xls b/MiAIRR-Elements_NCBI_mapping.xls
diff --git a/NCBI_implementation/Filled_NCBI_Templates/BioSample_AIRR_Filled_Example.xls b/NCBI_implementation/Filled_NCBI_Templates/BioSample_AIRR_Filled_Example.xls
diff --git a/NCBI_implementation/Filled_NCBI_Templates/SRA_AIRR_Filled_Example.xlsx b/NCBI_implementation/Filled_NCBI_Templates/SRA_AIRR_Filled_Example.xlsx
diff --git a/NCBI_implementation/NCBI Templates/AIRR_BioSample_v1.0.xls b/NCBI_implementation/NCBI Templates/AIRR_BioSample_v1.0.xls
diff --git a/NCBI_implementation/README.md b/NCBI_implementation/README.md
@@ -1,4 +1,4 @@
-![Image](https://github.com/airr-community/airr-standards/raw/master/Images/miairr_logo.png)
+![Image](https://github.com/airr-community/airr-standards/raw/master/images/miairr_logo.png)
 
 _Minimum information about an Adaptive Immune Receptor Repertoire Sequencing Experiment_
 
@@ -20,13 +20,13 @@ elements within these sets are defined
 [here](https://github.com/airr-community/airr-standards/blob/master/AIRR_Minimal_Standard_Data_Elements.tsv). The
 association between these AIRR sets, the associated data elements, and each of the NCBI repositories is shown below:
 
-![Image](https://github.com/airr-community/airr-standards/raw/master/Images/MiAIRR_data_elements_NCBI_targets.png)
+![Image](https://github.com/airr-community/airr-standards/raw/master/images/MiAIRR_data_elements_NCBI_targets.png)
 
 Submission of AIRR sequencing data and metadata to NCBI's public data repositories consists of five sequential steps:
 
 1. Submit study information to [NCBI BioProject](https://submit.ncbi.nlm.nih.gov/subs/bioproject/) using the NCBI web interface.
-2. Submit sample-level information to the [NCBI BioSample repository](https://submit.ncbi.nlm.nih.gov/subs/biosample/) using the [AIRR-BioSample templates](https://github.com/airr-community/airr-standards/raw/master/NCBI_implementation/NCBI%20Templates/AIRR_BioSample_v1.0.xls).
-3. Submit raw sequencing data to [NCBI SRA](https://submit.ncbi.nlm.nih.gov/subs/sra/) using the [AIRR-SRA data templates](https://github.com/airr-community/airr-standards/raw/master/NCBI_implementation/NCBI%20Templates/AIRR_SRA_v1.0.xls).
+2. Submit sample-level information to the [NCBI BioSample repository](https://submit.ncbi.nlm.nih.gov/subs/biosample/) using the [AIRR-BioSample templates](https://github.com/airr-community/airr-standards/raw/master/NCBI_implementation/templates_XLS/AIRR_BioSample_v1.0.xls).
+3. Submit raw sequencing data to [NCBI SRA](https://submit.ncbi.nlm.nih.gov/subs/sra/) using the [AIRR-SRA data templates](https://github.com/airr-community/airr-standards/raw/master/NCBI_implementation/templates_XLS/AIRR_SRA_v1.0.xls).
 4. Generate a DOI for the protocol describing how raw sequencing data were processed using [Zenodo](https://zenodo.org) or an equivalent DOI-granting service.
 5. Submit processed sequencing data with sequence-level annotations to [GenBank](https://www.ncbi.nlm.nih.gov/genbank/tbl2asn2/) using AIRR feature tags.
 

diff --git a/NCBI_implementation/mapping_MiAIRR_BioSample.tsv b/NCBI_implementation/mapping_MiAIRR_BioSample.tsv
@@ -0,0 +1,40 @@
+AIRR Formats WG field name	NCBI BioSample attribute	Keyword relation	Mandatory BioSample attribute	Note
+study_id	bioproject_accession	MAPPED	FALSE	Reference to the associated BioProject record
+subject_id	isolate	MAPPED	TRUE	
+synthetic	synthetic	AIRR_CUSTOM	FALSE	
+organism	organism	IDENTICAL	TRUE	
+sex	sex	IDENTICAL	TRUE	
+age	age	IDENTICAL	TRUE	To be IDENTICAL, `age` MUST be age of subject at sampling time point. In contrast, MiAIRR also allows other reference time points for `age`
+age_event	age_event	AIRR_CUSTOM	FALSE	Value for this field MUST be `sampling` to be consistent with BioSample's `age` definition. See `age`
+ancestry_population	population	MAPPED	FALSE	BioSample attributes `(super_)population_*` were not used as they encode keywords from the Coriell Institute, whose suitability for MiAIRR has not yet been fully evalutated
+ethnicity	ethnicity	IDENTICAL	FALSE	
+race	race	IDENTICAL	FALSE	
+strain_name	strain	MAPPED	FALSE	BioSample has separate attributes for `strain` and `breed`. MiAIRR has only one keyword (`strain_name`) for this information
+linked_subjects	linked_subjects	AIRR_CUSTOM	FALSE	BioSample attributes `family_*` were not used as they suggest a restriction to genetic relationship
+link_type	link_type	AIRR_CUSTOM	FALSE	BioSample attributes `family_*` were not used as they suggest a restriction to genetic relationship
+study_group_description	study_group_description	AIRR_CUSTOM	FALSE	
+disease_diagnosis	disease	MAPPED	FALSE	
+disease_length	disease_length	AIRR_CUSTOM	FALSE	
+disease_stage	disease_stage	IDENTICAL	FALSE	
+prior_therapies	prior_therapies	AIRR_CUSTOM	FALSE	
+immunogen	immunogen	AIRR_CUSTOM	FALSE	
+intervention	treatment	MAPPED	FALSE	
+medical_history	medical_history	AIRR_CUSTOM	FALSE	
+sample_id	sample_name	MAPPED	TRUE	BioSample attirbute `bio_material` has an overlapping meaning, however it is not required for submission
+sample_type	sample_type	IDENTICAL	FALSE	
+tissue	tissue	IDENTICAL	TRUE	
+anatomic_site	anatomic_site	AIRR_CUSTOM	FALSE	
+disease_state_sample	health_state	MAPPED	FALSE	
+collection_time_point_relative	collection_time_point_relative	AIRR_CUSTOM	FALSE	BioSample attribute `collection_date` was not used as it defines an absolute date
+collection_time_point_reference	collection_time_point_reference	AIRR_CUSTOM	FALSE	
+biomaterial_provider	biomaterial_provider	IDENTICAL	TRUE	
+tissue_processing	tissue_processing	AIRR_CUSTOM	FALSE	
+cell_subset	cell_type	MAPPED	FALSE	
+cell_phenotype	cell_phenotype	AIRR_CUSTOM	FALSE	
+single_cell	single_cell	AIRR_CUSTOM	FALSE	
+cell_number	cell_number	AIRR_CUSTOM	FALSE	
+cells_per_reaction	cells_per_reaction	AIRR_CUSTOM	FALSE	
+cell_storage	cell_storage	AIRR_CUSTOM	FALSE	
+cell_quality	cell_quality	AIRR_CUSTOM	FALSE	
+cell_isolation	cell_isolation	AIRR_CUSTOM	FALSE	
+cell_processing_protocol	cell_processing_protocol	AIRR_CUSTOM	FALSE	
diff --git a/NCBI_implementation/mapping_MiARR_SRA.tsv b/NCBI_implementation/mapping_MiARR_SRA.tsv
@@ -0,0 +1,34 @@
+AIRR Formats WG field name	NCBI SRA attribute	Keyword relation	Mandatory SRA attribute	Note
+study_id	bioproject_accession	MAPPING	TRUE	
+sample_id	sample_name	MAPPING	TRUE	
+nucleic_acid_processing_id	library_ID	MAPPING	TRUE	
+NULL	title	DATABASE_SPECIFIC	TRUE	
+NULL	library_strategy	DATABASE_SPECIFIC	TRUE	
+NULL	library_source	DATABASE_SPECIFIC	TRUE	
+NULL	library_selection	DATABASE_SPECIFIC	TRUE	
+NULL	library_layout	DATABASE_SPECIFIC	TRUE	
+NULL	platform	DATABASE_SPECIFIC	TRUE	
+sequencing_platform	instrument_model	MAPPING	TRUE	SRA splits this information into `platform` and `instrument_model`, however the controlled vocabulary of the latter one also often contains the `platform` information. Therefore preference was given to a 1:1 mapping using `instrument_model`
+library_generation_protocol	design_description	MAPPING	TRUE	
+NULL	filetype	DATABASE_SPECIFIC	TRUE	
+NULL	filename	DATABASE_SPECIFIC	TRUE	
+NULL	filename2	DATABASE_SPECIFIC	FALSE	
+NULL	filename3	DATABASE_SPECIFIC	FALSE	
+NULL	filename4	DATABASE_SPECIFIC	FALSE	
+NULL	assembly	DATABASE_SPECIFIC	FALSE	
+template_class	template_class	AIRR_CUSTOM	FALSE	SRA keyword `library_source` is related to this field, but makes a number of distinctions (bulk vs. single-cell) that are incompatible with the current definition of `template_class`
+template_quality	template_quality	AIRR_CUSTOM	FALSE	
+template_amount	template_amount	AIRR_CUSTOM	FALSE	
+library_generation_method	library_generation_method	AIRR_CUSTOM	FALSE	SRA keyword `library_strategy` is related to this field, but uses a controlled vocubulary that is not fine-grained enough to provide the required information of MiAIRR `library_generation_method` (e.g. mode of cDNA generation, UMI, etc.)
+library_generation_kit_version	library_generation_kit_version	AIRR_CUSTOM	FALSE	
+pcr_target_locus	pcr_target_locus	AIRR_CUSTOM	FALSE	
+forward_pcr_primer_target_location	forward_pcr_primer_target_location	AIRR_CUSTOM	FALSE	
+reverse_pcr_primer_target_location	reverse_pcr_primer_target_location	AIRR_CUSTOM	FALSE	
+complete_sequences	complete_sequences	AIRR_CUSTOM	FALSE	
+physical_linkage	physical_linkage	AIRR_CUSTOM	FALSE	
+total_reads_passing_qc_filter	total_reads_passing_qc_filter	AIRR_CUSTOM	FALSE	
+read_length	read_length	AIRR_CUSTOM	FALSE	
+sequencing_facility	sequencing_facility	AIRR_CUSTOM	FALSE	
+sequencing_run_id	sequencing_run_id	AIRR_CUSTOM	FALSE	
+sequencing_run_date	sequencing_run_date	AIRR_CUSTOM	FALSE	
+sequencing_kit	sequencing_kit	AIRR_CUSTOM	FALSE	
diff --git a/...ation/Filled_NCBI_Templates/F_AIRR_BS.tsv → ...tion/templates_TSV_examples/F_AIRR_BS.tsv b/...ation/Filled_NCBI_Templates/F_AIRR_BS.tsv → ...tion/templates_TSV_examples/F_AIRR_BS.tsv
diff --git a/...tion/Filled_NCBI_Templates/F_AIRR_SRA.tsv → ...ion/templates_TSV_examples/F_AIRR_SRA.tsv b/...tion/Filled_NCBI_Templates/F_AIRR_SRA.tsv → ...ion/templates_TSV_examples/F_AIRR_SRA.tsv
diff --git a/NCBI_implementation/templates_XLS/AIRR_BioSample_v1.0.xls b/NCBI_implementation/templates_XLS/AIRR_BioSample_v1.0.xls
diff --git a/...entation/NCBI Templates/AIRR_SRA_v1.0.xls → ...mentation/templates_XLS/AIRR_SRA_v1.0.xls b/...entation/NCBI Templates/AIRR_SRA_v1.0.xls → ...mentation/templates_XLS/AIRR_SRA_v1.0.xls
diff --git a/...n/NCBI-XML Templates/AIRR_BS_SRA_v1.0.xml → ...tation/templates_XML/AIRR_BS_SRA_v1.0.xml b/...n/NCBI-XML Templates/AIRR_BS_SRA_v1.0.xml → ...tation/templates_XML/AIRR_BS_SRA_v1.0.xml
diff --git a/...emplates/AIRR_SRA_ONTOLOGY_HARMONIZED.xml → ...ates_XML/AIRR_SRA_ONTOLOGY_HARMONIZED.xml b/...emplates/AIRR_SRA_ONTOLOGY_HARMONIZED.xml → ...ates_XML/AIRR_SRA_ONTOLOGY_HARMONIZED.xml
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-![Image](https://github.com/airr-community/airr-standards/raw/master/Images/miairr_logo.png)
+![Image](https://github.com/airr-community/airr-standards/raw/master/images/miairr_logo.png)
 
 _Minimum information about an Adaptive Immune Receptor Repertoire Sequencing Experiment_
 

diff --git a/Images/MiAIRR-to-NCBI_mapping_overview.png → images/MiAIRR-to-NCBI_mapping_overview.png b/Images/MiAIRR-to-NCBI_mapping_overview.png → images/MiAIRR-to-NCBI_mapping_overview.png
diff --git a/Images/MiAIRR-to-NCBI_mapping_overview.svg → images/MiAIRR-to-NCBI_mapping_overview.svg b/Images/MiAIRR-to-NCBI_mapping_overview.svg → images/MiAIRR-to-NCBI_mapping_overview.svg
diff --git a/Images/MiAIRR_data_elements.svg → images/MiAIRR_data_elements.svg b/Images/MiAIRR_data_elements.svg → images/MiAIRR_data_elements.svg
diff --git a/Images/MiAIRR_data_elements_NCBI_targets.png → images/MiAIRR_data_elements_NCBI_targets.png b/Images/MiAIRR_data_elements_NCBI_targets.png → images/MiAIRR_data_elements_NCBI_targets.png
diff --git a/Images/MiAIRR_data_elements_plain.png → images/MiAIRR_data_elements_plain.png b/Images/MiAIRR_data_elements_plain.png → images/MiAIRR_data_elements_plain.png
diff --git a/Images/manual1.png → images/manual1.png b/Images/manual1.png → images/manual1.png
diff --git a/Images/manual2.png → images/manual2.png b/Images/manual2.png → images/manual2.png
diff --git a/Images/manual3.png → images/manual3.png b/Images/manual3.png → images/manual3.png
diff --git a/Images/miairr.png → images/miairr.png b/Images/miairr.png → images/miairr.png
diff --git a/Images/miairr_logo.png → images/miairr_logo.png b/Images/miairr_logo.png → images/miairr_logo.png
diff --git a/scripts/check-consistency.py b/scripts/check-consistency.py
@@ -0,0 +1,118 @@
+#! /usr/bin/env python
+
+import sys
+from collections import Counter
+
+import yaml
+import csv
+from deepdiff import DeepDiff
+
+object_map = { '1 / study': 'MiAIRR_Study',
+               '1 / subject': 'MiAIRR_Subject',
+               '1 / diag. & intervent.': 'MiAIRR_Diagnosis',
+               '2 / sample': 'MiAIRR_Sample',
+               '3 / process (cell)': 'MiAIRR_CellProcessing',
+               '3 / process (nucl. acid)': 'MiAIRR_NucleicAcidProcessing',
+               '5 / process (comput.)': 'MiAIRR_SoftwareProcessing',
+               '6 / data (proc. seq.)': 'MiAIRR_Rearrangement' }
+
+with open('AIRR_Minimal_Standard_Data_Elements.tsv', 'r') as ip:
+    dictReader = csv.DictReader(ip, dialect='excel-tab')
+    miairr_elements = [line for line in dictReader]
+
+with open('AIRR_Minimal_Standard_Data_Elements.tsv', 'r') as ip:
+    # header line present
+    assert next(ip).split()[0] == 'MiAIRR'
+
+    table = [line.split('\t')[6].strip() for line in ip]
+    # handle the exceptional 4 / data line
+    assert table.count('') == 1
+    _ = table.pop(table.index(''))
+
+with open('specs/definitions.yaml', 'r') as ip:
+    definitions = yaml.load(ip)
+    properties = [property
+                  for obj in definitions.values()
+                  for property in obj['properties']
+                  if obj.get('discriminator') == 'MiAIRR']
+
+failed = False
+
+# check for uniqueness of fields in AIRR_Minimal_Standard_Data_Elements.tsv
+if len(table) != len(set(table)):
+    print('Duplicate entries found in AIRR_Minimal_Standard_Data_Elements.tsv', file=sys.stderr)
+    for k, v in Counter(table).items():
+        if v > 1:
+            print(f'{k:30} found {v} times in tsv when it should be unique\n', file=sys.stderr)
+    failed = True
+
+# check for differences in fields between specs/definitions.yaml and
+# AIRR_Minimal_Standard_Data_Elements.tsv
+for key in object_map.keys():
+    elements = [element['AIRR Formats WG field name'] for element in miairr_elements
+                if element['MiAIRR data set / subset'] == key]
+    definition = definitions.get(object_map[key])
+    if not definition:
+        print(f'{object_map[key]} not found in definitions.yaml.\n', file=sys.stderr)
+        failed = True
+        continue
+
+    properties = [property for property in definition['properties']]
+    if set(elements) != set(properties):
+        print(f'{object_map[key]} does not match TSV', file=sys.stderr)
+        for field in set(properties) - set(elements):
+            print(f'{field:30} is found in yaml but not tsv for {object_map[key]}', file=sys.stderr)
+        for field in set(elements) - set(properties):
+            print(f'{field:30} is found in tsv but not yaml for {object_map[key]}', file=sys.stderr)
+        failed = True
+
+# check that MiAIRR object definitions contained
+# within AIRR definition
+for definition in definitions.keys():
+    if definitions[definition].get('discriminator') == 'MiAIRR':
+        name = definition.split('_')[1]
+        if not definitions.get(name):
+            print(f'{name} corresponding to {definition} not found in definitions.yaml', file=sys.stderr)
+            failed = True
+            continue
+
+        for prop in definitions[definition]['properties']:
+            if not definitions[name]['properties'].get(prop):
+                print(f'{prop} in {definition} object is not in {name} object.', file=sys.stderr)
+                failed = True
+                continue
+            ddiff = DeepDiff(definitions[definition]['properties'][prop], definitions[name]['properties'][prop], ignore_order=True)
+            if ddiff:
+                print(f'{prop} in {definition} object is not the same object in {name}.', file=sys.stderr)
+                print(ddiff, file=sys.stderr)
+                failed = True
+
+# check consistency with NCBI XML definitions, per @BusseChristian's pseudocode
+# in https://github.com/airr-community/airr-standards/issues/20
+import pandas as pd
+
+miairr_table = pd.read_csv('AIRR_Minimal_Standard_Data_Elements.tsv', sep='\t', header=0, index_col=None)
+miairr_biosample_rows = miairr_table.iloc[:, 0].isin(["1 / subject", "1 / diag. & intervent.", "2 / sample", "3 / process (cell)"])
+miairr_identifiers = set(miairr_table[miairr_biosample_rows].iloc[:, 6])
+miairr_identifiers.add('study_id') # manually add
+miairr_mapping = {}
+with open('NCBI_implementation/mapping_MiAIRR_BioSample.tsv', 'r') as ip:
+    dictReader = csv.DictReader(ip, dialect='excel-tab')
+    for line in dictReader:
+        miairr_mapping[line['AIRR Formats WG field name']] = line['NCBI BioSample attribute']
+mapped_identifiers = set([miairr_mapping.get(name, name) for name in miairr_identifiers])
+
+ncbi_biosample = pd.read_excel('NCBI_implementation/templates_XLS/AIRR_BioSample_v1.0.xls', skiprows=13)
+ncbi_identifiers = set([x.lstrip('*') for x in ncbi_biosample.columns])
+
+if mapped_identifiers != ncbi_identifiers:
+    print('AIRR_Minimal_Standard_Data_Elements.tsv does not match AIRR_BioSample_v1.0.xls', file=sys.stderr)
+    for field in set(mapped_identifiers) - set(ncbi_identifiers):
+        print(f'{field:30} is found in MiAIRR table tsv but not in NCBI Biosample template xls', file=sys.stderr)
+    for field in set(ncbi_identifiers) - set(mapped_identifiers):
+        print(f'{field:30} is found in NCBI Biosample template xls but not in MiAIRR table tsv', file=sys.stderr)
+    failed = True
+
+if failed:
+    print('consistency checks failed', file=sys.stderr)
+    sys.exit(1)