-
Notifications
You must be signed in to change notification settings - Fork 23
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #40 from airr-community/development
In preparation for release
- Loading branch information
Showing
29 changed files
with
823 additions
and
112 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# emacs backup | ||
*~ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
Binary file not shown.
Binary file added
BIN
+61 KB
NCBI_implementation/Filled_NCBI_Templates/BioSample_AIRR_Filled_Example.xls
Binary file not shown.
Binary file added
BIN
+15.7 KB
NCBI_implementation/Filled_NCBI_Templates/SRA_AIRR_Filled_Example.xlsx
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
AIRR Formats WG field name NCBI BioSample attribute Keyword relation Mandatory BioSample attribute Note | ||
study_id bioproject_accession MAPPED FALSE Reference to the associated BioProject record | ||
subject_id isolate MAPPED TRUE | ||
synthetic synthetic AIRR_CUSTOM FALSE | ||
organism organism IDENTICAL TRUE | ||
sex sex IDENTICAL TRUE | ||
age age IDENTICAL TRUE To be IDENTICAL, `age` MUST be age of subject at sampling time point. In contrast, MiAIRR also allows other reference time points for `age` | ||
age_event age_event AIRR_CUSTOM FALSE Value for this field MUST be `sampling` to be consistent with BioSample's `age` definition. See `age` | ||
ancestry_population population MAPPED FALSE BioSample attributes `(super_)population_*` were not used as they encode keywords from the Coriell Institute, whose suitability for MiAIRR has not yet been fully evalutated | ||
ethnicity ethnicity IDENTICAL FALSE | ||
race race IDENTICAL FALSE | ||
strain_name strain MAPPED FALSE BioSample has separate attributes for `strain` and `breed`. MiAIRR has only one keyword (`strain_name`) for this information | ||
linked_subjects linked_subjects AIRR_CUSTOM FALSE BioSample attributes `family_*` were not used as they suggest a restriction to genetic relationship | ||
link_type link_type AIRR_CUSTOM FALSE BioSample attributes `family_*` were not used as they suggest a restriction to genetic relationship | ||
study_group_description study_group_description AIRR_CUSTOM FALSE | ||
disease_diagnosis disease MAPPED FALSE | ||
disease_length disease_length AIRR_CUSTOM FALSE | ||
disease_stage disease_stage IDENTICAL FALSE | ||
prior_therapies prior_therapies AIRR_CUSTOM FALSE | ||
immunogen immunogen AIRR_CUSTOM FALSE | ||
intervention treatment MAPPED FALSE | ||
medical_history medical_history AIRR_CUSTOM FALSE | ||
sample_id sample_name MAPPED TRUE BioSample attirbute `bio_material` has an overlapping meaning, however it is not required for submission | ||
sample_type sample_type IDENTICAL FALSE | ||
tissue tissue IDENTICAL TRUE | ||
anatomic_site anatomic_site AIRR_CUSTOM FALSE | ||
disease_state_sample health_state MAPPED FALSE | ||
collection_time_point_relative collection_time_point_relative AIRR_CUSTOM FALSE BioSample attribute `collection_date` was not used as it defines an absolute date | ||
collection_time_point_reference collection_time_point_reference AIRR_CUSTOM FALSE | ||
biomaterial_provider biomaterial_provider IDENTICAL TRUE | ||
tissue_processing tissue_processing AIRR_CUSTOM FALSE | ||
cell_subset cell_type MAPPED FALSE | ||
cell_phenotype cell_phenotype AIRR_CUSTOM FALSE | ||
single_cell single_cell AIRR_CUSTOM FALSE | ||
cell_number cell_number AIRR_CUSTOM FALSE | ||
cells_per_reaction cells_per_reaction AIRR_CUSTOM FALSE | ||
cell_storage cell_storage AIRR_CUSTOM FALSE | ||
cell_quality cell_quality AIRR_CUSTOM FALSE | ||
cell_isolation cell_isolation AIRR_CUSTOM FALSE | ||
cell_processing_protocol cell_processing_protocol AIRR_CUSTOM FALSE |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
AIRR Formats WG field name NCBI SRA attribute Keyword relation Mandatory SRA attribute Note | ||
study_id bioproject_accession MAPPING TRUE | ||
sample_id sample_name MAPPING TRUE | ||
nucleic_acid_processing_id library_ID MAPPING TRUE | ||
NULL title DATABASE_SPECIFIC TRUE | ||
NULL library_strategy DATABASE_SPECIFIC TRUE | ||
NULL library_source DATABASE_SPECIFIC TRUE | ||
NULL library_selection DATABASE_SPECIFIC TRUE | ||
NULL library_layout DATABASE_SPECIFIC TRUE | ||
NULL platform DATABASE_SPECIFIC TRUE | ||
sequencing_platform instrument_model MAPPING TRUE SRA splits this information into `platform` and `instrument_model`, however the controlled vocabulary of the latter one also often contains the `platform` information. Therefore preference was given to a 1:1 mapping using `instrument_model` | ||
library_generation_protocol design_description MAPPING TRUE | ||
NULL filetype DATABASE_SPECIFIC TRUE | ||
NULL filename DATABASE_SPECIFIC TRUE | ||
NULL filename2 DATABASE_SPECIFIC FALSE | ||
NULL filename3 DATABASE_SPECIFIC FALSE | ||
NULL filename4 DATABASE_SPECIFIC FALSE | ||
NULL assembly DATABASE_SPECIFIC FALSE | ||
template_class template_class AIRR_CUSTOM FALSE SRA keyword `library_source` is related to this field, but makes a number of distinctions (bulk vs. single-cell) that are incompatible with the current definition of `template_class` | ||
template_quality template_quality AIRR_CUSTOM FALSE | ||
template_amount template_amount AIRR_CUSTOM FALSE | ||
library_generation_method library_generation_method AIRR_CUSTOM FALSE SRA keyword `library_strategy` is related to this field, but uses a controlled vocubulary that is not fine-grained enough to provide the required information of MiAIRR `library_generation_method` (e.g. mode of cDNA generation, UMI, etc.) | ||
library_generation_kit_version library_generation_kit_version AIRR_CUSTOM FALSE | ||
pcr_target_locus pcr_target_locus AIRR_CUSTOM FALSE | ||
forward_pcr_primer_target_location forward_pcr_primer_target_location AIRR_CUSTOM FALSE | ||
reverse_pcr_primer_target_location reverse_pcr_primer_target_location AIRR_CUSTOM FALSE | ||
complete_sequences complete_sequences AIRR_CUSTOM FALSE | ||
physical_linkage physical_linkage AIRR_CUSTOM FALSE | ||
total_reads_passing_qc_filter total_reads_passing_qc_filter AIRR_CUSTOM FALSE | ||
read_length read_length AIRR_CUSTOM FALSE | ||
sequencing_facility sequencing_facility AIRR_CUSTOM FALSE | ||
sequencing_run_id sequencing_run_id AIRR_CUSTOM FALSE | ||
sequencing_run_date sequencing_run_date AIRR_CUSTOM FALSE | ||
sequencing_kit sequencing_kit AIRR_CUSTOM FALSE |
File renamed without changes.
File renamed without changes.
Binary file not shown.
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
#! /usr/bin/env python | ||
|
||
import sys | ||
from collections import Counter | ||
|
||
import yaml | ||
import csv | ||
from deepdiff import DeepDiff | ||
|
||
object_map = { '1 / study': 'MiAIRR_Study', | ||
'1 / subject': 'MiAIRR_Subject', | ||
'1 / diag. & intervent.': 'MiAIRR_Diagnosis', | ||
'2 / sample': 'MiAIRR_Sample', | ||
'3 / process (cell)': 'MiAIRR_CellProcessing', | ||
'3 / process (nucl. acid)': 'MiAIRR_NucleicAcidProcessing', | ||
'5 / process (comput.)': 'MiAIRR_SoftwareProcessing', | ||
'6 / data (proc. seq.)': 'MiAIRR_Rearrangement' } | ||
|
||
with open('AIRR_Minimal_Standard_Data_Elements.tsv', 'r') as ip: | ||
dictReader = csv.DictReader(ip, dialect='excel-tab') | ||
miairr_elements = [line for line in dictReader] | ||
|
||
with open('AIRR_Minimal_Standard_Data_Elements.tsv', 'r') as ip: | ||
# header line present | ||
assert next(ip).split()[0] == 'MiAIRR' | ||
|
||
table = [line.split('\t')[6].strip() for line in ip] | ||
# handle the exceptional 4 / data line | ||
assert table.count('') == 1 | ||
_ = table.pop(table.index('')) | ||
|
||
with open('specs/definitions.yaml', 'r') as ip: | ||
definitions = yaml.load(ip) | ||
properties = [property | ||
for obj in definitions.values() | ||
for property in obj['properties'] | ||
if obj.get('discriminator') == 'MiAIRR'] | ||
|
||
failed = False | ||
|
||
# check for uniqueness of fields in AIRR_Minimal_Standard_Data_Elements.tsv | ||
if len(table) != len(set(table)): | ||
print('Duplicate entries found in AIRR_Minimal_Standard_Data_Elements.tsv', file=sys.stderr) | ||
for k, v in Counter(table).items(): | ||
if v > 1: | ||
print(f'{k:30} found {v} times in tsv when it should be unique\n', file=sys.stderr) | ||
failed = True | ||
|
||
# check for differences in fields between specs/definitions.yaml and | ||
# AIRR_Minimal_Standard_Data_Elements.tsv | ||
for key in object_map.keys(): | ||
elements = [element['AIRR Formats WG field name'] for element in miairr_elements | ||
if element['MiAIRR data set / subset'] == key] | ||
definition = definitions.get(object_map[key]) | ||
if not definition: | ||
print(f'{object_map[key]} not found in definitions.yaml.\n', file=sys.stderr) | ||
failed = True | ||
continue | ||
|
||
properties = [property for property in definition['properties']] | ||
if set(elements) != set(properties): | ||
print(f'{object_map[key]} does not match TSV', file=sys.stderr) | ||
for field in set(properties) - set(elements): | ||
print(f'{field:30} is found in yaml but not tsv for {object_map[key]}', file=sys.stderr) | ||
for field in set(elements) - set(properties): | ||
print(f'{field:30} is found in tsv but not yaml for {object_map[key]}', file=sys.stderr) | ||
failed = True | ||
|
||
# check that MiAIRR object definitions contained | ||
# within AIRR definition | ||
for definition in definitions.keys(): | ||
if definitions[definition].get('discriminator') == 'MiAIRR': | ||
name = definition.split('_')[1] | ||
if not definitions.get(name): | ||
print(f'{name} corresponding to {definition} not found in definitions.yaml', file=sys.stderr) | ||
failed = True | ||
continue | ||
|
||
for prop in definitions[definition]['properties']: | ||
if not definitions[name]['properties'].get(prop): | ||
print(f'{prop} in {definition} object is not in {name} object.', file=sys.stderr) | ||
failed = True | ||
continue | ||
ddiff = DeepDiff(definitions[definition]['properties'][prop], definitions[name]['properties'][prop], ignore_order=True) | ||
if ddiff: | ||
print(f'{prop} in {definition} object is not the same object in {name}.', file=sys.stderr) | ||
print(ddiff, file=sys.stderr) | ||
failed = True | ||
|
||
# check consistency with NCBI XML definitions, per @BusseChristian's pseudocode | ||
# in https://github.com/airr-community/airr-standards/issues/20 | ||
import pandas as pd | ||
|
||
miairr_table = pd.read_csv('AIRR_Minimal_Standard_Data_Elements.tsv', sep='\t', header=0, index_col=None) | ||
miairr_biosample_rows = miairr_table.iloc[:, 0].isin(["1 / subject", "1 / diag. & intervent.", "2 / sample", "3 / process (cell)"]) | ||
miairr_identifiers = set(miairr_table[miairr_biosample_rows].iloc[:, 6]) | ||
miairr_identifiers.add('study_id') # manually add | ||
miairr_mapping = {} | ||
with open('NCBI_implementation/mapping_MiAIRR_BioSample.tsv', 'r') as ip: | ||
dictReader = csv.DictReader(ip, dialect='excel-tab') | ||
for line in dictReader: | ||
miairr_mapping[line['AIRR Formats WG field name']] = line['NCBI BioSample attribute'] | ||
mapped_identifiers = set([miairr_mapping.get(name, name) for name in miairr_identifiers]) | ||
|
||
ncbi_biosample = pd.read_excel('NCBI_implementation/templates_XLS/AIRR_BioSample_v1.0.xls', skiprows=13) | ||
ncbi_identifiers = set([x.lstrip('*') for x in ncbi_biosample.columns]) | ||
|
||
if mapped_identifiers != ncbi_identifiers: | ||
print('AIRR_Minimal_Standard_Data_Elements.tsv does not match AIRR_BioSample_v1.0.xls', file=sys.stderr) | ||
for field in set(mapped_identifiers) - set(ncbi_identifiers): | ||
print(f'{field:30} is found in MiAIRR table tsv but not in NCBI Biosample template xls', file=sys.stderr) | ||
for field in set(ncbi_identifiers) - set(mapped_identifiers): | ||
print(f'{field:30} is found in NCBI Biosample template xls but not in MiAIRR table tsv', file=sys.stderr) | ||
failed = True | ||
|
||
if failed: | ||
print('consistency checks failed', file=sys.stderr) | ||
sys.exit(1) |
Oops, something went wrong.