diff --git a/lang/python/tests/data/good_combined_airr.yaml b/lang/python/tests/data/good_combined_airr.yaml index 7b9fb4390..49a6ecae8 100644 --- a/lang/python/tests/data/good_combined_airr.yaml +++ b/lang/python/tests/data/good_combined_airr.yaml @@ -2,41 +2,27 @@ Repertoire: - repertoire_id: 1841923116114776551-242ac11c-0001-012 study: study_id: PRJNA300878 - study_title: Homo sapiens B and T cell repertoire - MZ twins + study_title: "Homo sapiens B and T cell repertoire - MZ twins" study_type: - id: - label: - study_description: The adaptive immune system's capability to protect the body - requires a highly diverse lymphocyte antigen receptor repertoire. However, the - influence of individual genetic and epigenetic differences on these repertoires - is not typically measured. By leveraging the unique characteristics of B, CD4+ - T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified - the impact of heritable factors on both the V(D)J recombination process and - thymic selection in the case of T cell receptors, and show that the repertoires - of both naive and antigen experienced cells are subject to biases resulting - from differences in recombination. We show that biases in V(D)J usage, as well - as biased N/P additions, contribute to significant variation in the CDR3 region. - Moreover, we show that the relative usage of V and J gene segments is chromosomally - biased, with approximately 1.5 times as many rearrangements originating from - a single chromosome. These data refine our understanding of the heritable mechanisms - affecting the repertoire, and show that biases are evident on a chromosome-wide - level. - inclusion_exclusion_criteria: + id: null + label: null + study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level." + inclusion_exclusion_criteria: null contributors: - contributor_id: "1" name: "Florian Rubelt" orcid_id: - id: - label: + id: null + label: null affiliation: id: "ROR:00f54p054" label: "Stanford University" affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" contributions: - role: "investigation" - degree: + degree: null - role: "data curation" - degree: + degree: null - contributor_id: "2" name: "Mark M. Davis" orcid_id: @@ -48,106 +34,73 @@ Repertoire: affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" contributions: - role: "supervision" - degree: + degree: null pub_ids: ["PMID:27005435"] - grants: + grants: null keywords_study: - - contains_ig - - contains_tr + - "contains_ig" + - "contains_tr" subject: subject_id: TW01A synthetic: false species: - id: NCBITaxon_9606 - label: Homo sapiens + id: "NCBITaxon_9606" + label: "Homo sapiens" sex: female age_min: 27 age_max: 27 age_unit: id: UO_0000036 label: year - age_event: + age_event: null ancestry_population: - id: - label: + id: null + label: null location_birth: - id: - label: - ethnicity: - race: - strain_name: + id: null + label: null + ethnicity: null + race: null + strain_name: null linked_subjects: TW01B link_type: twin diagnosis: - - study_group_description: + - study_group_description: null disease_diagnosis: - id: - label: - disease_length: - disease_stage: - prior_therapies: - immunogen: - intervention: - medical_history: - # genotype: - # receptor_genotype_set: - # receptor_genotype_set_id: '1' - # genotype_class_list: - # - receptor_genotype_id: '1' - # locus: IGH - # documented_alleles: - # - label: IGHV1-69*01 - # germline_set_ref: IMGT:Homo sapiens:2022.1.31 - # phasing: 1 - # - label: IGHV1-69*02 - # germline_set_ref: IMGT:Homo sapiens:2022.1.31 - # phasing: 2 - # undocumented_alleles: - # - allele_name: IGHD3-1*01_S1234 - # sequence: agtagtagtagt - # phasing: 1 - # deleted_genes: - # - label: IGHV3-30-3 - # germline_set_ref: IMGT:Homo sapiens:2022.1.31 - # phasing: 1 - # inference_process: repertoire_sequencing - # mhc_genotype_set: - # mhc_genotype_set_id: "this is a unique identifier" - # mhc_genotype_list: - # - mhc_genotype_id: unique - # mhc_class: MHC-I - # mhc_genotyping_method: pcr_low_resolution - # mhc_alleles: - # - allele_designation: "01:01" - # gene: - # id: "MRO-0000046" - # label: "HLA-A" - # reference_set_ref: blah + id: null + label: null + disease_length: null + disease_stage: null + prior_therapies: null + immunogen: null + intervention: null + medical_history: null + sample: - sample_id: TW01A_B_naive - sample_processing_id: - sample_type: peripheral venous puncture + sample_processing_id: null + sample_type: "peripheral venous puncture" tissue: - id: UBERON_0000178 - label: blood - tissue_processing: Ficoll gradient + id: "UBERON_0000178" + label: "blood" + tissue_processing: "Ficoll gradient" cell_subset: - id: CL_0000788 - label: naive B cell - cell_phenotype: expression of CD20 and the absence of CD27 + id: "CL_0000788" + label: "naive B cell" + cell_phenotype: "expression of CD20 and the absence of CD27" cell_species: - id: NCBITaxon_9606 - label: Homo sapiens + id: "NCBITaxon_9606" + label: "Homo sapiens" single_cell: false cell_isolation: FACS template_class: RNA pcr_target: - pcr_target_locus: IGH - forward_pcr_primer_target_location: - reverse_pcr_primer_target_location: - sequencing_platform: Illumina MiSeq + forward_pcr_primer_target_location: null + reverse_pcr_primer_target_location: null + sequencing_platform: "Illumina MiSeq" sequencing_files: - sequencing_data_id: SRR2905656 + sequencing_data_id: SRA:SRR2905656 file_type: fastq filename: SRR2905656_R1.fastq.gz read_direction: forward @@ -155,87 +108,76 @@ Repertoire: paired_filename: SRR2905656_R2.fastq.gz paired_read_direction: reverse paired_read_length: 300 - anatomic_site: - disease_state_sample: - collection_time_point_relative: + index_filename: SRR2905656_R3.fastq.gz + index_length: 8 + anatomic_site: null + disease_state_sample: null + collection_time_point_relative: null collection_time_point_relative_unit: - id: - label: - collection_time_point_reference: + id: null + label: null + collection_time_point_reference: null collection_location: - id: - label: - biomaterial_provider: - cell_number: - cells_per_reaction: + id: null + label: null + biomaterial_provider: null + cell_number: null + cells_per_reaction: null cell_storage: false - cell_quality: - cell_processing_protocol: - template_quality: - template_amount: + cell_quality: null + cell_processing_protocol: null + template_quality: null + template_amount: null template_amount_unit: - id: - label: - library_generation_method: RT(oligo-dT)+PCR - library_generation_protocol: - library_generation_kit_version: - complete_sequences: partial - physical_linkage: none - sequencing_run_id: - total_reads_passing_qc_filter: - sequencing_facility: - sequencing_run_date: - sequencing_kit: + id: null + label: null + library_generation_method: "RT(oligo-dT)+PCR" + library_generation_protocol: null + library_generation_kit_version: null + complete_sequences: "partial" + physical_linkage: "none" + sequencing_run_id: null + total_reads_passing_qc_filter: null + sequencing_facility: null + sequencing_run_date: null + sequencing_kit: null data_processing: - data_processing_id: 3059369183532618216-242ac11b-0001-007 primary_annotation: true - software_versions: - paired_reads_assembly: - quality_thresholds: - primer_match_cutoffs: - collapsing_method: - data_processing_protocols: - data_processing_files: - germline_database: + software_versions: null + paired_reads_assembly: null + quality_thresholds: null + primer_match_cutoffs: null + collapsing_method: null + data_processing_protocols: null + data_processing_files: null + germline_database: null analysis_provenance_id: 6623294219256599016-242ac11c-0001-012 + - repertoire_id: 1602908186092376551-242ac11c-0001-012 study: study_id: PRJNA300878 - study_title: Homo sapiens B and T cell repertoire - MZ twins + study_title: "Homo sapiens B and T cell repertoire - MZ twins" study_type: - id: - label: - study_description: The adaptive immune system's capability to protect the body - requires a highly diverse lymphocyte antigen receptor repertoire. However, the - influence of individual genetic and epigenetic differences on these repertoires - is not typically measured. By leveraging the unique characteristics of B, CD4+ - T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified - the impact of heritable factors on both the V(D)J recombination process and - thymic selection in the case of T cell receptors, and show that the repertoires - of both naive and antigen experienced cells are subject to biases resulting - from differences in recombination. We show that biases in V(D)J usage, as well - as biased N/P additions, contribute to significant variation in the CDR3 region. - Moreover, we show that the relative usage of V and J gene segments is chromosomally - biased, with approximately 1.5 times as many rearrangements originating from - a single chromosome. These data refine our understanding of the heritable mechanisms - affecting the repertoire, and show that biases are evident on a chromosome-wide - level. - inclusion_exclusion_criteria: + id: null + label: null + study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level." + inclusion_exclusion_criteria: null contributors: - contributor_id: "1" name: "Florian Rubelt" orcid_id: - id: - label: + id: null + label: null affiliation: id: "ROR:00f54p054" label: "Stanford University" affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" contributions: - role: "investigation" - degree: + degree: null - role: "data curation" - degree: + degree: null - contributor_id: "2" name: "Mark M. Davis" orcid_id: @@ -247,72 +189,73 @@ Repertoire: affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" contributions: - role: "supervision" - degree: + degree: null pub_ids: ["PMID:27005435"] - grants: + grants: null keywords_study: - - contains_ig - - contains_tr + - "contains_ig" + - "contains_tr" subject: subject_id: TW01A synthetic: false species: - id: NCBITaxon_9606 - label: Homo sapiens + id: "NCBITaxon_9606" + label: "Homo sapiens" sex: female age_min: 27 age_max: 27 age_unit: id: UO_0000036 label: year - age_event: + age_event: null ancestry_population: - id: - label: + id: null + label: null location_birth: - id: - label: - ethnicity: - race: - strain_name: + id: null + label: null + ethnicity: null + race: null + strain_name: null linked_subjects: TW01B link_type: twin diagnosis: - - study_group_description: + - study_group_description: null disease_diagnosis: - id: - label: - disease_length: - disease_stage: - prior_therapies: - immunogen: - intervention: - medical_history: + id: null + label: null + disease_length: null + disease_stage: null + prior_therapies: null + immunogen: null + intervention: null + medical_history: null + sample: - sample_id: TW01A_B_memory - sample_processing_id: - sample_type: peripheral venous puncture + sample_processing_id: null + sample_type: "peripheral venous puncture" tissue: - id: UBERON_0000178 - label: blood - tissue_processing: Ficoll gradient + id: "UBERON_0000178" + label: "blood" + tissue_processing: "Ficoll gradient" cell_subset: - id: CL_0000787 - label: memory B cell - cell_phenotype: expression of CD20 and CD27 + id: "CL_0000787" + label: "memory B cell" + cell_phenotype: "expression of CD20 and CD27" cell_species: - id: NCBITaxon_9606 - label: Homo sapiens + id: "NCBITaxon_9606" + label: "Homo sapiens" single_cell: false cell_isolation: FACS template_class: RNA pcr_target: - pcr_target_locus: IGH - forward_pcr_primer_target_location: - reverse_pcr_primer_target_location: - sequencing_platform: Illumina MiSeq + forward_pcr_primer_target_location: null + reverse_pcr_primer_target_location: null + sequencing_platform: "Illumina MiSeq" sequencing_files: - sequencing_data_id: SRR2905655 + sequencing_data_id: SRA:SRR2905655 file_type: fastq filename: SRR2905655_R1.fastq.gz read_direction: forward @@ -320,87 +263,76 @@ Repertoire: paired_filename: SRR2905655_R2.fastq.gz paired_read_direction: reverse paired_read_length: 300 - anatomic_site: - disease_state_sample: - collection_time_point_relative: + index_filename: SRR2905655_R3.fastq.gz + index_length: 8 + anatomic_site: null + disease_state_sample: null + collection_time_point_relative: null collection_time_point_relative_unit: - id: - label: - collection_time_point_reference: + id: null + label: null + collection_time_point_reference: null collection_location: - id: - label: - biomaterial_provider: - cell_number: - cells_per_reaction: + id: null + label: null + biomaterial_provider: null + cell_number: null + cells_per_reaction: null cell_storage: false - cell_quality: - cell_processing_protocol: - template_quality: - template_amount: + cell_quality: null + cell_processing_protocol: null + template_quality: null + template_amount: null template_amount_unit: - id: - label: - library_generation_method: RT(oligo-dT)+PCR - library_generation_protocol: - library_generation_kit_version: - complete_sequences: partial - physical_linkage: none - sequencing_run_id: - total_reads_passing_qc_filter: - sequencing_facility: - sequencing_run_date: - sequencing_kit: + id: null + label: null + library_generation_method: "RT(oligo-dT)+PCR" + library_generation_protocol: null + library_generation_kit_version: null + complete_sequences: "partial" + physical_linkage: "none" + sequencing_run_id: null + total_reads_passing_qc_filter: null + sequencing_facility: null + sequencing_run_date: null + sequencing_kit: null data_processing: - data_processing_id: 3059369183532618216-242ac11b-0001-007 primary_annotation: true - software_versions: - paired_reads_assembly: - quality_thresholds: - primer_match_cutoffs: - collapsing_method: - data_processing_protocols: - data_processing_files: - germline_database: + software_versions: null + paired_reads_assembly: null + quality_thresholds: null + primer_match_cutoffs: null + collapsing_method: null + data_processing_protocols: null + data_processing_files: null + germline_database: null analysis_provenance_id: 6623294219256599016-242ac11c-0001-012 + - repertoire_id: 2366080924918616551-242ac11c-0001-012 study: study_id: PRJNA300878 - study_title: Homo sapiens B and T cell repertoire - MZ twins + study_title: "Homo sapiens B and T cell repertoire - MZ twins" study_type: - id: - label: - study_description: The adaptive immune system's capability to protect the body - requires a highly diverse lymphocyte antigen receptor repertoire. However, the - influence of individual genetic and epigenetic differences on these repertoires - is not typically measured. By leveraging the unique characteristics of B, CD4+ - T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified - the impact of heritable factors on both the V(D)J recombination process and - thymic selection in the case of T cell receptors, and show that the repertoires - of both naive and antigen experienced cells are subject to biases resulting - from differences in recombination. We show that biases in V(D)J usage, as well - as biased N/P additions, contribute to significant variation in the CDR3 region. - Moreover, we show that the relative usage of V and J gene segments is chromosomally - biased, with approximately 1.5 times as many rearrangements originating from - a single chromosome. These data refine our understanding of the heritable mechanisms - affecting the repertoire, and show that biases are evident on a chromosome-wide - level. - inclusion_exclusion_criteria: + id: null + label: null + study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level." + inclusion_exclusion_criteria: null contributors: - contributor_id: "1" name: "Florian Rubelt" orcid_id: - id: - label: + id: null + label: null affiliation: id: "ROR:00f54p054" label: "Stanford University" affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" contributions: - role: "investigation" - degree: + degree: null - role: "data curation" - degree: + degree: null - contributor_id: "2" name: "Mark M. Davis" orcid_id: @@ -412,72 +344,73 @@ Repertoire: affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" contributions: - role: "supervision" - degree: + degree: null pub_ids: ["PMID:27005435"] - grants: + grants: null keywords_study: - - contains_ig - - contains_tr + - "contains_ig" + - "contains_tr" subject: subject_id: TW01A synthetic: false species: - id: NCBITaxon_9606 - label: Homo sapiens + id: "NCBITaxon_9606" + label: "Homo sapiens" sex: female age_min: 27 age_max: 27 age_unit: id: UO_0000036 label: year - age_event: + age_event: null ancestry_population: - id: - label: + id: null + label: null location_birth: - id: - label: - ethnicity: - race: - strain_name: + id: null + label: null + ethnicity: null + race: null + strain_name: null linked_subjects: TW01B link_type: twin diagnosis: - - study_group_description: + - study_group_description: null disease_diagnosis: - id: - label: - disease_length: - disease_stage: - prior_therapies: - immunogen: - intervention: - medical_history: + id: null + label: null + disease_length: null + disease_stage: null + prior_therapies: null + immunogen: null + intervention: null + medical_history: null + sample: - sample_id: TW01A_T_naive_CD4 - sample_processing_id: - sample_type: peripheral venous puncture + sample_processing_id: null + sample_type: "peripheral venous puncture" tissue: - id: UBERON_0000178 - label: blood - tissue_processing: Ficoll gradient + id: "UBERON_0000178" + label: "blood" + tissue_processing: "Ficoll gradient" cell_subset: - id: CL_0000895 - label: naive thymus-derived CD4-positive, alpha-beta T cell - cell_phenotype: expression of CD8 and absence of CD4 and CD45RO + id: "CL_0000895" + label: "naive thymus-derived CD4-positive, alpha-beta T cell" + cell_phenotype: "expression of CD8 and absence of CD4 and CD45RO" cell_species: - id: NCBITaxon_9606 - label: Homo sapiens + id: "NCBITaxon_9606" + label: "Homo sapiens" single_cell: false cell_isolation: FACS template_class: RNA pcr_target: - pcr_target_locus: TRB - forward_pcr_primer_target_location: - reverse_pcr_primer_target_location: - sequencing_platform: Illumina MiSeq + forward_pcr_primer_target_location: null + reverse_pcr_primer_target_location: null + sequencing_platform: "Illumina MiSeq" sequencing_files: - sequencing_data_id: SRR2905659 + sequencing_data_id: SRA:SRR2905659 file_type: fastq filename: SRR2905659_R1.fastq.gz read_direction: forward @@ -485,48 +418,50 @@ Repertoire: paired_filename: SRR2905659_R2.fastq.gz paired_read_direction: reverse paired_read_length: 300 - anatomic_site: - disease_state_sample: - collection_time_point_relative: + index_filename: SRR2905659_R3.fastq.gz + index_length: 8 + anatomic_site: null + disease_state_sample: null + collection_time_point_relative: null collection_time_point_relative_unit: - id: - label: - collection_time_point_reference: + id: null + label: null + collection_time_point_reference: null collection_location: - id: - label: - biomaterial_provider: - cell_number: - cells_per_reaction: + id: null + label: null + biomaterial_provider: null + cell_number: null + cells_per_reaction: null cell_storage: false - cell_quality: - cell_processing_protocol: - template_quality: - template_amount: + cell_quality: null + cell_processing_protocol: null + template_quality: null + template_amount: null template_amount_unit: - id: - label: - library_generation_method: RT(oligo-dT)+PCR - library_generation_protocol: - library_generation_kit_version: - complete_sequences: partial - physical_linkage: none - sequencing_run_id: - total_reads_passing_qc_filter: - sequencing_facility: - sequencing_run_date: - sequencing_kit: + id: null + label: null + library_generation_method: "RT(oligo-dT)+PCR" + library_generation_protocol: null + library_generation_kit_version: null + complete_sequences: "partial" + physical_linkage: "none" + sequencing_run_id: null + total_reads_passing_qc_filter: null + sequencing_facility: null + sequencing_run_date: null + sequencing_kit: null data_processing: - data_processing_id: 651223970338378216-242ac11b-0001-007 primary_annotation: true - software_versions: - paired_reads_assembly: - quality_thresholds: - primer_match_cutoffs: - collapsing_method: - data_processing_protocols: - data_processing_files: - germline_database: + software_versions: null + paired_reads_assembly: null + quality_thresholds: null + primer_match_cutoffs: null + collapsing_method: null + data_processing_protocols: null + data_processing_files: null + germline_database: null analysis_provenance_id: 4625424004665971176-242ac11c-0001-012 # GermlineSet: