From 5012dd22ab08a7ea0c258d9e0b144eb843f97cd8 Mon Sep 17 00:00:00 2001 From: Scott Christley Date: Wed, 21 Feb 2024 17:35:34 -0600 Subject: [PATCH 01/15] good ole make, great for simple tasks --- Makefile | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 Makefile diff --git a/Makefile b/Makefile new file mode 100644 index 000000000..151bf2dd4 --- /dev/null +++ b/Makefile @@ -0,0 +1,50 @@ +# helper commands for keeping the language directories in sync + +# note: "help" MUST be the first target in the file, +# when the user types "make" they should get help info +help: + @echo "" + @echo "Helper commands for AIRR Standards repository" + @echo "" + @echo "make gen-v2 -- Generate OpenAPI V2 spec from the V3 spec" + @echo "make docs -- Build documentation" + @echo "make lang-copy -- Copy spec files to language directories" + @echo "make data-copy -- Copy test data files to language directories" + @echo "make checks -- Run consistency checks on spec files" + @echo "make tests -- Run all language test suites" + @echo "make python-tests -- Run Python test suite" + @echo "make r-tests -- Run R test suite" + @echo "make js-tests -- Run Javascript test suite" + @echo "" + +gen-v2: + @echo "Not implemented" + +lang-copy: + @echo "Copying specs to language directories" + cp specs/airr-schema.yaml lang/python/airr/specs + cp specs/airr-schema-openapi3.yaml lang/python/airr/specs + cp specs/airr-schema.yaml lang/R/inst/extdata + cp specs/airr-schema-openapi3.yaml lang/R/inst/extdata +# cp specs/airr-schema.yaml lang/js/ +# cp specs/airr-schema-openapi3.yaml lang/js/ + +data-copy: + @echo "Not implemented" + +checks: + @echo "Running consistency checks on spec files" + python3 tests/check-consistency-formats.py + +tests: python-tests r-tests js-tests + +python-tests: + @echo "Running Python test suite" + cd lang/python; python3 -m unittest discover + +r-tests: + @echo "Running R test suite" + cd lang/R; R -e "library(devtools); test()" + +js-tests: + @echo "Running Javascript test suite" From 8dd05cb6cb954028131bc73998d65eb2e221be9f Mon Sep 17 00:00:00 2001 From: Scott Christley Date: Wed, 21 Feb 2024 18:02:24 -0600 Subject: [PATCH 02/15] openapi v3 spec in the lang directories, add consistency checks --- lang/R/inst/extdata/airr-schema-openapi3.yaml | 5091 +++++++++++++++++ .../airr/specs/airr-schema-openapi3.yaml | 5091 +++++++++++++++++ tests/check-consistency-formats.py | 41 +- 3 files changed, 10221 insertions(+), 2 deletions(-) create mode 100644 lang/R/inst/extdata/airr-schema-openapi3.yaml create mode 100644 lang/python/airr/specs/airr-schema-openapi3.yaml diff --git a/lang/R/inst/extdata/airr-schema-openapi3.yaml b/lang/R/inst/extdata/airr-schema-openapi3.yaml new file mode 100644 index 000000000..bba3a45d8 --- /dev/null +++ b/lang/R/inst/extdata/airr-schema-openapi3.yaml @@ -0,0 +1,5091 @@ +# +# Schema definitions for AIRR standards objects +# +Info: + title: AIRR Schema + description: Schema definitions for AIRR standards objects + version: 1.4 + contact: + name: AIRR Community + url: https://github.com/airr-community + license: + name: Creative Commons Attribution 4.0 International + url: https://creativecommons.org/licenses/by/4.0/ + + +# Properties that are based upon an ontology use this +# standard schema definition +Ontology: + type: object + properties: + id: + type: string + nullable: true + description: CURIE of the concept, encoding the ontology and the local ID + label: + type: string + nullable: true + description: Label of the concept in the respective ontology + +# Map to expand CURIE prefixes to full IRIs +CURIEMap: + ABREG: + type: identifier + default: + map: ABREG + map: + ABREG: + iri_prefix: "http://antibodyregistry.org/AB_" + CHEBI: + type: ontology + default: + map: OBO + provider: OLS + map: + OBO: + iri_prefix: "http://purl.obolibrary.org/obo/CHEBI_" + CL: + type: ontology + default: + map: OBO + provider: OLS + map: + OBO: + iri_prefix: "http://purl.obolibrary.org/obo/CL_" + DOI: + type: identifier + default: + map: DOI + map: + DOI: + iri_prefix: "https://doi.org/" + DOID: + type: ontology + default: + map: OBO + provider: OLS + map: + OBO: + iri_prefix: "http://purl.obolibrary.org/obo/DOID_" + ENA: + type: identifier + default: + map: ENA + map: + ENA: + iri_prefix: "https://www.ebi.ac.uk/ena/browser/view/" + ENSG: + type: identifier + default: + map: ENSG + map: + ENSG: + iri_prefix: "https://www.ensembl.org/Multi/Search/Results?q=" + GAZ: + type: ontology + default: + map: OBO + provider: OLS + map: + OBO: + iri_prefix: "http://purl.obolibrary.org/obo/GAZ_" + IEDB_RECEPTOR: + type: identifier + default: + map: IEDB + provider: IEDB + map: + IEDB: + iri_prefix: "https://www.iedb.org/receptor/" + MRO: + type: ontology + default: + map: OBO + provider: OLS + map: + OBO: + iri_prefix: "http://purl.obolibrary.org/obo/MRO_" + NCBITAXON: + type: taxonomy + default: + map: OBO + provider: OLS + map: + OBO: + iri_prefix: "http://purl.obolibrary.org/obo/NCBITaxon_" + BioPortal: + iri_prefix: "http://purl.bioontology.org/ontology/NCBITAXON/" + NCIT: + type: ontology + default: + map: OBO + provider: OLS + map: + OBO: + iri_prefix: "http://purl.obolibrary.org/obo/NCIT_" + ORCID: + type: catalog + default: + map: ORCID + provider: ORCID + map: + ORCID: + iri_prefix: "https://orcid.org/" + ROR: + type: catalog + default: + map: ROR + provider: ROR + map: + ROR: + iri_prefix: "https://ror.org/" + SRA: + type: identifier + default: + map: SRA + map: + SRA: + iri_prefix: "https://trace.ncbi.nlm.nih.gov/Traces/sra/?run=" + UBERON: + type: ontology + default: + map: OBO + provider: OLS + map: + OBO: + iri_prefix: "http://purl.obolibrary.org/obo/UBERON_" + UNIPROT: + type: identifier + default: + map: UNIPROT + map: + UniProt: + iri_prefix: "http://purl.uniprot.org/uniprot/" + UO: + type: ontology + default: + map: OBO + provider: OLS + map: + OBO: + iri_prefix: "http://purl.obolibrary.org/obo/UO_" + +InformationProvider: + provider: + ENA: + request: + url: "{iri}" + response: text/html + IEDB: + request: + url: "https://query-api.iedb.org/tcr_search?receptor_group_id=eq.{local_id}" + response: application/json + OLS: + request: + url: "https://www.ebi.ac.uk/ols/api/ontologies/{ontology_id}/terms?iri={iri}" + response: application/json + Ontobee: + request: + url: "http://www.ontobee.org/ontology/rdf/{ontology_id}?iri={iri}" + response: application/rdf+xml + ORCID: + request: + url: "https://pub.orcid.org/v2.1/{local_id}" + header: + Accept: application/json + response: application/json + ROR: + request: + url: "https://api.ror.org/organizations/{iri}" + response: application/json + SRA: + request: + url: "{iri}" + response: text/html + parameter: + CHEBI: + Ontobee: + ontology_id: CHEBI + OLS: + ontology_id: chebi + CL: + Ontobee: + ontology_id: CL + OLS: + ontology_id: cl + DOID: + Ontobee: + ontology_id: DOID + OLS: + ontology_id: doid + GAZ: + Ontobee: + ontology_id: GAZ + OLS: + ontology_id: gaz + MRO: + Ontobee: + ontology_id: MRO + OLS: + ontology_id: mro + NCBITAXON: + Ontobee: + ontology_id: NCBITaxon + OLS: + ontology_id: ncbitaxon + BioPortal: + ontology_id: NCBITAXON + NCIT: + Ontobee: + ontology_id: NCIT + OLS: + ontology_id: ncit + UBERON: + Ontobee: + ontology_id: UBERON + OLS: + ontology_id: uberon + UO: + Ontobee: + ontology_id: UO + OLS: + ontology_id: uo + +# AIRR specification extensions +# +# The schema definitions for AIRR standards objects is extended to +# provide a number of AIRR specific attributes. This schema definition +# specifies the structure, property names and data types. These +# attributes are attached to an AIRR field with the x-airr property. + +Attributes: + type: object + properties: + miairr: + type: string + description: MiAIRR requirement level. + enum: + - essential + - important + - defined + default: defined + identifier: + type: boolean + description: > + True if the field is an identifier required to link metadata and/or individual + sequence records across objects in the complete AIRR Data Model and ADC API. + default: false + adc-query-support: + type: boolean + description: > + True if an ADC API implementation must support queries on the field. + If false, query support for the field in ADC API implementations is optional. + default: false + adc-api-optional: + type: boolean + description: > + If false, repositories must implement these fields both for queries and query repsonse. + Only applies to fields in the ADC API spec that are extensions to the AIRR Standard, + targeted at "convenience query fields" that make queries against repositories more + efficient than if queries were limited to AIRR fields only. + If true, repositories can choose to support the field or not. + default: false + deprecated: + type: boolean + description: True if the field has been deprecated from the schema. + default: false + deprecated-description: + type: string + description: Information regarding the deprecation of the field. + deprecated-replaced-by: + type: array + items: + type: string + description: The deprecated field is replaced by this list of fields. + set: + type: integer + description: MiAIRR set + subset: + type: string + description: MiAIRR subset + name: + type: string + description: MiAIRR name + format: + type: string + description: Field format. If null then assume the full range of the field data type + enum: + - ontology + - controlled_vocabulary + - physical_quantity + - CURIE + ontology: + type: object + description: Ontology definition for field + properties: + draft: + type: boolean + description: Indicates if ontology definition is a draft + top_node: + type: object + description: > + Concept to use as top node for ontology. Note that this must have the same CURIE namespace + as the actually annotated concept. + properties: + id: + type: string + description: CURIE for the top node term + label: + type: string + description: Ontology name for the top node term + +# AIRR Data File +# +# A JSON data file that holds Repertoire metadata, data processing +# analysis objects, or any object in the AIRR Data Model. +# +# It is presumed that the objects gathered together in an AIRR Data File are related +# or relevant to each other, e.g. part of the same study; thus, the ID fields can be +# internally resolved unless the ID contains an external PID. This implies that AIRR +# Data Files cannot be merged simply by concatenating arrays; any merge program +# would need to manage duplicate or conflicting ID values. +# +# While the properties in an AIRR Data File are not required, if one is provided then +# the value should not be null. + +DataFile: + type: object + properties: + Info: + nullable: false + $ref: '#/InfoObject' + Repertoire: + type: array + nullable: false + description: List of repertoires + items: + $ref: '#/Repertoire' + RepertoireGroup: + type: array + nullable: false + description: List of repertoire collections + items: + $ref: '#/RepertoireGroup' + Rearrangement: + type: array + nullable: false + description: List of rearrangement records + items: + $ref: '#/Rearrangement' + Cell: + type: array + nullable: false + description: List of cells + items: + $ref: '#/Cell' + Clone: + type: array + nullable: false + description: List of clones + items: + $ref: '#/Clone' + GermlineSet: + type: array + nullable: false + description: List of germline sets + items: + $ref: '#/GermlineSet' + GenotypeSet: + type: array + nullable: false + description: List of genotype sets + items: + $ref: '#/GenotypeSet' + +# AIRR Info object, should be similar to openapi +# should we point to an openapi schema? +InfoObject: + type: object + description: Provides information about data and API responses. + required: + - title + - version + properties: + title: + type: string + nullable: false + version: + type: string + nullable: false + description: + type: string + nullable: true + contact: + type: object + nullable: true + properties: + name: + type: string + nullable: true + url: + type: string + nullable: true + email: + type: string + nullable: true + license: + type: object + nullable: true + required: + - name + properties: + name: + type: string + nullable: false + url: + type: string + nullable: true + +# A time point +TimePoint: + description: Time point at which an observation or other action was performed. + type: object + properties: + label: + type: string + nullable: true + description: Informative label for the time point + example: Pre-operative sampling of cancer tissue + x-airr: + adc-query-support: true + value: + type: number + nullable: true + description: Value of the time point + example: -5.0 + x-airr: + adc-query-support: true + unit: + $ref: '#/Ontology' + nullable: true + description: Unit of the time point + title: Unit of immunization schedule + example: + id: UO:0000033 + label: day + x-airr: + adc-query-support: true + format: ontology + ontology: + draft: false + top_node: + id: UO:0000003 + label: time unit + +# +# General objects +# + +# An individual +Acknowledgement: + description: Individual whose contribution to this work should be acknowledged + type: object + required: + - acknowledgement_id + - name + - institution_name + properties: + acknowledgement_id: + type: string + description: unique identifier of this Acknowledgement within the file + x-airr: + identifier: true + miairr: important + nullable: true + name: + type: string + nullable: true + description: Full name of individual + institution_name: + type: string + nullable: true + description: Individual's department and institution name + orcid_id: + type: string + nullable: true + description: Individual's ORCID identifier + +# +# Germline gene schema +# + +# Rearranged and genomic germline sequences +RearrangedSequence: + type: object + description: > + Details of a directly observed rearranged sequence or an inference from rearranged sequences + contributing support for a gene or allele. + required: + - sequence_id + - sequence + - derivation + - observation_type + - repository_name + - repository_id + - deposited_version + - seq_start + - seq_end + properties: + sequence_id: + type: string + nullable: true + description: > + Unique identifier of this RearrangedSequence within the file, typically generated by the repository + hosting the schema, for example from the underlying ID of the database record. + x-airr: + identifier: true + miairr: important + sequence: + type: string + nullable: false + x-airr: + miairr: essential + description: nucleotide sequence + derivation: + type: string + nullable: true + enum: + - DNA + - RNA + - null + description: The class of nucleic acid that was used as primary starting material + x-airr: + miairr: important + observation_type: + type: string + nullable: false + enum: + - direct_sequencing + - inference_from_repertoire + description: > + The type of observation from which this sequence was drawn, such as direct sequencing or + inference from repertoire sequencing data. + x-airr: + miairr: essential + curation: + type: string + nullable: true + description: Curational notes on the sequence + repository_name: + type: string + nullable: true + x-airr: + miairr: defined + description: Name of the repository in which the sequence has been deposited + repository_ref: + type: string + nullable: true + x-airr: + miairr: defined + description: Queryable id or accession number of the sequence published by the repository + deposited_version: + type: string + nullable: true + x-airr: + miairr: defined + description: Version number of the sequence within the repository + sequence_start: + type: integer + nullable: false + x-airr: + miairr: essential + description: Start co-ordinate of the sequence detailed in this record, within the sequence deposited + sequence_end: + type: integer + nullable: false + x-airr: + miairr: essential + description: End co-ordinate of the sequence detailed in this record, within the sequence deposited + +UnrearrangedSequence: + description: Details of an unrearranged sequence contributing support for a gene or allele + type: object + required: + - sequence_id + - sequence + - repository_name + - assembly_id + - gff_seqid + - gff_start + - gff_end + - strand + properties: + sequence_id: + type: string + nullable: true + x-airr: + identifier: true + miairr: important + description: unique identifier of this UnrearrangedSequence within the file + sequence: + type: string + nullable: false + description: > + Sequence of interest described in this record. Typically, this will include gene and promoter region. + x-airr: + miairr: essential + curation: + type: string + nullable: true + description: Curational notes on the sequence + repository_name: + type: string + nullable: true + x-airr: + miairr: defined + description: Name of the repository in which the assembly or contig is deposited + repository_ref: + type: string + nullable: true + x-airr: + miairr: defined + description: Queryable id or accession number of the sequence published by the repository + patch_no: + type: string + nullable: true + description: Genome assembly patch number in which this gene was determined + gff_seqid: + type: string + nullable: true + description: > + Sequence (from the assembly) of a window including the gene and preferably also the promoter region. + gff_start: + type: integer + nullable: true + description: > + Genomic co-ordinates of the start of the sequence of interest described in this record in + Ensemble GFF version 3. + gff_end: + type: integer + nullable: true + description: > + Genomic co-ordinates of the end of the sequence of interest described in this record in + Ensemble GFF version 3. + strand: + type: string + nullable: true + enum: + - + + - "-" + - null + description: sense (+ or -) + +# V gene delineation +SequenceDelineationV: + description: Delineation of a V-gene in a particular system + type: object + required: + - sequence_delineation_id + - delineation_scheme + - fwr1_start + - fwr1_end + - cdr1_start + - cdr1_end + - fwr2_start + - fwr2_end + - cdr2_start + - cdr2_end + - fwr3_start + - fwr3_end + - cdr3_start + properties: + sequence_delineation_id: + type: string + nullable: true + description: > + Unique identifier of this SequenceDelineationV within the file. Typically, generated by the + repository hosting the record. + x-airr: + identifier: true + miairr: important + + delineation_scheme: + type: string + nullable: true + x-airr: + miairr: important + description: Name of the delineation scheme + example: Chothia + unaligned_sequence: + type: string + nullable: true + x-airr: + miairr: important + description: entire V-sequence covered by this delineation + aligned_sequence: + type: string + nullable: true + description: > + Aligned sequence if this delineation provides an alignment. An aligned sequence should always be + provided for IMGT delineations. + fwr1_start: + type: integer + nullable: true + x-airr: + miairr: important + description: FWR1 start co-ordinate in the 'unaligned sequence' field + fwr1_end: + type: integer + nullable: true + x-airr: + miairr: important + description: FWR1 end co-ordinate in the 'unaligned sequence' field + cdr1_start: + type: integer + nullable: true + x-airr: + miairr: important + description: CDR1 start co-ordinate in the 'unaligned sequence' field + cdr1_end: + type: integer + nullable: true + x-airr: + miairr: important + description: CDR1 end co-ordinate in the 'unaligned sequence' field + fwr2_start: + type: integer + nullable: true + x-airr: + miairr: important + description: FWR2 start co-ordinate in the 'unaligned sequence' field + fwr2_end: + type: integer + nullable: true + x-airr: + miairr: important + description: FWR2 end co-ordinate in the 'unaligned sequence' field + cdr2_start: + type: integer + nullable: true + x-airr: + miairr: important + description: CDR2 start co-ordinate in the 'unaligned sequence' field + cdr2_end: + type: integer + nullable: true + x-airr: + miairr: important + description: CDR2 end co-ordinate in the 'unaligned sequence' field + fwr3_start: + type: integer + nullable: true + x-airr: + miairr: important + description: FWR3 start co-ordinate in the 'unaligned sequence' field + fwr3_end: + type: integer + nullable: true + x-airr: + miairr: important + description: FWR3 end co-ordinate in the 'unaligned sequence' field + cdr3_start: + type: integer + nullable: true + x-airr: + miairr: important + description: CDR3 start co-ordinate in the 'unaligned sequence' field + alignment_labels: + type: array + nullable: true + items: + type: string + description: > + One string for each codon in the aligned_sequence indicating the label of that codon according to + the numbering of the delineation scheme if it provides one. + +# Description of a putative or confirmed Ig receptor gene/allele +AlleleDescription: + description: Details of a putative or confirmed Ig receptor gene/allele inferred from one or more observations + type: object + required: + - allele_description_id + - maintainer + - lab_address + - release_version + - release_date + - release_description + - sequence + - coding_sequence + - locus + - sequence_type + - functional + - inference_type + - species + properties: + allele_description_id: + type: string + nullable: true + x-airr: + identifier: true + miairr: important + description: > + Unique identifier of this AlleleDescription within the file. Typically, generated by the + repository hosting the record. + allele_description_ref: + type: string + nullable: true + x-airr: + miairr: important + description: Unique reference to the allele description, in standardized form (Repo:Label:Version) + example: OGRDB:Human_IGH:IGHV1-69*01.001 + maintainer: + type: string + nullable: true + x-airr: + miairr: defined + description: Maintainer of this sequence record + acknowledgements: + type: array + nullable: true + description: List of individuals whose contribution to the gene description should be acknowledged + items: + $ref: '#/Acknowledgement' + lab_address: + type: string + nullable: true + x-airr: + miairr: defined + description: Institution and full address of corresponding author + release_version: + type: integer + nullable: true + x-airr: + miairr: important + description: Version number of this record, updated whenever a revised version is published or released + release_date: + type: string + nullable: true + format: date-time + x-airr: + miairr: important + description: Date of this release + title: Release Date + example: "2021-02-02" + release_description: + type: string + nullable: true + x-airr: + miairr: important + description: Brief descriptive notes of the reason for this release and the changes embodied + label: + type: string + nullable: true + x-airr: + miairr: important + description: > + The accepted name for this gene or allele following the relevant nomenclature. + The value in this field should correspond to values in acceptable name fields of other schemas, + such as v_call, d_call, and j_call fields. + example: IGHV1-69*01 + sequence: + type: string + nullable: false + x-airr: + miairr: essential + description: > + Nucleotide sequence of the gene. This should cover the full length that is available, + including where possible RSS, and 5' UTR and lead-in for V-gene sequences. + coding_sequence: + type: string + nullable: true + x-airr: + miairr: important + description: > + Nucleotide sequence of the core coding region, such as the coding region of a D-, J- or C- gene + or the coding region of a V-gene excluding the leader. + aliases: + type: array + nullable: true + items: + type: string + description: Alternative names for this sequence + locus: + type: string + nullable: false + enum: + - IGH + - IGI + - IGK + - IGL + - TRA + - TRB + - TRG + - TRD + description: Gene locus + x-airr: + miairr: essential + chromosome: + type: integer + nullable: true + description: chromosome on which the gene is located + sequence_type: + type: string + nullable: false + enum: + - V + - D + - J + - C + description: Sequence type (V, D, J, C) + x-airr: + miairr: essential + functional: + type: boolean + nullable: true + x-airr: + miairr: important + description: True if the gene is functional, false if it is a pseudogene + inference_type: + type: string + nullable: true + enum: + - genomic_and_rearranged + - genomic_only + - rearranged_only + - null + description: Type of inference(s) from which this gene sequence was inferred + x-airr: + miairr: important + species: + $ref: '#/Ontology' + nullable: false + description: Binomial designation of subject's species + title: Organism + example: + id: NCBITAXON:9606 + label: Homo sapiens + x-airr: + miairr: essential + species_subgroup: + type: string + nullable: true + description: Race, strain or other species subgroup to which this subject belongs + example: BALB/c + species_subgroup_type: + type: string + nullable: true + enum: + - breed + - strain + - inbred + - outbred + - locational + - null + status: + type: string + nullable: true + enum: + - active + - draft + - retired + - withdrawn + - null + description: Status of record, assumed active if the field is not present + subgroup_designation: + type: string + nullable: true + description: Identifier of the gene subgroup or clade, as (and if) defined + gene_designation: + type: string + nullable: true + description: Gene number or other identifier, as (and if) defined + allele_designation: + type: string + nullable: true + description: Allele number or other identifier, as (and if) defined + allele_similarity_cluster_designation: + type: string + nullable: true + description: ID of the similarity cluster used in this germline set, if designated + allele_similarity_cluster_member_id: + type: string + nullable: true + description: Membership ID of the allele within the similarity cluster, if a cluster is designated + j_codon_frame: + type: integer + nullable: true + enum: + - 1 + - 2 + - 3 + - null + description: > + Codon position of the first nucleotide in the 'coding_sequence' field. Mandatory for J genes. + Not used for V or D genes. '1' means the sequence is in-frame, '2' means that the first bp is + missing from the first codon, and '3' means that the first 2 bp are missing. + gene_start: + type: integer + nullable: true + description: > + Co-ordinate in the sequence field of the first nucleotide in the coding_sequence field. + x-airr: + miairr: important + gene_end: + type: integer + nullable: true + description: > + Co-ordinate in the sequence field of the last gene-coding nucleotide in the coding_sequence field. + x-airr: + miairr: important + utr_5_prime_start: + type: integer + nullable: true + description: Start co-ordinate in the sequence field of the 5 prime UTR (V-genes only). + utr_5_prime_end: + type: integer + nullable: true + description: End co-ordinate in the sequence field of the 5 prime UTR (V-genes only). + leader_1_start: + type: integer + nullable: true + description: Start co-ordinate in the sequence field of L-PART1 (V-genes only). + leader_1_end: + type: integer + nullable: true + description: End co-ordinate in the sequence field of L-PART1 (V-genes only). + leader_2_start: + type: integer + nullable: true + description: Start co-ordinate in the sequence field of L-PART2 (V-genes only). + leader_2_end: + type: integer + nullable: true + description: End co-ordinate in the sequence field of L-PART2 (V-genes only). + v_rs_start: + type: integer + nullable: true + description: Start co-ordinate in the sequence field of the V recombination site (V-genes only). + v_rs_end: + type: integer + nullable: true + description: End co-ordinate in the sequence field of the V recombination site (V-genes only). + d_rs_3_prime_start: + type: integer + nullable: true + description: Start co-ordinate in the sequence field of the 3 prime D recombination site (D-genes only). + d_rs_3_prime_end: + type: integer + nullable: true + description: End co-ordinate in the sequence field of the 3 prime D recombination site (D-genes only). + d_rs_5_prime_start: + type: integer + nullable: true + description: Start co-ordinate in the sequence field of the 5 prime D recombination site (D-genes only). + d_rs_5_prime_end: + type: integer + nullable: true + description: End co-ordinate in the sequence field of 5 the prime D recombination site (D-genes only). + j_cdr3_end: + type: integer + nullable: true + description: > + In the case of a J-gene, the co-ordinate in the sequence field of the first nucelotide of the + conserved PHE or TRP (IMGT codon position 118). + j_rs_start: + type: integer + nullable: true + description: Start co-ordinate in the sequence field of J recombination site (J-genes only). + j_rs_end: + type: integer + nullable: true + description: End co-ordinate in the sequence field of J recombination site (J-genes only). + j_donor_splice: + type: integer + nullable: true + description: Co-ordinate in the sequence field of the final 3' nucleotide of the J-REGION (J-genes only). + v_gene_delineations: + type: array + nullable: true + items: + $ref: '#/SequenceDelineationV' + unrearranged_support: + type: array + nullable: true + items: + $ref: '#/UnrearrangedSequence' + rearranged_support: + type: array + nullable: true + items: + $ref: '#/RearrangedSequence' + paralogs: + type: array + nullable: true + items: + type: string + description: Gene symbols of any paralogs + curation: + type: string + nullable: true + description: > + Curational notes on the AlleleDescription. This can be used to give more extensive notes on the + decisions taken than are provided in the release_description. + curational_tags: + type: array + nullable: true + items: + type: string + enum: + - likely_truncated + - likely_full_length + description: Controlled-vocabulary tags applied to this description + +# Collection of gene descriptions into a germline set +GermlineSet: + type: object + description: > + A germline object set bringing together multiple AlleleDescriptions from the same strain or species. + All genes in a GermlineSet should be from a single locus. + required: + - germline_set_id + - author + - lab_name + - lab_address + - release_version + - release_description + - release_date + - germline_set_name + - germline_set_ref + - species + - locus + - allele_descriptions + properties: + germline_set_id: + type: string + nullable: true + description: > + Unique identifier of the GermlineSet within this file. Typically, generated by the + repository hosting the record. + x-airr: + identifier: true + miairr: important + author: + type: string + nullable: true + x-airr: + miairr: important + description: Corresponding author + lab_name: + type: string + nullable: true + x-airr: + miairr: important + description: Department of corresponding author + lab_address: + type: string + nullable: true + x-airr: + miairr: important + description: Institutional address of corresponding author + acknowledgements: + type: array + nullable: true + description: List of individuals whose contribution to the germline set should be acknowledged + items: + $ref: '#/Acknowledgement' + release_version: + type: number + nullable: true + x-airr: + miairr: important + description: Version number of this record, allocated automatically + release_description: + type: string + nullable: true + x-airr: + miairr: important + description: Brief descriptive notes of the reason for this release and the changes embodied + release_date: + type: string + nullable: true + format: date-time + x-airr: + miairr: important + description: Date of this release + title: Release Date + example: "2021-02-02" + germline_set_name: + type: string + nullable: true + x-airr: + miairr: important + description: descriptive name of this germline set + germline_set_ref: + type: string + nullable: true + x-airr: + miairr: important + description: Unique identifier of the germline set and version, in standardized form (Repo:Label:Version) + example: OGRDB:Human_IGH:2021.11 + pub_ids: + type: array + items: + type: string + nullable: true + description: Publications describing the germline set + example: ["PMID:35720344"] + species: + $ref: '#/Ontology' + nullable: false + x-airr: + miairr: essential + description: Binomial designation of subject's species + title: Organism + example: + id: NCBITAXON:9606 + label: Homo sapiens + species_subgroup: + type: string + nullable: true + description: Race, strain or other species subgroup to which this subject belongs + example: BALB/c + species_subgroup_type: + type: string + nullable: true + enum: + - breed + - strain + - inbred + - outbred + - locational + - null + locus: + type: string + nullable: false + enum: + - IGH + - IGI + - IGK + - IGL + - TRA + - TRB + - TRG + - TRD + description: Gene locus + x-airr: + miairr: essential + allele_descriptions: + type: array + nullable: true + items: + $ref: '#/AlleleDescription' + description: list of allele_descriptions in the germline set + x-airr: + miairr: important + curation: + type: string + nullable: true + description: > + Curational notes on the GermlineSet. This can be used to give more extensive notes on the + decisions taken than are provided in the release_description. + +# +# Genotype schema +# + +# GenotypeSet lists the Genotypes (describing different loci) inferred for this subject + +GenotypeSet: + type: object + required: + - receptor_genotype_set_id + properties: + receptor_genotype_set_id: + type: string + nullable: true + x-airr: + identifier: true + miairr: important + description: > + A unique identifier for this Receptor Genotype Set, typically generated by the repository + hosting the schema, for example from the underlying ID of the database record. + genotype_class_list: + description: List of Genotypes included in this Receptor Genotype Set. + type: array + nullable: true + items: + $ref: '#/Genotype' + +# Genotype of adaptive immune receptors +# This enumerates the alleles and gene deletions inferred in a single subject. +# Included alleles may either be listed by reference to a GermlineSet, or +# listed as 'undocumented', in which case the inferred sequence is provided + +Genotype: + type: object + required: + - receptor_genotype_id + - locus + properties: + receptor_genotype_id: + type: string + nullable: true + x-airr: + identifier: true + miairr: important + description: > + A unique identifier within the file for this Receptor Genotype, typically generated by the + repository hosting the schema, for example from the underlying ID of the database record. + locus: + type: string + nullable: false + enum: + - IGH + - IGI + - IGK + - IGL + - TRA + - TRB + - TRD + - TRG + description: Gene locus + example: IGH + x-airr: + adc-query-support: true + format: controlled_vocabulary + miairr: essential + documented_alleles: + type: array + nullable: true + description: List of alleles documented in reference set(s) + items: + $ref: '#/DocumentedAllele' + x-airr: + miairr: important + undocumented_alleles: + type: array + nullable: true + description: List of alleles inferred to be present and not documented in an identified GermlineSet + items: + $ref: '#/UndocumentedAllele' + x-airr: + adc-query-support: true + deleted_genes: + type: array + nullable: true + description: Array of genes identified as being deleted in this genotype + items: + $ref: '#/DeletedGene' + x-airr: + adc-query-support: true + inference_process: + type: string + nullable: true + enum: + - genomic_sequencing + - repertoire_sequencing + - null + description: Information on how the genotype was acquired. Controlled vocabulary. + title: Genotype acquisition process + example: repertoire_sequencing + x-airr: + adc-query-support: true + format: controlled_vocabulary + +# Documented Allele +# This describes a 'known' allele found in a genotype +# It 'known' in the sense that it is documented in a reference set + +DocumentedAllele: + type: object + required: + - label + - germline_set_ref + properties: + label: + type: string + nullable: true + x-airr: + miairr: important + description: The accepted name for this allele, taken from the GermlineSet + germline_set_ref: + type: string + nullable: true + x-airr: + miairr: important + description: GermlineSet from which it was taken, referenced in standardized form (Repo:Label:Version) + example: OGRDB:Human_IGH:2021.11 + phasing: + type: integer + nullable: true + description: > + Chromosomal phasing indicator. Alleles with the same value are inferred to be located on the + same chromosome. + +# Undocumented Allele +# This describes a 'undocumented' allele found in a genotype +# It is 'undocumented' in the sense that it was not found in reference sets consulted for the analysis + +UndocumentedAllele: + required: + - allele_name + - sequence + type: object + properties: + allele_name: + type: string + nullable: true + description: Allele name as allocated by the inference pipeline + x-airr: + miairr: important + sequence: + type: string + nullable: false + description: nt sequence of the allele, as provided by the inference pipeline + x-airr: + miairr: essential + phasing: + type: integer + nullable: true + description: > + Chromosomal phasing indicator. Alleles with the same value are inferred to be located on the + same chromosome. + +# Deleted Gene +# It is regarded as 'deleted' in the sense that it was not identified during inference of the genotype + +DeletedGene: + required: + - label + - germline_set_ref + type: object + properties: + label: + type: string + nullable: false + description: The accepted name for this gene, taken from the GermlineSet + x-airr: + miairr: essential + germline_set_ref: + type: string + nullable: true + description: GermlineSet from which it was taken (issuer/name/version) + x-airr: + miairr: important + phasing: + type: integer + nullable: true + description: > + Chromosomal phasing indicator. Alleles with the same value are inferred to be located on the + same chromosome. + + +# List of MHCGenotypes describing a subject's genotype +MHCGenotypeSet: + type: object + required: + - mhc_genotype_set_id + - mhc_genotype_list + properties: + mhc_genotype_set_id: + type: string + nullable: true + x-airr: + identifier: true + miairr: important + description: A unique identifier for this MHCGenotypeSet + mhc_genotype_list: + description: List of MHCGenotypes included in this set + type: array + nullable: true + x-airr: + miairr: important + items: + $ref: '#/MHCGenotype' + +# Genotype of major histocompatibility complex (MHC) class I, class II and non-classical loci +MHCGenotype: + type: object + required: + - mhc_genotype_id + - mhc_class + - mhc_alleles + properties: + mhc_genotype_id: + type: string + nullable: true + x-airr: + identifier: true + miairr: important + description: A unique identifier for this MHCGenotype, assumed to be unique in the context of the study + mhc_class: + type: string + nullable: false + enum: + - MHC-I + - MHC-II + - MHC-nonclassical + description: Class of MHC alleles described by the MHCGenotype + example: MHC-I + x-airr: + miairr: essential + adc-query-support: true + format: controlled_vocabulary + mhc_alleles: + type: array + nullable: true + description: List of MHC alleles of the indicated mhc_class identified in an individual + items: + $ref: '#/MHCAllele' + x-airr: + miairr: important + adc-query-support: true + mhc_genotyping_method: + type: string + nullable: true + description: > + Information on how the genotype was determined. The content of this field should come from a list of + recommended terms provided in the AIRR Schema documentation. + title: MHC genotyping method + example: pcr_low_resolution + x-airr: + adc-query-support: true + miairr: important + + +# Allele of an MHC gene +MHCAllele: + type: object + properties: + allele_designation: + type: string + nullable: true + x-airr: + miairr: important + description: > + The accepted designation of an allele, usually its gene symbol plus allele/sub-allele/etc + identifiers, if provided by the mhc_typing method + gene: + $ref: '#/Ontology' + nullable: true + description: The MHC gene to which the described allele belongs + title: MHC gene + example: + id: MRO:0000046 + label: HLA-A + x-airr: + adc-query-support: false + format: ontology + ontology: + draft: true + top_node: + id: MRO:0000004 + label: MHC gene + miairr: important + reference_set_ref: + type: string + nullable: true + x-airr: + miairr: important + description: Repository and list from which it was taken (issuer/name/version) + + +SubjectGenotype: + type: object + properties: + receptor_genotype_set: + nullable: true + $ref: '#/GenotypeSet' + description: Immune receptor genotype set for this subject. + mhc_genotype_set: + nullable: true + $ref: '#/MHCGenotypeSet' + description: MHC genotype set for this subject. + +# +# Repertoire metadata schema +# + +# The overall study with a globally unique study_id +Study: + type: object + required: + - study_id + - study_title + - study_type + - inclusion_exclusion_criteria + - grants + - collected_by + - lab_name + - lab_address + - submitted_by + - pub_ids + - keywords_study + properties: + study_id: + type: string + nullable: true + description: > + Unique ID assigned by study registry such as one of the International Nucleotide Sequence Database + Collaboration (INSDC) repositories. + title: Study ID + example: PRJNA001 + x-airr: + identifier: true + miairr: important + adc-query-support: true + set: 1 + subset: study + name: Study ID + study_title: + type: string + nullable: true + description: Descriptive study title + title: Study title + example: Effects of sun light exposure of the Treg repertoire + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: study + name: Study title + study_type: + $ref: '#/Ontology' + nullable: true + description: Type of study design + title: Study type + example: + id: NCIT:C15197 + label: Case-Control Study + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: study + name: Study type + format: ontology + ontology: + draft: false + top_node: + id: NCIT:C63536 + label: Study + study_description: + type: string + nullable: true + description: Generic study description + title: Study description + example: Longer description + x-airr: + name: Study description + adc-query-support: true + inclusion_exclusion_criteria: + type: string + nullable: true + description: List of criteria for inclusion/exclusion for the study + title: Study inclusion/exclusion criteria + example: "Include: Clinical P. falciparum infection; Exclude: Seropositive for HIV" + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: study + name: Study inclusion/exclusion criteria + grants: + type: string + nullable: true + description: Funding agencies and grant numbers + title: Grant funding agency + example: NIH, award number R01GM987654 + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: study + name: Grant funding agency + study_contact: + type: string + nullable: true + description: > + Full contact information of the contact persons for this study This should include an e-mail address + and a persistent identifier such as an ORCID ID. + title: Contact information (study) + example: Dr. P. Stibbons, p.stibbons@unseenu.edu, https://orcid.org/0000-0002-1825-0097 + x-airr: + adc-query-support: true + name: Contact information (study) + collected_by: + type: string + nullable: true + description: > + Full contact information of the data collector, i.e. the person who is legally responsible for data + collection and release. This should include an e-mail address and a persistent identifier such as an + ORCID ID. + title: Contact information (data collection) + example: Dr. P. Stibbons, p.stibbons@unseenu.edu, https://orcid.org/0000-0002-1825-0097 + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: study + name: Contact information (data collection) + lab_name: + type: string + nullable: true + description: Department of data collector + title: Lab name + example: Department for Planar Immunology + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: study + name: Lab name + lab_address: + type: string + nullable: true + description: Institution and institutional address of data collector + title: Lab address + example: School of Medicine, Unseen University, Ankh-Morpork, Disk World + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: study + name: Lab address + submitted_by: + type: string + nullable: true + description: > + Full contact information of the data depositor, i.e., the person submitting the data to a repository. + This should include an e-mail address and a persistent identifier such as an ORCID ID. This is + supposed to be a short-lived and technical role until the submission is relased. + title: Contact information (data deposition) + example: Adrian Turnipseed, a.turnipseed@unseenu.edu, https://orcid.org/0000-0002-1825-0097 + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: study + name: Contact information (data deposition) + pub_ids: + type: array + items: + type: string + nullable: true + description: > + Array of publications describing the rationale and/or outcome of the study as an array of CURIE objects such as + a DOI or Pubmed ID. Where more than one publication is given, if there is a primary publication for the study it + should come first. + title: Relevant publications + example: ["PMID:29144493", "DOI:10.1038/ni.3873"] + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: study + name: Relevant publications + keywords_study: + type: array + items: + type: string + enum: + - contains_ig + - contains_tr + - contains_paired_chain + - contains_schema_rearrangement + - contains_schema_clone + - contains_schema_cell + - contains_schema_receptor + - contains_schema_cellexpression + - contains_schema_receptorreactivity + nullable: true + description: > + Keywords describing properties of one or more data sets in a study. "contains_schema" keywords indicate that + the study contains data objects from the AIRR Schema of that type (Rearrangement, Clone, Cell, Receptor) while + the other keywords indicate that the study design considers the type of data indicated (e.g. it is possible to have + a study that "contains_paired_chain" but does not "contains_schema_cell"). + title: Keywords for study + example: + - contains_ig + - contains_schema_rearrangement + - contains_schema_clone + - contains_schema_cell + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: study + name: Keywords for study + format: controlled_vocabulary + adc_publish_date: + type: string + format: date-time + nullable: true + description: > + Date the study was first published in the AIRR Data Commons. + title: ADC Publish Date + example: "2021-02-02" + x-airr: + adc-query-support: true + name: ADC Publish Date + adc_update_date: + type: string + format: date-time + nullable: true + description: > + Date the study data was updated in the AIRR Data Commons. + title: ADC Update Date + example: "2021-02-02" + x-airr: + adc-query-support: true + name: ADC Update Date + +# 1-to-n relationship between a study and its subjects +# subject_id is unique within a study +Subject: + type: object + required: + - subject_id + - synthetic + - species + - sex + - age_min + - age_max + - age_unit + - age_event + - ancestry_population + - ethnicity + - race + - strain_name + - linked_subjects + - link_type + properties: + subject_id: + type: string + nullable: true + description: > + Subject ID assigned by submitter, unique within study. If possible, a persistent subject ID linked to + an INSDC or similar repository study should be used. + title: Subject ID + example: SUB856413 + x-airr: + identifier: true + miairr: important + adc-query-support: true + set: 1 + subset: subject + name: Subject ID + synthetic: + type: boolean + nullable: false + description: TRUE for libraries in which the diversity has been synthetically generated (e.g. phage display) + title: Synthetic library + x-airr: + miairr: essential + adc-query-support: true + set: 1 + subset: subject + name: Synthetic library + species: + $ref: '#/Ontology' + nullable: false + description: Binomial designation of subject's species + title: Organism + example: + id: NCBITAXON:9606 + label: Homo sapiens + x-airr: + miairr: essential + adc-query-support: true + set: 1 + subset: subject + name: Species + format: ontology + ontology: + draft: false + top_node: + id: NCBITAXON:7776 + label: Gnathostomata + organism: + $ref: '#/Ontology' + nullable: true + description: Binomial designation of subject's species + x-airr: + deprecated: true + deprecated-description: Field was renamed to species for clarity. + deprecated-replaced-by: + - species + sex: + type: string + enum: + - male + - female + - pooled + - hermaphrodite + - intersex + - null + nullable: true + description: Biological sex of subject + title: Sex + example: female + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: subject + name: Sex + format: controlled_vocabulary + age_min: + type: number + nullable: true + description: Specific age or lower boundary of age range. + title: Age minimum + example: 60 + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: subject + name: Age minimum + age_max: + type: number + nullable: true + description: > + Upper boundary of age range or equal to age_min for specific age. + This field should only be null if age_min is null. + title: Age maximum + example: 80 + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: subject + name: Age maximum + age_unit: + $ref: '#/Ontology' + nullable: true + description: Unit of age range + title: Age unit + example: + id: UO:0000036 + label: year + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: subject + name: Age unit + format: ontology + ontology: + draft: false + top_node: + id: UO:0000003 + label: time unit + age_event: + type: string + nullable: true + description: > + Event in the study schedule to which `Age` refers. For NCBI BioSample this MUST be `sampling`. For other + implementations submitters need to be aware that there is currently no mechanism to encode to potential + delta between `Age event` and `Sample collection time`, hence the chosen events should be in temporal proximity. + title: Age event + example: enrollment + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: subject + name: Age event + age: + type: string + nullable: true + x-airr: + deprecated: true + deprecated-description: Split into two fields to specify as an age range. + deprecated-replaced-by: + - age_min + - age_max + - age_unit + ancestry_population: + $ref: '#/Ontology' + nullable: true + description: Broad geographic origin of ancestry (continent) + title: Ancestry population + example: + id: GAZ:00000459 + label: South America + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: subject + name: Ancestry population + format: ontology + ontology: + draft: true + top_node: + id: GAZ:00000448 + label: geographic location + location_birth: + $ref: '#/Ontology' + nullable: true + description: Self-reported location of birth of the subject, preferred granularity is country-level + example: + id: GAZ:00002939 + label: Poland + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: subject + name: Location of birth + format: ontology + ontology: + draft: true + top_node: + id: GAZ:00000448 + label: geographic location + ethnicity: + type: string + nullable: true + description: Ethnic group of subject (defined as cultural/language-based membership) + title: Ethnicity + example: English, Kurds, Manchu, Yakuts (and other fields from Wikipedia) + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: subject + name: Ethnicity + race: + type: string + nullable: true + description: Racial group of subject (as defined by NIH) + title: Race + example: White, American Indian or Alaska Native, Black, Asian, Native Hawaiian or Other Pacific Islander, Other + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: subject + name: Race + strain_name: + type: string + nullable: true + description: Non-human designation of the strain or breed of animal used + title: Strain name + example: C57BL/6J + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: subject + name: Strain name + linked_subjects: + type: string + nullable: true + description: Subject ID to which `Relation type` refers + title: Relation to other subjects + example: SUB1355648 + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: subject + name: Relation to other subjects + link_type: + type: string + nullable: true + description: Relation between subject and `linked_subjects`, can be genetic or environmental (e.g.exposure) + title: Relation type + example: father, daughter, household + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: subject + name: Relation type + diagnosis: + type: array + nullable: false + description: Diagnosis information for subject + items: + $ref: '#/Diagnosis' + x-airr: + adc-query-support: true + genotype: + nullable: true + $ref: '#/SubjectGenotype' + title: SubjectGenotype + +# 1-to-n relationship between a subject and its diagnoses +Diagnosis: + type: object + required: + - study_group_description + - disease_diagnosis + - disease_length + - disease_stage + - prior_therapies + - immunogen + - intervention + - medical_history + properties: + study_group_description: + type: string + nullable: true + description: Designation of study arm to which the subject is assigned to + title: Study group description + example: control + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: diagnosis and intervention + name: Study group description + disease_diagnosis: + $ref: '#/Ontology' + nullable: true + description: Diagnosis of subject + title: Diagnosis + example: + id: DOID:9538 + label: multiple myeloma + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: diagnosis and intervention + name: Diagnosis + format: ontology + ontology: + draft: false + top_node: + id: DOID:4 + label: disease + disease_length: + type: string + nullable: true + description: Time duration between initial diagnosis and current intervention + title: Length of disease + example: 23 months + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: diagnosis and intervention + name: Length of disease + format: physical_quantity + disease_stage: + type: string + nullable: true + description: Stage of disease at current intervention + title: Disease stage + example: Stage II + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: diagnosis and intervention + name: Disease stage + prior_therapies: + type: string + nullable: true + description: List of all relevant previous therapies applied to subject for treatment of `Diagnosis` + title: Prior therapies for primary disease under study + example: melphalan/prednisone + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: diagnosis and intervention + name: Prior therapies for primary disease under study + immunogen: + type: string + nullable: true + description: Antigen, vaccine or drug applied to subject at this intervention + title: Immunogen/agent + example: bortezomib + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: diagnosis and intervention + name: Immunogen/agent + intervention: + type: string + nullable: true + description: Description of intervention + title: Intervention definition + example: systemic chemotherapy, 6 cycles, 1.25 mg/m2 + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: diagnosis and intervention + name: Intervention definition + medical_history: + type: string + nullable: true + description: Medical history of subject that is relevant to assess the course of disease and/or treatment + title: Other relevant medical history + example: MGUS, first diagnosed 5 years prior + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: diagnosis and intervention + name: Other relevant medical history + +# 1-to-n relationship between a subject and its samples +# sample_id is unique within a study +Sample: + type: object + required: + - sample_id + - sample_type + - tissue + - anatomic_site + - disease_state_sample + - collection_time_point_relative + - collection_time_point_relative_unit + - collection_time_point_reference + - biomaterial_provider + properties: + sample_id: + type: string + nullable: true + description: > + Sample ID assigned by submitter, unique within study. If possible, a persistent sample ID linked to + INSDC or similar repository study should be used. + title: Biological sample ID + example: SUP52415 + x-airr: + identifier: true + miairr: important + adc-query-support: true + set: 2 + subset: sample + name: Biological sample ID + sample_type: + type: string + nullable: true + description: The way the sample was obtained, e.g. fine-needle aspirate, organ harvest, peripheral venous puncture + title: Sample type + example: Biopsy + x-airr: + miairr: important + adc-query-support: true + set: 2 + subset: sample + name: Sample type + tissue: + $ref: '#/Ontology' + nullable: true + description: The actual tissue sampled, e.g. lymph node, liver, peripheral blood + title: Tissue + example: + id: UBERON:0002371 + label: bone marrow + x-airr: + miairr: important + adc-query-support: true + set: 2 + subset: sample + name: Tissue + format: ontology + ontology: + draft: false + top_node: + id: UBERON:0010000 + label: multicellular anatomical structure + anatomic_site: + type: string + nullable: true + description: The anatomic location of the tissue, e.g. Inguinal, femur + title: Anatomic site + example: Iliac crest + x-airr: + miairr: important + adc-query-support: true + set: 2 + subset: sample + name: Anatomic site + disease_state_sample: + type: string + nullable: true + description: Histopathologic evaluation of the sample + title: Disease state of sample + example: Tumor infiltration + x-airr: + miairr: important + adc-query-support: true + set: 2 + subset: sample + name: Disease state of sample + collection_time_point_relative: + type: number + nullable: true + description: Time point at which sample was taken, relative to `Collection time event` + title: Sample collection time + example: 14 + x-airr: + miairr: important + adc-query-support: true + set: 2 + subset: sample + name: Sample collection time + collection_time_point_relative_unit: + $ref: '#/Ontology' + nullable: true + description: Unit of Sample collection time + title: Sample collection time unit + example: + id: UO:0000033 + label: day + x-airr: + miairr: important + adc-query-support: true + set: 2 + subset: sample + name: Sample collection time unit + format: ontology + ontology: + draft: false + top_node: + id: UO:0000003 + label: time unit + collection_time_point_reference: + type: string + nullable: true + description: Event in the study schedule to which `Sample collection time` relates to + title: Collection time event + example: Primary vaccination + x-airr: + miairr: important + adc-query-support: true + set: 2 + subset: sample + name: Collection time event + collection_location: + $ref: '#/Ontology' + nullable: true + description: Location where the sample was taken, preferred granularity is country-level + title: Sample collection location + example: + id: GAZ:00002939 + label: Poland + x-airr: + miairr: important + set: 2 + subset: sample + name: Sample collection location + format: ontology + ontology: + draft: true + top_node: + id: GAZ:00000448 + label: geographic location + biomaterial_provider: + type: string + nullable: true + description: Name and address of the entity providing the sample + title: Biomaterial provider + example: Tissues-R-Us, Tampa, FL, USA + x-airr: + miairr: important + adc-query-support: true + set: 2 + subset: sample + name: Biomaterial provider + +# 1-to-n relationship between a sample and processing of its cells +CellProcessing: + type: object + required: + - tissue_processing + - cell_subset + - cell_phenotype + - single_cell + - cell_number + - cells_per_reaction + - cell_storage + - cell_quality + - cell_isolation + - cell_processing_protocol + properties: + tissue_processing: + type: string + nullable: true + description: Enzymatic digestion and/or physical methods used to isolate cells from sample + title: Tissue processing + example: Collagenase A/Dnase I digested, followed by Percoll gradient + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (cell) + name: Tissue processing + cell_subset: + $ref: '#/Ontology' + nullable: true + description: Commonly-used designation of isolated cell population + title: Cell subset + example: + id: CL:0000972 + label: class switched memory B cell + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (cell) + name: Cell subset + format: ontology + ontology: + draft: false + top_node: + id: CL:0000542 + label: lymphocyte + cell_phenotype: + type: string + nullable: true + description: List of cellular markers and their expression levels used to isolate the cell population + title: Cell subset phenotype + example: CD19+ CD38+ CD27+ IgM- IgD- + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (cell) + name: Cell subset phenotype + cell_species: + $ref: '#/Ontology' + nullable: true + description: > + Binomial designation of the species from which the analyzed cells originate. Typically, this value + should be identical to `species`, in which case it SHOULD NOT be set explicitly. However, there are + valid experimental setups in which the two might differ, e.g., chimeric animal models. If set, this + key will overwrite the `species` information for all lower layers of the schema. + title: Cell species + example: + id: NCBITAXON:9606 + label: Homo sapiens + x-airr: + miairr: defined + adc-query-support: true + set: 3 + subset: process (cell) + name: Cell species + format: ontology + ontology: + draft: false + top_node: + id: NCBITAXON:7776 + label: Gnathostomata + single_cell: + type: boolean + nullable: true + description: TRUE if single cells were isolated into separate compartments + title: Single-cell sort + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (cell) + name: Single-cell sort + cell_number: + type: integer + nullable: true + description: Total number of cells that went into the experiment + title: Number of cells in experiment + example: 1000000 + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (cell) + name: Number of cells in experiment + cells_per_reaction: + type: integer + nullable: true + description: Number of cells for each biological replicate + title: Number of cells per sequencing reaction + example: 50000 + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (cell) + name: Number of cells per sequencing reaction + cell_storage: + type: boolean + nullable: true + description: TRUE if cells were cryo-preserved between isolation and further processing + title: Cell storage + example: TRUE + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (cell) + name: Cell storage + cell_quality: + type: string + nullable: true + description: Relative amount of viable cells after preparation and (if applicable) thawing + title: Cell quality + example: 90% viability as determined by 7-AAD + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (cell) + name: Cell quality + cell_isolation: + type: string + nullable: true + description: Description of the procedure used for marker-based isolation or enrich cells + title: Cell isolation / enrichment procedure + example: > + Cells were stained with fluorochrome labeled antibodies and then sorted on a FlowMerlin (CE) cytometer. + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (cell) + name: Cell isolation / enrichment procedure + cell_processing_protocol: + type: string + nullable: true + description: > + Description of the methods applied to the sample including cell preparation/ isolation/enrichment and + nucleic acid extraction. This should closely mirror the Materials and methods section in the manuscript. + title: Processing protocol + example: Stimulated wih anti-CD3/anti-CD28 + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (cell) + name: Processing protocol + +# object for PCR primer targets +PCRTarget: + type: object + required: + - pcr_target_locus + - forward_pcr_primer_target_location + - reverse_pcr_primer_target_location + properties: + pcr_target_locus: + type: string + enum: + - IGH + - IGI + - IGK + - IGL + - TRA + - TRB + - TRD + - TRG + - null + nullable: true + description: > + Designation of the target locus. Note that this field uses a controlled vocubulary that is meant to + provide a generic classification of the locus, not necessarily the correct designation according to + a specific nomenclature. + title: Target locus for PCR + example: IGK + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (nucleic acid [pcr]) + name: Target locus for PCR + format: controlled_vocabulary + forward_pcr_primer_target_location: + type: string + nullable: true + description: Position of the most distal nucleotide templated by the forward primer or primer mix + title: Forward PCR primer target location + example: IGHV, +23 + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (nucleic acid [pcr]) + name: Forward PCR primer target location + reverse_pcr_primer_target_location: + type: string + nullable: true + description: Position of the most proximal nucleotide templated by the reverse primer or primer mix + title: Reverse PCR primer target location + example: IGHG, +57 + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (nucleic acid [pcr]) + name: Reverse PCR primer target location + +# generally, a 1-to-1 relationship between a CellProcessing and processing of its nucleic acid +# but may be 1-to-n for technical replicates. +NucleicAcidProcessing: + type: object + required: + - template_class + - template_quality + - template_amount + - template_amount_unit + - library_generation_method + - library_generation_protocol + - library_generation_kit_version + - complete_sequences + - physical_linkage + properties: + template_class: + type: string + enum: + - DNA + - RNA + nullable: false + description: > + The class of nucleic acid that was used as primary starting material for the following procedures + title: Target substrate + example: RNA + x-airr: + miairr: essential + adc-query-support: true + set: 3 + subset: process (nucleic acid) + name: Target substrate + format: controlled_vocabulary + template_quality: + type: string + nullable: true + description: Description and results of the quality control performed on the template material + title: Target substrate quality + example: RIN 9.2 + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (nucleic acid) + name: Target substrate quality + template_amount: + type: number + nullable: true + description: Amount of template that went into the process + title: Template amount + example: 1000 + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (nucleic acid) + name: Template amount + template_amount_unit: + $ref: '#/Ontology' + nullable: true + description: Unit of template amount + title: Template amount time unit + example: + id: UO:0000024 + label: nanogram + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (nucleic acid) + name: Template amount time unit + format: ontology + ontology: + draft: false + top_node: + id: UO:0000002 + label: physical quantity + library_generation_method: + type: string + enum: + - "PCR" + - "RT(RHP)+PCR" + - "RT(oligo-dT)+PCR" + - "RT(oligo-dT)+TS+PCR" + - "RT(oligo-dT)+TS(UMI)+PCR" + - "RT(specific)+PCR" + - "RT(specific)+TS+PCR" + - "RT(specific)+TS(UMI)+PCR" + - "RT(specific+UMI)+PCR" + - "RT(specific+UMI)+TS+PCR" + - "RT(specific)+TS" + - "other" + nullable: false + description: Generic type of library generation + title: Library generation method + example: RT(oligo-dT)+TS(UMI)+PCR + x-airr: + miairr: essential + adc-query-support: true + set: 3 + subset: process (nucleic acid) + name: Library generation method + format: controlled_vocabulary + library_generation_protocol: + type: string + nullable: true + description: Description of processes applied to substrate to obtain a library that is ready for sequencing + title: Library generation protocol + example: cDNA was generated using + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (nucleic acid) + name: Library generation protocol + library_generation_kit_version: + type: string + nullable: true + description: When using a library generation protocol from a commercial provider, provide the protocol version number + title: Protocol IDs + example: v2.1 (2016-09-15) + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (nucleic acid) + name: Protocol IDs + pcr_target: + type: array + nullable: false + description: > + If a PCR step was performed that specifically targets the IG/TR loci, the target and primer locations + need to be provided here. This field holds an array of PCRTarget objects, so that multiplex PCR setups + amplifying multiple loci at the same time can be annotated using one record per locus. PCR setups not + targeting any specific locus must not annotate this field but select the appropriate + library_generation_method instead. + items: + $ref: '#/PCRTarget' + x-airr: + adc-query-support: true + complete_sequences: + type: string + enum: + - partial + - complete + - "complete+untemplated" + - mixed + nullable: false + description: > + To be considered `complete`, the procedure used for library construction MUST generate sequences that + 1) include the first V gene codon that encodes the mature polypeptide chain (i.e. after the + leader sequence) and 2) include the last complete codon of the J gene (i.e. 1 bp 5' of the J->C + splice site) and 3) provide sequence information for all positions between 1) and 2). To be considered + `complete & untemplated`, the sections of the sequences defined in points 1) to 3) of the previous + sentence MUST be untemplated, i.e. MUST NOT overlap with the primers used in library preparation. + `mixed` should only be used if the procedure used for library construction will likely produce multiple + categories of sequences in the given experiment. It SHOULD NOT be used as a replacement of a NULL value. + title: Complete sequences + example: partial + x-airr: + miairr: essential + adc-query-support: true + set: 3 + subset: process (nucleic acid) + name: Complete sequences + format: controlled_vocabulary + physical_linkage: + type: string + enum: + - none + - "hetero_head-head" + - "hetero_tail-head" + - "hetero_prelinked" + nullable: false + description: > + In case an experimental setup is used that physically links nucleic acids derived from distinct + `Rearrangements` before library preparation, this field describes the mode of that linkage. All + `hetero_*` terms indicate that in case of paired-read sequencing, the two reads should be expected + to map to distinct IG/TR loci. `*_head-head` refers to techniques that link the 5' ends of transcripts + in a single-cell context. `*_tail-head` refers to techniques that link the 3' end of one transcript to + the 5' end of another one in a single-cell context. This term does not provide any information whether + a continuous reading-frame between the two is generated. `*_prelinked` refers to constructs in which + the linkage was already present on the DNA level (e.g. scFv). + title: Physical linkage of different rearrangements + example: hetero_head-head + x-airr: + miairr: essential + adc-query-support: true + set: 3 + subset: process (nucleic acid) + name: Physical linkage of different rearrangements + format: controlled_vocabulary + +# 1-to-n relationship between a NucleicAcidProcessing and SequencingRun with resultant raw sequence file(s) +SequencingRun: + type: object + required: + - sequencing_run_id + - total_reads_passing_qc_filter + - sequencing_platform + - sequencing_facility + - sequencing_run_date + - sequencing_kit + properties: + sequencing_run_id: + type: string + nullable: true + description: ID of sequencing run assigned by the sequencing facility + title: Batch number + example: 160101_M01234 + x-airr: + identifier: true + miairr: important + adc-query-support: true + set: 3 + subset: process (sequencing) + name: Batch number + total_reads_passing_qc_filter: + type: integer + nullable: true + description: Number of usable reads for analysis + title: Total reads passing QC filter + example: 10365118 + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (sequencing) + name: Total reads passing QC filter + sequencing_platform: + type: string + nullable: true + description: Designation of sequencing instrument used + title: Sequencing platform + example: Alumina LoSeq 1000 + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (sequencing) + name: Sequencing platform + sequencing_facility: + type: string + nullable: true + description: Name and address of sequencing facility + title: Sequencing facility + example: Seqs-R-Us, Vancouver, BC, Canada + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (sequencing) + name: Sequencing facility + sequencing_run_date: + type: string + nullable: true + description: Date of sequencing run + title: Date of sequencing run + format: date + example: 2016-12-16 + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (sequencing) + name: Date of sequencing run + sequencing_kit: + type: string + nullable: true + description: Name, manufacturer, order and lot numbers of sequencing kit + title: Sequencing kit + example: "FullSeq 600, Alumina, #M123456C0, 789G1HK" + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (sequencing) + name: Sequencing kit + sequencing_files: + $ref: '#/SequencingData' + nullable: false + description: Set of sequencing files produced by the sequencing run + x-airr: + adc-query-support: true + +# Resultant raw sequencing files from a SequencingRun +SequencingData: + type: object + required: + - sequencing_data_id + - file_type + - filename + - read_direction + - read_length + - paired_filename + - paired_read_direction + - paired_read_length + properties: + sequencing_data_id: + type: string + nullable: true + description: > + Persistent identifier of raw data stored in an archive (e.g. INSDC run ID). Data archive should + be identified in the CURIE prefix. + title: Raw sequencing data persistent identifier + example: "SRA:SRR11610494" + x-airr: + identifier: true + miairr: important + adc-query-support: true + set: 4 + subset: data (raw reads) + format: CURIE + file_type: + type: string + nullable: true + description: File format for the raw reads or sequences + title: Raw sequencing data file type + enum: + - fasta + - fastq + - null + x-airr: + miairr: important + adc-query-support: true + set: 4 + subset: data (raw reads) + name: Raw sequencing data file type + format: controlled_vocabulary + filename: + type: string + nullable: true + description: File name for the raw reads or sequences. The first file in paired-read sequencing. + title: Raw sequencing data file name + example: MS10R-NMonson-C7JR9_S1_R1_001.fastq + x-airr: + miairr: important + adc-query-support: true + set: 4 + subset: data (raw reads) + name: Raw sequencing data file name + read_direction: + type: string + nullable: true + description: Read direction for the raw reads or sequences. The first file in paired-read sequencing. + title: Read direction + example: forward + enum: + - forward + - reverse + - mixed + - null + x-airr: + miairr: important + adc-query-support: true + set: 4 + subset: data (raw reads) + name: Read direction + format: controlled_vocabulary + read_length: + type: integer + nullable: true + description: Read length in bases for the first file in paired-read sequencing + title: Forward read length + example: 300 + x-airr: + miairr: important + adc-query-support: true + set: 4 + subset: data (raw reads) + name: Forward read length + paired_filename: + type: string + nullable: true + description: File name for the second file in paired-read sequencing + title: Paired raw sequencing data file name + example: MS10R-NMonson-C7JR9_S1_R2_001.fastq + x-airr: + miairr: important + adc-query-support: true + set: 4 + subset: data (raw reads) + name: Paired raw sequencing data file name + paired_read_direction: + type: string + nullable: true + description: Read direction for the second file in paired-read sequencing + title: Paired read direction + example: reverse + enum: + - forward + - reverse + - mixed + - null + x-airr: + miairr: important + adc-query-support: true + set: 4 + subset: data (raw reads) + name: Paired read direction + format: controlled_vocabulary + paired_read_length: + type: integer + nullable: true + description: Read length in bases for the second file in paired-read sequencing + title: Paired read length + example: 300 + x-airr: + miairr: important + adc-query-support: true + set: 4 + subset: data (raw reads) + name: Paired read length + index_filename: + type: string + nullable: true + description: File name for the index file + title: Sequencing index file name + example: MS10R-NMonson-C7JR9_S1_R3_001.fastq + x-airr: + adc-query-support: true + index_length: + type: integer + nullable: true + description: Read length in bases for the index file + title: Index read length + example: 8 + x-airr: + adc-query-support: true + +# 1-to-n relationship between a repertoire and data processing +# +# Set of annotated rearrangement sequences produced by +# data processing upon the raw sequence data for a repertoire. +DataProcessing: + type: object + required: + - software_versions + - paired_reads_assembly + - quality_thresholds + - primer_match_cutoffs + - collapsing_method + - data_processing_protocols + - germline_database + properties: + data_processing_id: + type: string + nullable: true + description: Identifier for the data processing object. + title: Data processing ID + x-airr: + name: Data processing ID + adc-query-support: true + identifier: true + primary_annotation: + type: boolean + default: false + nullable: false + description: > + If true, indicates this is the primary or default data processing for + the repertoire and its rearrangements. If false, indicates this is a secondary + or additional data processing. + title: Primary annotation + x-airr: + adc-query-support: true + identifier: true + software_versions: + type: string + nullable: true + description: Version number and / or date, include company pipelines + title: Software tools and version numbers + example: IgBLAST 1.6 + x-airr: + miairr: important + adc-query-support: true + set: 5 + subset: process (computational) + name: Software tools and version numbers + paired_reads_assembly: + type: string + nullable: true + description: How paired end reads were assembled into a single receptor sequence + title: Paired read assembly + example: PandaSeq (minimal overlap 50, threshold 0.8) + x-airr: + miairr: important + adc-query-support: true + set: 5 + subset: process (computational) + name: Paired read assembly + quality_thresholds: + type: string + nullable: true + description: How/if sequences were removed from (4) based on base quality scores + title: Quality thresholds + example: Average Phred score >=20 + x-airr: + miairr: important + adc-query-support: true + set: 5 + subset: process (computational) + name: Quality thresholds + primer_match_cutoffs: + type: string + nullable: true + description: How primers were identified in the sequences, were they removed/masked/etc? + title: Primer match cutoffs + example: Hamming distance <= 2 + x-airr: + miairr: important + adc-query-support: true + set: 5 + subset: process (computational) + name: Primer match cutoffs + collapsing_method: + type: string + nullable: true + description: The method used for combining multiple sequences from (4) into a single sequence in (5) + title: Collapsing method + example: MUSCLE 3.8.31 + x-airr: + miairr: important + adc-query-support: true + set: 5 + subset: process (computational) + name: Collapsing method + data_processing_protocols: + type: string + nullable: true + description: General description of how QC is performed + title: Data processing protocols + example: Data was processed using [...] + x-airr: + miairr: important + adc-query-support: true + set: 5 + subset: process (computational) + name: Data processing protocols + data_processing_files: + type: array + items: + type: string + nullable: true + description: Array of file names for data produced by this data processing. + title: Processed data file names + example: + - 'ERR1278153_aa.txz' + - 'ERR1278153_ab.txz' + - 'ERR1278153_ac.txz' + x-airr: + adc-query-support: true + name: Processed data file names + germline_database: + type: string + nullable: true + description: Source of germline V(D)J genes with version number or date accessed. + title: V(D)J germline reference database + example: ENSEMBL, Homo sapiens build 90, 2017-10-01 + x-airr: + miairr: important + adc-query-support: true + set: 5 + subset: data (processed sequence) + name: V(D)J germline reference database + germline_set_ref: + type: string + nullable: true + description: Unique identifier of the germline set and version, in standardized form (Repo:Label:Version) + example: OGRDB:Human_IGH:2021.11 + x-airr: + adc-query-support: true + analysis_provenance_id: + type: string + nullable: true + description: Identifier for machine-readable PROV model of analysis provenance + title: Analysis provenance ID + x-airr: + adc-query-support: true + +SampleProcessing: + allOf: + - type: object + properties: + sample_processing_id: + type: string + nullable: true + description: > + Identifier for the sample processing object. This field should be unique within the repertoire. + This field can be used to uniquely identify the combination of sample, cell processing, + nucleic acid processing and sequencing run information for the repertoire. + title: Sample processing ID + x-airr: + name: Sample processing ID + adc-query-support: true + identifier: true + - $ref: '#/Sample' + - $ref: '#/CellProcessing' + - $ref: '#/NucleicAcidProcessing' + - $ref: '#/SequencingRun' + + +# The composite schema for the repertoire object +# +# This represents a sample repertoire as defined by the study +# and experimentally observed by raw sequence data. A repertoire +# can only be for one subject but may include multiple samples. +Repertoire: + type: object + required: + - study + - subject + - sample + - data_processing + properties: + repertoire_id: + type: string + nullable: true + description: > + Identifier for the repertoire object. This identifier should be globally unique so that repertoires + from multiple studies can be combined together without conflict. The repertoire_id is used to link + other AIRR data to a Repertoire. Specifically, the Rearrangements Schema includes repertoire_id for + referencing the specific Repertoire for that Rearrangement. + title: Repertoire ID + x-airr: + adc-query-support: true + identifier: true + repertoire_name: + type: string + nullable: true + description: Short generic display name for the repertoire + title: Repertoire name + x-airr: + name: Repertoire name + adc-query-support: true + repertoire_description: + type: string + nullable: true + description: Generic repertoire description + title: Repertoire description + x-airr: + name: Repertoire description + adc-query-support: true + study: + $ref: '#/Study' + nullable: false + description: Study object + x-airr: + adc-query-support: true + subject: + $ref: '#/Subject' + nullable: false + description: Subject object + x-airr: + adc-query-support: true + sample: + type: array + nullable: false + description: List of Sample Processing objects + items: + $ref: '#/SampleProcessing' + x-airr: + adc-query-support: true + data_processing: + type: array + nullable: false + description: List of Data Processing objects + items: + $ref: '#/DataProcessing' + x-airr: + adc-query-support: true + +# A collection of repertoires for analysis purposes, includes optional time course +RepertoireGroup: + type: object + required: + - repertoire_group_id + - repertoires + properties: + repertoire_group_id: + type: string + nullable: true + description: Identifier for this repertoire collection + x-airr: + identifier: true + repertoire_group_name: + type: string + nullable: true + description: Short display name for this repertoire collection + repertoire_group_description: + type: string + nullable: true + description: Repertoire collection description + repertoires: + type: array + nullable: true + description: > + List of repertoires in this collection with an associated description and time point designation + items: + type: object + properties: + repertoire_id: + type: string + nullable: false + description: Identifier to the repertoire + x-airr: + adc-query-support: true + repertoire_description: + type: string + nullable: true + description: Description of this repertoire within the group + x-airr: + adc-query-support: true + time_point: + $ref: '#/TimePoint' + nullable: true + description: Time point designation for this repertoire within the group + x-airr: + adc-query-support: true + +Alignment: + type: object + required: + - sequence_id + - segment + - call + - score + - cigar + properties: + sequence_id: + type: string + nullable: true + description: > + Unique query sequence identifier within the file. Most often this will be the input sequence + header or a substring thereof, but may also be a custom identifier defined by the tool in + cases where query sequences have been combined in some fashion prior to alignment. + x-airr: + identifier: true + segment: + type: string + nullable: true + description: > + The segment for this alignment. One of V, D, J or C. + rev_comp: + type: boolean + nullable: true + description: > + Alignment result is from the reverse complement of the query sequence. + call: + type: string + nullable: true + description: > + Gene assignment with allele. + score: + type: number + nullable: true + description: > + Alignment score. + identity: + type: number + nullable: true + description: > + Alignment fractional identity. + support: + type: number + nullable: true + description: > + Alignment E-value, p-value, likelihood, probability or other similar measure of + support for the gene assignment as defined by the alignment tool. + cigar: + type: string + nullable: true + description: > + Alignment CIGAR string. + sequence_start: + type: integer + nullable: true + description: > + Start position of the segment in the query sequence (1-based closed interval). + sequence_end: + type: integer + nullable: true + description: > + End position of the segment in the query sequence (1-based closed interval). + germline_start: + type: integer + nullable: true + description: > + Alignment start position in the reference sequence (1-based closed interval). + germline_end: + type: integer + nullable: true + description: > + Alignment end position in the reference sequence (1-based closed interval). + rank: + type: integer + nullable: true + description: > + Alignment rank. + rearrangement_id: + type: string + nullable: true + description: > + Identifier for the Rearrangement object. May be identical to sequence_id, + but will usually be a universally unique record locator for database applications. + x-airr: + deprecated: true + deprecated-description: Field has been merged with sequence_id to avoid confusion. + deprecated-replaced-by: + - sequence_id + data_processing_id: + type: string + nullable: true + description: > + Identifier to the data processing object in the repertoire metadata + for this rearrangement. If this field is empty than the primary data processing object is assumed. + germline_database: + type: string + nullable: true + description: Source of germline V(D)J genes with version number or date accessed. + example: ENSEMBL, Homo sapiens build 90, 2017-10-01 + x-airr: + deprecated: true + deprecated-description: Field was moved up to the DataProcessing level to avoid data duplication. + deprecated-replaced-by: + - "DataProcessing:germline_database" + + +# The extended rearrangement object +Rearrangement: + type: object + required: + - sequence_id + - sequence + - rev_comp + - productive + - v_call + - d_call + - j_call + - sequence_alignment + - germline_alignment + - junction + - junction_aa + - v_cigar + - d_cigar + - j_cigar + properties: + sequence_id: + type: string + nullable: true + description: > + Unique query sequence identifier for the Rearrangement. Most often this will be the input sequence + header or a substring thereof, but may also be a custom identifier defined by the tool in + cases where query sequences have been combined in some fashion prior to alignment. When + downloaded from an AIRR Data Commons repository, this will usually be a universally unique + record locator for linking with other objects in the AIRR Data Model. + x-airr: + adc-query-support: true + identifier: true + sequence: + type: string + nullable: true + description: > + The query nucleotide sequence. Usually, this is the unmodified input sequence, which may be + reverse complemented if necessary. In some cases, this field may contain consensus sequences or + other types of collapsed input sequences if these steps are performed prior to alignment. + quality: + type: string + nullable: true + description: > + The Sanger/Phred quality scores for assessment of sequence quality. + Phred quality scores from 0 to 93 are encoded using ASCII 33 to 126 (Used by Illumina from v1.8.) + sequence_aa: + type: string + nullable: true + description: > + Amino acid translation of the query nucleotide sequence. + rev_comp: + type: boolean + nullable: true + description: > + True if the alignment is on the opposite strand (reverse complemented) with respect to the + query sequence. If True then all output data, such as alignment coordinates and sequences, + are based on the reverse complement of 'sequence'. + productive: + type: boolean + nullable: true + description: > + True if the V(D)J sequence is predicted to be productive. + x-airr: + adc-query-support: true + vj_in_frame: + type: boolean + nullable: true + description: True if the V and J gene alignments are in-frame. + stop_codon: + type: boolean + nullable: true + description: True if the aligned sequence contains a stop codon. + complete_vdj: + type: boolean + nullable: true + description: > + True if the sequence alignment spans the entire V(D)J region. Meaning, + sequence_alignment includes both the first V gene codon that encodes the + mature polypeptide chain (i.e., after the leader sequence) and the last + complete codon of the J gene (i.e., before the J-C splice site). + This does not require an absence of deletions within the internal + FWR and CDR regions of the alignment. + locus: + type: string + enum: + - IGH + - IGI + - IGK + - IGL + - TRA + - TRB + - TRD + - TRG + - null + nullable: true + description: > + Gene locus (chain type). Note that this field uses a controlled vocabulary that is meant to provide a + generic classification of the locus, not necessarily the correct designation according to a specific + nomenclature. + title: Gene locus + example: IGH + x-airr: + adc-query-support: true + name: Gene locus + format: controlled_vocabulary + locus_species: + $ref: '#/Ontology' + nullable: true + description: > + Binomial designation of the species from which the locus originates. Typically, this value should be + identical to `organism`, if which case it SHOULD NOT be set explicitly. However, there are valid + experimental setups in which the two might differ, e.g. transgenic animal models. If set, this key + will overwrite the `organism` information for all lower layers of the schema. + title: Locus species + example: + id: NCBITAXON:9606 + label: Homo sapiens + x-airr: + miairr: defined + adc-query-support: true + name: Locus species + format: ontology + ontology: + draft: false + top_node: + id: NCBITAXON:7776 + label: Gnathostomata + v_call: + type: string + nullable: true + description: > + V gene with allele. If referring to a known reference sequence in a database + the relevant gene/allele nomenclature should be followed (e.g., IGHV4-59*01 if using IMGT/GENE-DB). + title: V gene with allele + example: IGHV4-59*01 + x-airr: + miairr: important + adc-query-support: true + set: 6 + subset: data (processed sequence) + name: V gene with allele + d_call: + type: string + nullable: true + description: > + First or only D gene with allele. If referring to a known reference sequence in a database + the relevant gene/allele nomenclature should be followed (e.g., IGHD3-10*01 if using IMGT/GENE-DB). + title: D gene with allele + example: IGHD3-10*01 + x-airr: + miairr: important + adc-query-support: true + set: 6 + subset: data (processed sequence) + name: D gene with allele + d2_call: + type: string + nullable: true + description: > + Second D gene with allele. If referring to a known reference sequence in a database the relevant + gene/allele nomenclature should be followed (e.g., IGHD3-10*01 if using IMGT/GENE-DB). + example: IGHD3-10*01 + j_call: + type: string + nullable: true + description: > + J gene with allele. If referring to a known reference sequence in a database the relevant + gene/allele nomenclature should be followed (e.g., IGHJ4*02 if using IMGT/GENE-DB). + title: J gene with allele + example: IGHJ4*02 + x-airr: + miairr: important + adc-query-support: true + set: 6 + subset: data (processed sequence) + name: J gene with allele + c_call: + type: string + nullable: true + description: > + Constant region gene with allele. If referring to a known reference sequence in a database the + relevant gene/allele nomenclature should be followed (e.g., IGHG1*01 if using IMGT/GENE-DB). + title: C region + example: IGHG1*01 + x-airr: + miairr: important + adc-query-support: true + set: 6 + subset: data (processed sequence) + name: C region + sequence_alignment: + type: string + nullable: true + description: > + Aligned portion of query sequence, including any indel corrections or numbering spacers, + such as IMGT-gaps. Typically, this will include only the V(D)J region, but that is not + a requirement. + quality_alignment: + type: string + nullable: true + description: > + Sanger/Phred quality scores for assessment of sequence_alignment quality. + Phred quality scores from 0 to 93 are encoded using ASCII 33 to 126 (Used by Illumina from v1.8.) + sequence_alignment_aa: + type: string + nullable: true + description: > + Amino acid translation of the aligned query sequence. + germline_alignment: + type: string + nullable: true + description: > + Assembled, aligned, full-length inferred germline sequence spanning the same region + as the sequence_alignment field (typically the V(D)J region) and including the same set + of corrections and spacers (if any). + germline_alignment_aa: + type: string + nullable: true + description: > + Amino acid translation of the assembled germline sequence. + junction: + type: string + nullable: true + description: > + Junction region nucleotide sequence, where the junction is defined as + the CDR3 plus the two flanking conserved codons. + title: IMGT-JUNCTION nucleotide sequence + example: TGTGCAAGAGCGGGAGTTTACGACGGATATACTATGGACTACTGG + x-airr: + miairr: important + set: 6 + subset: data (processed sequence) + name: IMGT-JUNCTION nucleotide sequence + junction_aa: + type: string + nullable: true + description: > + Amino acid translation of the junction. + title: IMGT-JUNCTION amino acid sequence + example: CARAGVYDGYTMDYW + x-airr: + miairr: important + adc-query-support: true + set: 6 + subset: data (processed sequence) + name: IMGT-JUNCTION amino acid sequence + np1: + type: string + nullable: true + description: > + Nucleotide sequence of the combined N/P region between the V gene and + first D gene alignment or between the V gene and J gene alignments. + np1_aa: + type: string + nullable: true + description: > + Amino acid translation of the np1 field. + np2: + type: string + nullable: true + description: > + Nucleotide sequence of the combined N/P region between either the first D gene and J gene + alignments or the first D gene and second D gene alignments. + np2_aa: + type: string + nullable: true + description: > + Amino acid translation of the np2 field. + np3: + type: string + nullable: true + description: > + Nucleotide sequence of the combined N/P region between the second D gene + and J gene alignments. + np3_aa: + type: string + nullable: true + description: > + Amino acid translation of the np3 field. + cdr1: + type: string + nullable: true + description: > + Nucleotide sequence of the aligned CDR1 region. + cdr1_aa: + type: string + nullable: true + description: > + Amino acid translation of the cdr1 field. + cdr2: + type: string + nullable: true + description: > + Nucleotide sequence of the aligned CDR2 region. + cdr2_aa: + type: string + nullable: true + description: > + Amino acid translation of the cdr2 field. + cdr3: + type: string + nullable: true + description: > + Nucleotide sequence of the aligned CDR3 region. + cdr3_aa: + type: string + nullable: true + description: > + Amino acid translation of the cdr3 field. + fwr1: + type: string + nullable: true + description: > + Nucleotide sequence of the aligned FWR1 region. + fwr1_aa: + type: string + nullable: true + description: > + Amino acid translation of the fwr1 field. + fwr2: + type: string + nullable: true + description: > + Nucleotide sequence of the aligned FWR2 region. + fwr2_aa: + type: string + nullable: true + description: > + Amino acid translation of the fwr2 field. + fwr3: + type: string + nullable: true + description: > + Nucleotide sequence of the aligned FWR3 region. + fwr3_aa: + type: string + nullable: true + description: > + Amino acid translation of the fwr3 field. + fwr4: + type: string + nullable: true + description: > + Nucleotide sequence of the aligned FWR4 region. + fwr4_aa: + type: string + nullable: true + description: > + Amino acid translation of the fwr4 field. + v_score: + type: number + nullable: true + description: Alignment score for the V gene. + v_identity: + type: number + nullable: true + description: Fractional identity for the V gene alignment. + v_support: + type: number + nullable: true + description: > + V gene alignment E-value, p-value, likelihood, probability or other similar measure of + support for the V gene assignment as defined by the alignment tool. + v_cigar: + type: string + nullable: true + description: CIGAR string for the V gene alignment. + d_score: + type: number + nullable: true + description: Alignment score for the first or only D gene alignment. + d_identity: + type: number + nullable: true + description: Fractional identity for the first or only D gene alignment. + d_support: + type: number + nullable: true + description: > + D gene alignment E-value, p-value, likelihood, probability or other similar measure of + support for the first or only D gene as defined by the alignment tool. + d_cigar: + type: string + nullable: true + description: CIGAR string for the first or only D gene alignment. + d2_score: + type: number + nullable: true + description: Alignment score for the second D gene alignment. + d2_identity: + type: number + nullable: true + description: Fractional identity for the second D gene alignment. + d2_support: + type: number + nullable: true + description: > + D gene alignment E-value, p-value, likelihood, probability or other similar measure of + support for the second D gene as defined by the alignment tool. + d2_cigar: + type: string + nullable: true + description: CIGAR string for the second D gene alignment. + j_score: + type: number + nullable: true + description: Alignment score for the J gene alignment. + j_identity: + type: number + nullable: true + description: Fractional identity for the J gene alignment. + j_support: + type: number + nullable: true + description: > + J gene alignment E-value, p-value, likelihood, probability or other similar measure of + support for the J gene assignment as defined by the alignment tool. + j_cigar: + type: string + nullable: true + description: CIGAR string for the J gene alignment. + c_score: + type: number + nullable: true + description: Alignment score for the C gene alignment. + c_identity: + type: number + nullable: true + description: Fractional identity for the C gene alignment. + c_support: + type: number + nullable: true + description: > + C gene alignment E-value, p-value, likelihood, probability or other similar measure of + support for the C gene assignment as defined by the alignment tool. + c_cigar: + type: string + nullable: true + description: CIGAR string for the C gene alignment. + v_sequence_start: + type: integer + nullable: true + description: > + Start position of the V gene in the query sequence (1-based closed interval). + v_sequence_end: + type: integer + nullable: true + description: > + End position of the V gene in the query sequence (1-based closed interval). + v_germline_start: + type: integer + nullable: true + description: > + Alignment start position in the V gene reference sequence (1-based closed interval). + v_germline_end: + type: integer + nullable: true + description: > + Alignment end position in the V gene reference sequence (1-based closed interval). + v_alignment_start: + type: integer + nullable: true + description: > + Start position of the V gene alignment in both the sequence_alignment and germline_alignment + fields (1-based closed interval). + v_alignment_end: + type: integer + nullable: true + description: > + End position of the V gene alignment in both the sequence_alignment and germline_alignment + fields (1-based closed interval). + d_sequence_start: + type: integer + nullable: true + description: > + Start position of the first or only D gene in the query sequence. + (1-based closed interval). + d_sequence_end: + type: integer + nullable: true + description: > + End position of the first or only D gene in the query sequence. + (1-based closed interval). + d_germline_start: + type: integer + nullable: true + description: > + Alignment start position in the D gene reference sequence for the first or only + D gene (1-based closed interval). + d_germline_end: + type: integer + nullable: true + description: > + Alignment end position in the D gene reference sequence for the first or only + D gene (1-based closed interval). + d_alignment_start: + type: integer + nullable: true + description: > + Start position of the first or only D gene in both the sequence_alignment + and germline_alignment fields (1-based closed interval). + d_alignment_end: + type: integer + nullable: true + description: > + End position of the first or only D gene in both the sequence_alignment + and germline_alignment fields (1-based closed interval). + d2_sequence_start: + type: integer + nullable: true + description: > + Start position of the second D gene in the query sequence (1-based closed interval). + d2_sequence_end: + type: integer + nullable: true + description: > + End position of the second D gene in the query sequence (1-based closed interval). + d2_germline_start: + type: integer + nullable: true + description: > + Alignment start position in the second D gene reference sequence (1-based closed interval). + d2_germline_end: + type: integer + nullable: true + description: > + Alignment end position in the second D gene reference sequence (1-based closed interval). + d2_alignment_start: + type: integer + nullable: true + description: > + Start position of the second D gene alignment in both the sequence_alignment and + germline_alignment fields (1-based closed interval). + d2_alignment_end: + type: integer + nullable: true + description: > + End position of the second D gene alignment in both the sequence_alignment and + germline_alignment fields (1-based closed interval). + j_sequence_start: + type: integer + nullable: true + description: > + Start position of the J gene in the query sequence (1-based closed interval). + j_sequence_end: + type: integer + nullable: true + description: > + End position of the J gene in the query sequence (1-based closed interval). + j_germline_start: + type: integer + nullable: true + description: > + Alignment start position in the J gene reference sequence (1-based closed interval). + j_germline_end: + type: integer + nullable: true + description: > + Alignment end position in the J gene reference sequence (1-based closed interval). + j_alignment_start: + type: integer + nullable: true + description: > + Start position of the J gene alignment in both the sequence_alignment and germline_alignment + fields (1-based closed interval). + j_alignment_end: + type: integer + nullable: true + description: > + End position of the J gene alignment in both the sequence_alignment and germline_alignment + fields (1-based closed interval). + c_sequence_start: + type: integer + nullable: true + description: > + Start position of the C gene in the query sequence (1-based closed interval). + c_sequence_end: + type: integer + nullable: true + description: > + End position of the C gene in the query sequence (1-based closed interval). + c_germline_start: + type: integer + nullable: true + description: > + Alignment start position in the C gene reference sequence (1-based closed interval). + c_germline_end: + type: integer + nullable: true + description: > + Alignment end position in the C gene reference sequence (1-based closed interval). + c_alignment_start: + type: integer + nullable: true + description: > + Start position of the C gene alignment in both the sequence_alignment and germline_alignment + fields (1-based closed interval). + c_alignment_end: + type: integer + nullable: true + description: > + End position of the C gene alignment in both the sequence_alignment and germline_alignment + fields (1-based closed interval). + cdr1_start: + type: integer + nullable: true + description: CDR1 start position in the query sequence (1-based closed interval). + cdr1_end: + type: integer + nullable: true + description: CDR1 end position in the query sequence (1-based closed interval). + cdr2_start: + type: integer + nullable: true + description: CDR2 start position in the query sequence (1-based closed interval). + cdr2_end: + type: integer + nullable: true + description: CDR2 end position in the query sequence (1-based closed interval). + cdr3_start: + type: integer + nullable: true + description: CDR3 start position in the query sequence (1-based closed interval). + cdr3_end: + type: integer + nullable: true + description: CDR3 end position in the query sequence (1-based closed interval). + fwr1_start: + type: integer + nullable: true + description: FWR1 start position in the query sequence (1-based closed interval). + fwr1_end: + type: integer + nullable: true + description: FWR1 end position in the query sequence (1-based closed interval). + fwr2_start: + type: integer + nullable: true + description: FWR2 start position in the query sequence (1-based closed interval). + fwr2_end: + type: integer + nullable: true + description: FWR2 end position in the query sequence (1-based closed interval). + fwr3_start: + type: integer + nullable: true + description: FWR3 start position in the query sequence (1-based closed interval). + fwr3_end: + type: integer + nullable: true + description: FWR3 end position in the query sequence (1-based closed interval). + fwr4_start: + type: integer + nullable: true + description: FWR4 start position in the query sequence (1-based closed interval). + fwr4_end: + type: integer + nullable: true + description: FWR4 end position in the query sequence (1-based closed interval). + v_sequence_alignment: + type: string + nullable: true + description: > + Aligned portion of query sequence assigned to the V gene, including any + indel corrections or numbering spacers. + v_sequence_alignment_aa: + type: string + nullable: true + description: > + Amino acid translation of the v_sequence_alignment field. + d_sequence_alignment: + type: string + nullable: true + description: > + Aligned portion of query sequence assigned to the first or only D gene, including any + indel corrections or numbering spacers. + d_sequence_alignment_aa: + type: string + nullable: true + description: > + Amino acid translation of the d_sequence_alignment field. + d2_sequence_alignment: + type: string + nullable: true + description: > + Aligned portion of query sequence assigned to the second D gene, including any + indel corrections or numbering spacers. + d2_sequence_alignment_aa: + type: string + nullable: true + description: > + Amino acid translation of the d2_sequence_alignment field. + j_sequence_alignment: + type: string + nullable: true + description: > + Aligned portion of query sequence assigned to the J gene, including any + indel corrections or numbering spacers. + j_sequence_alignment_aa: + type: string + nullable: true + description: > + Amino acid translation of the j_sequence_alignment field. + c_sequence_alignment: + type: string + nullable: true + description: > + Aligned portion of query sequence assigned to the constant region, including + any indel corrections or numbering spacers. + c_sequence_alignment_aa: + type: string + nullable: true + description: > + Amino acid translation of the c_sequence_alignment field. + v_germline_alignment: + type: string + nullable: true + description: > + Aligned V gene germline sequence spanning the same region + as the v_sequence_alignment field and including the same set + of corrections and spacers (if any). + v_germline_alignment_aa: + type: string + nullable: true + description: > + Amino acid translation of the v_germline_alignment field. + d_germline_alignment: + type: string + nullable: true + description: > + Aligned D gene germline sequence spanning the same region + as the d_sequence_alignment field and including the same set + of corrections and spacers (if any). + d_germline_alignment_aa: + type: string + nullable: true + description: > + Amino acid translation of the d_germline_alignment field. + d2_germline_alignment: + type: string + nullable: true + description: > + Aligned D gene germline sequence spanning the same region + as the d2_sequence_alignment field and including the same set + of corrections and spacers (if any). + d2_germline_alignment_aa: + type: string + nullable: true + description: > + Amino acid translation of the d2_germline_alignment field. + j_germline_alignment: + type: string + nullable: true + description: > + Aligned J gene germline sequence spanning the same region + as the j_sequence_alignment field and including the same set + of corrections and spacers (if any). + j_germline_alignment_aa: + type: string + nullable: true + description: > + Amino acid translation of the j_germline_alignment field. + c_germline_alignment: + type: string + nullable: true + description: > + Aligned constant region germline sequence spanning the same region + as the c_sequence_alignment field and including the same set + of corrections and spacers (if any). + c_germline_alignment_aa: + type: string + nullable: true + description: > + Amino acid translation of the c_germline_aligment field. + junction_length: + type: integer + nullable: true + description: Number of nucleotides in the junction sequence. + junction_aa_length: + type: integer + nullable: true + description: Number of amino acids in the junction sequence. + x-airr: + adc-query-support: true + np1_length: + type: integer + nullable: true + description: > + Number of nucleotides between the V gene and first D gene alignments or + between the V gene and J gene alignments. + np2_length: + type: integer + nullable: true + description: > + Number of nucleotides between either the first D gene and J gene alignments + or the first D gene and second D gene alignments. + np3_length: + type: integer + nullable: true + description: > + Number of nucleotides between the second D gene and J gene alignments. + n1_length: + type: integer + nullable: true + description: Number of untemplated nucleotides 5' of the first or only D gene alignment. + n2_length: + type: integer + nullable: true + description: Number of untemplated nucleotides 3' of the first or only D gene alignment. + n3_length: + type: integer + nullable: true + description: Number of untemplated nucleotides 3' of the second D gene alignment. + p3v_length: + type: integer + nullable: true + description: Number of palindromic nucleotides 3' of the V gene alignment. + p5d_length: + type: integer + nullable: true + description: Number of palindromic nucleotides 5' of the first or only D gene alignment. + p3d_length: + type: integer + nullable: true + description: Number of palindromic nucleotides 3' of the first or only D gene alignment. + p5d2_length: + type: integer + nullable: true + description: Number of palindromic nucleotides 5' of the second D gene alignment. + p3d2_length: + type: integer + nullable: true + description: Number of palindromic nucleotides 3' of the second D gene alignment. + p5j_length: + type: integer + nullable: true + description: Number of palindromic nucleotides 5' of the J gene alignment. + v_frameshift: + type: boolean + nullable: true + description: > + True if the V gene in the query nucleotide sequence contains a translational + frameshift relative to the frame of the V gene reference sequence. + j_frameshift: + type: boolean + nullable: true + description: > + True if the J gene in the query nucleotide sequence contains a translational + frameshift relative to the frame of the J gene reference sequence. + d_frame: + type: integer + nullable: true + description: > + Numerical reading frame (1, 2, 3) of the first or only D gene in the query nucleotide sequence, + where frame 1 is relative to the first codon of D gene reference sequence. + d2_frame: + type: integer + nullable: true + description: > + Numerical reading frame (1, 2, 3) of the second D gene in the query nucleotide sequence, + where frame 1 is relative to the first codon of D gene reference sequence. + consensus_count: + type: integer + nullable: true + description: > + Number of reads contributing to the UMI consensus or contig assembly for this sequence. + For example, the sum of the number of reads for all UMIs that contribute to + the query sequence. + duplicate_count: + type: integer + nullable: true + description: > + Copy number or number of duplicate observations for the query sequence. + For example, the number of identical reads observed for this sequence. + title: Read count + example: 123 + x-airr: + miairr: important + set: 6 + subset: data (processed sequence) + name: Read count + umi_count: + type: integer + nullable: true + description: > + Number of distinct UMIs represented by this sequence. + For example, the total number of UMIs that contribute to + the contig assembly for the query sequence. + cell_id: + type: string + nullable: true + description: > + Identifier defining the cell of origin for the query sequence. + title: Cell index + example: W06_046_091 + x-airr: + identifier: true + miairr: important + adc-query-support: true + set: 6 + subset: data (processed sequence) + name: Cell index + clone_id: + type: string + nullable: true + description: Clonal cluster assignment for the query sequence. + x-airr: + adc-query-support: true + identifier: true + repertoire_id: + type: string + nullable: true + description: Identifier to the associated repertoire in study metadata. + x-airr: + adc-query-support: true + identifier: true + sample_processing_id: + type: string + nullable: true + description: > + Identifier to the sample processing object in the repertoire metadata + for this rearrangement. If the repertoire has a single sample then + this field may be empty or missing. If the repertoire has multiple samples then + this field may be empty or missing if the sample cannot be differentiated or + the relationship is not maintained by the data processing. + x-airr: + adc-query-support: true + identifier: true + data_processing_id: + type: string + nullable: true + description: > + Identifier to the data processing object in the repertoire metadata + for this rearrangement. If this field is empty than the primary data processing object is assumed. + x-airr: + adc-query-support: true + identifier: true + rearrangement_id: + type: string + nullable: true + description: > + Identifier for the Rearrangement object. May be identical to sequence_id, + but will usually be a universally unique record locator for database applications. + x-airr: + deprecated: true + deprecated-description: Field has been merged with sequence_id to avoid confusion. + deprecated-replaced-by: + - sequence_id + rearrangement_set_id: + type: string + nullable: true + description: > + Identifier for grouping Rearrangement objects. + x-airr: + deprecated: true + deprecated-description: Field has been replaced by other specialized identifiers. + deprecated-replaced-by: + - repertoire_id + - sample_processing_id + - data_processing_id + germline_database: + type: string + nullable: true + description: Source of germline V(D)J genes with version number or date accessed. + example: ENSEMBL, Homo sapiens build 90, 2017-10-01 + x-airr: + deprecated: true + deprecated-description: Field was moved up to the DataProcessing level to avoid data duplication. + deprecated-replaced-by: + - "DataProcessing:germline_database" + +# A unique inferred clone object that has been constructed within a single data processing +# for a single repertoire and a subset of its sequences and/or rearrangements. +Clone: + type: object + required: + - clone_id + - germline_alignment + properties: + clone_id: + type: string + nullable: true + description: Identifier for the clone. + x-airr: + identifier: true + repertoire_id: + type: string + nullable: true + description: Identifier to the associated repertoire in study metadata. + x-airr: + adc-query-support: true + data_processing_id: + type: string + nullable: true + description: Identifier of the data processing object in the repertoire metadata for this clone. + x-airr: + adc-query-support: true + sequences: + type: array + items: + type: string + nullable: true + description: > + List sequence_id strings that act as keys to the Rearrangement records for members of the clone. + v_call: + type: string + nullable: true + description: > + V gene with allele of the inferred ancestral of the clone. For example, IGHV4-59*01. + example: IGHV4-59*01 + d_call: + type: string + nullable: true + description: > + D gene with allele of the inferred ancestor of the clone. For example, IGHD3-10*01. + example: IGHD3-10*01 + j_call: + type: string + nullable: true + description: > + J gene with allele of the inferred ancestor of the clone. For example, IGHJ4*02. + example: IGHJ4*02 + junction: + type: string + nullable: true + description: > + Nucleotide sequence for the junction region of the inferred ancestor of the clone, + where the junction is defined as the CDR3 plus the two flanking conserved codons. + junction_aa: + type: string + nullable: true + description: > + Amino acid translation of the junction. + junction_length: + type: integer + nullable: true + description: Number of nucleotides in the junction. + junction_aa_length: + type: integer + nullable: true + description: Number of amino acids in junction_aa. + germline_alignment: + type: string + nullable: true + description: > + Assembled, aligned, full-length inferred ancestor of the clone spanning the same region + as the sequence_alignment field of nodes (typically the V(D)J region) and including the + same set of corrections and spacers (if any). + germline_alignment_aa: + type: string + nullable: true + description: > + Amino acid translation of germline_alignment. + v_alignment_start: + type: integer + nullable: true + description: > + Start position in the V gene alignment in both the sequence_alignment and germline_alignment + fields (1-based closed interval). + v_alignment_end: + type: integer + nullable: true + description: > + End position in the V gene alignment in both the sequence_alignment and germline_alignment + fields (1-based closed interval). + d_alignment_start: + type: integer + nullable: true + description: > + Start position of the D gene alignment in both the sequence_alignment and germline_alignment + fields (1-based closed interval). + d_alignment_end: + type: integer + nullable: true + description: > + End position of the D gene alignment in both the sequence_alignment and germline_alignment + fields (1-based closed interval). + j_alignment_start: + type: integer + nullable: true + description: > + Start position of the J gene alignment in both the sequence_alignment and germline_alignment + fields (1-based closed interval). + j_alignment_end: + type: integer + nullable: true + description: > + End position of the J gene alignment in both the sequence_alignment and germline_alignment + fields (1-based closed interval). + junction_start: + type: integer + nullable: true + description: Junction region start position in the alignment (1-based closed interval). + junction_end: + type: integer + nullable: true + description: Junction region end position in the alignment (1-based closed interval). + umi_count: + type: integer + nullable: true + description: > + Number of distinct UMIs observed across all sequences (Rearrangement records) in this clone. + clone_count: + type: integer + nullable: true + description: > + Absolute count of the size (number of members) of this clone in the repertoire. + This could simply be the number of sequences (Rearrangement records) observed in this clone, + the number of distinct cell barcodes (unique cell_id values), + or a more sophisticated calculation appropriate to the experimental protocol. + Absolute count is provided versus a frequency so that downstream analysis tools can perform their own normalization. + seed_id: + type: string + nullable: true + description: sequence_id of the seed sequence. Empty string (or null) if there is no seed sequence. + +# 1-to-n relationship for a clone to its trees. +Tree: + type: object + required: + - tree_id + - clone_id + - newick + properties: + tree_id: + type: string + nullable: true + description: Identifier for the tree. + x-airr: + identifier: true + clone_id: + type: string + nullable: true + description: Identifier for the clone. + newick: + type: string + nullable: true + description: Newick string of the tree edges. + nodes: + type: object + nullable: true + description: Dictionary of nodes in the tree, keyed by sequence_id string + additionalProperties: + $ref: '#/Node' + +# 1-to-n relationship between a tree and its nodes +Node: + type: object + required: + - sequence_id + properties: + sequence_id: + type: string + nullable: true + description: > + Identifier for this node that matches the identifier in the newick string and, where possible, + the sequence_id in the source repertoire. + x-airr: + identifier: true + sequence_alignment: + type: string + nullable: true + description: > + Nucleotide sequence of the node, aligned to the germline_alignment for this clone, including + including any indel corrections or spacers. + junction: + type: string + nullable: true + description: > + Junction region nucleotide sequence for the node, where the junction is defined as + the CDR3 plus the two flanking conserved codons. + junction_aa: + type: string + nullable: true + description: > + Amino acid translation of the junction. + +# The cell object acts as point of reference for all data that can be related +# to an individual cell, either by direct observation or inference. +Cell: + type: object + required: + - cell_id + - rearrangements + - repertoire_id + - virtual_pairing + properties: + cell_id: + type: string + nullable: false + description: > + Identifier defining the cell of origin for the query sequence. + title: Cell index + example: W06_046_091 + x-airr: + identifier: true + miairr: defined + adc-query-support: true + name: Cell index + rearrangements: + type: array + nullable: true + description: > + Array of sequence identifiers defined for the Rearrangement object + title: Cell-associated rearrangements + items: + type: string + example: [id1, id2] #empty vs NULL? + x-airr: + miairr: defined + adc-query-support: true + name: Cell-associated rearrangements + receptors: + type: array + nullable: true + description: > + Array of receptor identifiers defined for the Receptor object + title: Cell-associated receptors + items: + type: string + example: [id1, id2] #empty vs NULL? + x-airr: + miairr: defined + adc-query-support: true + name: Cell-associated receptors + repertoire_id: + type: string + nullable: true + description: Identifier to the associated repertoire in study metadata. + title: Parental repertoire of cell + x-airr: + miairr: defined + adc-query-support: true + name: Parental repertoire of cell + data_processing_id: + type: string + nullable: true + description: Identifier of the data processing object in the repertoire metadata for this clone. + title: Data processing for cell + x-airr: + miairr: defined + adc-query-support: true + name: Data processing for cell + expression_study_method: + type: string + enum: + - flow_cytometry + - single-cell_transcriptome + - null + nullable: true + description: > + Keyword describing the methodology used to assess expression. This values for this field MUST + come from a controlled vocabulary. + x-airr: + miairr: defined + adc-query-support: true + expression_raw_doi: + type: string + nullable: true + description: > + DOI of raw data set containing the current event + x-airr: + miairr: defined + adc-query-support: true + expression_index: + type: string + nullable: true + description: > + Index addressing the current event within the raw data set. + x-airr: + miairr: defined + virtual_pairing: + type: boolean + nullable: true + description: > + boolean to indicate if pairing was inferred. + title: Virtual pairing + x-airr: + miairr: defined + adc-query-support: true + name: Virtual pairing + +# The CellExpression object acts as a container to hold a single expression level measurement from +# an experiment. Expression data is associated with a cell_id and the related repertoire_id and +# data_processing_id as cell_id is not guaranteed to be unique outside the data processing for +# a single repertoire. +CellExpression: + type: object + required: + - expression_id + - repertoire_id + - data_processing_id + - cell_id + - property + - property_type + - value + properties: + expression_id: + type: string + description: > + Identifier of this expression property measurement. + title: Expression property measurement identifier + nullable: false + x-airr: + identifier: true + miairr: defined + adc-query-support: true + name: Expression measurement identifier + cell_id: + type: string + description: > + Identifier of the cell to which this expression data is related. + title: Cell identifier + nullable: false + example: W06_046_091 + x-airr: + miairr: defined + adc-query-support: true + name: Cell identifier + repertoire_id: + type: string + description: Identifier for the associated repertoire in study metadata. + title: Parental repertoire of cell + nullable: true + x-airr: + miairr: defined + adc-query-support: true + name: Parental repertoire of cell + data_processing_id: + type: string + description: Identifier of the data processing object in the repertoire metadata for this clone. + title: Data processing for cell + nullable: true + x-airr: + miairr: defined + adc-query-support: true + name: Data processing for cell + property_type: + type: string + description: > + Keyword describing the property type and detection method used to measure the property value. + The following keywords are recommended, but custom property types are also valid: + "mrna_expression_by_read_count", + "protein_expression_by_fluorescence_intensity", "antigen_bait_binding_by_fluorescence_intensity", + "protein_expression_by_dna_barcode_count" and "antigen_bait_binding_by_dna_barcode_count". + nullable: false + title: Property type and detection method + x-airr: + miairr: defined + adc-query-support: true + name: Property type and detection method + property: + $ref: '#/Ontology' + nullable: true + title: Property information + description: > + Name of the property observed, typically a gene or antibody identifier (and label) from a + canonical resource such as Ensembl (e.g. ENSG00000275747, IGHV3-79) or + Antibody Registry (ABREG:1236456, Purified anti-mouse/rat/human CD27 antibody). + example: + id: ENSG:ENSG00000275747 + label: IGHV3-79 + x-airr: + miairr: defined + adc-query-support: true + format: ontology + name: Property information + value: + type: number + description: Level at which the property was observed in the experiment (non-normalized). + title: Property value + nullable: true + example: 3 + x-airr: + miairr: defined + adc-query-support: true + name: Property value + + +# The Receptor object hold information about a receptor and its reactivity. +# +Receptor: + type: object + required: + - receptor_id + - receptor_hash + - receptor_type + - receptor_variable_domain_1_aa + - receptor_variable_domain_1_locus + - receptor_variable_domain_2_aa + - receptor_variable_domain_2_locus + properties: + receptor_id: + type: string + nullable: false + description: ID of the current Receptor object, unique within the local repository. + title: Receptor ID + example: TCR-MM-012345 + x-airr: + identifier: true + adc-query-support: true + receptor_hash: + type: string + nullable: false + description: > + The SHA256 hash of the receptor amino acid sequence, calculated on the concatenated + ``receptor_variable_domain_*_aa`` sequences and represented as base16-encoded string. + title: Receptor hash ID + example: aa1c4b77a6f4927611ab39f5267415beaa0ba07a952c233d803b07e52261f026 + x-airr: + adc-query-support: true + receptor_type: + type: string + nullable: false + enum: + - Ig + - TCR + description: The top-level receptor type, either Immunoglobulin (Ig) or T Cell Receptor (TCR). + x-airr: + adc-query-support: true + receptor_variable_domain_1_aa: + type: string + nullable: false + description: > + Complete amino acid sequence of the mature variable domain of the Ig heavy, TCR beta or TCR delta chain. + The mature variable domain is defined as encompassing all AA from and including first AA after the the + signal peptide to and including the last AA that is completely encoded by the J gene. + example: > + QVQLQQPGAELVKPGASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKYNEKFKSKATLTVDKPSSTAYMQLSSLTSEDSAVYYCARYDYYGSSYFDYWGQGTTLTVSS + x-airr: + adc-query-support: true + receptor_variable_domain_1_locus: + type: string + nullable: false + enum: + - IGH + - TRB + - TRD + description: Locus from which the variable domain in receptor_variable_domain_1_aa originates + example: IGH + x-airr: + adc-query-support: true + receptor_variable_domain_2_aa: + type: string + nullable: false + description: > + Complete amino acid sequence of the mature variable domain of the Ig light, TCR alpha or TCR gamma chain. + The mature variable domain is defined as encompassing all AA from and including first AA after the the + signal peptide to and including the last AA that is completely encoded by the J gene. + example: > + QAVVTQESALTTSPGETVTLTCRSSTGAVTTSNYANWVQEKPDHLFTGLIGGTNNRAPGVPARFSGSLIGDKAALTITGAQTEDEAIYFCALWYSNHWVFGGGTKLTVL + x-airr: + adc-query-support: true + receptor_variable_domain_2_locus: + type: string + nullable: false + enum: + - IGI + - IGK + - IGL + - TRA + - TRG + description: Locus from which the variable domain in receptor_variable_domain_2_aa originates + example: IGL + x-airr: + adc-query-support: true + receptor_ref: + type: array + nullable: true + description: Array of receptor identifiers defined for the Receptor object + title: Receptor cross-references + items: + type: string + example: ["IEDB_RECEPTOR:10"] + x-airr: + adc-query-support: true + reactivity_measurements: + type: array + nullable: true + description: Records of reactivity measurement + items: + $ref: '#/ReceptorReactivity' + + +ReceptorReactivity: + type: object + required: + - ligand_type + - antigen_type + - antigen + - reactivity_method + - reactivity_readout + - reactivity_value + - reactivity_unit + properties: + ligand_type: + type: string + nullable: false + enum: + - "MHC:peptide" + - "MHC:non-peptide" + - protein + - peptide + - non-peptidic + description: Classification of ligand binding to receptor + example: non-peptide + antigen_type: + type: string + nullable: false + enum: + - protein + - peptide + - non-peptidic + description: > + The type of antigen before processing by the immune system. + example: protein + antigen: + $ref: '#/Ontology' + nullable: false + description: > + The substance against which the receptor was tested. This can be any substance that + stimulates an adaptive immune response in the host, either through antibody production + or by T cell activation after presentation via an MHC molecule. + title: Antigen + example: + id: UNIPROT:P19597 + label: Circumsporozoite protein + x-airr: + adc-query-support: true + format: ontology + antigen_source_species: + $ref: '#/Ontology' + nullable: true + description: The species from which the antigen was isolated + title: Source species of antigen + example: + id: NCBITAXON:5843 + label: Plasmodium falciparum NF54 + x-airr: + format: ontology + ontology: + draft: true + top_node: + id: NCBITAXON:1 + label: root + peptide_start: + type: integer + nullable: true + description: Start position of the peptide within the reference protein sequence + peptide_end: + type: integer + nullable: true + description: End position of the peptide within the reference protein sequence + mhc_class: + type: string + nullable: true + enum: + - MHC-I + - MHC-II + - MHC-nonclassical + - null + description: Class of MHC molecule, only present for MHC:x ligand types + example: MHC-II + mhc_gene_1: + $ref: '#/Ontology' + nullable: true + description: The MHC gene to which the mhc_allele_1 belongs + title: MHC gene 1 + example: + id: MRO:0000055 + label: HLA-DRA + x-airr: + format: ontology + ontology: + draft: true + top_node: + id: MRO:0000004 + label: MHC gene + mhc_allele_1: + type: string + nullable: true + description: Allele designation of the MHC alpha chain + example: HLA-DRA + mhc_gene_2: + $ref: '#/Ontology' + nullable: true + description: The MHC gene to which the mhc_allele_2 belongs + title: MHC gene 2 + example: + id: MRO:0000057 + label: HLA-DRB1 + x-airr: + format: ontology + ontology: + draft: true + top_node: + id: MRO:0000004 + label: MHC gene + mhc_allele_2: + type: string + nullable: true + description: > + Allele designation of the MHC class II beta chain or the invariant beta2-microglobin chain + example: HLA-DRB1*04:01 + reactivity_method: + type: string + nullable: false + enum: + - SPR + - ITC + - ELISA + - cytometry + - biological_activity + description: The methodology used to assess expression (assay implemented in experiment) + reactivity_readout: + type: string + nullable: false + enum: + - binding_strength + - cytokine_release + - dissociation_constant_kd + - on_rate + - off_rate + - pathogen_inhibition + description: Reactivity measurement read-out + example: cytokine release + reactivity_value: + type: number + nullable: false + description: The absolute (processed) value of the measurement + example: 162.26 + reactivity_unit: + type: string + nullable: false + description: The unit of the measurement + example: pg/ml diff --git a/lang/python/airr/specs/airr-schema-openapi3.yaml b/lang/python/airr/specs/airr-schema-openapi3.yaml new file mode 100644 index 000000000..bba3a45d8 --- /dev/null +++ b/lang/python/airr/specs/airr-schema-openapi3.yaml @@ -0,0 +1,5091 @@ +# +# Schema definitions for AIRR standards objects +# +Info: + title: AIRR Schema + description: Schema definitions for AIRR standards objects + version: 1.4 + contact: + name: AIRR Community + url: https://github.com/airr-community + license: + name: Creative Commons Attribution 4.0 International + url: https://creativecommons.org/licenses/by/4.0/ + + +# Properties that are based upon an ontology use this +# standard schema definition +Ontology: + type: object + properties: + id: + type: string + nullable: true + description: CURIE of the concept, encoding the ontology and the local ID + label: + type: string + nullable: true + description: Label of the concept in the respective ontology + +# Map to expand CURIE prefixes to full IRIs +CURIEMap: + ABREG: + type: identifier + default: + map: ABREG + map: + ABREG: + iri_prefix: "http://antibodyregistry.org/AB_" + CHEBI: + type: ontology + default: + map: OBO + provider: OLS + map: + OBO: + iri_prefix: "http://purl.obolibrary.org/obo/CHEBI_" + CL: + type: ontology + default: + map: OBO + provider: OLS + map: + OBO: + iri_prefix: "http://purl.obolibrary.org/obo/CL_" + DOI: + type: identifier + default: + map: DOI + map: + DOI: + iri_prefix: "https://doi.org/" + DOID: + type: ontology + default: + map: OBO + provider: OLS + map: + OBO: + iri_prefix: "http://purl.obolibrary.org/obo/DOID_" + ENA: + type: identifier + default: + map: ENA + map: + ENA: + iri_prefix: "https://www.ebi.ac.uk/ena/browser/view/" + ENSG: + type: identifier + default: + map: ENSG + map: + ENSG: + iri_prefix: "https://www.ensembl.org/Multi/Search/Results?q=" + GAZ: + type: ontology + default: + map: OBO + provider: OLS + map: + OBO: + iri_prefix: "http://purl.obolibrary.org/obo/GAZ_" + IEDB_RECEPTOR: + type: identifier + default: + map: IEDB + provider: IEDB + map: + IEDB: + iri_prefix: "https://www.iedb.org/receptor/" + MRO: + type: ontology + default: + map: OBO + provider: OLS + map: + OBO: + iri_prefix: "http://purl.obolibrary.org/obo/MRO_" + NCBITAXON: + type: taxonomy + default: + map: OBO + provider: OLS + map: + OBO: + iri_prefix: "http://purl.obolibrary.org/obo/NCBITaxon_" + BioPortal: + iri_prefix: "http://purl.bioontology.org/ontology/NCBITAXON/" + NCIT: + type: ontology + default: + map: OBO + provider: OLS + map: + OBO: + iri_prefix: "http://purl.obolibrary.org/obo/NCIT_" + ORCID: + type: catalog + default: + map: ORCID + provider: ORCID + map: + ORCID: + iri_prefix: "https://orcid.org/" + ROR: + type: catalog + default: + map: ROR + provider: ROR + map: + ROR: + iri_prefix: "https://ror.org/" + SRA: + type: identifier + default: + map: SRA + map: + SRA: + iri_prefix: "https://trace.ncbi.nlm.nih.gov/Traces/sra/?run=" + UBERON: + type: ontology + default: + map: OBO + provider: OLS + map: + OBO: + iri_prefix: "http://purl.obolibrary.org/obo/UBERON_" + UNIPROT: + type: identifier + default: + map: UNIPROT + map: + UniProt: + iri_prefix: "http://purl.uniprot.org/uniprot/" + UO: + type: ontology + default: + map: OBO + provider: OLS + map: + OBO: + iri_prefix: "http://purl.obolibrary.org/obo/UO_" + +InformationProvider: + provider: + ENA: + request: + url: "{iri}" + response: text/html + IEDB: + request: + url: "https://query-api.iedb.org/tcr_search?receptor_group_id=eq.{local_id}" + response: application/json + OLS: + request: + url: "https://www.ebi.ac.uk/ols/api/ontologies/{ontology_id}/terms?iri={iri}" + response: application/json + Ontobee: + request: + url: "http://www.ontobee.org/ontology/rdf/{ontology_id}?iri={iri}" + response: application/rdf+xml + ORCID: + request: + url: "https://pub.orcid.org/v2.1/{local_id}" + header: + Accept: application/json + response: application/json + ROR: + request: + url: "https://api.ror.org/organizations/{iri}" + response: application/json + SRA: + request: + url: "{iri}" + response: text/html + parameter: + CHEBI: + Ontobee: + ontology_id: CHEBI + OLS: + ontology_id: chebi + CL: + Ontobee: + ontology_id: CL + OLS: + ontology_id: cl + DOID: + Ontobee: + ontology_id: DOID + OLS: + ontology_id: doid + GAZ: + Ontobee: + ontology_id: GAZ + OLS: + ontology_id: gaz + MRO: + Ontobee: + ontology_id: MRO + OLS: + ontology_id: mro + NCBITAXON: + Ontobee: + ontology_id: NCBITaxon + OLS: + ontology_id: ncbitaxon + BioPortal: + ontology_id: NCBITAXON + NCIT: + Ontobee: + ontology_id: NCIT + OLS: + ontology_id: ncit + UBERON: + Ontobee: + ontology_id: UBERON + OLS: + ontology_id: uberon + UO: + Ontobee: + ontology_id: UO + OLS: + ontology_id: uo + +# AIRR specification extensions +# +# The schema definitions for AIRR standards objects is extended to +# provide a number of AIRR specific attributes. This schema definition +# specifies the structure, property names and data types. These +# attributes are attached to an AIRR field with the x-airr property. + +Attributes: + type: object + properties: + miairr: + type: string + description: MiAIRR requirement level. + enum: + - essential + - important + - defined + default: defined + identifier: + type: boolean + description: > + True if the field is an identifier required to link metadata and/or individual + sequence records across objects in the complete AIRR Data Model and ADC API. + default: false + adc-query-support: + type: boolean + description: > + True if an ADC API implementation must support queries on the field. + If false, query support for the field in ADC API implementations is optional. + default: false + adc-api-optional: + type: boolean + description: > + If false, repositories must implement these fields both for queries and query repsonse. + Only applies to fields in the ADC API spec that are extensions to the AIRR Standard, + targeted at "convenience query fields" that make queries against repositories more + efficient than if queries were limited to AIRR fields only. + If true, repositories can choose to support the field or not. + default: false + deprecated: + type: boolean + description: True if the field has been deprecated from the schema. + default: false + deprecated-description: + type: string + description: Information regarding the deprecation of the field. + deprecated-replaced-by: + type: array + items: + type: string + description: The deprecated field is replaced by this list of fields. + set: + type: integer + description: MiAIRR set + subset: + type: string + description: MiAIRR subset + name: + type: string + description: MiAIRR name + format: + type: string + description: Field format. If null then assume the full range of the field data type + enum: + - ontology + - controlled_vocabulary + - physical_quantity + - CURIE + ontology: + type: object + description: Ontology definition for field + properties: + draft: + type: boolean + description: Indicates if ontology definition is a draft + top_node: + type: object + description: > + Concept to use as top node for ontology. Note that this must have the same CURIE namespace + as the actually annotated concept. + properties: + id: + type: string + description: CURIE for the top node term + label: + type: string + description: Ontology name for the top node term + +# AIRR Data File +# +# A JSON data file that holds Repertoire metadata, data processing +# analysis objects, or any object in the AIRR Data Model. +# +# It is presumed that the objects gathered together in an AIRR Data File are related +# or relevant to each other, e.g. part of the same study; thus, the ID fields can be +# internally resolved unless the ID contains an external PID. This implies that AIRR +# Data Files cannot be merged simply by concatenating arrays; any merge program +# would need to manage duplicate or conflicting ID values. +# +# While the properties in an AIRR Data File are not required, if one is provided then +# the value should not be null. + +DataFile: + type: object + properties: + Info: + nullable: false + $ref: '#/InfoObject' + Repertoire: + type: array + nullable: false + description: List of repertoires + items: + $ref: '#/Repertoire' + RepertoireGroup: + type: array + nullable: false + description: List of repertoire collections + items: + $ref: '#/RepertoireGroup' + Rearrangement: + type: array + nullable: false + description: List of rearrangement records + items: + $ref: '#/Rearrangement' + Cell: + type: array + nullable: false + description: List of cells + items: + $ref: '#/Cell' + Clone: + type: array + nullable: false + description: List of clones + items: + $ref: '#/Clone' + GermlineSet: + type: array + nullable: false + description: List of germline sets + items: + $ref: '#/GermlineSet' + GenotypeSet: + type: array + nullable: false + description: List of genotype sets + items: + $ref: '#/GenotypeSet' + +# AIRR Info object, should be similar to openapi +# should we point to an openapi schema? +InfoObject: + type: object + description: Provides information about data and API responses. + required: + - title + - version + properties: + title: + type: string + nullable: false + version: + type: string + nullable: false + description: + type: string + nullable: true + contact: + type: object + nullable: true + properties: + name: + type: string + nullable: true + url: + type: string + nullable: true + email: + type: string + nullable: true + license: + type: object + nullable: true + required: + - name + properties: + name: + type: string + nullable: false + url: + type: string + nullable: true + +# A time point +TimePoint: + description: Time point at which an observation or other action was performed. + type: object + properties: + label: + type: string + nullable: true + description: Informative label for the time point + example: Pre-operative sampling of cancer tissue + x-airr: + adc-query-support: true + value: + type: number + nullable: true + description: Value of the time point + example: -5.0 + x-airr: + adc-query-support: true + unit: + $ref: '#/Ontology' + nullable: true + description: Unit of the time point + title: Unit of immunization schedule + example: + id: UO:0000033 + label: day + x-airr: + adc-query-support: true + format: ontology + ontology: + draft: false + top_node: + id: UO:0000003 + label: time unit + +# +# General objects +# + +# An individual +Acknowledgement: + description: Individual whose contribution to this work should be acknowledged + type: object + required: + - acknowledgement_id + - name + - institution_name + properties: + acknowledgement_id: + type: string + description: unique identifier of this Acknowledgement within the file + x-airr: + identifier: true + miairr: important + nullable: true + name: + type: string + nullable: true + description: Full name of individual + institution_name: + type: string + nullable: true + description: Individual's department and institution name + orcid_id: + type: string + nullable: true + description: Individual's ORCID identifier + +# +# Germline gene schema +# + +# Rearranged and genomic germline sequences +RearrangedSequence: + type: object + description: > + Details of a directly observed rearranged sequence or an inference from rearranged sequences + contributing support for a gene or allele. + required: + - sequence_id + - sequence + - derivation + - observation_type + - repository_name + - repository_id + - deposited_version + - seq_start + - seq_end + properties: + sequence_id: + type: string + nullable: true + description: > + Unique identifier of this RearrangedSequence within the file, typically generated by the repository + hosting the schema, for example from the underlying ID of the database record. + x-airr: + identifier: true + miairr: important + sequence: + type: string + nullable: false + x-airr: + miairr: essential + description: nucleotide sequence + derivation: + type: string + nullable: true + enum: + - DNA + - RNA + - null + description: The class of nucleic acid that was used as primary starting material + x-airr: + miairr: important + observation_type: + type: string + nullable: false + enum: + - direct_sequencing + - inference_from_repertoire + description: > + The type of observation from which this sequence was drawn, such as direct sequencing or + inference from repertoire sequencing data. + x-airr: + miairr: essential + curation: + type: string + nullable: true + description: Curational notes on the sequence + repository_name: + type: string + nullable: true + x-airr: + miairr: defined + description: Name of the repository in which the sequence has been deposited + repository_ref: + type: string + nullable: true + x-airr: + miairr: defined + description: Queryable id or accession number of the sequence published by the repository + deposited_version: + type: string + nullable: true + x-airr: + miairr: defined + description: Version number of the sequence within the repository + sequence_start: + type: integer + nullable: false + x-airr: + miairr: essential + description: Start co-ordinate of the sequence detailed in this record, within the sequence deposited + sequence_end: + type: integer + nullable: false + x-airr: + miairr: essential + description: End co-ordinate of the sequence detailed in this record, within the sequence deposited + +UnrearrangedSequence: + description: Details of an unrearranged sequence contributing support for a gene or allele + type: object + required: + - sequence_id + - sequence + - repository_name + - assembly_id + - gff_seqid + - gff_start + - gff_end + - strand + properties: + sequence_id: + type: string + nullable: true + x-airr: + identifier: true + miairr: important + description: unique identifier of this UnrearrangedSequence within the file + sequence: + type: string + nullable: false + description: > + Sequence of interest described in this record. Typically, this will include gene and promoter region. + x-airr: + miairr: essential + curation: + type: string + nullable: true + description: Curational notes on the sequence + repository_name: + type: string + nullable: true + x-airr: + miairr: defined + description: Name of the repository in which the assembly or contig is deposited + repository_ref: + type: string + nullable: true + x-airr: + miairr: defined + description: Queryable id or accession number of the sequence published by the repository + patch_no: + type: string + nullable: true + description: Genome assembly patch number in which this gene was determined + gff_seqid: + type: string + nullable: true + description: > + Sequence (from the assembly) of a window including the gene and preferably also the promoter region. + gff_start: + type: integer + nullable: true + description: > + Genomic co-ordinates of the start of the sequence of interest described in this record in + Ensemble GFF version 3. + gff_end: + type: integer + nullable: true + description: > + Genomic co-ordinates of the end of the sequence of interest described in this record in + Ensemble GFF version 3. + strand: + type: string + nullable: true + enum: + - + + - "-" + - null + description: sense (+ or -) + +# V gene delineation +SequenceDelineationV: + description: Delineation of a V-gene in a particular system + type: object + required: + - sequence_delineation_id + - delineation_scheme + - fwr1_start + - fwr1_end + - cdr1_start + - cdr1_end + - fwr2_start + - fwr2_end + - cdr2_start + - cdr2_end + - fwr3_start + - fwr3_end + - cdr3_start + properties: + sequence_delineation_id: + type: string + nullable: true + description: > + Unique identifier of this SequenceDelineationV within the file. Typically, generated by the + repository hosting the record. + x-airr: + identifier: true + miairr: important + + delineation_scheme: + type: string + nullable: true + x-airr: + miairr: important + description: Name of the delineation scheme + example: Chothia + unaligned_sequence: + type: string + nullable: true + x-airr: + miairr: important + description: entire V-sequence covered by this delineation + aligned_sequence: + type: string + nullable: true + description: > + Aligned sequence if this delineation provides an alignment. An aligned sequence should always be + provided for IMGT delineations. + fwr1_start: + type: integer + nullable: true + x-airr: + miairr: important + description: FWR1 start co-ordinate in the 'unaligned sequence' field + fwr1_end: + type: integer + nullable: true + x-airr: + miairr: important + description: FWR1 end co-ordinate in the 'unaligned sequence' field + cdr1_start: + type: integer + nullable: true + x-airr: + miairr: important + description: CDR1 start co-ordinate in the 'unaligned sequence' field + cdr1_end: + type: integer + nullable: true + x-airr: + miairr: important + description: CDR1 end co-ordinate in the 'unaligned sequence' field + fwr2_start: + type: integer + nullable: true + x-airr: + miairr: important + description: FWR2 start co-ordinate in the 'unaligned sequence' field + fwr2_end: + type: integer + nullable: true + x-airr: + miairr: important + description: FWR2 end co-ordinate in the 'unaligned sequence' field + cdr2_start: + type: integer + nullable: true + x-airr: + miairr: important + description: CDR2 start co-ordinate in the 'unaligned sequence' field + cdr2_end: + type: integer + nullable: true + x-airr: + miairr: important + description: CDR2 end co-ordinate in the 'unaligned sequence' field + fwr3_start: + type: integer + nullable: true + x-airr: + miairr: important + description: FWR3 start co-ordinate in the 'unaligned sequence' field + fwr3_end: + type: integer + nullable: true + x-airr: + miairr: important + description: FWR3 end co-ordinate in the 'unaligned sequence' field + cdr3_start: + type: integer + nullable: true + x-airr: + miairr: important + description: CDR3 start co-ordinate in the 'unaligned sequence' field + alignment_labels: + type: array + nullable: true + items: + type: string + description: > + One string for each codon in the aligned_sequence indicating the label of that codon according to + the numbering of the delineation scheme if it provides one. + +# Description of a putative or confirmed Ig receptor gene/allele +AlleleDescription: + description: Details of a putative or confirmed Ig receptor gene/allele inferred from one or more observations + type: object + required: + - allele_description_id + - maintainer + - lab_address + - release_version + - release_date + - release_description + - sequence + - coding_sequence + - locus + - sequence_type + - functional + - inference_type + - species + properties: + allele_description_id: + type: string + nullable: true + x-airr: + identifier: true + miairr: important + description: > + Unique identifier of this AlleleDescription within the file. Typically, generated by the + repository hosting the record. + allele_description_ref: + type: string + nullable: true + x-airr: + miairr: important + description: Unique reference to the allele description, in standardized form (Repo:Label:Version) + example: OGRDB:Human_IGH:IGHV1-69*01.001 + maintainer: + type: string + nullable: true + x-airr: + miairr: defined + description: Maintainer of this sequence record + acknowledgements: + type: array + nullable: true + description: List of individuals whose contribution to the gene description should be acknowledged + items: + $ref: '#/Acknowledgement' + lab_address: + type: string + nullable: true + x-airr: + miairr: defined + description: Institution and full address of corresponding author + release_version: + type: integer + nullable: true + x-airr: + miairr: important + description: Version number of this record, updated whenever a revised version is published or released + release_date: + type: string + nullable: true + format: date-time + x-airr: + miairr: important + description: Date of this release + title: Release Date + example: "2021-02-02" + release_description: + type: string + nullable: true + x-airr: + miairr: important + description: Brief descriptive notes of the reason for this release and the changes embodied + label: + type: string + nullable: true + x-airr: + miairr: important + description: > + The accepted name for this gene or allele following the relevant nomenclature. + The value in this field should correspond to values in acceptable name fields of other schemas, + such as v_call, d_call, and j_call fields. + example: IGHV1-69*01 + sequence: + type: string + nullable: false + x-airr: + miairr: essential + description: > + Nucleotide sequence of the gene. This should cover the full length that is available, + including where possible RSS, and 5' UTR and lead-in for V-gene sequences. + coding_sequence: + type: string + nullable: true + x-airr: + miairr: important + description: > + Nucleotide sequence of the core coding region, such as the coding region of a D-, J- or C- gene + or the coding region of a V-gene excluding the leader. + aliases: + type: array + nullable: true + items: + type: string + description: Alternative names for this sequence + locus: + type: string + nullable: false + enum: + - IGH + - IGI + - IGK + - IGL + - TRA + - TRB + - TRG + - TRD + description: Gene locus + x-airr: + miairr: essential + chromosome: + type: integer + nullable: true + description: chromosome on which the gene is located + sequence_type: + type: string + nullable: false + enum: + - V + - D + - J + - C + description: Sequence type (V, D, J, C) + x-airr: + miairr: essential + functional: + type: boolean + nullable: true + x-airr: + miairr: important + description: True if the gene is functional, false if it is a pseudogene + inference_type: + type: string + nullable: true + enum: + - genomic_and_rearranged + - genomic_only + - rearranged_only + - null + description: Type of inference(s) from which this gene sequence was inferred + x-airr: + miairr: important + species: + $ref: '#/Ontology' + nullable: false + description: Binomial designation of subject's species + title: Organism + example: + id: NCBITAXON:9606 + label: Homo sapiens + x-airr: + miairr: essential + species_subgroup: + type: string + nullable: true + description: Race, strain or other species subgroup to which this subject belongs + example: BALB/c + species_subgroup_type: + type: string + nullable: true + enum: + - breed + - strain + - inbred + - outbred + - locational + - null + status: + type: string + nullable: true + enum: + - active + - draft + - retired + - withdrawn + - null + description: Status of record, assumed active if the field is not present + subgroup_designation: + type: string + nullable: true + description: Identifier of the gene subgroup or clade, as (and if) defined + gene_designation: + type: string + nullable: true + description: Gene number or other identifier, as (and if) defined + allele_designation: + type: string + nullable: true + description: Allele number or other identifier, as (and if) defined + allele_similarity_cluster_designation: + type: string + nullable: true + description: ID of the similarity cluster used in this germline set, if designated + allele_similarity_cluster_member_id: + type: string + nullable: true + description: Membership ID of the allele within the similarity cluster, if a cluster is designated + j_codon_frame: + type: integer + nullable: true + enum: + - 1 + - 2 + - 3 + - null + description: > + Codon position of the first nucleotide in the 'coding_sequence' field. Mandatory for J genes. + Not used for V or D genes. '1' means the sequence is in-frame, '2' means that the first bp is + missing from the first codon, and '3' means that the first 2 bp are missing. + gene_start: + type: integer + nullable: true + description: > + Co-ordinate in the sequence field of the first nucleotide in the coding_sequence field. + x-airr: + miairr: important + gene_end: + type: integer + nullable: true + description: > + Co-ordinate in the sequence field of the last gene-coding nucleotide in the coding_sequence field. + x-airr: + miairr: important + utr_5_prime_start: + type: integer + nullable: true + description: Start co-ordinate in the sequence field of the 5 prime UTR (V-genes only). + utr_5_prime_end: + type: integer + nullable: true + description: End co-ordinate in the sequence field of the 5 prime UTR (V-genes only). + leader_1_start: + type: integer + nullable: true + description: Start co-ordinate in the sequence field of L-PART1 (V-genes only). + leader_1_end: + type: integer + nullable: true + description: End co-ordinate in the sequence field of L-PART1 (V-genes only). + leader_2_start: + type: integer + nullable: true + description: Start co-ordinate in the sequence field of L-PART2 (V-genes only). + leader_2_end: + type: integer + nullable: true + description: End co-ordinate in the sequence field of L-PART2 (V-genes only). + v_rs_start: + type: integer + nullable: true + description: Start co-ordinate in the sequence field of the V recombination site (V-genes only). + v_rs_end: + type: integer + nullable: true + description: End co-ordinate in the sequence field of the V recombination site (V-genes only). + d_rs_3_prime_start: + type: integer + nullable: true + description: Start co-ordinate in the sequence field of the 3 prime D recombination site (D-genes only). + d_rs_3_prime_end: + type: integer + nullable: true + description: End co-ordinate in the sequence field of the 3 prime D recombination site (D-genes only). + d_rs_5_prime_start: + type: integer + nullable: true + description: Start co-ordinate in the sequence field of the 5 prime D recombination site (D-genes only). + d_rs_5_prime_end: + type: integer + nullable: true + description: End co-ordinate in the sequence field of 5 the prime D recombination site (D-genes only). + j_cdr3_end: + type: integer + nullable: true + description: > + In the case of a J-gene, the co-ordinate in the sequence field of the first nucelotide of the + conserved PHE or TRP (IMGT codon position 118). + j_rs_start: + type: integer + nullable: true + description: Start co-ordinate in the sequence field of J recombination site (J-genes only). + j_rs_end: + type: integer + nullable: true + description: End co-ordinate in the sequence field of J recombination site (J-genes only). + j_donor_splice: + type: integer + nullable: true + description: Co-ordinate in the sequence field of the final 3' nucleotide of the J-REGION (J-genes only). + v_gene_delineations: + type: array + nullable: true + items: + $ref: '#/SequenceDelineationV' + unrearranged_support: + type: array + nullable: true + items: + $ref: '#/UnrearrangedSequence' + rearranged_support: + type: array + nullable: true + items: + $ref: '#/RearrangedSequence' + paralogs: + type: array + nullable: true + items: + type: string + description: Gene symbols of any paralogs + curation: + type: string + nullable: true + description: > + Curational notes on the AlleleDescription. This can be used to give more extensive notes on the + decisions taken than are provided in the release_description. + curational_tags: + type: array + nullable: true + items: + type: string + enum: + - likely_truncated + - likely_full_length + description: Controlled-vocabulary tags applied to this description + +# Collection of gene descriptions into a germline set +GermlineSet: + type: object + description: > + A germline object set bringing together multiple AlleleDescriptions from the same strain or species. + All genes in a GermlineSet should be from a single locus. + required: + - germline_set_id + - author + - lab_name + - lab_address + - release_version + - release_description + - release_date + - germline_set_name + - germline_set_ref + - species + - locus + - allele_descriptions + properties: + germline_set_id: + type: string + nullable: true + description: > + Unique identifier of the GermlineSet within this file. Typically, generated by the + repository hosting the record. + x-airr: + identifier: true + miairr: important + author: + type: string + nullable: true + x-airr: + miairr: important + description: Corresponding author + lab_name: + type: string + nullable: true + x-airr: + miairr: important + description: Department of corresponding author + lab_address: + type: string + nullable: true + x-airr: + miairr: important + description: Institutional address of corresponding author + acknowledgements: + type: array + nullable: true + description: List of individuals whose contribution to the germline set should be acknowledged + items: + $ref: '#/Acknowledgement' + release_version: + type: number + nullable: true + x-airr: + miairr: important + description: Version number of this record, allocated automatically + release_description: + type: string + nullable: true + x-airr: + miairr: important + description: Brief descriptive notes of the reason for this release and the changes embodied + release_date: + type: string + nullable: true + format: date-time + x-airr: + miairr: important + description: Date of this release + title: Release Date + example: "2021-02-02" + germline_set_name: + type: string + nullable: true + x-airr: + miairr: important + description: descriptive name of this germline set + germline_set_ref: + type: string + nullable: true + x-airr: + miairr: important + description: Unique identifier of the germline set and version, in standardized form (Repo:Label:Version) + example: OGRDB:Human_IGH:2021.11 + pub_ids: + type: array + items: + type: string + nullable: true + description: Publications describing the germline set + example: ["PMID:35720344"] + species: + $ref: '#/Ontology' + nullable: false + x-airr: + miairr: essential + description: Binomial designation of subject's species + title: Organism + example: + id: NCBITAXON:9606 + label: Homo sapiens + species_subgroup: + type: string + nullable: true + description: Race, strain or other species subgroup to which this subject belongs + example: BALB/c + species_subgroup_type: + type: string + nullable: true + enum: + - breed + - strain + - inbred + - outbred + - locational + - null + locus: + type: string + nullable: false + enum: + - IGH + - IGI + - IGK + - IGL + - TRA + - TRB + - TRG + - TRD + description: Gene locus + x-airr: + miairr: essential + allele_descriptions: + type: array + nullable: true + items: + $ref: '#/AlleleDescription' + description: list of allele_descriptions in the germline set + x-airr: + miairr: important + curation: + type: string + nullable: true + description: > + Curational notes on the GermlineSet. This can be used to give more extensive notes on the + decisions taken than are provided in the release_description. + +# +# Genotype schema +# + +# GenotypeSet lists the Genotypes (describing different loci) inferred for this subject + +GenotypeSet: + type: object + required: + - receptor_genotype_set_id + properties: + receptor_genotype_set_id: + type: string + nullable: true + x-airr: + identifier: true + miairr: important + description: > + A unique identifier for this Receptor Genotype Set, typically generated by the repository + hosting the schema, for example from the underlying ID of the database record. + genotype_class_list: + description: List of Genotypes included in this Receptor Genotype Set. + type: array + nullable: true + items: + $ref: '#/Genotype' + +# Genotype of adaptive immune receptors +# This enumerates the alleles and gene deletions inferred in a single subject. +# Included alleles may either be listed by reference to a GermlineSet, or +# listed as 'undocumented', in which case the inferred sequence is provided + +Genotype: + type: object + required: + - receptor_genotype_id + - locus + properties: + receptor_genotype_id: + type: string + nullable: true + x-airr: + identifier: true + miairr: important + description: > + A unique identifier within the file for this Receptor Genotype, typically generated by the + repository hosting the schema, for example from the underlying ID of the database record. + locus: + type: string + nullable: false + enum: + - IGH + - IGI + - IGK + - IGL + - TRA + - TRB + - TRD + - TRG + description: Gene locus + example: IGH + x-airr: + adc-query-support: true + format: controlled_vocabulary + miairr: essential + documented_alleles: + type: array + nullable: true + description: List of alleles documented in reference set(s) + items: + $ref: '#/DocumentedAllele' + x-airr: + miairr: important + undocumented_alleles: + type: array + nullable: true + description: List of alleles inferred to be present and not documented in an identified GermlineSet + items: + $ref: '#/UndocumentedAllele' + x-airr: + adc-query-support: true + deleted_genes: + type: array + nullable: true + description: Array of genes identified as being deleted in this genotype + items: + $ref: '#/DeletedGene' + x-airr: + adc-query-support: true + inference_process: + type: string + nullable: true + enum: + - genomic_sequencing + - repertoire_sequencing + - null + description: Information on how the genotype was acquired. Controlled vocabulary. + title: Genotype acquisition process + example: repertoire_sequencing + x-airr: + adc-query-support: true + format: controlled_vocabulary + +# Documented Allele +# This describes a 'known' allele found in a genotype +# It 'known' in the sense that it is documented in a reference set + +DocumentedAllele: + type: object + required: + - label + - germline_set_ref + properties: + label: + type: string + nullable: true + x-airr: + miairr: important + description: The accepted name for this allele, taken from the GermlineSet + germline_set_ref: + type: string + nullable: true + x-airr: + miairr: important + description: GermlineSet from which it was taken, referenced in standardized form (Repo:Label:Version) + example: OGRDB:Human_IGH:2021.11 + phasing: + type: integer + nullable: true + description: > + Chromosomal phasing indicator. Alleles with the same value are inferred to be located on the + same chromosome. + +# Undocumented Allele +# This describes a 'undocumented' allele found in a genotype +# It is 'undocumented' in the sense that it was not found in reference sets consulted for the analysis + +UndocumentedAllele: + required: + - allele_name + - sequence + type: object + properties: + allele_name: + type: string + nullable: true + description: Allele name as allocated by the inference pipeline + x-airr: + miairr: important + sequence: + type: string + nullable: false + description: nt sequence of the allele, as provided by the inference pipeline + x-airr: + miairr: essential + phasing: + type: integer + nullable: true + description: > + Chromosomal phasing indicator. Alleles with the same value are inferred to be located on the + same chromosome. + +# Deleted Gene +# It is regarded as 'deleted' in the sense that it was not identified during inference of the genotype + +DeletedGene: + required: + - label + - germline_set_ref + type: object + properties: + label: + type: string + nullable: false + description: The accepted name for this gene, taken from the GermlineSet + x-airr: + miairr: essential + germline_set_ref: + type: string + nullable: true + description: GermlineSet from which it was taken (issuer/name/version) + x-airr: + miairr: important + phasing: + type: integer + nullable: true + description: > + Chromosomal phasing indicator. Alleles with the same value are inferred to be located on the + same chromosome. + + +# List of MHCGenotypes describing a subject's genotype +MHCGenotypeSet: + type: object + required: + - mhc_genotype_set_id + - mhc_genotype_list + properties: + mhc_genotype_set_id: + type: string + nullable: true + x-airr: + identifier: true + miairr: important + description: A unique identifier for this MHCGenotypeSet + mhc_genotype_list: + description: List of MHCGenotypes included in this set + type: array + nullable: true + x-airr: + miairr: important + items: + $ref: '#/MHCGenotype' + +# Genotype of major histocompatibility complex (MHC) class I, class II and non-classical loci +MHCGenotype: + type: object + required: + - mhc_genotype_id + - mhc_class + - mhc_alleles + properties: + mhc_genotype_id: + type: string + nullable: true + x-airr: + identifier: true + miairr: important + description: A unique identifier for this MHCGenotype, assumed to be unique in the context of the study + mhc_class: + type: string + nullable: false + enum: + - MHC-I + - MHC-II + - MHC-nonclassical + description: Class of MHC alleles described by the MHCGenotype + example: MHC-I + x-airr: + miairr: essential + adc-query-support: true + format: controlled_vocabulary + mhc_alleles: + type: array + nullable: true + description: List of MHC alleles of the indicated mhc_class identified in an individual + items: + $ref: '#/MHCAllele' + x-airr: + miairr: important + adc-query-support: true + mhc_genotyping_method: + type: string + nullable: true + description: > + Information on how the genotype was determined. The content of this field should come from a list of + recommended terms provided in the AIRR Schema documentation. + title: MHC genotyping method + example: pcr_low_resolution + x-airr: + adc-query-support: true + miairr: important + + +# Allele of an MHC gene +MHCAllele: + type: object + properties: + allele_designation: + type: string + nullable: true + x-airr: + miairr: important + description: > + The accepted designation of an allele, usually its gene symbol plus allele/sub-allele/etc + identifiers, if provided by the mhc_typing method + gene: + $ref: '#/Ontology' + nullable: true + description: The MHC gene to which the described allele belongs + title: MHC gene + example: + id: MRO:0000046 + label: HLA-A + x-airr: + adc-query-support: false + format: ontology + ontology: + draft: true + top_node: + id: MRO:0000004 + label: MHC gene + miairr: important + reference_set_ref: + type: string + nullable: true + x-airr: + miairr: important + description: Repository and list from which it was taken (issuer/name/version) + + +SubjectGenotype: + type: object + properties: + receptor_genotype_set: + nullable: true + $ref: '#/GenotypeSet' + description: Immune receptor genotype set for this subject. + mhc_genotype_set: + nullable: true + $ref: '#/MHCGenotypeSet' + description: MHC genotype set for this subject. + +# +# Repertoire metadata schema +# + +# The overall study with a globally unique study_id +Study: + type: object + required: + - study_id + - study_title + - study_type + - inclusion_exclusion_criteria + - grants + - collected_by + - lab_name + - lab_address + - submitted_by + - pub_ids + - keywords_study + properties: + study_id: + type: string + nullable: true + description: > + Unique ID assigned by study registry such as one of the International Nucleotide Sequence Database + Collaboration (INSDC) repositories. + title: Study ID + example: PRJNA001 + x-airr: + identifier: true + miairr: important + adc-query-support: true + set: 1 + subset: study + name: Study ID + study_title: + type: string + nullable: true + description: Descriptive study title + title: Study title + example: Effects of sun light exposure of the Treg repertoire + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: study + name: Study title + study_type: + $ref: '#/Ontology' + nullable: true + description: Type of study design + title: Study type + example: + id: NCIT:C15197 + label: Case-Control Study + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: study + name: Study type + format: ontology + ontology: + draft: false + top_node: + id: NCIT:C63536 + label: Study + study_description: + type: string + nullable: true + description: Generic study description + title: Study description + example: Longer description + x-airr: + name: Study description + adc-query-support: true + inclusion_exclusion_criteria: + type: string + nullable: true + description: List of criteria for inclusion/exclusion for the study + title: Study inclusion/exclusion criteria + example: "Include: Clinical P. falciparum infection; Exclude: Seropositive for HIV" + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: study + name: Study inclusion/exclusion criteria + grants: + type: string + nullable: true + description: Funding agencies and grant numbers + title: Grant funding agency + example: NIH, award number R01GM987654 + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: study + name: Grant funding agency + study_contact: + type: string + nullable: true + description: > + Full contact information of the contact persons for this study This should include an e-mail address + and a persistent identifier such as an ORCID ID. + title: Contact information (study) + example: Dr. P. Stibbons, p.stibbons@unseenu.edu, https://orcid.org/0000-0002-1825-0097 + x-airr: + adc-query-support: true + name: Contact information (study) + collected_by: + type: string + nullable: true + description: > + Full contact information of the data collector, i.e. the person who is legally responsible for data + collection and release. This should include an e-mail address and a persistent identifier such as an + ORCID ID. + title: Contact information (data collection) + example: Dr. P. Stibbons, p.stibbons@unseenu.edu, https://orcid.org/0000-0002-1825-0097 + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: study + name: Contact information (data collection) + lab_name: + type: string + nullable: true + description: Department of data collector + title: Lab name + example: Department for Planar Immunology + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: study + name: Lab name + lab_address: + type: string + nullable: true + description: Institution and institutional address of data collector + title: Lab address + example: School of Medicine, Unseen University, Ankh-Morpork, Disk World + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: study + name: Lab address + submitted_by: + type: string + nullable: true + description: > + Full contact information of the data depositor, i.e., the person submitting the data to a repository. + This should include an e-mail address and a persistent identifier such as an ORCID ID. This is + supposed to be a short-lived and technical role until the submission is relased. + title: Contact information (data deposition) + example: Adrian Turnipseed, a.turnipseed@unseenu.edu, https://orcid.org/0000-0002-1825-0097 + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: study + name: Contact information (data deposition) + pub_ids: + type: array + items: + type: string + nullable: true + description: > + Array of publications describing the rationale and/or outcome of the study as an array of CURIE objects such as + a DOI or Pubmed ID. Where more than one publication is given, if there is a primary publication for the study it + should come first. + title: Relevant publications + example: ["PMID:29144493", "DOI:10.1038/ni.3873"] + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: study + name: Relevant publications + keywords_study: + type: array + items: + type: string + enum: + - contains_ig + - contains_tr + - contains_paired_chain + - contains_schema_rearrangement + - contains_schema_clone + - contains_schema_cell + - contains_schema_receptor + - contains_schema_cellexpression + - contains_schema_receptorreactivity + nullable: true + description: > + Keywords describing properties of one or more data sets in a study. "contains_schema" keywords indicate that + the study contains data objects from the AIRR Schema of that type (Rearrangement, Clone, Cell, Receptor) while + the other keywords indicate that the study design considers the type of data indicated (e.g. it is possible to have + a study that "contains_paired_chain" but does not "contains_schema_cell"). + title: Keywords for study + example: + - contains_ig + - contains_schema_rearrangement + - contains_schema_clone + - contains_schema_cell + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: study + name: Keywords for study + format: controlled_vocabulary + adc_publish_date: + type: string + format: date-time + nullable: true + description: > + Date the study was first published in the AIRR Data Commons. + title: ADC Publish Date + example: "2021-02-02" + x-airr: + adc-query-support: true + name: ADC Publish Date + adc_update_date: + type: string + format: date-time + nullable: true + description: > + Date the study data was updated in the AIRR Data Commons. + title: ADC Update Date + example: "2021-02-02" + x-airr: + adc-query-support: true + name: ADC Update Date + +# 1-to-n relationship between a study and its subjects +# subject_id is unique within a study +Subject: + type: object + required: + - subject_id + - synthetic + - species + - sex + - age_min + - age_max + - age_unit + - age_event + - ancestry_population + - ethnicity + - race + - strain_name + - linked_subjects + - link_type + properties: + subject_id: + type: string + nullable: true + description: > + Subject ID assigned by submitter, unique within study. If possible, a persistent subject ID linked to + an INSDC or similar repository study should be used. + title: Subject ID + example: SUB856413 + x-airr: + identifier: true + miairr: important + adc-query-support: true + set: 1 + subset: subject + name: Subject ID + synthetic: + type: boolean + nullable: false + description: TRUE for libraries in which the diversity has been synthetically generated (e.g. phage display) + title: Synthetic library + x-airr: + miairr: essential + adc-query-support: true + set: 1 + subset: subject + name: Synthetic library + species: + $ref: '#/Ontology' + nullable: false + description: Binomial designation of subject's species + title: Organism + example: + id: NCBITAXON:9606 + label: Homo sapiens + x-airr: + miairr: essential + adc-query-support: true + set: 1 + subset: subject + name: Species + format: ontology + ontology: + draft: false + top_node: + id: NCBITAXON:7776 + label: Gnathostomata + organism: + $ref: '#/Ontology' + nullable: true + description: Binomial designation of subject's species + x-airr: + deprecated: true + deprecated-description: Field was renamed to species for clarity. + deprecated-replaced-by: + - species + sex: + type: string + enum: + - male + - female + - pooled + - hermaphrodite + - intersex + - null + nullable: true + description: Biological sex of subject + title: Sex + example: female + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: subject + name: Sex + format: controlled_vocabulary + age_min: + type: number + nullable: true + description: Specific age or lower boundary of age range. + title: Age minimum + example: 60 + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: subject + name: Age minimum + age_max: + type: number + nullable: true + description: > + Upper boundary of age range or equal to age_min for specific age. + This field should only be null if age_min is null. + title: Age maximum + example: 80 + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: subject + name: Age maximum + age_unit: + $ref: '#/Ontology' + nullable: true + description: Unit of age range + title: Age unit + example: + id: UO:0000036 + label: year + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: subject + name: Age unit + format: ontology + ontology: + draft: false + top_node: + id: UO:0000003 + label: time unit + age_event: + type: string + nullable: true + description: > + Event in the study schedule to which `Age` refers. For NCBI BioSample this MUST be `sampling`. For other + implementations submitters need to be aware that there is currently no mechanism to encode to potential + delta between `Age event` and `Sample collection time`, hence the chosen events should be in temporal proximity. + title: Age event + example: enrollment + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: subject + name: Age event + age: + type: string + nullable: true + x-airr: + deprecated: true + deprecated-description: Split into two fields to specify as an age range. + deprecated-replaced-by: + - age_min + - age_max + - age_unit + ancestry_population: + $ref: '#/Ontology' + nullable: true + description: Broad geographic origin of ancestry (continent) + title: Ancestry population + example: + id: GAZ:00000459 + label: South America + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: subject + name: Ancestry population + format: ontology + ontology: + draft: true + top_node: + id: GAZ:00000448 + label: geographic location + location_birth: + $ref: '#/Ontology' + nullable: true + description: Self-reported location of birth of the subject, preferred granularity is country-level + example: + id: GAZ:00002939 + label: Poland + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: subject + name: Location of birth + format: ontology + ontology: + draft: true + top_node: + id: GAZ:00000448 + label: geographic location + ethnicity: + type: string + nullable: true + description: Ethnic group of subject (defined as cultural/language-based membership) + title: Ethnicity + example: English, Kurds, Manchu, Yakuts (and other fields from Wikipedia) + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: subject + name: Ethnicity + race: + type: string + nullable: true + description: Racial group of subject (as defined by NIH) + title: Race + example: White, American Indian or Alaska Native, Black, Asian, Native Hawaiian or Other Pacific Islander, Other + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: subject + name: Race + strain_name: + type: string + nullable: true + description: Non-human designation of the strain or breed of animal used + title: Strain name + example: C57BL/6J + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: subject + name: Strain name + linked_subjects: + type: string + nullable: true + description: Subject ID to which `Relation type` refers + title: Relation to other subjects + example: SUB1355648 + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: subject + name: Relation to other subjects + link_type: + type: string + nullable: true + description: Relation between subject and `linked_subjects`, can be genetic or environmental (e.g.exposure) + title: Relation type + example: father, daughter, household + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: subject + name: Relation type + diagnosis: + type: array + nullable: false + description: Diagnosis information for subject + items: + $ref: '#/Diagnosis' + x-airr: + adc-query-support: true + genotype: + nullable: true + $ref: '#/SubjectGenotype' + title: SubjectGenotype + +# 1-to-n relationship between a subject and its diagnoses +Diagnosis: + type: object + required: + - study_group_description + - disease_diagnosis + - disease_length + - disease_stage + - prior_therapies + - immunogen + - intervention + - medical_history + properties: + study_group_description: + type: string + nullable: true + description: Designation of study arm to which the subject is assigned to + title: Study group description + example: control + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: diagnosis and intervention + name: Study group description + disease_diagnosis: + $ref: '#/Ontology' + nullable: true + description: Diagnosis of subject + title: Diagnosis + example: + id: DOID:9538 + label: multiple myeloma + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: diagnosis and intervention + name: Diagnosis + format: ontology + ontology: + draft: false + top_node: + id: DOID:4 + label: disease + disease_length: + type: string + nullable: true + description: Time duration between initial diagnosis and current intervention + title: Length of disease + example: 23 months + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: diagnosis and intervention + name: Length of disease + format: physical_quantity + disease_stage: + type: string + nullable: true + description: Stage of disease at current intervention + title: Disease stage + example: Stage II + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: diagnosis and intervention + name: Disease stage + prior_therapies: + type: string + nullable: true + description: List of all relevant previous therapies applied to subject for treatment of `Diagnosis` + title: Prior therapies for primary disease under study + example: melphalan/prednisone + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: diagnosis and intervention + name: Prior therapies for primary disease under study + immunogen: + type: string + nullable: true + description: Antigen, vaccine or drug applied to subject at this intervention + title: Immunogen/agent + example: bortezomib + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: diagnosis and intervention + name: Immunogen/agent + intervention: + type: string + nullable: true + description: Description of intervention + title: Intervention definition + example: systemic chemotherapy, 6 cycles, 1.25 mg/m2 + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: diagnosis and intervention + name: Intervention definition + medical_history: + type: string + nullable: true + description: Medical history of subject that is relevant to assess the course of disease and/or treatment + title: Other relevant medical history + example: MGUS, first diagnosed 5 years prior + x-airr: + miairr: important + adc-query-support: true + set: 1 + subset: diagnosis and intervention + name: Other relevant medical history + +# 1-to-n relationship between a subject and its samples +# sample_id is unique within a study +Sample: + type: object + required: + - sample_id + - sample_type + - tissue + - anatomic_site + - disease_state_sample + - collection_time_point_relative + - collection_time_point_relative_unit + - collection_time_point_reference + - biomaterial_provider + properties: + sample_id: + type: string + nullable: true + description: > + Sample ID assigned by submitter, unique within study. If possible, a persistent sample ID linked to + INSDC or similar repository study should be used. + title: Biological sample ID + example: SUP52415 + x-airr: + identifier: true + miairr: important + adc-query-support: true + set: 2 + subset: sample + name: Biological sample ID + sample_type: + type: string + nullable: true + description: The way the sample was obtained, e.g. fine-needle aspirate, organ harvest, peripheral venous puncture + title: Sample type + example: Biopsy + x-airr: + miairr: important + adc-query-support: true + set: 2 + subset: sample + name: Sample type + tissue: + $ref: '#/Ontology' + nullable: true + description: The actual tissue sampled, e.g. lymph node, liver, peripheral blood + title: Tissue + example: + id: UBERON:0002371 + label: bone marrow + x-airr: + miairr: important + adc-query-support: true + set: 2 + subset: sample + name: Tissue + format: ontology + ontology: + draft: false + top_node: + id: UBERON:0010000 + label: multicellular anatomical structure + anatomic_site: + type: string + nullable: true + description: The anatomic location of the tissue, e.g. Inguinal, femur + title: Anatomic site + example: Iliac crest + x-airr: + miairr: important + adc-query-support: true + set: 2 + subset: sample + name: Anatomic site + disease_state_sample: + type: string + nullable: true + description: Histopathologic evaluation of the sample + title: Disease state of sample + example: Tumor infiltration + x-airr: + miairr: important + adc-query-support: true + set: 2 + subset: sample + name: Disease state of sample + collection_time_point_relative: + type: number + nullable: true + description: Time point at which sample was taken, relative to `Collection time event` + title: Sample collection time + example: 14 + x-airr: + miairr: important + adc-query-support: true + set: 2 + subset: sample + name: Sample collection time + collection_time_point_relative_unit: + $ref: '#/Ontology' + nullable: true + description: Unit of Sample collection time + title: Sample collection time unit + example: + id: UO:0000033 + label: day + x-airr: + miairr: important + adc-query-support: true + set: 2 + subset: sample + name: Sample collection time unit + format: ontology + ontology: + draft: false + top_node: + id: UO:0000003 + label: time unit + collection_time_point_reference: + type: string + nullable: true + description: Event in the study schedule to which `Sample collection time` relates to + title: Collection time event + example: Primary vaccination + x-airr: + miairr: important + adc-query-support: true + set: 2 + subset: sample + name: Collection time event + collection_location: + $ref: '#/Ontology' + nullable: true + description: Location where the sample was taken, preferred granularity is country-level + title: Sample collection location + example: + id: GAZ:00002939 + label: Poland + x-airr: + miairr: important + set: 2 + subset: sample + name: Sample collection location + format: ontology + ontology: + draft: true + top_node: + id: GAZ:00000448 + label: geographic location + biomaterial_provider: + type: string + nullable: true + description: Name and address of the entity providing the sample + title: Biomaterial provider + example: Tissues-R-Us, Tampa, FL, USA + x-airr: + miairr: important + adc-query-support: true + set: 2 + subset: sample + name: Biomaterial provider + +# 1-to-n relationship between a sample and processing of its cells +CellProcessing: + type: object + required: + - tissue_processing + - cell_subset + - cell_phenotype + - single_cell + - cell_number + - cells_per_reaction + - cell_storage + - cell_quality + - cell_isolation + - cell_processing_protocol + properties: + tissue_processing: + type: string + nullable: true + description: Enzymatic digestion and/or physical methods used to isolate cells from sample + title: Tissue processing + example: Collagenase A/Dnase I digested, followed by Percoll gradient + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (cell) + name: Tissue processing + cell_subset: + $ref: '#/Ontology' + nullable: true + description: Commonly-used designation of isolated cell population + title: Cell subset + example: + id: CL:0000972 + label: class switched memory B cell + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (cell) + name: Cell subset + format: ontology + ontology: + draft: false + top_node: + id: CL:0000542 + label: lymphocyte + cell_phenotype: + type: string + nullable: true + description: List of cellular markers and their expression levels used to isolate the cell population + title: Cell subset phenotype + example: CD19+ CD38+ CD27+ IgM- IgD- + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (cell) + name: Cell subset phenotype + cell_species: + $ref: '#/Ontology' + nullable: true + description: > + Binomial designation of the species from which the analyzed cells originate. Typically, this value + should be identical to `species`, in which case it SHOULD NOT be set explicitly. However, there are + valid experimental setups in which the two might differ, e.g., chimeric animal models. If set, this + key will overwrite the `species` information for all lower layers of the schema. + title: Cell species + example: + id: NCBITAXON:9606 + label: Homo sapiens + x-airr: + miairr: defined + adc-query-support: true + set: 3 + subset: process (cell) + name: Cell species + format: ontology + ontology: + draft: false + top_node: + id: NCBITAXON:7776 + label: Gnathostomata + single_cell: + type: boolean + nullable: true + description: TRUE if single cells were isolated into separate compartments + title: Single-cell sort + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (cell) + name: Single-cell sort + cell_number: + type: integer + nullable: true + description: Total number of cells that went into the experiment + title: Number of cells in experiment + example: 1000000 + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (cell) + name: Number of cells in experiment + cells_per_reaction: + type: integer + nullable: true + description: Number of cells for each biological replicate + title: Number of cells per sequencing reaction + example: 50000 + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (cell) + name: Number of cells per sequencing reaction + cell_storage: + type: boolean + nullable: true + description: TRUE if cells were cryo-preserved between isolation and further processing + title: Cell storage + example: TRUE + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (cell) + name: Cell storage + cell_quality: + type: string + nullable: true + description: Relative amount of viable cells after preparation and (if applicable) thawing + title: Cell quality + example: 90% viability as determined by 7-AAD + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (cell) + name: Cell quality + cell_isolation: + type: string + nullable: true + description: Description of the procedure used for marker-based isolation or enrich cells + title: Cell isolation / enrichment procedure + example: > + Cells were stained with fluorochrome labeled antibodies and then sorted on a FlowMerlin (CE) cytometer. + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (cell) + name: Cell isolation / enrichment procedure + cell_processing_protocol: + type: string + nullable: true + description: > + Description of the methods applied to the sample including cell preparation/ isolation/enrichment and + nucleic acid extraction. This should closely mirror the Materials and methods section in the manuscript. + title: Processing protocol + example: Stimulated wih anti-CD3/anti-CD28 + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (cell) + name: Processing protocol + +# object for PCR primer targets +PCRTarget: + type: object + required: + - pcr_target_locus + - forward_pcr_primer_target_location + - reverse_pcr_primer_target_location + properties: + pcr_target_locus: + type: string + enum: + - IGH + - IGI + - IGK + - IGL + - TRA + - TRB + - TRD + - TRG + - null + nullable: true + description: > + Designation of the target locus. Note that this field uses a controlled vocubulary that is meant to + provide a generic classification of the locus, not necessarily the correct designation according to + a specific nomenclature. + title: Target locus for PCR + example: IGK + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (nucleic acid [pcr]) + name: Target locus for PCR + format: controlled_vocabulary + forward_pcr_primer_target_location: + type: string + nullable: true + description: Position of the most distal nucleotide templated by the forward primer or primer mix + title: Forward PCR primer target location + example: IGHV, +23 + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (nucleic acid [pcr]) + name: Forward PCR primer target location + reverse_pcr_primer_target_location: + type: string + nullable: true + description: Position of the most proximal nucleotide templated by the reverse primer or primer mix + title: Reverse PCR primer target location + example: IGHG, +57 + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (nucleic acid [pcr]) + name: Reverse PCR primer target location + +# generally, a 1-to-1 relationship between a CellProcessing and processing of its nucleic acid +# but may be 1-to-n for technical replicates. +NucleicAcidProcessing: + type: object + required: + - template_class + - template_quality + - template_amount + - template_amount_unit + - library_generation_method + - library_generation_protocol + - library_generation_kit_version + - complete_sequences + - physical_linkage + properties: + template_class: + type: string + enum: + - DNA + - RNA + nullable: false + description: > + The class of nucleic acid that was used as primary starting material for the following procedures + title: Target substrate + example: RNA + x-airr: + miairr: essential + adc-query-support: true + set: 3 + subset: process (nucleic acid) + name: Target substrate + format: controlled_vocabulary + template_quality: + type: string + nullable: true + description: Description and results of the quality control performed on the template material + title: Target substrate quality + example: RIN 9.2 + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (nucleic acid) + name: Target substrate quality + template_amount: + type: number + nullable: true + description: Amount of template that went into the process + title: Template amount + example: 1000 + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (nucleic acid) + name: Template amount + template_amount_unit: + $ref: '#/Ontology' + nullable: true + description: Unit of template amount + title: Template amount time unit + example: + id: UO:0000024 + label: nanogram + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (nucleic acid) + name: Template amount time unit + format: ontology + ontology: + draft: false + top_node: + id: UO:0000002 + label: physical quantity + library_generation_method: + type: string + enum: + - "PCR" + - "RT(RHP)+PCR" + - "RT(oligo-dT)+PCR" + - "RT(oligo-dT)+TS+PCR" + - "RT(oligo-dT)+TS(UMI)+PCR" + - "RT(specific)+PCR" + - "RT(specific)+TS+PCR" + - "RT(specific)+TS(UMI)+PCR" + - "RT(specific+UMI)+PCR" + - "RT(specific+UMI)+TS+PCR" + - "RT(specific)+TS" + - "other" + nullable: false + description: Generic type of library generation + title: Library generation method + example: RT(oligo-dT)+TS(UMI)+PCR + x-airr: + miairr: essential + adc-query-support: true + set: 3 + subset: process (nucleic acid) + name: Library generation method + format: controlled_vocabulary + library_generation_protocol: + type: string + nullable: true + description: Description of processes applied to substrate to obtain a library that is ready for sequencing + title: Library generation protocol + example: cDNA was generated using + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (nucleic acid) + name: Library generation protocol + library_generation_kit_version: + type: string + nullable: true + description: When using a library generation protocol from a commercial provider, provide the protocol version number + title: Protocol IDs + example: v2.1 (2016-09-15) + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (nucleic acid) + name: Protocol IDs + pcr_target: + type: array + nullable: false + description: > + If a PCR step was performed that specifically targets the IG/TR loci, the target and primer locations + need to be provided here. This field holds an array of PCRTarget objects, so that multiplex PCR setups + amplifying multiple loci at the same time can be annotated using one record per locus. PCR setups not + targeting any specific locus must not annotate this field but select the appropriate + library_generation_method instead. + items: + $ref: '#/PCRTarget' + x-airr: + adc-query-support: true + complete_sequences: + type: string + enum: + - partial + - complete + - "complete+untemplated" + - mixed + nullable: false + description: > + To be considered `complete`, the procedure used for library construction MUST generate sequences that + 1) include the first V gene codon that encodes the mature polypeptide chain (i.e. after the + leader sequence) and 2) include the last complete codon of the J gene (i.e. 1 bp 5' of the J->C + splice site) and 3) provide sequence information for all positions between 1) and 2). To be considered + `complete & untemplated`, the sections of the sequences defined in points 1) to 3) of the previous + sentence MUST be untemplated, i.e. MUST NOT overlap with the primers used in library preparation. + `mixed` should only be used if the procedure used for library construction will likely produce multiple + categories of sequences in the given experiment. It SHOULD NOT be used as a replacement of a NULL value. + title: Complete sequences + example: partial + x-airr: + miairr: essential + adc-query-support: true + set: 3 + subset: process (nucleic acid) + name: Complete sequences + format: controlled_vocabulary + physical_linkage: + type: string + enum: + - none + - "hetero_head-head" + - "hetero_tail-head" + - "hetero_prelinked" + nullable: false + description: > + In case an experimental setup is used that physically links nucleic acids derived from distinct + `Rearrangements` before library preparation, this field describes the mode of that linkage. All + `hetero_*` terms indicate that in case of paired-read sequencing, the two reads should be expected + to map to distinct IG/TR loci. `*_head-head` refers to techniques that link the 5' ends of transcripts + in a single-cell context. `*_tail-head` refers to techniques that link the 3' end of one transcript to + the 5' end of another one in a single-cell context. This term does not provide any information whether + a continuous reading-frame between the two is generated. `*_prelinked` refers to constructs in which + the linkage was already present on the DNA level (e.g. scFv). + title: Physical linkage of different rearrangements + example: hetero_head-head + x-airr: + miairr: essential + adc-query-support: true + set: 3 + subset: process (nucleic acid) + name: Physical linkage of different rearrangements + format: controlled_vocabulary + +# 1-to-n relationship between a NucleicAcidProcessing and SequencingRun with resultant raw sequence file(s) +SequencingRun: + type: object + required: + - sequencing_run_id + - total_reads_passing_qc_filter + - sequencing_platform + - sequencing_facility + - sequencing_run_date + - sequencing_kit + properties: + sequencing_run_id: + type: string + nullable: true + description: ID of sequencing run assigned by the sequencing facility + title: Batch number + example: 160101_M01234 + x-airr: + identifier: true + miairr: important + adc-query-support: true + set: 3 + subset: process (sequencing) + name: Batch number + total_reads_passing_qc_filter: + type: integer + nullable: true + description: Number of usable reads for analysis + title: Total reads passing QC filter + example: 10365118 + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (sequencing) + name: Total reads passing QC filter + sequencing_platform: + type: string + nullable: true + description: Designation of sequencing instrument used + title: Sequencing platform + example: Alumina LoSeq 1000 + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (sequencing) + name: Sequencing platform + sequencing_facility: + type: string + nullable: true + description: Name and address of sequencing facility + title: Sequencing facility + example: Seqs-R-Us, Vancouver, BC, Canada + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (sequencing) + name: Sequencing facility + sequencing_run_date: + type: string + nullable: true + description: Date of sequencing run + title: Date of sequencing run + format: date + example: 2016-12-16 + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (sequencing) + name: Date of sequencing run + sequencing_kit: + type: string + nullable: true + description: Name, manufacturer, order and lot numbers of sequencing kit + title: Sequencing kit + example: "FullSeq 600, Alumina, #M123456C0, 789G1HK" + x-airr: + miairr: important + adc-query-support: true + set: 3 + subset: process (sequencing) + name: Sequencing kit + sequencing_files: + $ref: '#/SequencingData' + nullable: false + description: Set of sequencing files produced by the sequencing run + x-airr: + adc-query-support: true + +# Resultant raw sequencing files from a SequencingRun +SequencingData: + type: object + required: + - sequencing_data_id + - file_type + - filename + - read_direction + - read_length + - paired_filename + - paired_read_direction + - paired_read_length + properties: + sequencing_data_id: + type: string + nullable: true + description: > + Persistent identifier of raw data stored in an archive (e.g. INSDC run ID). Data archive should + be identified in the CURIE prefix. + title: Raw sequencing data persistent identifier + example: "SRA:SRR11610494" + x-airr: + identifier: true + miairr: important + adc-query-support: true + set: 4 + subset: data (raw reads) + format: CURIE + file_type: + type: string + nullable: true + description: File format for the raw reads or sequences + title: Raw sequencing data file type + enum: + - fasta + - fastq + - null + x-airr: + miairr: important + adc-query-support: true + set: 4 + subset: data (raw reads) + name: Raw sequencing data file type + format: controlled_vocabulary + filename: + type: string + nullable: true + description: File name for the raw reads or sequences. The first file in paired-read sequencing. + title: Raw sequencing data file name + example: MS10R-NMonson-C7JR9_S1_R1_001.fastq + x-airr: + miairr: important + adc-query-support: true + set: 4 + subset: data (raw reads) + name: Raw sequencing data file name + read_direction: + type: string + nullable: true + description: Read direction for the raw reads or sequences. The first file in paired-read sequencing. + title: Read direction + example: forward + enum: + - forward + - reverse + - mixed + - null + x-airr: + miairr: important + adc-query-support: true + set: 4 + subset: data (raw reads) + name: Read direction + format: controlled_vocabulary + read_length: + type: integer + nullable: true + description: Read length in bases for the first file in paired-read sequencing + title: Forward read length + example: 300 + x-airr: + miairr: important + adc-query-support: true + set: 4 + subset: data (raw reads) + name: Forward read length + paired_filename: + type: string + nullable: true + description: File name for the second file in paired-read sequencing + title: Paired raw sequencing data file name + example: MS10R-NMonson-C7JR9_S1_R2_001.fastq + x-airr: + miairr: important + adc-query-support: true + set: 4 + subset: data (raw reads) + name: Paired raw sequencing data file name + paired_read_direction: + type: string + nullable: true + description: Read direction for the second file in paired-read sequencing + title: Paired read direction + example: reverse + enum: + - forward + - reverse + - mixed + - null + x-airr: + miairr: important + adc-query-support: true + set: 4 + subset: data (raw reads) + name: Paired read direction + format: controlled_vocabulary + paired_read_length: + type: integer + nullable: true + description: Read length in bases for the second file in paired-read sequencing + title: Paired read length + example: 300 + x-airr: + miairr: important + adc-query-support: true + set: 4 + subset: data (raw reads) + name: Paired read length + index_filename: + type: string + nullable: true + description: File name for the index file + title: Sequencing index file name + example: MS10R-NMonson-C7JR9_S1_R3_001.fastq + x-airr: + adc-query-support: true + index_length: + type: integer + nullable: true + description: Read length in bases for the index file + title: Index read length + example: 8 + x-airr: + adc-query-support: true + +# 1-to-n relationship between a repertoire and data processing +# +# Set of annotated rearrangement sequences produced by +# data processing upon the raw sequence data for a repertoire. +DataProcessing: + type: object + required: + - software_versions + - paired_reads_assembly + - quality_thresholds + - primer_match_cutoffs + - collapsing_method + - data_processing_protocols + - germline_database + properties: + data_processing_id: + type: string + nullable: true + description: Identifier for the data processing object. + title: Data processing ID + x-airr: + name: Data processing ID + adc-query-support: true + identifier: true + primary_annotation: + type: boolean + default: false + nullable: false + description: > + If true, indicates this is the primary or default data processing for + the repertoire and its rearrangements. If false, indicates this is a secondary + or additional data processing. + title: Primary annotation + x-airr: + adc-query-support: true + identifier: true + software_versions: + type: string + nullable: true + description: Version number and / or date, include company pipelines + title: Software tools and version numbers + example: IgBLAST 1.6 + x-airr: + miairr: important + adc-query-support: true + set: 5 + subset: process (computational) + name: Software tools and version numbers + paired_reads_assembly: + type: string + nullable: true + description: How paired end reads were assembled into a single receptor sequence + title: Paired read assembly + example: PandaSeq (minimal overlap 50, threshold 0.8) + x-airr: + miairr: important + adc-query-support: true + set: 5 + subset: process (computational) + name: Paired read assembly + quality_thresholds: + type: string + nullable: true + description: How/if sequences were removed from (4) based on base quality scores + title: Quality thresholds + example: Average Phred score >=20 + x-airr: + miairr: important + adc-query-support: true + set: 5 + subset: process (computational) + name: Quality thresholds + primer_match_cutoffs: + type: string + nullable: true + description: How primers were identified in the sequences, were they removed/masked/etc? + title: Primer match cutoffs + example: Hamming distance <= 2 + x-airr: + miairr: important + adc-query-support: true + set: 5 + subset: process (computational) + name: Primer match cutoffs + collapsing_method: + type: string + nullable: true + description: The method used for combining multiple sequences from (4) into a single sequence in (5) + title: Collapsing method + example: MUSCLE 3.8.31 + x-airr: + miairr: important + adc-query-support: true + set: 5 + subset: process (computational) + name: Collapsing method + data_processing_protocols: + type: string + nullable: true + description: General description of how QC is performed + title: Data processing protocols + example: Data was processed using [...] + x-airr: + miairr: important + adc-query-support: true + set: 5 + subset: process (computational) + name: Data processing protocols + data_processing_files: + type: array + items: + type: string + nullable: true + description: Array of file names for data produced by this data processing. + title: Processed data file names + example: + - 'ERR1278153_aa.txz' + - 'ERR1278153_ab.txz' + - 'ERR1278153_ac.txz' + x-airr: + adc-query-support: true + name: Processed data file names + germline_database: + type: string + nullable: true + description: Source of germline V(D)J genes with version number or date accessed. + title: V(D)J germline reference database + example: ENSEMBL, Homo sapiens build 90, 2017-10-01 + x-airr: + miairr: important + adc-query-support: true + set: 5 + subset: data (processed sequence) + name: V(D)J germline reference database + germline_set_ref: + type: string + nullable: true + description: Unique identifier of the germline set and version, in standardized form (Repo:Label:Version) + example: OGRDB:Human_IGH:2021.11 + x-airr: + adc-query-support: true + analysis_provenance_id: + type: string + nullable: true + description: Identifier for machine-readable PROV model of analysis provenance + title: Analysis provenance ID + x-airr: + adc-query-support: true + +SampleProcessing: + allOf: + - type: object + properties: + sample_processing_id: + type: string + nullable: true + description: > + Identifier for the sample processing object. This field should be unique within the repertoire. + This field can be used to uniquely identify the combination of sample, cell processing, + nucleic acid processing and sequencing run information for the repertoire. + title: Sample processing ID + x-airr: + name: Sample processing ID + adc-query-support: true + identifier: true + - $ref: '#/Sample' + - $ref: '#/CellProcessing' + - $ref: '#/NucleicAcidProcessing' + - $ref: '#/SequencingRun' + + +# The composite schema for the repertoire object +# +# This represents a sample repertoire as defined by the study +# and experimentally observed by raw sequence data. A repertoire +# can only be for one subject but may include multiple samples. +Repertoire: + type: object + required: + - study + - subject + - sample + - data_processing + properties: + repertoire_id: + type: string + nullable: true + description: > + Identifier for the repertoire object. This identifier should be globally unique so that repertoires + from multiple studies can be combined together without conflict. The repertoire_id is used to link + other AIRR data to a Repertoire. Specifically, the Rearrangements Schema includes repertoire_id for + referencing the specific Repertoire for that Rearrangement. + title: Repertoire ID + x-airr: + adc-query-support: true + identifier: true + repertoire_name: + type: string + nullable: true + description: Short generic display name for the repertoire + title: Repertoire name + x-airr: + name: Repertoire name + adc-query-support: true + repertoire_description: + type: string + nullable: true + description: Generic repertoire description + title: Repertoire description + x-airr: + name: Repertoire description + adc-query-support: true + study: + $ref: '#/Study' + nullable: false + description: Study object + x-airr: + adc-query-support: true + subject: + $ref: '#/Subject' + nullable: false + description: Subject object + x-airr: + adc-query-support: true + sample: + type: array + nullable: false + description: List of Sample Processing objects + items: + $ref: '#/SampleProcessing' + x-airr: + adc-query-support: true + data_processing: + type: array + nullable: false + description: List of Data Processing objects + items: + $ref: '#/DataProcessing' + x-airr: + adc-query-support: true + +# A collection of repertoires for analysis purposes, includes optional time course +RepertoireGroup: + type: object + required: + - repertoire_group_id + - repertoires + properties: + repertoire_group_id: + type: string + nullable: true + description: Identifier for this repertoire collection + x-airr: + identifier: true + repertoire_group_name: + type: string + nullable: true + description: Short display name for this repertoire collection + repertoire_group_description: + type: string + nullable: true + description: Repertoire collection description + repertoires: + type: array + nullable: true + description: > + List of repertoires in this collection with an associated description and time point designation + items: + type: object + properties: + repertoire_id: + type: string + nullable: false + description: Identifier to the repertoire + x-airr: + adc-query-support: true + repertoire_description: + type: string + nullable: true + description: Description of this repertoire within the group + x-airr: + adc-query-support: true + time_point: + $ref: '#/TimePoint' + nullable: true + description: Time point designation for this repertoire within the group + x-airr: + adc-query-support: true + +Alignment: + type: object + required: + - sequence_id + - segment + - call + - score + - cigar + properties: + sequence_id: + type: string + nullable: true + description: > + Unique query sequence identifier within the file. Most often this will be the input sequence + header or a substring thereof, but may also be a custom identifier defined by the tool in + cases where query sequences have been combined in some fashion prior to alignment. + x-airr: + identifier: true + segment: + type: string + nullable: true + description: > + The segment for this alignment. One of V, D, J or C. + rev_comp: + type: boolean + nullable: true + description: > + Alignment result is from the reverse complement of the query sequence. + call: + type: string + nullable: true + description: > + Gene assignment with allele. + score: + type: number + nullable: true + description: > + Alignment score. + identity: + type: number + nullable: true + description: > + Alignment fractional identity. + support: + type: number + nullable: true + description: > + Alignment E-value, p-value, likelihood, probability or other similar measure of + support for the gene assignment as defined by the alignment tool. + cigar: + type: string + nullable: true + description: > + Alignment CIGAR string. + sequence_start: + type: integer + nullable: true + description: > + Start position of the segment in the query sequence (1-based closed interval). + sequence_end: + type: integer + nullable: true + description: > + End position of the segment in the query sequence (1-based closed interval). + germline_start: + type: integer + nullable: true + description: > + Alignment start position in the reference sequence (1-based closed interval). + germline_end: + type: integer + nullable: true + description: > + Alignment end position in the reference sequence (1-based closed interval). + rank: + type: integer + nullable: true + description: > + Alignment rank. + rearrangement_id: + type: string + nullable: true + description: > + Identifier for the Rearrangement object. May be identical to sequence_id, + but will usually be a universally unique record locator for database applications. + x-airr: + deprecated: true + deprecated-description: Field has been merged with sequence_id to avoid confusion. + deprecated-replaced-by: + - sequence_id + data_processing_id: + type: string + nullable: true + description: > + Identifier to the data processing object in the repertoire metadata + for this rearrangement. If this field is empty than the primary data processing object is assumed. + germline_database: + type: string + nullable: true + description: Source of germline V(D)J genes with version number or date accessed. + example: ENSEMBL, Homo sapiens build 90, 2017-10-01 + x-airr: + deprecated: true + deprecated-description: Field was moved up to the DataProcessing level to avoid data duplication. + deprecated-replaced-by: + - "DataProcessing:germline_database" + + +# The extended rearrangement object +Rearrangement: + type: object + required: + - sequence_id + - sequence + - rev_comp + - productive + - v_call + - d_call + - j_call + - sequence_alignment + - germline_alignment + - junction + - junction_aa + - v_cigar + - d_cigar + - j_cigar + properties: + sequence_id: + type: string + nullable: true + description: > + Unique query sequence identifier for the Rearrangement. Most often this will be the input sequence + header or a substring thereof, but may also be a custom identifier defined by the tool in + cases where query sequences have been combined in some fashion prior to alignment. When + downloaded from an AIRR Data Commons repository, this will usually be a universally unique + record locator for linking with other objects in the AIRR Data Model. + x-airr: + adc-query-support: true + identifier: true + sequence: + type: string + nullable: true + description: > + The query nucleotide sequence. Usually, this is the unmodified input sequence, which may be + reverse complemented if necessary. In some cases, this field may contain consensus sequences or + other types of collapsed input sequences if these steps are performed prior to alignment. + quality: + type: string + nullable: true + description: > + The Sanger/Phred quality scores for assessment of sequence quality. + Phred quality scores from 0 to 93 are encoded using ASCII 33 to 126 (Used by Illumina from v1.8.) + sequence_aa: + type: string + nullable: true + description: > + Amino acid translation of the query nucleotide sequence. + rev_comp: + type: boolean + nullable: true + description: > + True if the alignment is on the opposite strand (reverse complemented) with respect to the + query sequence. If True then all output data, such as alignment coordinates and sequences, + are based on the reverse complement of 'sequence'. + productive: + type: boolean + nullable: true + description: > + True if the V(D)J sequence is predicted to be productive. + x-airr: + adc-query-support: true + vj_in_frame: + type: boolean + nullable: true + description: True if the V and J gene alignments are in-frame. + stop_codon: + type: boolean + nullable: true + description: True if the aligned sequence contains a stop codon. + complete_vdj: + type: boolean + nullable: true + description: > + True if the sequence alignment spans the entire V(D)J region. Meaning, + sequence_alignment includes both the first V gene codon that encodes the + mature polypeptide chain (i.e., after the leader sequence) and the last + complete codon of the J gene (i.e., before the J-C splice site). + This does not require an absence of deletions within the internal + FWR and CDR regions of the alignment. + locus: + type: string + enum: + - IGH + - IGI + - IGK + - IGL + - TRA + - TRB + - TRD + - TRG + - null + nullable: true + description: > + Gene locus (chain type). Note that this field uses a controlled vocabulary that is meant to provide a + generic classification of the locus, not necessarily the correct designation according to a specific + nomenclature. + title: Gene locus + example: IGH + x-airr: + adc-query-support: true + name: Gene locus + format: controlled_vocabulary + locus_species: + $ref: '#/Ontology' + nullable: true + description: > + Binomial designation of the species from which the locus originates. Typically, this value should be + identical to `organism`, if which case it SHOULD NOT be set explicitly. However, there are valid + experimental setups in which the two might differ, e.g. transgenic animal models. If set, this key + will overwrite the `organism` information for all lower layers of the schema. + title: Locus species + example: + id: NCBITAXON:9606 + label: Homo sapiens + x-airr: + miairr: defined + adc-query-support: true + name: Locus species + format: ontology + ontology: + draft: false + top_node: + id: NCBITAXON:7776 + label: Gnathostomata + v_call: + type: string + nullable: true + description: > + V gene with allele. If referring to a known reference sequence in a database + the relevant gene/allele nomenclature should be followed (e.g., IGHV4-59*01 if using IMGT/GENE-DB). + title: V gene with allele + example: IGHV4-59*01 + x-airr: + miairr: important + adc-query-support: true + set: 6 + subset: data (processed sequence) + name: V gene with allele + d_call: + type: string + nullable: true + description: > + First or only D gene with allele. If referring to a known reference sequence in a database + the relevant gene/allele nomenclature should be followed (e.g., IGHD3-10*01 if using IMGT/GENE-DB). + title: D gene with allele + example: IGHD3-10*01 + x-airr: + miairr: important + adc-query-support: true + set: 6 + subset: data (processed sequence) + name: D gene with allele + d2_call: + type: string + nullable: true + description: > + Second D gene with allele. If referring to a known reference sequence in a database the relevant + gene/allele nomenclature should be followed (e.g., IGHD3-10*01 if using IMGT/GENE-DB). + example: IGHD3-10*01 + j_call: + type: string + nullable: true + description: > + J gene with allele. If referring to a known reference sequence in a database the relevant + gene/allele nomenclature should be followed (e.g., IGHJ4*02 if using IMGT/GENE-DB). + title: J gene with allele + example: IGHJ4*02 + x-airr: + miairr: important + adc-query-support: true + set: 6 + subset: data (processed sequence) + name: J gene with allele + c_call: + type: string + nullable: true + description: > + Constant region gene with allele. If referring to a known reference sequence in a database the + relevant gene/allele nomenclature should be followed (e.g., IGHG1*01 if using IMGT/GENE-DB). + title: C region + example: IGHG1*01 + x-airr: + miairr: important + adc-query-support: true + set: 6 + subset: data (processed sequence) + name: C region + sequence_alignment: + type: string + nullable: true + description: > + Aligned portion of query sequence, including any indel corrections or numbering spacers, + such as IMGT-gaps. Typically, this will include only the V(D)J region, but that is not + a requirement. + quality_alignment: + type: string + nullable: true + description: > + Sanger/Phred quality scores for assessment of sequence_alignment quality. + Phred quality scores from 0 to 93 are encoded using ASCII 33 to 126 (Used by Illumina from v1.8.) + sequence_alignment_aa: + type: string + nullable: true + description: > + Amino acid translation of the aligned query sequence. + germline_alignment: + type: string + nullable: true + description: > + Assembled, aligned, full-length inferred germline sequence spanning the same region + as the sequence_alignment field (typically the V(D)J region) and including the same set + of corrections and spacers (if any). + germline_alignment_aa: + type: string + nullable: true + description: > + Amino acid translation of the assembled germline sequence. + junction: + type: string + nullable: true + description: > + Junction region nucleotide sequence, where the junction is defined as + the CDR3 plus the two flanking conserved codons. + title: IMGT-JUNCTION nucleotide sequence + example: TGTGCAAGAGCGGGAGTTTACGACGGATATACTATGGACTACTGG + x-airr: + miairr: important + set: 6 + subset: data (processed sequence) + name: IMGT-JUNCTION nucleotide sequence + junction_aa: + type: string + nullable: true + description: > + Amino acid translation of the junction. + title: IMGT-JUNCTION amino acid sequence + example: CARAGVYDGYTMDYW + x-airr: + miairr: important + adc-query-support: true + set: 6 + subset: data (processed sequence) + name: IMGT-JUNCTION amino acid sequence + np1: + type: string + nullable: true + description: > + Nucleotide sequence of the combined N/P region between the V gene and + first D gene alignment or between the V gene and J gene alignments. + np1_aa: + type: string + nullable: true + description: > + Amino acid translation of the np1 field. + np2: + type: string + nullable: true + description: > + Nucleotide sequence of the combined N/P region between either the first D gene and J gene + alignments or the first D gene and second D gene alignments. + np2_aa: + type: string + nullable: true + description: > + Amino acid translation of the np2 field. + np3: + type: string + nullable: true + description: > + Nucleotide sequence of the combined N/P region between the second D gene + and J gene alignments. + np3_aa: + type: string + nullable: true + description: > + Amino acid translation of the np3 field. + cdr1: + type: string + nullable: true + description: > + Nucleotide sequence of the aligned CDR1 region. + cdr1_aa: + type: string + nullable: true + description: > + Amino acid translation of the cdr1 field. + cdr2: + type: string + nullable: true + description: > + Nucleotide sequence of the aligned CDR2 region. + cdr2_aa: + type: string + nullable: true + description: > + Amino acid translation of the cdr2 field. + cdr3: + type: string + nullable: true + description: > + Nucleotide sequence of the aligned CDR3 region. + cdr3_aa: + type: string + nullable: true + description: > + Amino acid translation of the cdr3 field. + fwr1: + type: string + nullable: true + description: > + Nucleotide sequence of the aligned FWR1 region. + fwr1_aa: + type: string + nullable: true + description: > + Amino acid translation of the fwr1 field. + fwr2: + type: string + nullable: true + description: > + Nucleotide sequence of the aligned FWR2 region. + fwr2_aa: + type: string + nullable: true + description: > + Amino acid translation of the fwr2 field. + fwr3: + type: string + nullable: true + description: > + Nucleotide sequence of the aligned FWR3 region. + fwr3_aa: + type: string + nullable: true + description: > + Amino acid translation of the fwr3 field. + fwr4: + type: string + nullable: true + description: > + Nucleotide sequence of the aligned FWR4 region. + fwr4_aa: + type: string + nullable: true + description: > + Amino acid translation of the fwr4 field. + v_score: + type: number + nullable: true + description: Alignment score for the V gene. + v_identity: + type: number + nullable: true + description: Fractional identity for the V gene alignment. + v_support: + type: number + nullable: true + description: > + V gene alignment E-value, p-value, likelihood, probability or other similar measure of + support for the V gene assignment as defined by the alignment tool. + v_cigar: + type: string + nullable: true + description: CIGAR string for the V gene alignment. + d_score: + type: number + nullable: true + description: Alignment score for the first or only D gene alignment. + d_identity: + type: number + nullable: true + description: Fractional identity for the first or only D gene alignment. + d_support: + type: number + nullable: true + description: > + D gene alignment E-value, p-value, likelihood, probability or other similar measure of + support for the first or only D gene as defined by the alignment tool. + d_cigar: + type: string + nullable: true + description: CIGAR string for the first or only D gene alignment. + d2_score: + type: number + nullable: true + description: Alignment score for the second D gene alignment. + d2_identity: + type: number + nullable: true + description: Fractional identity for the second D gene alignment. + d2_support: + type: number + nullable: true + description: > + D gene alignment E-value, p-value, likelihood, probability or other similar measure of + support for the second D gene as defined by the alignment tool. + d2_cigar: + type: string + nullable: true + description: CIGAR string for the second D gene alignment. + j_score: + type: number + nullable: true + description: Alignment score for the J gene alignment. + j_identity: + type: number + nullable: true + description: Fractional identity for the J gene alignment. + j_support: + type: number + nullable: true + description: > + J gene alignment E-value, p-value, likelihood, probability or other similar measure of + support for the J gene assignment as defined by the alignment tool. + j_cigar: + type: string + nullable: true + description: CIGAR string for the J gene alignment. + c_score: + type: number + nullable: true + description: Alignment score for the C gene alignment. + c_identity: + type: number + nullable: true + description: Fractional identity for the C gene alignment. + c_support: + type: number + nullable: true + description: > + C gene alignment E-value, p-value, likelihood, probability or other similar measure of + support for the C gene assignment as defined by the alignment tool. + c_cigar: + type: string + nullable: true + description: CIGAR string for the C gene alignment. + v_sequence_start: + type: integer + nullable: true + description: > + Start position of the V gene in the query sequence (1-based closed interval). + v_sequence_end: + type: integer + nullable: true + description: > + End position of the V gene in the query sequence (1-based closed interval). + v_germline_start: + type: integer + nullable: true + description: > + Alignment start position in the V gene reference sequence (1-based closed interval). + v_germline_end: + type: integer + nullable: true + description: > + Alignment end position in the V gene reference sequence (1-based closed interval). + v_alignment_start: + type: integer + nullable: true + description: > + Start position of the V gene alignment in both the sequence_alignment and germline_alignment + fields (1-based closed interval). + v_alignment_end: + type: integer + nullable: true + description: > + End position of the V gene alignment in both the sequence_alignment and germline_alignment + fields (1-based closed interval). + d_sequence_start: + type: integer + nullable: true + description: > + Start position of the first or only D gene in the query sequence. + (1-based closed interval). + d_sequence_end: + type: integer + nullable: true + description: > + End position of the first or only D gene in the query sequence. + (1-based closed interval). + d_germline_start: + type: integer + nullable: true + description: > + Alignment start position in the D gene reference sequence for the first or only + D gene (1-based closed interval). + d_germline_end: + type: integer + nullable: true + description: > + Alignment end position in the D gene reference sequence for the first or only + D gene (1-based closed interval). + d_alignment_start: + type: integer + nullable: true + description: > + Start position of the first or only D gene in both the sequence_alignment + and germline_alignment fields (1-based closed interval). + d_alignment_end: + type: integer + nullable: true + description: > + End position of the first or only D gene in both the sequence_alignment + and germline_alignment fields (1-based closed interval). + d2_sequence_start: + type: integer + nullable: true + description: > + Start position of the second D gene in the query sequence (1-based closed interval). + d2_sequence_end: + type: integer + nullable: true + description: > + End position of the second D gene in the query sequence (1-based closed interval). + d2_germline_start: + type: integer + nullable: true + description: > + Alignment start position in the second D gene reference sequence (1-based closed interval). + d2_germline_end: + type: integer + nullable: true + description: > + Alignment end position in the second D gene reference sequence (1-based closed interval). + d2_alignment_start: + type: integer + nullable: true + description: > + Start position of the second D gene alignment in both the sequence_alignment and + germline_alignment fields (1-based closed interval). + d2_alignment_end: + type: integer + nullable: true + description: > + End position of the second D gene alignment in both the sequence_alignment and + germline_alignment fields (1-based closed interval). + j_sequence_start: + type: integer + nullable: true + description: > + Start position of the J gene in the query sequence (1-based closed interval). + j_sequence_end: + type: integer + nullable: true + description: > + End position of the J gene in the query sequence (1-based closed interval). + j_germline_start: + type: integer + nullable: true + description: > + Alignment start position in the J gene reference sequence (1-based closed interval). + j_germline_end: + type: integer + nullable: true + description: > + Alignment end position in the J gene reference sequence (1-based closed interval). + j_alignment_start: + type: integer + nullable: true + description: > + Start position of the J gene alignment in both the sequence_alignment and germline_alignment + fields (1-based closed interval). + j_alignment_end: + type: integer + nullable: true + description: > + End position of the J gene alignment in both the sequence_alignment and germline_alignment + fields (1-based closed interval). + c_sequence_start: + type: integer + nullable: true + description: > + Start position of the C gene in the query sequence (1-based closed interval). + c_sequence_end: + type: integer + nullable: true + description: > + End position of the C gene in the query sequence (1-based closed interval). + c_germline_start: + type: integer + nullable: true + description: > + Alignment start position in the C gene reference sequence (1-based closed interval). + c_germline_end: + type: integer + nullable: true + description: > + Alignment end position in the C gene reference sequence (1-based closed interval). + c_alignment_start: + type: integer + nullable: true + description: > + Start position of the C gene alignment in both the sequence_alignment and germline_alignment + fields (1-based closed interval). + c_alignment_end: + type: integer + nullable: true + description: > + End position of the C gene alignment in both the sequence_alignment and germline_alignment + fields (1-based closed interval). + cdr1_start: + type: integer + nullable: true + description: CDR1 start position in the query sequence (1-based closed interval). + cdr1_end: + type: integer + nullable: true + description: CDR1 end position in the query sequence (1-based closed interval). + cdr2_start: + type: integer + nullable: true + description: CDR2 start position in the query sequence (1-based closed interval). + cdr2_end: + type: integer + nullable: true + description: CDR2 end position in the query sequence (1-based closed interval). + cdr3_start: + type: integer + nullable: true + description: CDR3 start position in the query sequence (1-based closed interval). + cdr3_end: + type: integer + nullable: true + description: CDR3 end position in the query sequence (1-based closed interval). + fwr1_start: + type: integer + nullable: true + description: FWR1 start position in the query sequence (1-based closed interval). + fwr1_end: + type: integer + nullable: true + description: FWR1 end position in the query sequence (1-based closed interval). + fwr2_start: + type: integer + nullable: true + description: FWR2 start position in the query sequence (1-based closed interval). + fwr2_end: + type: integer + nullable: true + description: FWR2 end position in the query sequence (1-based closed interval). + fwr3_start: + type: integer + nullable: true + description: FWR3 start position in the query sequence (1-based closed interval). + fwr3_end: + type: integer + nullable: true + description: FWR3 end position in the query sequence (1-based closed interval). + fwr4_start: + type: integer + nullable: true + description: FWR4 start position in the query sequence (1-based closed interval). + fwr4_end: + type: integer + nullable: true + description: FWR4 end position in the query sequence (1-based closed interval). + v_sequence_alignment: + type: string + nullable: true + description: > + Aligned portion of query sequence assigned to the V gene, including any + indel corrections or numbering spacers. + v_sequence_alignment_aa: + type: string + nullable: true + description: > + Amino acid translation of the v_sequence_alignment field. + d_sequence_alignment: + type: string + nullable: true + description: > + Aligned portion of query sequence assigned to the first or only D gene, including any + indel corrections or numbering spacers. + d_sequence_alignment_aa: + type: string + nullable: true + description: > + Amino acid translation of the d_sequence_alignment field. + d2_sequence_alignment: + type: string + nullable: true + description: > + Aligned portion of query sequence assigned to the second D gene, including any + indel corrections or numbering spacers. + d2_sequence_alignment_aa: + type: string + nullable: true + description: > + Amino acid translation of the d2_sequence_alignment field. + j_sequence_alignment: + type: string + nullable: true + description: > + Aligned portion of query sequence assigned to the J gene, including any + indel corrections or numbering spacers. + j_sequence_alignment_aa: + type: string + nullable: true + description: > + Amino acid translation of the j_sequence_alignment field. + c_sequence_alignment: + type: string + nullable: true + description: > + Aligned portion of query sequence assigned to the constant region, including + any indel corrections or numbering spacers. + c_sequence_alignment_aa: + type: string + nullable: true + description: > + Amino acid translation of the c_sequence_alignment field. + v_germline_alignment: + type: string + nullable: true + description: > + Aligned V gene germline sequence spanning the same region + as the v_sequence_alignment field and including the same set + of corrections and spacers (if any). + v_germline_alignment_aa: + type: string + nullable: true + description: > + Amino acid translation of the v_germline_alignment field. + d_germline_alignment: + type: string + nullable: true + description: > + Aligned D gene germline sequence spanning the same region + as the d_sequence_alignment field and including the same set + of corrections and spacers (if any). + d_germline_alignment_aa: + type: string + nullable: true + description: > + Amino acid translation of the d_germline_alignment field. + d2_germline_alignment: + type: string + nullable: true + description: > + Aligned D gene germline sequence spanning the same region + as the d2_sequence_alignment field and including the same set + of corrections and spacers (if any). + d2_germline_alignment_aa: + type: string + nullable: true + description: > + Amino acid translation of the d2_germline_alignment field. + j_germline_alignment: + type: string + nullable: true + description: > + Aligned J gene germline sequence spanning the same region + as the j_sequence_alignment field and including the same set + of corrections and spacers (if any). + j_germline_alignment_aa: + type: string + nullable: true + description: > + Amino acid translation of the j_germline_alignment field. + c_germline_alignment: + type: string + nullable: true + description: > + Aligned constant region germline sequence spanning the same region + as the c_sequence_alignment field and including the same set + of corrections and spacers (if any). + c_germline_alignment_aa: + type: string + nullable: true + description: > + Amino acid translation of the c_germline_aligment field. + junction_length: + type: integer + nullable: true + description: Number of nucleotides in the junction sequence. + junction_aa_length: + type: integer + nullable: true + description: Number of amino acids in the junction sequence. + x-airr: + adc-query-support: true + np1_length: + type: integer + nullable: true + description: > + Number of nucleotides between the V gene and first D gene alignments or + between the V gene and J gene alignments. + np2_length: + type: integer + nullable: true + description: > + Number of nucleotides between either the first D gene and J gene alignments + or the first D gene and second D gene alignments. + np3_length: + type: integer + nullable: true + description: > + Number of nucleotides between the second D gene and J gene alignments. + n1_length: + type: integer + nullable: true + description: Number of untemplated nucleotides 5' of the first or only D gene alignment. + n2_length: + type: integer + nullable: true + description: Number of untemplated nucleotides 3' of the first or only D gene alignment. + n3_length: + type: integer + nullable: true + description: Number of untemplated nucleotides 3' of the second D gene alignment. + p3v_length: + type: integer + nullable: true + description: Number of palindromic nucleotides 3' of the V gene alignment. + p5d_length: + type: integer + nullable: true + description: Number of palindromic nucleotides 5' of the first or only D gene alignment. + p3d_length: + type: integer + nullable: true + description: Number of palindromic nucleotides 3' of the first or only D gene alignment. + p5d2_length: + type: integer + nullable: true + description: Number of palindromic nucleotides 5' of the second D gene alignment. + p3d2_length: + type: integer + nullable: true + description: Number of palindromic nucleotides 3' of the second D gene alignment. + p5j_length: + type: integer + nullable: true + description: Number of palindromic nucleotides 5' of the J gene alignment. + v_frameshift: + type: boolean + nullable: true + description: > + True if the V gene in the query nucleotide sequence contains a translational + frameshift relative to the frame of the V gene reference sequence. + j_frameshift: + type: boolean + nullable: true + description: > + True if the J gene in the query nucleotide sequence contains a translational + frameshift relative to the frame of the J gene reference sequence. + d_frame: + type: integer + nullable: true + description: > + Numerical reading frame (1, 2, 3) of the first or only D gene in the query nucleotide sequence, + where frame 1 is relative to the first codon of D gene reference sequence. + d2_frame: + type: integer + nullable: true + description: > + Numerical reading frame (1, 2, 3) of the second D gene in the query nucleotide sequence, + where frame 1 is relative to the first codon of D gene reference sequence. + consensus_count: + type: integer + nullable: true + description: > + Number of reads contributing to the UMI consensus or contig assembly for this sequence. + For example, the sum of the number of reads for all UMIs that contribute to + the query sequence. + duplicate_count: + type: integer + nullable: true + description: > + Copy number or number of duplicate observations for the query sequence. + For example, the number of identical reads observed for this sequence. + title: Read count + example: 123 + x-airr: + miairr: important + set: 6 + subset: data (processed sequence) + name: Read count + umi_count: + type: integer + nullable: true + description: > + Number of distinct UMIs represented by this sequence. + For example, the total number of UMIs that contribute to + the contig assembly for the query sequence. + cell_id: + type: string + nullable: true + description: > + Identifier defining the cell of origin for the query sequence. + title: Cell index + example: W06_046_091 + x-airr: + identifier: true + miairr: important + adc-query-support: true + set: 6 + subset: data (processed sequence) + name: Cell index + clone_id: + type: string + nullable: true + description: Clonal cluster assignment for the query sequence. + x-airr: + adc-query-support: true + identifier: true + repertoire_id: + type: string + nullable: true + description: Identifier to the associated repertoire in study metadata. + x-airr: + adc-query-support: true + identifier: true + sample_processing_id: + type: string + nullable: true + description: > + Identifier to the sample processing object in the repertoire metadata + for this rearrangement. If the repertoire has a single sample then + this field may be empty or missing. If the repertoire has multiple samples then + this field may be empty or missing if the sample cannot be differentiated or + the relationship is not maintained by the data processing. + x-airr: + adc-query-support: true + identifier: true + data_processing_id: + type: string + nullable: true + description: > + Identifier to the data processing object in the repertoire metadata + for this rearrangement. If this field is empty than the primary data processing object is assumed. + x-airr: + adc-query-support: true + identifier: true + rearrangement_id: + type: string + nullable: true + description: > + Identifier for the Rearrangement object. May be identical to sequence_id, + but will usually be a universally unique record locator for database applications. + x-airr: + deprecated: true + deprecated-description: Field has been merged with sequence_id to avoid confusion. + deprecated-replaced-by: + - sequence_id + rearrangement_set_id: + type: string + nullable: true + description: > + Identifier for grouping Rearrangement objects. + x-airr: + deprecated: true + deprecated-description: Field has been replaced by other specialized identifiers. + deprecated-replaced-by: + - repertoire_id + - sample_processing_id + - data_processing_id + germline_database: + type: string + nullable: true + description: Source of germline V(D)J genes with version number or date accessed. + example: ENSEMBL, Homo sapiens build 90, 2017-10-01 + x-airr: + deprecated: true + deprecated-description: Field was moved up to the DataProcessing level to avoid data duplication. + deprecated-replaced-by: + - "DataProcessing:germline_database" + +# A unique inferred clone object that has been constructed within a single data processing +# for a single repertoire and a subset of its sequences and/or rearrangements. +Clone: + type: object + required: + - clone_id + - germline_alignment + properties: + clone_id: + type: string + nullable: true + description: Identifier for the clone. + x-airr: + identifier: true + repertoire_id: + type: string + nullable: true + description: Identifier to the associated repertoire in study metadata. + x-airr: + adc-query-support: true + data_processing_id: + type: string + nullable: true + description: Identifier of the data processing object in the repertoire metadata for this clone. + x-airr: + adc-query-support: true + sequences: + type: array + items: + type: string + nullable: true + description: > + List sequence_id strings that act as keys to the Rearrangement records for members of the clone. + v_call: + type: string + nullable: true + description: > + V gene with allele of the inferred ancestral of the clone. For example, IGHV4-59*01. + example: IGHV4-59*01 + d_call: + type: string + nullable: true + description: > + D gene with allele of the inferred ancestor of the clone. For example, IGHD3-10*01. + example: IGHD3-10*01 + j_call: + type: string + nullable: true + description: > + J gene with allele of the inferred ancestor of the clone. For example, IGHJ4*02. + example: IGHJ4*02 + junction: + type: string + nullable: true + description: > + Nucleotide sequence for the junction region of the inferred ancestor of the clone, + where the junction is defined as the CDR3 plus the two flanking conserved codons. + junction_aa: + type: string + nullable: true + description: > + Amino acid translation of the junction. + junction_length: + type: integer + nullable: true + description: Number of nucleotides in the junction. + junction_aa_length: + type: integer + nullable: true + description: Number of amino acids in junction_aa. + germline_alignment: + type: string + nullable: true + description: > + Assembled, aligned, full-length inferred ancestor of the clone spanning the same region + as the sequence_alignment field of nodes (typically the V(D)J region) and including the + same set of corrections and spacers (if any). + germline_alignment_aa: + type: string + nullable: true + description: > + Amino acid translation of germline_alignment. + v_alignment_start: + type: integer + nullable: true + description: > + Start position in the V gene alignment in both the sequence_alignment and germline_alignment + fields (1-based closed interval). + v_alignment_end: + type: integer + nullable: true + description: > + End position in the V gene alignment in both the sequence_alignment and germline_alignment + fields (1-based closed interval). + d_alignment_start: + type: integer + nullable: true + description: > + Start position of the D gene alignment in both the sequence_alignment and germline_alignment + fields (1-based closed interval). + d_alignment_end: + type: integer + nullable: true + description: > + End position of the D gene alignment in both the sequence_alignment and germline_alignment + fields (1-based closed interval). + j_alignment_start: + type: integer + nullable: true + description: > + Start position of the J gene alignment in both the sequence_alignment and germline_alignment + fields (1-based closed interval). + j_alignment_end: + type: integer + nullable: true + description: > + End position of the J gene alignment in both the sequence_alignment and germline_alignment + fields (1-based closed interval). + junction_start: + type: integer + nullable: true + description: Junction region start position in the alignment (1-based closed interval). + junction_end: + type: integer + nullable: true + description: Junction region end position in the alignment (1-based closed interval). + umi_count: + type: integer + nullable: true + description: > + Number of distinct UMIs observed across all sequences (Rearrangement records) in this clone. + clone_count: + type: integer + nullable: true + description: > + Absolute count of the size (number of members) of this clone in the repertoire. + This could simply be the number of sequences (Rearrangement records) observed in this clone, + the number of distinct cell barcodes (unique cell_id values), + or a more sophisticated calculation appropriate to the experimental protocol. + Absolute count is provided versus a frequency so that downstream analysis tools can perform their own normalization. + seed_id: + type: string + nullable: true + description: sequence_id of the seed sequence. Empty string (or null) if there is no seed sequence. + +# 1-to-n relationship for a clone to its trees. +Tree: + type: object + required: + - tree_id + - clone_id + - newick + properties: + tree_id: + type: string + nullable: true + description: Identifier for the tree. + x-airr: + identifier: true + clone_id: + type: string + nullable: true + description: Identifier for the clone. + newick: + type: string + nullable: true + description: Newick string of the tree edges. + nodes: + type: object + nullable: true + description: Dictionary of nodes in the tree, keyed by sequence_id string + additionalProperties: + $ref: '#/Node' + +# 1-to-n relationship between a tree and its nodes +Node: + type: object + required: + - sequence_id + properties: + sequence_id: + type: string + nullable: true + description: > + Identifier for this node that matches the identifier in the newick string and, where possible, + the sequence_id in the source repertoire. + x-airr: + identifier: true + sequence_alignment: + type: string + nullable: true + description: > + Nucleotide sequence of the node, aligned to the germline_alignment for this clone, including + including any indel corrections or spacers. + junction: + type: string + nullable: true + description: > + Junction region nucleotide sequence for the node, where the junction is defined as + the CDR3 plus the two flanking conserved codons. + junction_aa: + type: string + nullable: true + description: > + Amino acid translation of the junction. + +# The cell object acts as point of reference for all data that can be related +# to an individual cell, either by direct observation or inference. +Cell: + type: object + required: + - cell_id + - rearrangements + - repertoire_id + - virtual_pairing + properties: + cell_id: + type: string + nullable: false + description: > + Identifier defining the cell of origin for the query sequence. + title: Cell index + example: W06_046_091 + x-airr: + identifier: true + miairr: defined + adc-query-support: true + name: Cell index + rearrangements: + type: array + nullable: true + description: > + Array of sequence identifiers defined for the Rearrangement object + title: Cell-associated rearrangements + items: + type: string + example: [id1, id2] #empty vs NULL? + x-airr: + miairr: defined + adc-query-support: true + name: Cell-associated rearrangements + receptors: + type: array + nullable: true + description: > + Array of receptor identifiers defined for the Receptor object + title: Cell-associated receptors + items: + type: string + example: [id1, id2] #empty vs NULL? + x-airr: + miairr: defined + adc-query-support: true + name: Cell-associated receptors + repertoire_id: + type: string + nullable: true + description: Identifier to the associated repertoire in study metadata. + title: Parental repertoire of cell + x-airr: + miairr: defined + adc-query-support: true + name: Parental repertoire of cell + data_processing_id: + type: string + nullable: true + description: Identifier of the data processing object in the repertoire metadata for this clone. + title: Data processing for cell + x-airr: + miairr: defined + adc-query-support: true + name: Data processing for cell + expression_study_method: + type: string + enum: + - flow_cytometry + - single-cell_transcriptome + - null + nullable: true + description: > + Keyword describing the methodology used to assess expression. This values for this field MUST + come from a controlled vocabulary. + x-airr: + miairr: defined + adc-query-support: true + expression_raw_doi: + type: string + nullable: true + description: > + DOI of raw data set containing the current event + x-airr: + miairr: defined + adc-query-support: true + expression_index: + type: string + nullable: true + description: > + Index addressing the current event within the raw data set. + x-airr: + miairr: defined + virtual_pairing: + type: boolean + nullable: true + description: > + boolean to indicate if pairing was inferred. + title: Virtual pairing + x-airr: + miairr: defined + adc-query-support: true + name: Virtual pairing + +# The CellExpression object acts as a container to hold a single expression level measurement from +# an experiment. Expression data is associated with a cell_id and the related repertoire_id and +# data_processing_id as cell_id is not guaranteed to be unique outside the data processing for +# a single repertoire. +CellExpression: + type: object + required: + - expression_id + - repertoire_id + - data_processing_id + - cell_id + - property + - property_type + - value + properties: + expression_id: + type: string + description: > + Identifier of this expression property measurement. + title: Expression property measurement identifier + nullable: false + x-airr: + identifier: true + miairr: defined + adc-query-support: true + name: Expression measurement identifier + cell_id: + type: string + description: > + Identifier of the cell to which this expression data is related. + title: Cell identifier + nullable: false + example: W06_046_091 + x-airr: + miairr: defined + adc-query-support: true + name: Cell identifier + repertoire_id: + type: string + description: Identifier for the associated repertoire in study metadata. + title: Parental repertoire of cell + nullable: true + x-airr: + miairr: defined + adc-query-support: true + name: Parental repertoire of cell + data_processing_id: + type: string + description: Identifier of the data processing object in the repertoire metadata for this clone. + title: Data processing for cell + nullable: true + x-airr: + miairr: defined + adc-query-support: true + name: Data processing for cell + property_type: + type: string + description: > + Keyword describing the property type and detection method used to measure the property value. + The following keywords are recommended, but custom property types are also valid: + "mrna_expression_by_read_count", + "protein_expression_by_fluorescence_intensity", "antigen_bait_binding_by_fluorescence_intensity", + "protein_expression_by_dna_barcode_count" and "antigen_bait_binding_by_dna_barcode_count". + nullable: false + title: Property type and detection method + x-airr: + miairr: defined + adc-query-support: true + name: Property type and detection method + property: + $ref: '#/Ontology' + nullable: true + title: Property information + description: > + Name of the property observed, typically a gene or antibody identifier (and label) from a + canonical resource such as Ensembl (e.g. ENSG00000275747, IGHV3-79) or + Antibody Registry (ABREG:1236456, Purified anti-mouse/rat/human CD27 antibody). + example: + id: ENSG:ENSG00000275747 + label: IGHV3-79 + x-airr: + miairr: defined + adc-query-support: true + format: ontology + name: Property information + value: + type: number + description: Level at which the property was observed in the experiment (non-normalized). + title: Property value + nullable: true + example: 3 + x-airr: + miairr: defined + adc-query-support: true + name: Property value + + +# The Receptor object hold information about a receptor and its reactivity. +# +Receptor: + type: object + required: + - receptor_id + - receptor_hash + - receptor_type + - receptor_variable_domain_1_aa + - receptor_variable_domain_1_locus + - receptor_variable_domain_2_aa + - receptor_variable_domain_2_locus + properties: + receptor_id: + type: string + nullable: false + description: ID of the current Receptor object, unique within the local repository. + title: Receptor ID + example: TCR-MM-012345 + x-airr: + identifier: true + adc-query-support: true + receptor_hash: + type: string + nullable: false + description: > + The SHA256 hash of the receptor amino acid sequence, calculated on the concatenated + ``receptor_variable_domain_*_aa`` sequences and represented as base16-encoded string. + title: Receptor hash ID + example: aa1c4b77a6f4927611ab39f5267415beaa0ba07a952c233d803b07e52261f026 + x-airr: + adc-query-support: true + receptor_type: + type: string + nullable: false + enum: + - Ig + - TCR + description: The top-level receptor type, either Immunoglobulin (Ig) or T Cell Receptor (TCR). + x-airr: + adc-query-support: true + receptor_variable_domain_1_aa: + type: string + nullable: false + description: > + Complete amino acid sequence of the mature variable domain of the Ig heavy, TCR beta or TCR delta chain. + The mature variable domain is defined as encompassing all AA from and including first AA after the the + signal peptide to and including the last AA that is completely encoded by the J gene. + example: > + QVQLQQPGAELVKPGASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKYNEKFKSKATLTVDKPSSTAYMQLSSLTSEDSAVYYCARYDYYGSSYFDYWGQGTTLTVSS + x-airr: + adc-query-support: true + receptor_variable_domain_1_locus: + type: string + nullable: false + enum: + - IGH + - TRB + - TRD + description: Locus from which the variable domain in receptor_variable_domain_1_aa originates + example: IGH + x-airr: + adc-query-support: true + receptor_variable_domain_2_aa: + type: string + nullable: false + description: > + Complete amino acid sequence of the mature variable domain of the Ig light, TCR alpha or TCR gamma chain. + The mature variable domain is defined as encompassing all AA from and including first AA after the the + signal peptide to and including the last AA that is completely encoded by the J gene. + example: > + QAVVTQESALTTSPGETVTLTCRSSTGAVTTSNYANWVQEKPDHLFTGLIGGTNNRAPGVPARFSGSLIGDKAALTITGAQTEDEAIYFCALWYSNHWVFGGGTKLTVL + x-airr: + adc-query-support: true + receptor_variable_domain_2_locus: + type: string + nullable: false + enum: + - IGI + - IGK + - IGL + - TRA + - TRG + description: Locus from which the variable domain in receptor_variable_domain_2_aa originates + example: IGL + x-airr: + adc-query-support: true + receptor_ref: + type: array + nullable: true + description: Array of receptor identifiers defined for the Receptor object + title: Receptor cross-references + items: + type: string + example: ["IEDB_RECEPTOR:10"] + x-airr: + adc-query-support: true + reactivity_measurements: + type: array + nullable: true + description: Records of reactivity measurement + items: + $ref: '#/ReceptorReactivity' + + +ReceptorReactivity: + type: object + required: + - ligand_type + - antigen_type + - antigen + - reactivity_method + - reactivity_readout + - reactivity_value + - reactivity_unit + properties: + ligand_type: + type: string + nullable: false + enum: + - "MHC:peptide" + - "MHC:non-peptide" + - protein + - peptide + - non-peptidic + description: Classification of ligand binding to receptor + example: non-peptide + antigen_type: + type: string + nullable: false + enum: + - protein + - peptide + - non-peptidic + description: > + The type of antigen before processing by the immune system. + example: protein + antigen: + $ref: '#/Ontology' + nullable: false + description: > + The substance against which the receptor was tested. This can be any substance that + stimulates an adaptive immune response in the host, either through antibody production + or by T cell activation after presentation via an MHC molecule. + title: Antigen + example: + id: UNIPROT:P19597 + label: Circumsporozoite protein + x-airr: + adc-query-support: true + format: ontology + antigen_source_species: + $ref: '#/Ontology' + nullable: true + description: The species from which the antigen was isolated + title: Source species of antigen + example: + id: NCBITAXON:5843 + label: Plasmodium falciparum NF54 + x-airr: + format: ontology + ontology: + draft: true + top_node: + id: NCBITAXON:1 + label: root + peptide_start: + type: integer + nullable: true + description: Start position of the peptide within the reference protein sequence + peptide_end: + type: integer + nullable: true + description: End position of the peptide within the reference protein sequence + mhc_class: + type: string + nullable: true + enum: + - MHC-I + - MHC-II + - MHC-nonclassical + - null + description: Class of MHC molecule, only present for MHC:x ligand types + example: MHC-II + mhc_gene_1: + $ref: '#/Ontology' + nullable: true + description: The MHC gene to which the mhc_allele_1 belongs + title: MHC gene 1 + example: + id: MRO:0000055 + label: HLA-DRA + x-airr: + format: ontology + ontology: + draft: true + top_node: + id: MRO:0000004 + label: MHC gene + mhc_allele_1: + type: string + nullable: true + description: Allele designation of the MHC alpha chain + example: HLA-DRA + mhc_gene_2: + $ref: '#/Ontology' + nullable: true + description: The MHC gene to which the mhc_allele_2 belongs + title: MHC gene 2 + example: + id: MRO:0000057 + label: HLA-DRB1 + x-airr: + format: ontology + ontology: + draft: true + top_node: + id: MRO:0000004 + label: MHC gene + mhc_allele_2: + type: string + nullable: true + description: > + Allele designation of the MHC class II beta chain or the invariant beta2-microglobin chain + example: HLA-DRB1*04:01 + reactivity_method: + type: string + nullable: false + enum: + - SPR + - ITC + - ELISA + - cytometry + - biological_activity + description: The methodology used to assess expression (assay implemented in experiment) + reactivity_readout: + type: string + nullable: false + enum: + - binding_strength + - cytokine_release + - dissociation_constant_kd + - on_rate + - off_rate + - pathogen_inhibition + description: Reactivity measurement read-out + example: cytokine release + reactivity_value: + type: number + nullable: false + description: The absolute (processed) value of the measurement + example: 162.26 + reactivity_unit: + type: string + nullable: false + description: The unit of the measurement + example: pg/ml diff --git a/tests/check-consistency-formats.py b/tests/check-consistency-formats.py index cd3d76423..98bec18fd 100755 --- a/tests/check-consistency-formats.py +++ b/tests/check-consistency-formats.py @@ -17,7 +17,9 @@ spec_files = {basename(f): f for f in glob('specs/airr-schema.yaml')} v3spec_files = {basename(f): f for f in glob('specs/airr-schema-openapi3.yaml')} py_files = {basename(f): f for f in glob('lang/python/airr/specs/airr-schema.yaml')} +py_v3_files = {basename(f): f for f in glob('lang/python/airr/specs/airr-schema-openapi3.yaml')} r_files = {basename(f): f for f in glob('lang/R/inst/extdata/airr-schema.yaml')} +r_v3_files = {basename(f): f for f in glob('lang/R/inst/extdata/airr-schema-openapi3.yaml')} # Check python package specs if set(spec_files.keys()) != set(py_files.keys()): @@ -26,6 +28,12 @@ for spec in set(py_files.keys()) - set(spec_files.keys()): print('{} found in python package but missing from specs/'.format(spec), file=sys.stderr) sys.exit(1) +if set(v3spec_files.keys()) != set(py_v3_files.keys()): + for spec in set(v3spec_files.keys()) - set(py_v3_files.keys()): + print('{} missing from python package'.format(spec), file=sys.stderr) + for spec in set(py_v3_files.keys()) - set(v3spec_files.keys()): + print('{} found in python package but missing from specs/'.format(spec), file=sys.stderr) + sys.exit(1) # Check R package specs if set(spec_files.keys()) != set(r_files.keys()): @@ -34,7 +42,36 @@ for spec in set(r_files.keys()) - set(spec_files.keys()): print('{} found in R package but missing from specs/'.format(spec), file=sys.stderr) sys.exit(1) +if set(v3spec_files.keys()) != set(r_v3_files.keys()): + for spec in set(v3spec_files.keys()) - set(r_v3_files.keys()): + print('{} missing from R package'.format(spec), file=sys.stderr) + for spec in set(r_v3_files.keys()) - set(v3spec_files.keys()): + print('{} found in R package but missing from specs/'.format(spec), file=sys.stderr) + sys.exit(1) + +# V3 spec against lang +for spec_name in v3spec_files: + # check equality of specs + with open(v3spec_files[spec_name], 'r') as ip: + gold_spec = yaml.safe_load(ip) + with open(py_v3_files[spec_name], 'r') as ip: + py_spec = yaml.safe_load(ip) + with open(r_v3_files[spec_name], 'r') as ip: + r_spec = yaml.safe_load(ip) + + # Check python package + if jsondiff.diff(gold_spec, py_spec) != {}: + print('{} openapi v3 spec is different from python version'.format(spec_name), file=sys.stderr) + print(jsondiff.diff(gold_spec, py_spec, syntax='explicit'), file=sys.stderr) + sys.exit(1) + + # Check R package + if jsondiff.diff(gold_spec, r_spec) != {}: + print('{} openapi v3 spec is different from R version'.format(spec_name), file=sys.stderr) + print(jsondiff.diff(gold_spec, r_spec), file=sys.stderr) + sys.exit(1) +# V2 spec against lang for spec_name in spec_files: # check equality of specs with open(spec_files[spec_name], 'r') as ip: @@ -46,13 +83,13 @@ # Check python package if jsondiff.diff(gold_spec, py_spec) != {}: - print('{} spec is different from python version'.format(spec_name), file=sys.stderr) + print('{} openapi v2 spec is different from python version'.format(spec_name), file=sys.stderr) print(jsondiff.diff(gold_spec, py_spec, syntax='explicit'), file=sys.stderr) sys.exit(1) # Check R package if jsondiff.diff(gold_spec, r_spec) != {}: - print('{} spec is different from R version'.format(spec_name), file=sys.stderr) + print('{} openapi v2 spec is different from R version'.format(spec_name), file=sys.stderr) print(jsondiff.diff(gold_spec, r_spec), file=sys.stderr) sys.exit(1) From 9412cb3459a4f910ba490417db9a345b8d24d5a8 Mon Sep 17 00:00:00 2001 From: Scott Christley Date: Mon, 26 Feb 2024 16:06:21 -0600 Subject: [PATCH 03/15] centralize test data --- .github/workflows/py-unittest.yaml | 5 + .github/workflows/r-check.yaml | 4 + tests/data/bad_genotype_set.json | 44 ++ tests/data/bad_germline_set.json | 351 +++++++++++ tests/data/bad_rearrangement.tsv | 10 + tests/data/bad_repertoire.yaml | 148 +++++ tests/data/extra_rearrangement.tsv | 2 + tests/data/good_combined_airr.json | 933 +++++++++++++++++++++++++++++ tests/data/good_combined_airr.yaml | 834 ++++++++++++++++++++++++++ tests/data/good_genotype_set.json | 38 ++ tests/data/good_germline_set.json | 358 +++++++++++ tests/data/good_rearrangement.tsv | 10 + tests/data/good_repertoire.yaml | 403 +++++++++++++ tests/data/output_blank.json | 231 +++++++ tests/data/output_data.json | 913 ++++++++++++++++++++++++++++ tests/data/output_rep.json | 506 ++++++++++++++++ tests/data/warning_repertoire.json | 1 + 17 files changed, 4791 insertions(+) create mode 100644 tests/data/bad_genotype_set.json create mode 100644 tests/data/bad_germline_set.json create mode 100644 tests/data/bad_rearrangement.tsv create mode 100644 tests/data/bad_repertoire.yaml create mode 100644 tests/data/extra_rearrangement.tsv create mode 100644 tests/data/good_combined_airr.json create mode 100644 tests/data/good_combined_airr.yaml create mode 100644 tests/data/good_genotype_set.json create mode 100644 tests/data/good_germline_set.json create mode 100644 tests/data/good_rearrangement.tsv create mode 100644 tests/data/good_repertoire.yaml create mode 100644 tests/data/output_blank.json create mode 100644 tests/data/output_data.json create mode 100644 tests/data/output_rep.json create mode 100644 tests/data/warning_repertoire.json diff --git a/.github/workflows/py-unittest.yaml b/.github/workflows/py-unittest.yaml index f531fd12f..4d24c2020 100644 --- a/.github/workflows/py-unittest.yaml +++ b/.github/workflows/py-unittest.yaml @@ -26,6 +26,11 @@ jobs: python-version: [ '3.8' ] steps: - uses: actions/checkout@v2 + + - name: Check test data matches the global test data files + run: diff -rc tests/data ../../tests/data + shell: bash + - name: Set up Python uses: actions/setup-python@v2 with: diff --git a/.github/workflows/r-check.yaml b/.github/workflows/r-check.yaml index 8732673a4..1127abf90 100644 --- a/.github/workflows/r-check.yaml +++ b/.github/workflows/r-check.yaml @@ -26,6 +26,10 @@ jobs: steps: - uses: actions/checkout@v2 + - name: Check test data matches the global test data files + run: diff -rc tests/data-tests ../../tests/data + shell: bash + - name: Install dependencies run: | install.packages(c("remotes", "testthat", "roxygen2", "devtools", "rcmdcheck")) diff --git a/tests/data/bad_genotype_set.json b/tests/data/bad_genotype_set.json new file mode 100644 index 000000000..c58a39027 --- /dev/null +++ b/tests/data/bad_genotype_set.json @@ -0,0 +1,44 @@ +{ + "GenotypeSet": [{ + "receptor_genotype_set_id": "1", + "genotype_class_list": [ + { + "receptor_genotype_id": "1", + "locus": "IGH", + "documented_alleles": [ + { + "label": "IGHV1-69*01", + "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", + "phasing": 1 + }, + { + "label": "IGHV1-69*02", + "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", + "phasing": 2 + }, + { + "label": "IGHV1-69*02", + "name": "1234", + "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", + "phasing": 2 + } + ], + "undocumented_alleles": [ + { + "allele_name": "IGHD3-1*01_S1234", + "sequence": "agtagtagtagt", + "phasing": 1 + } + ], + "deleted_genes": [ + { + "label": "IGHV3-30-3", + "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", + "phasing": "1" + } + ], + "inference_process": "repertoire_sequencing" + } + ] + }] +} \ No newline at end of file diff --git a/tests/data/bad_germline_set.json b/tests/data/bad_germline_set.json new file mode 100644 index 000000000..168cc1fa5 --- /dev/null +++ b/tests/data/bad_germline_set.json @@ -0,0 +1,351 @@ +{ + "GermlineSet": [{ + "germline_set_id": "OGRDB:G00007", + "author": "William Lees", + "lab_name": "", + "lab_address": "Birkbeck College, University of London, Malet Street, London", + "release_version": 1, + "release_description": "", + "release_date": "2021-11-24", + "germline_set_name": "CAST IGH", + "germline_set_ref": "OGRDB:G00007.1", + "pub_ids": [""], + "species": ["Mouse"], + "species_subgroup": "CAST_EiJ", + "species_subgroup_type": "strain", + "locus": "IGH", + "allele_descriptions": [ + { + "allele_description_id": "OGRDB:A00301", + "maintainer": "William Lees", + "acknowledgements": [], + "lab_address": "Birkbeck College, University of London, Malet Street, London", + "release_version": 1, + "release_date": "24-Nov-2021", + "release_description": "First release", + "label": "IGHV-2DBF", + "sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", + "coding_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", + "aliases": [ + "watson_et_al:CAST_EiJ_IGHV5-3" + ], + "locus": "IGH", + "chromosome": null, + "sequence_type": "V", + "functional": true, + "inference_type": "rearranged_only", + "species": "Mouse", + "species_subgroup": "CAST_EiJ", + "species_subgroup_type": "strain", + "status": "active", + "gene_designation": null, + "subgroup_designation": null, + "allele_designation": null, + "gene_start": null, + "gene_end": null, + "utr_5_prime_start": null, + "utr_5_prime_end": null, + "leader_1_start": null, + "leader_1_end": null, + "leader_2_start": null, + "leader_2_end": null, + "v_rs_start": null, + "v_rs_end": null, + "v_gene_delineations": [ + { + "sequence_delineation_id": "1", + "delineation_scheme": "IMGT", + "fwr1_start": 1, + "fwr1_end": 78, + "cdr1_start": 79, + "cdr1_end": 114, + "fwr2_start": 115, + "fwr2_end": 165, + "cdr2_start": 166, + "cdr2_end": 195, + "fwr3_start": 196, + "fwr3_end": 312, + "cdr3_start": 313, + "alignment": [ + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "21", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "40", + "41", + "42", + "43", + "44", + "45", + "46", + "47", + "48", + "49", + "50", + "51", + "52", + "53", + "54", + "55", + "56", + "57", + "58", + "59", + "60", + "61", + "62", + "63", + "64", + "65", + "66", + "67", + "68", + "69", + "70", + "71", + "72", + "73", + "74", + "75", + "76", + "77", + "78", + "79", + "80", + "81", + "82", + "83", + "84", + "85", + "86", + "87", + "88", + "89", + "90", + "91", + "92", + "93", + "94", + "95", + "96", + "97", + "98", + "99", + "100", + "101", + "102", + "103", + "104" + ] + } + ], + "unrearranged_support": [], + "rearranged_support": [], + "paralogs": [], + "notes": "Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV5-3", + "curational_tags": null + }, + { + "allele_description_id": "OGRDB:A00314", + "maintainer": "William Lees", + "acknowledgements": [], + "lab_address": "Birkbeck College, University of London, Malet Street, London", + "release_version": 1, + "release_date": "24-Nov-2021", + "release_description": "First release", + "label": "IGHV-2ETO", + "sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC", + "coding_sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCT...GGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGC......ACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGAT.........GATGATAAGTACTATAACCCATCCCTGAAG...AGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC", + "aliases": [ + "watson_et_al:CAST_EiJ_IGHV8-2" + ], + "locus": "IGH", + "chromosome": null, + "sequence_type": "V", + "functional": true, + "inference_type": "rearranged_only", + "species": "Mouse", + "species_subgroup": "CAST_EiJ", + "species_subgroup_type": "strain", + "status": "active", + "gene_designation": null, + "subgroup_designation": null, + "allele_designation": null, + "gene_start": null, + "gene_end": null, + "utr_5_prime_start": null, + "utr_5_prime_end": null, + "leader_1_start": null, + "leader_1_end": null, + "leader_2_start": null, + "leader_2_end": null, + "v_rs_start": null, + "v_rs_end": null, + "v_gene_delineations": [ + { + "sequence_delineation_id": "1", + "delineation_scheme": "IMGT", + "fwr1_start": 1, + "fwr1_end": 78, + "cdr1_start": 79, + "cdr1_end": 114, + "fwr2_start": 115, + "fwr2_end": 165, + "cdr2_start": 166, + "cdr2_end": 195, + "fwr3_start": 196, + "fwr3_end": 312, + "cdr3_start": 313, + "alignment": [ + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "21", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "40", + "41", + "42", + "43", + "44", + "45", + "46", + "47", + "48", + "49", + "50", + "51", + "52", + "53", + "54", + "55", + "56", + "57", + "58", + "59", + "60", + "61", + "62", + "63", + "64", + "65", + "66", + "67", + "68", + "69", + "70", + "71", + "72", + "73", + "74", + "75", + "76", + "77", + "78", + "79", + "80", + "81", + "82", + "83", + "84", + "85", + "86", + "87", + "88", + "89", + "90", + "91", + "92", + "93", + "94", + "95", + "96", + "97", + "98", + "99", + "100", + "101", + "102", + "103", + "104" + ] + } + ], + "unrearranged_support": [], + "rearranged_support": [], + "paralogs": [], + "notes": "Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV8-2", + "curational_tags": null + } + ], + "notes": "" + }] +} diff --git a/tests/data/bad_rearrangement.tsv b/tests/data/bad_rearrangement.tsv new file mode 100644 index 000000000..d12fc79fe --- /dev/null +++ b/tests/data/bad_rearrangement.tsv @@ -0,0 +1,10 @@ +rearrangement_id rearrangement_set_id sequence_id wrong_name rev_comp productive sequence_alignment germline_alignment v_call d_call j_call c_call junction junction_length junction_aa v_score d_score j_score c_score v_cigar d_cigar j_cigar c_cigar v_identity v_evalue d_identity d_evalue j_identity j_evalue v_sequence_start v_sequence_end v_germline_start v_germline_end d_sequence_start d_sequence_end d_germline_start d_germline_end j_sequence_start j_sequence_end j_germline_start j_germline_end np1_length np2_length duplicate_count +IVKNQEJ01BVGQ6 1 IVKNQEJ01BVGQ6 GGCCCAGGACTGGTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGGGGCCAGGGAACCCTGGTCACTGTCTCCTCA T T IGHV4-31*03 IGHD1-7*01,IGHD6-19*01 IGHJ4*02 TGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGG 36 CASGVAGTFDYW 430 16.4 75.8 22N1S275= 11N280S8= 6N292S32=1X9= 1 1E-122 1 2.7 0.9762 6E-18 0 275 0 317 279 287 10 18 291 333 5 47 4 4 1247 +IVKNQEJ01AQVWS 1 IVKNQEJ01AQVWS GGCCCAGGACTGGTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCAACTACAACCCCTCCCTCAAGAGTCGAGTCACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCA T T IGHV4-31*03 IGHD1-7*01,IGHD6-19*01 IGHJ4*02 TGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGG 36 CASGVAGTFDYW 420 16.4 83.8 22N1S156=1X10=1X17=1X89= 11N280S8= 6N292S42= 0.9891 8E-120 1 2.7 1 2E-20 0 275 0 317 279 287 10 18 291 333 5 47 4 4 4 +IVKNQEJ01AOYFZ 1 IVKNQEJ01AOYFZ GGCCCAGGACTGGTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGCGGGGTGGCTGGTAACTTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCA T F IGHV4-31*03 IGHD6-19*01 IGHJ4*02 TGTGCGAGCGGGGTGGCTGGTAACTTTTGACTACTGG 37 CASGVAGNF*LLX 430 20.4 83.8 22N1S275= 11N280S10= 6N293S42= 1 1E-122 1 0.17 1 2E-20 0 275 0 317 279 289 10 20 292 334 5 47 4 3 92 +IVKNQEJ01EI5S4 1 IVKNQEJ01EI5S4 GGCCCAGGACTGGTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCA T T IGHV4-31*03 IGHD1-7*01,IGHD6-19*01 IGHJ4*02 TGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGG 36 CASGVAGTFDYW 430 16.4 83.8 22N1S275= 11N280S8= 6N292S42= 1 1E-122 1 2.7 1 2E-20 0 275 0 317 279 287 10 18 291 333 5 47 4 4 2913 +IVKNQEJ01DGRRI 1 IVKNQEJ01DGRRI GGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCGCTGTCTATGGTGGGTCCTTCAGTGGTTACTACTGGAGCTGGATCCGCCAGCCCCCAGGGAAGGGTCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCA T T IGHV4-34*09 IGHD1-7*01,IGHD6-19*01 IGHJ4*02 TGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGG 36 CASGVAGTFDYW 389 16.4 83.8 22N1S23=2X85=1X15=1X1=1X3=1X2=1X1=1X5=1X6=1X118= 11N274S8= 6N286S42= 0.9628 2E-110 1 2.6 1 2E-20 0 269 0 317 273 281 10 18 285 327 5 47 4 4 1 +IVKNQEJ01APN5N 1 IVKNQEJ01APN5N GGCCCAGGACTGGTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTAGGGCCAGGGAACCCTGGTCACTGTCTCCTCA T F IGHV4-31*03 IGHD1-7*01,IGHD6-19*01 IGHJ4*02 TGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTAG 36 CASGVAGTFDY* 430 16.4 67.9 22N1S275= 11N280S8= 6N292S10=1X21=1X9= 1 1E-122 1 2.7 0.9524 1E-15 0 275 0 317 279 287 10 18 291 333 5 47 4 4 1 +IVKNQEJ01B0TT2 1 IVKNQEJ01B0TT2 GGCCCAGGACTGGTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGCGGGGTGGCTGGTAACTTTTGACTACTGGGGCCAGGGAACCCTGGTCACTGTCTCCTCA T F IGHV4-31*03 IGHD6-19*01 IGHJ4*02 TGTGCGAGCGGGGTGGCTGGTAACTTTTGACTACTGG 37 CASGVAGNF*LLX 430 20.4 75.8 22N1S275= 11N280S10= 6N293S32=1X9= 1 1E-122 1 0.17 0.9762 6E-18 0 275 0 317 279 289 10 20 292 334 5 47 4 3 30 +IVKNQEJ01AIS74 1 IVKNQEJ01AIS74 GGCGCAGGACTGTTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGGCGGGGTGGCTGGTAACTTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCA T F IGHV4-31*03 IGHD6-19*01 IGHJ4*02 TGTGCGAGGCGGGGTGGCTGGTAACTTTTGACTACTGG 38 CARRGGW*LLTTG 424 20.4 83.8 22N1S3=1X8=1X262= 11N281S10= 6N294S42= 0.9927 9E-121 1 0.17 1 2E-20 0 275 0 317 280 290 10 20 293 335 5 47 5 3 4 +IVKNQEJ01AJ44V 1 IVKNQEJ01AJ44V GGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCGCTGTCTATGGTGGGTCCTTCAGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGGGGCCAGGGAACCCTGGTCACTGTCTCCTCA T T IGHV4-59*06 IGHD1-7*01,IGHD6-19*01 IGHJ4*02 TGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGG 36 CASGVAGTFDYW 386 16.4 75.8 22N1S45=1X5=2X6=1X3=1X5=1X22=1X4=1X1=1X1=1X165= 11N274S8= 6N286S32=1X9= 0.9625 2E-109 1 2.6 0.9762 5E-18 0 267 0 315 273 281 10 18 285 327 5 47 6 4 12 diff --git a/tests/data/bad_repertoire.yaml b/tests/data/bad_repertoire.yaml new file mode 100644 index 000000000..2de377cb3 --- /dev/null +++ b/tests/data/bad_repertoire.yaml @@ -0,0 +1,148 @@ +# +# Example metadata +# + +Repertoire: + - repertoire_id: 1841923116114776551-242ac11c-0001-012 + study: + study_id: PRJNA300878 + study_title: "Homo sapiens B and T cell repertoire - MZ twins" + study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level." + lab_name: "Mark M. Davis" + lab_address: "Stanford University" + submitted_by: "Florian Rubelt" + pub_ids: ["PMID:27005435"] + subject: + subject_id: TW01A + synthetic: false + species: + id: "NCBITaxon_9606" + value: "Homo sapiens" + sex: female + age_min: 27 + age_max: 27 + age_unit: + id: UO_0000036 + value: year + linked_subjects: TW01B + link_type: twin + sample: + - sample_id: TW01A_B_naive + tissue: PBMC + cell_subset: "Naive B cell" + cell_phenotype: "expression of CD20 and the absence of CD27" + cell_species: + id: "NCBITaxon_9606" + value: "Homo sapiens" + single_cell: false + cell_isolation: FACS + template_class: RNA + pcr_target: + - pcr_target_locus: IGH + sequencing_platform: "Illumina MiSeq" + read_length: "300" + sequencing_files: + file_type: fastq + filename: SRR2905656_R1.fastq.gz + read_direction: forward + paired_filename: SRR2905656_R2.fastq.gz + paired_read_direction: reverse + data_processing: + - data_processing_id: 3059369183532618216-242ac11b-0001-007 + analysis_provenance_id: 6623294219256599016-242ac11c-0001-012 + + - repertoire_id: 1602908186092376551-242ac11c-0001-012 + study: + study_id: PRJNA300878 + study_title: "Homo sapiens B and T cell repertoire - MZ twins" + study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level." + lab_name: "Mark M. Davis" + lab_address: "Stanford University" + submitted_by: "Florian Rubelt" + pub_ids: ["PMID:27005435"] + subject: + subject_id: TW01A + synthetic: false + species: + id: "NCBITaxon_9606" + value: "Homo sapiens" + sex: female + age_min: 27 + age_max: 27 + age_unit: + id: UO_0000036 + value: year + linked_subjects: TW01B + link_type: twin + sample: + - sample_id: TW01A_B_memory + tissue: PBMC + cell_subset: "Memory B cell" + cell_phenotype: "expression of CD20 and CD27" + cell_species: + id: "NCBITaxon_9606" + value: "Homo sapiens" + single_cell: false + cell_isolation: FACS + template_class: RNA + pcr_target: + - pcr_target_locus: IGH + sequencing_platform: "Illumina MiSeq" + read_length: "300" + sequencing_files: + file_type: fastq + filename: SRR2905655_R1.fastq.gz + read_direction: forward + paired_filename: SRR2905655_R2.fastq.gz + paired_read_direction: reverse + data_processing: + - data_processing_id: 3059369183532618216-242ac11b-0001-007 + analysis_provenance_id: 6623294219256599016-242ac11c-0001-012 + + - repertoire_id: 2366080924918616551-242ac11c-0001-012 + study: + study_id: PRJNA300878 + study_title: "Homo sapiens B and T cell repertoire - MZ twins" + study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level." + lab_name: "Mark M. Davis" + lab_address: "Stanford University" + submitted_by: "Florian Rubelt" + pub_ids: ["PMID:27005435"] + subject: + subject_id: TW01A + synthetic: false + species: + id: "NCBITaxon_9606" + value: "Homo sapiens" + sex: female + age_min: 27 + age_max: 27 + age_unit: + id: UO_0000036 + value: year + linked_subjects: TW01B + link_type: twin + sample: + - sample_id: TW01A_T_naive_CD4 + tissue: PBMC + cell_subset: "Naive CD4+ T cell" + cell_phenotype: "expression of CD8 and absence of CD4 and CD45RO" + cell_species: + id: "NCBITaxon_9606" + value: "Homo sapiens" + single_cell: false + cell_isolation: FACS + template_class: RNA + pcr_target: + - pcr_target_locus: TRB + sequencing_platform: "Illumina MiSeq" + read_length: "300" + sequencing_files: + file_type: fastq + filename: SRR2905659_R1.fastq.gz + read_direction: forward + paired_filename: SRR2905659_R2.fastq.gz + paired_read_direction: reverse + data_processing: + - data_processing_id: 651223970338378216-242ac11b-0001-007 + analysis_provenance_id: 4625424004665971176-242ac11c-0001-012 diff --git a/tests/data/extra_rearrangement.tsv b/tests/data/extra_rearrangement.tsv new file mode 100644 index 000000000..8bedb960f --- /dev/null +++ b/tests/data/extra_rearrangement.tsv @@ -0,0 +1,2 @@ +sequence_id sequence rev_comp productive v_call d_call j_call sequence_alignment germline_alignment junction junction junction_aa v_cigar d_cigar j_cigar +1 2 F F 5 6 7 8 9 10 11 12 13 14 15 not_in_header not_in diff --git a/tests/data/good_combined_airr.json b/tests/data/good_combined_airr.json new file mode 100644 index 000000000..9101b24a9 --- /dev/null +++ b/tests/data/good_combined_airr.json @@ -0,0 +1,933 @@ +{ + "Repertoire": [ + { + "repertoire_id": "1841923116114776551-242ac11c-0001-012", + "study": { + "study_id": "PRJNA300878", + "study_title": "Homo sapiens B and T cell repertoire - MZ twins", + "study_type": { + "id": null, + "label": null + }, + "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.", + "study_contact": "Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X", + "inclusion_exclusion_criteria": null, + "lab_name": "Mark M. Davis", + "lab_address": "Stanford University", + "submitted_by": "Florian Rubelt", + "pub_ids": ["PMID:27005435"], + "collected_by": null, + "grants": null, + "keywords_study": [ + "contains_ig", + "contains_tr" + ] + }, + "subject": { + "subject_id": "TW01A", + "synthetic": false, + "species": { + "id": "NCBITaxon_9606", + "label": "Homo sapiens" + }, + "sex": "female", + "age_min": 27, + "age_max": 27, + "age_unit": { + "id": "UO_0000036", + "label": "year" + }, + "age_event": null, + "ancestry_population": { + "id": null, + "label": null + }, + "location_birth": { + "id": null, + "label": null + }, + "ethnicity": null, + "race": null, + "strain_name": null, + "linked_subjects": "TW01B", + "link_type": "twin", + "diagnosis": [ + { + "study_group_description": null, + "disease_diagnosis": { + "id": null, + "label": null + }, + "disease_length": null, + "disease_stage": null, + "prior_therapies": null, + "immunogen": null, + "intervention": null, + "medical_history": null + } + ], + "genotype": { + "receptor_genotype_set": { + "receptor_genotype_set_id": "1", + "genotype_class_list": [ + { + "receptor_genotype_id": "1", + "locus": "IGH", + "documented_alleles": [ + { + "label": "IGHV1-69*01", + "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", + "phasing": 1 + }, + { + "label": "IGHV1-69*02", + "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", + "phasing": 2 + } + ], + "undocumented_alleles": [ + { + "allele_name": "IGHD3-1*01_S1234", + "sequence": "agtagtagtagt", + "phasing": 1 + } + ], + "deleted_genes": [ + { + "label": "IGHV3-30-3", + "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", + "phasing": 1 + } + ], + "inference_process": "repertoire_sequencing" + } + ] + }, + "mhc_genotype_set": { + "mhc_genotype_set_id": "this is a unique identifier", + "mhc_genotype_list": [ + { + "mhc_genotype_id": "unique", + "mhc_class": "MHC-I", + "mhc_genotyping_method": "pcr_low_resolution", + "mhc_alleles": [ + { + "allele_designation": "01:01", + "gene": { + "id": "MRO-0000046", + "label": "HLA-A" + }, + "reference_set_ref": "blah" + } + ] + } + ] + } + } + }, + "sample": [ + { + "sample_id": "TW01A_B_naive", + "sample_processing_id": null, + "sample_type": "peripheral venous puncture", + "tissue": { + "id": "UBERON_0000178", + "label": "blood" + }, + "tissue_processing": "Ficoll gradient", + "cell_subset": { + "id": "CL_0000788", + "label": "naive B cell" + }, + "cell_phenotype": "expression of CD20 and the absence of CD27", + "cell_species": { + "id": "NCBITaxon_9606", + "label": "Homo sapiens" + }, + "single_cell": false, + "cell_isolation": "FACS", + "template_class": "RNA", + "pcr_target": [ + { + "pcr_target_locus": "IGH", + "forward_pcr_primer_target_location": null, + "reverse_pcr_primer_target_location": null + } + ], + "sequencing_platform": "Illumina MiSeq", + "sequencing_files": { + "sequencing_data_id": "SRR2905656", + "file_type": "fastq", + "filename": "SRR2905656_R1.fastq.gz", + "read_direction": "forward", + "read_length": 300, + "paired_filename": "SRR2905656_R2.fastq.gz", + "paired_read_direction": "reverse", + "paired_read_length": 300 + }, + "anatomic_site": null, + "disease_state_sample": null, + "collection_time_point_relative": null, + "collection_time_point_relative_unit": { + "id": null, + "label": null + }, + "collection_time_point_reference": null, + "collection_location": { + "id": null, + "label": null + }, + "biomaterial_provider": null, + "cell_number": null, + "cells_per_reaction": null, + "cell_storage": false, + "cell_quality": null, + "cell_processing_protocol": null, + "template_quality": null, + "template_amount": null, + "template_amount_unit": { + "id": null, + "label": null + }, + "library_generation_method": "RT(oligo-dT)+PCR", + "library_generation_protocol": null, + "library_generation_kit_version": null, + "complete_sequences": "partial", + "physical_linkage": "none", + "sequencing_run_id": null, + "total_reads_passing_qc_filter": null, + "sequencing_facility": null, + "sequencing_run_date": null, + "sequencing_kit": null + } + ], + "data_processing": [ + { + "data_processing_id": "3059369183532618216-242ac11b-0001-007", + "primary_annotation": true, + "software_versions": null, + "paired_reads_assembly": null, + "quality_thresholds": null, + "primer_match_cutoffs": null, + "collapsing_method": null, + "data_processing_protocols": null, + "data_processing_files": null, + "germline_database": null, + "analysis_provenance_id": "6623294219256599016-242ac11c-0001-012" + } + ] + }, + { + "repertoire_id": "1602908186092376551-242ac11c-0001-012", + "study": { + "study_id": "PRJNA300878", + "study_title": "Homo sapiens B and T cell repertoire - MZ twins", + "study_type": { + "id": null, + "label": null + }, + "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.", + "study_contact": "Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X", + "inclusion_exclusion_criteria": null, + "lab_name": "Mark M. Davis", + "lab_address": "Stanford University", + "submitted_by": "Florian Rubelt", + "pub_ids": ["PMID:27005435"], + "collected_by": null, + "grants": null, + "keywords_study": [ + "contains_ig", + "contains_tr" + ] + }, + "subject": { + "subject_id": "TW01A", + "synthetic": false, + "species": { + "id": "NCBITaxon_9606", + "label": "Homo sapiens" + }, + "sex": "female", + "age_min": 27, + "age_max": 27, + "age_unit": { + "id": "UO_0000036", + "label": "year" + }, + "age_event": null, + "ancestry_population": { + "id": null, + "label": null + }, + "location_birth": { + "id": null, + "label": null + }, + "ethnicity": null, + "race": null, + "strain_name": null, + "linked_subjects": "TW01B", + "link_type": "twin", + "diagnosis": [ + { + "study_group_description": null, + "disease_diagnosis": { + "id": null, + "label": null + }, + "disease_length": null, + "disease_stage": null, + "prior_therapies": null, + "immunogen": null, + "intervention": null, + "medical_history": null + } + ] + }, + "sample": [ + { + "sample_id": "TW01A_B_memory", + "sample_processing_id": null, + "sample_type": "peripheral venous puncture", + "tissue": { + "id": "UBERON_0000178", + "label": "blood" + }, + "tissue_processing": "Ficoll gradient", + "cell_subset": { + "id": "CL_0000787", + "label": "memory B cell" + }, + "cell_phenotype": "expression of CD20 and CD27", + "cell_species": { + "id": "NCBITaxon_9606", + "label": "Homo sapiens" + }, + "single_cell": false, + "cell_isolation": "FACS", + "template_class": "RNA", + "pcr_target": [ + { + "pcr_target_locus": "IGH", + "forward_pcr_primer_target_location": null, + "reverse_pcr_primer_target_location": null + } + ], + "sequencing_platform": "Illumina MiSeq", + "sequencing_files": { + "sequencing_data_id": "SRR2905655", + "file_type": "fastq", + "filename": "SRR2905655_R1.fastq.gz", + "read_direction": "forward", + "read_length": 300, + "paired_filename": "SRR2905655_R2.fastq.gz", + "paired_read_direction": "reverse", + "paired_read_length": 300 + }, + "anatomic_site": null, + "disease_state_sample": null, + "collection_time_point_relative": null, + "collection_time_point_relative_unit": { + "id": null, + "label": null + }, + "collection_time_point_reference": null, + "collection_location": { + "id": null, + "label": null + }, + "biomaterial_provider": null, + "cell_number": null, + "cells_per_reaction": null, + "cell_storage": false, + "cell_quality": null, + "cell_processing_protocol": null, + "template_quality": null, + "template_amount": null, + "template_amount_unit": { + "id": null, + "label": null + }, + "library_generation_method": "RT(oligo-dT)+PCR", + "library_generation_protocol": null, + "library_generation_kit_version": null, + "complete_sequences": "partial", + "physical_linkage": "none", + "sequencing_run_id": null, + "total_reads_passing_qc_filter": null, + "sequencing_facility": null, + "sequencing_run_date": null, + "sequencing_kit": null + } + ], + "data_processing": [ + { + "data_processing_id": "3059369183532618216-242ac11b-0001-007", + "primary_annotation": true, + "software_versions": null, + "paired_reads_assembly": null, + "quality_thresholds": null, + "primer_match_cutoffs": null, + "collapsing_method": null, + "data_processing_protocols": null, + "data_processing_files": null, + "germline_database": null, + "analysis_provenance_id": "6623294219256599016-242ac11c-0001-012" + } + ] + }, + { + "repertoire_id": "2366080924918616551-242ac11c-0001-012", + "study": { + "study_id": "PRJNA300878", + "study_title": "Homo sapiens B and T cell repertoire - MZ twins", + "study_type": { + "id": null, + "label": null + }, + "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.", + "study_contact": "Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X", + "inclusion_exclusion_criteria": null, + "lab_name": "Mark M. Davis", + "lab_address": "Stanford University", + "submitted_by": "Florian Rubelt", + "pub_ids": ["PMID:27005435"], + "collected_by": null, + "grants": null, + "keywords_study": [ + "contains_ig", + "contains_tr" + ] + }, + "subject": { + "subject_id": "TW01A", + "synthetic": false, + "species": { + "id": "NCBITaxon_9606", + "label": "Homo sapiens" + }, + "sex": "female", + "age_min": 27, + "age_max": 27, + "age_unit": { + "id": "UO_0000036", + "label": "year" + }, + "age_event": null, + "ancestry_population": { + "id": null, + "label": null + }, + "location_birth": { + "id": null, + "label": null + }, + "ethnicity": null, + "race": null, + "strain_name": null, + "linked_subjects": "TW01B", + "link_type": "twin", + "diagnosis": [ + { + "study_group_description": null, + "disease_diagnosis": { + "id": null, + "label": null + }, + "disease_length": null, + "disease_stage": null, + "prior_therapies": null, + "immunogen": null, + "intervention": null, + "medical_history": null + } + ] + }, + "sample": [ + { + "sample_id": "TW01A_T_naive_CD4", + "sample_processing_id": null, + "sample_type": "peripheral venous puncture", + "tissue": { + "id": "UBERON_0000178", + "label": "blood" + }, + "tissue_processing": "Ficoll gradient", + "cell_subset": { + "id": "CL_0000895", + "label": "naive thymus-derived CD4-positive, alpha-beta T cell" + }, + "cell_phenotype": "expression of CD8 and absence of CD4 and CD45RO", + "cell_species": { + "id": "NCBITaxon_9606", + "label": "Homo sapiens" + }, + "single_cell": false, + "cell_isolation": "FACS", + "template_class": "RNA", + "pcr_target": [ + { + "pcr_target_locus": "TRB", + "forward_pcr_primer_target_location": null, + "reverse_pcr_primer_target_location": null + } + ], + "sequencing_platform": "Illumina MiSeq", + "sequencing_files": { + "sequencing_data_id": "SRR2905659", + "file_type": "fastq", + "filename": "SRR2905659_R1.fastq.gz", + "read_direction": "forward", + "read_length": 300, + "paired_filename": "SRR2905659_R2.fastq.gz", + "paired_read_direction": "reverse", + "paired_read_length": 300 + }, + "anatomic_site": null, + "disease_state_sample": null, + "collection_time_point_relative": null, + "collection_time_point_relative_unit": { + "id": null, + "label": null + }, + "collection_time_point_reference": null, + "collection_location": { + "id": null, + "label": null + }, + "biomaterial_provider": null, + "cell_number": null, + "cells_per_reaction": null, + "cell_storage": false, + "cell_quality": null, + "cell_processing_protocol": null, + "template_quality": null, + "template_amount": null, + "template_amount_unit": { + "id": null, + "label": null + }, + "library_generation_method": "RT(oligo-dT)+PCR", + "library_generation_protocol": null, + "library_generation_kit_version": null, + "complete_sequences": "partial", + "physical_linkage": "none", + "sequencing_run_id": null, + "total_reads_passing_qc_filter": null, + "sequencing_facility": null, + "sequencing_run_date": null, + "sequencing_kit": null + } + ], + "data_processing": [ + { + "data_processing_id": "651223970338378216-242ac11b-0001-007", + "primary_annotation": true, + "software_versions": null, + "paired_reads_assembly": null, + "quality_thresholds": null, + "primer_match_cutoffs": null, + "collapsing_method": null, + "data_processing_protocols": null, + "data_processing_files": null, + "germline_database": null, + "analysis_provenance_id": "4625424004665971176-242ac11c-0001-012" + } + ] + } + ], + + + "GermlineSet": [{ + "germline_set_id": "OGRDB:G00007", + "author": "William Lees", + "lab_name": "", + "lab_address": "Birkbeck College, University of London, Malet Street, London", + "acknowledgements": [], + "release_version": 1, + "release_description": "", + "release_date": "2021-11-24", + "germline_set_name": "CAST IGH", + "germline_set_ref": "OGRDB:G00007.1", + "pub_ids": [""], + "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" }, + "species_subgroup": "CAST_EiJ", + "species_subgroup_type": "strain", + "locus": "IGH", + "allele_descriptions": [ + { + "allele_description_id": "OGRDB:A00301", + "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2DBF", + "maintainer": "William Lees", + "acknowledgements": [], + "lab_address": "Birkbeck College, University of London, Malet Street, London", + "release_version": 1, + "release_date": "24-Nov-2021", + "release_description": "First release", + "label": "IGHV-2DBF", + "sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", + "coding_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", + "aliases": [ + "watson_et_al:CAST_EiJ_IGHV5-3" + ], + "locus": "IGH", + "chromosome": null, + "sequence_type": "V", + "functional": true, + "inference_type": "rearranged_only", + "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" }, + "species_subgroup": "CAST_EiJ", + "species_subgroup_type": "strain", + "status": "active", + "gene_designation": null, + "subgroup_designation": null, + "allele_designation": null, + "gene_start": null, + "gene_end": null, + "utr_5_prime_start": null, + "utr_5_prime_end": null, + "leader_1_start": null, + "leader_1_end": null, + "leader_2_start": null, + "leader_2_end": null, + "v_rs_start": null, + "v_rs_end": null, + "v_gene_delineations": [ + { + "sequence_delineation_id": "1", + "delineation_scheme": "IMGT", + "aligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", + "unaligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", + "fwr1_start": 1, + "fwr1_end": 75, + "cdr1_start": 76, + "cdr1_end": 110, + "fwr2_start": 111, + "fwr2_end": 150, + "cdr2_start": 151, + "cdr2_end": 160, + "fwr3_start": 161, + "fwr3_end": 294, + "cdr3_start": 295, + "alignment": [ + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "21", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "40", + "41", + "42", + "43", + "44", + "45", + "46", + "47", + "48", + "49", + "50", + "51", + "52", + "53", + "54", + "55", + "56", + "57", + "58", + "59", + "60", + "61", + "62", + "63", + "64", + "65", + "66", + "67", + "68", + "69", + "70", + "71", + "72", + "73", + "74", + "75", + "76", + "77", + "78", + "79", + "80", + "81", + "82", + "83", + "84", + "85", + "86", + "87", + "88", + "89", + "90", + "91", + "92", + "93", + "94", + "95", + "96", + "97", + "98", + "99", + "100", + "101", + "102", + "103", + "104" + ] + } + ], + "unrearranged_support": [], + "rearranged_support": [], + "paralogs": [], + "curation": "Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV5-3", + "curational_tags": null + }, + { + "allele_description_id": "OGRDB:A00314", + "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2ETO", + "maintainer": "William Lees", + "acknowledgements": [], + "lab_address": "Birkbeck College, University of London, Malet Street, London", + "release_version": 1, + "release_date": "24-Nov-2021", + "release_description": "First release", + "label": "IGHV-2ETO", + "sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC", + "coding_sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC", + "aliases": [ + "watson_et_al:CAST_EiJ_IGHV8-2" + ], + "locus": "IGH", + "chromosome": null, + "sequence_type": "V", + "functional": true, + "inference_type": "rearranged_only", + "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" }, + "species_subgroup": "CAST_EiJ", + "species_subgroup_type": "strain", + "status": "active", + "gene_designation": null, + "subgroup_designation": null, + "allele_designation": null, + "gene_start": null, + "gene_end": null, + "utr_5_prime_start": null, + "utr_5_prime_end": null, + "leader_1_start": null, + "leader_1_end": null, + "leader_2_start": null, + "leader_2_end": null, + "v_rs_start": null, + "v_rs_end": null, + "v_gene_delineations": [ + { + "sequence_delineation_id": "1", + "delineation_scheme": "IMGT", + "aligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", + "unaligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", + "fwr1_start": 1, + "fwr1_end": 75, + "cdr1_start": 76, + "cdr1_end": 110, + "fwr2_start": 111, + "fwr2_end": 150, + "cdr2_start": 151, + "cdr2_end": 160, + "fwr3_start": 161, + "fwr3_end": 294, + "cdr3_start": 295, + "alignment": [ + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "21", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "40", + "41", + "42", + "43", + "44", + "45", + "46", + "47", + "48", + "49", + "50", + "51", + "52", + "53", + "54", + "55", + "56", + "57", + "58", + "59", + "60", + "61", + "62", + "63", + "64", + "65", + "66", + "67", + "68", + "69", + "70", + "71", + "72", + "73", + "74", + "75", + "76", + "77", + "78", + "79", + "80", + "81", + "82", + "83", + "84", + "85", + "86", + "87", + "88", + "89", + "90", + "91", + "92", + "93", + "94", + "95", + "96", + "97", + "98", + "99", + "100", + "101", + "102", + "103", + "104" + ] + } + ], + "unrearranged_support": [], + "rearranged_support": [], + "paralogs": [], + "curation": "Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV8-2", + "curational_tags": null + } + ], + "curation": null + }], + "GenotypeSet": [{ + "receptor_genotype_set_id": "1", + "genotype_class_list": [ + { + "receptor_genotype_id": "1", + "locus": "IGH", + "documented_alleles": [ + { + "label": "IGHV1-69*01", + "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", + "phasing": 1 + }, + { + "label": "IGHV1-69*02", + "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", + "phasing": 2 + } + ], + "undocumented_alleles": [ + { + "allele_name": "IGHD3-1*01_S1234", + "sequence": "agtagtagtagt", + "phasing": 1 + } + ], + "deleted_genes": [ + { + "label": "IGHV3-30-3", + "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", + "phasing": 1 + } + ], + "inference_process": "repertoire_sequencing" + } + ] + }] +} diff --git a/tests/data/good_combined_airr.yaml b/tests/data/good_combined_airr.yaml new file mode 100644 index 000000000..80d0fe3a2 --- /dev/null +++ b/tests/data/good_combined_airr.yaml @@ -0,0 +1,834 @@ +Repertoire: + - repertoire_id: 1841923116114776551-242ac11c-0001-012 + study: + study_id: PRJNA300878 + study_title: Homo sapiens B and T cell repertoire - MZ twins + study_type: + id: + label: + study_description: The adaptive immune system's capability to protect the body + requires a highly diverse lymphocyte antigen receptor repertoire. However, the + influence of individual genetic and epigenetic differences on these repertoires + is not typically measured. By leveraging the unique characteristics of B, CD4+ + T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified + the impact of heritable factors on both the V(D)J recombination process and + thymic selection in the case of T cell receptors, and show that the repertoires + of both naive and antigen experienced cells are subject to biases resulting + from differences in recombination. We show that biases in V(D)J usage, as well + as biased N/P additions, contribute to significant variation in the CDR3 region. + Moreover, we show that the relative usage of V and J gene segments is chromosomally + biased, with approximately 1.5 times as many rearrangements originating from + a single chromosome. These data refine our understanding of the heritable mechanisms + affecting the repertoire, and show that biases are evident on a chromosome-wide + level. + study_contact: Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X + inclusion_exclusion_criteria: + lab_name: Mark M. Davis + lab_address: Stanford University + submitted_by: Florian Rubelt + pub_ids: ["PMID:27005435"] + collected_by: + grants: + keywords_study: + - contains_ig + - contains_tr + subject: + subject_id: TW01A + synthetic: false + species: + id: NCBITaxon_9606 + label: Homo sapiens + sex: female + age_min: 27 + age_max: 27 + age_unit: + id: UO_0000036 + label: year + age_event: + ancestry_population: + id: + label: + location_birth: + id: + label: + ethnicity: + race: + strain_name: + linked_subjects: TW01B + link_type: twin + diagnosis: + - study_group_description: + disease_diagnosis: + id: + label: + disease_length: + disease_stage: + prior_therapies: + immunogen: + intervention: + medical_history: + genotype: + receptor_genotype_set: + receptor_genotype_set_id: '1' + genotype_class_list: + - receptor_genotype_id: '1' + locus: IGH + documented_alleles: + - label: IGHV1-69*01 + germline_set_ref: IMGT:Homo sapiens:2022.1.31 + phasing: 1 + - label: IGHV1-69*02 + germline_set_ref: IMGT:Homo sapiens:2022.1.31 + phasing: 2 + undocumented_alleles: + - allele_name: IGHD3-1*01_S1234 + sequence: agtagtagtagt + phasing: 1 + deleted_genes: + - label: IGHV3-30-3 + germline_set_ref: IMGT:Homo sapiens:2022.1.31 + phasing: 1 + inference_process: repertoire_sequencing + mhc_genotype_set: + mhc_genotype_set_id: "this is a unique identifier" + mhc_genotype_list: + - mhc_genotype_id: unique + mhc_class: MHC-I + mhc_genotyping_method: pcr_low_resolution + mhc_alleles: + - allele_designation: "01:01" + gene: + id: "MRO-0000046" + label: "HLA-A" + reference_set_ref: blah + sample: + - sample_id: TW01A_B_naive + sample_processing_id: + sample_type: peripheral venous puncture + tissue: + id: UBERON_0000178 + label: blood + tissue_processing: Ficoll gradient + cell_subset: + id: CL_0000788 + label: naive B cell + cell_phenotype: expression of CD20 and the absence of CD27 + cell_species: + id: NCBITaxon_9606 + label: Homo sapiens + single_cell: false + cell_isolation: FACS + template_class: RNA + pcr_target: + - pcr_target_locus: IGH + forward_pcr_primer_target_location: + reverse_pcr_primer_target_location: + sequencing_platform: Illumina MiSeq + sequencing_files: + sequencing_data_id: SRR2905656 + file_type: fastq + filename: SRR2905656_R1.fastq.gz + read_direction: forward + read_length: 300 + paired_filename: SRR2905656_R2.fastq.gz + paired_read_direction: reverse + paired_read_length: 300 + anatomic_site: + disease_state_sample: + collection_time_point_relative: + collection_time_point_relative_unit: + id: + label: + collection_time_point_reference: + collection_location: + id: + label: + biomaterial_provider: + cell_number: + cells_per_reaction: + cell_storage: false + cell_quality: + cell_processing_protocol: + template_quality: + template_amount: + template_amount_unit: + id: + label: + library_generation_method: RT(oligo-dT)+PCR + library_generation_protocol: + library_generation_kit_version: + complete_sequences: partial + physical_linkage: none + sequencing_run_id: + total_reads_passing_qc_filter: + sequencing_facility: + sequencing_run_date: + sequencing_kit: + data_processing: + - data_processing_id: 3059369183532618216-242ac11b-0001-007 + primary_annotation: true + software_versions: + paired_reads_assembly: + quality_thresholds: + primer_match_cutoffs: + collapsing_method: + data_processing_protocols: + data_processing_files: + germline_database: + analysis_provenance_id: 6623294219256599016-242ac11c-0001-012 + - repertoire_id: 1602908186092376551-242ac11c-0001-012 + study: + study_id: PRJNA300878 + study_title: Homo sapiens B and T cell repertoire - MZ twins + study_type: + id: + label: + study_description: The adaptive immune system's capability to protect the body + requires a highly diverse lymphocyte antigen receptor repertoire. However, the + influence of individual genetic and epigenetic differences on these repertoires + is not typically measured. By leveraging the unique characteristics of B, CD4+ + T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified + the impact of heritable factors on both the V(D)J recombination process and + thymic selection in the case of T cell receptors, and show that the repertoires + of both naive and antigen experienced cells are subject to biases resulting + from differences in recombination. We show that biases in V(D)J usage, as well + as biased N/P additions, contribute to significant variation in the CDR3 region. + Moreover, we show that the relative usage of V and J gene segments is chromosomally + biased, with approximately 1.5 times as many rearrangements originating from + a single chromosome. These data refine our understanding of the heritable mechanisms + affecting the repertoire, and show that biases are evident on a chromosome-wide + level. + study_contact: Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X + inclusion_exclusion_criteria: + lab_name: Mark M. Davis + lab_address: Stanford University + submitted_by: Florian Rubelt + pub_ids: ["PMID:27005435"] + collected_by: + grants: + keywords_study: + - contains_ig + - contains_tr + subject: + subject_id: TW01A + synthetic: false + species: + id: NCBITaxon_9606 + label: Homo sapiens + sex: female + age_min: 27 + age_max: 27 + age_unit: + id: UO_0000036 + label: year + age_event: + ancestry_population: + id: + label: + location_birth: + id: + label: + ethnicity: + race: + strain_name: + linked_subjects: TW01B + link_type: twin + diagnosis: + - study_group_description: + disease_diagnosis: + id: + label: + disease_length: + disease_stage: + prior_therapies: + immunogen: + intervention: + medical_history: + sample: + - sample_id: TW01A_B_memory + sample_processing_id: + sample_type: peripheral venous puncture + tissue: + id: UBERON_0000178 + label: blood + tissue_processing: Ficoll gradient + cell_subset: + id: CL_0000787 + label: memory B cell + cell_phenotype: expression of CD20 and CD27 + cell_species: + id: NCBITaxon_9606 + label: Homo sapiens + single_cell: false + cell_isolation: FACS + template_class: RNA + pcr_target: + - pcr_target_locus: IGH + forward_pcr_primer_target_location: + reverse_pcr_primer_target_location: + sequencing_platform: Illumina MiSeq + sequencing_files: + sequencing_data_id: SRR2905655 + file_type: fastq + filename: SRR2905655_R1.fastq.gz + read_direction: forward + read_length: 300 + paired_filename: SRR2905655_R2.fastq.gz + paired_read_direction: reverse + paired_read_length: 300 + anatomic_site: + disease_state_sample: + collection_time_point_relative: + collection_time_point_relative_unit: + id: + label: + collection_time_point_reference: + collection_location: + id: + label: + biomaterial_provider: + cell_number: + cells_per_reaction: + cell_storage: false + cell_quality: + cell_processing_protocol: + template_quality: + template_amount: + template_amount_unit: + id: + label: + library_generation_method: RT(oligo-dT)+PCR + library_generation_protocol: + library_generation_kit_version: + complete_sequences: partial + physical_linkage: none + sequencing_run_id: + total_reads_passing_qc_filter: + sequencing_facility: + sequencing_run_date: + sequencing_kit: + data_processing: + - data_processing_id: 3059369183532618216-242ac11b-0001-007 + primary_annotation: true + software_versions: + paired_reads_assembly: + quality_thresholds: + primer_match_cutoffs: + collapsing_method: + data_processing_protocols: + data_processing_files: + germline_database: + analysis_provenance_id: 6623294219256599016-242ac11c-0001-012 + - repertoire_id: 2366080924918616551-242ac11c-0001-012 + study: + study_id: PRJNA300878 + study_title: Homo sapiens B and T cell repertoire - MZ twins + study_type: + id: + label: + study_description: The adaptive immune system's capability to protect the body + requires a highly diverse lymphocyte antigen receptor repertoire. However, the + influence of individual genetic and epigenetic differences on these repertoires + is not typically measured. By leveraging the unique characteristics of B, CD4+ + T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified + the impact of heritable factors on both the V(D)J recombination process and + thymic selection in the case of T cell receptors, and show that the repertoires + of both naive and antigen experienced cells are subject to biases resulting + from differences in recombination. We show that biases in V(D)J usage, as well + as biased N/P additions, contribute to significant variation in the CDR3 region. + Moreover, we show that the relative usage of V and J gene segments is chromosomally + biased, with approximately 1.5 times as many rearrangements originating from + a single chromosome. These data refine our understanding of the heritable mechanisms + affecting the repertoire, and show that biases are evident on a chromosome-wide + level. + study_contact: Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X + inclusion_exclusion_criteria: + lab_name: Mark M. Davis + lab_address: Stanford University + submitted_by: Florian Rubelt + pub_ids: ["PMID:27005435"] + collected_by: + grants: + keywords_study: + - contains_ig + - contains_tr + subject: + subject_id: TW01A + synthetic: false + species: + id: NCBITaxon_9606 + label: Homo sapiens + sex: female + age_min: 27 + age_max: 27 + age_unit: + id: UO_0000036 + label: year + age_event: + ancestry_population: + id: + label: + location_birth: + id: + label: + ethnicity: + race: + strain_name: + linked_subjects: TW01B + link_type: twin + diagnosis: + - study_group_description: + disease_diagnosis: + id: + label: + disease_length: + disease_stage: + prior_therapies: + immunogen: + intervention: + medical_history: + sample: + - sample_id: TW01A_T_naive_CD4 + sample_processing_id: + sample_type: peripheral venous puncture + tissue: + id: UBERON_0000178 + label: blood + tissue_processing: Ficoll gradient + cell_subset: + id: CL_0000895 + label: naive thymus-derived CD4-positive, alpha-beta T cell + cell_phenotype: expression of CD8 and absence of CD4 and CD45RO + cell_species: + id: NCBITaxon_9606 + label: Homo sapiens + single_cell: false + cell_isolation: FACS + template_class: RNA + pcr_target: + - pcr_target_locus: TRB + forward_pcr_primer_target_location: + reverse_pcr_primer_target_location: + sequencing_platform: Illumina MiSeq + sequencing_files: + sequencing_data_id: SRR2905659 + file_type: fastq + filename: SRR2905659_R1.fastq.gz + read_direction: forward + read_length: 300 + paired_filename: SRR2905659_R2.fastq.gz + paired_read_direction: reverse + paired_read_length: 300 + anatomic_site: + disease_state_sample: + collection_time_point_relative: + collection_time_point_relative_unit: + id: + label: + collection_time_point_reference: + collection_location: + id: + label: + biomaterial_provider: + cell_number: + cells_per_reaction: + cell_storage: false + cell_quality: + cell_processing_protocol: + template_quality: + template_amount: + template_amount_unit: + id: + label: + library_generation_method: RT(oligo-dT)+PCR + library_generation_protocol: + library_generation_kit_version: + complete_sequences: partial + physical_linkage: none + sequencing_run_id: + total_reads_passing_qc_filter: + sequencing_facility: + sequencing_run_date: + sequencing_kit: + data_processing: + - data_processing_id: 651223970338378216-242ac11b-0001-007 + primary_annotation: true + software_versions: + paired_reads_assembly: + quality_thresholds: + primer_match_cutoffs: + collapsing_method: + data_processing_protocols: + data_processing_files: + germline_database: + analysis_provenance_id: 4625424004665971176-242ac11c-0001-012 + +GermlineSet: +- acknowledgements: [] + allele_descriptions: + - acknowledgements: [] + aliases: + - watson_et_al:CAST_EiJ_IGHV5-3 + allele_description_id: OGRDB:A00301 + allele_description_ref: OGRDB:Mouse_IGH:IGHV-2DBF + allele_designation: null + chromosome: null + coding_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA + curation: 'Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV5-3' + curational_tags: null + functional: true + gene_designation: null + gene_end: null + gene_start: null + inference_type: rearranged_only + lab_address: Birkbeck College, University of London, Malet Street, London + label: IGHV-2DBF + leader_1_end: null + leader_1_start: null + leader_2_end: null + leader_2_start: null + locus: IGH + maintainer: William Lees + paralogs: [] + rearranged_support: [] + release_date: 24-Nov-2021 + release_description: First release + release_version: 1 + sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA + sequence_type: V + species: + id: NCBITAXON:10090 + label: Mus musculus + species_subgroup: CAST_EiJ + species_subgroup_type: strain + status: active + subgroup_designation: null + unrearranged_support: [] + utr_5_prime_end: null + utr_5_prime_start: null + v_gene_delineations: + - aligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA + alignment: + - '1' + - '2' + - '3' + - '4' + - '5' + - '6' + - '7' + - '8' + - '9' + - '10' + - '11' + - '12' + - '13' + - '14' + - '15' + - '16' + - '17' + - '18' + - '19' + - '20' + - '21' + - '22' + - '23' + - '24' + - '25' + - '26' + - '27' + - '28' + - '29' + - '30' + - '31' + - '32' + - '33' + - '34' + - '35' + - '36' + - '37' + - '38' + - '39' + - '40' + - '41' + - '42' + - '43' + - '44' + - '45' + - '46' + - '47' + - '48' + - '49' + - '50' + - '51' + - '52' + - '53' + - '54' + - '55' + - '56' + - '57' + - '58' + - '59' + - '60' + - '61' + - '62' + - '63' + - '64' + - '65' + - '66' + - '67' + - '68' + - '69' + - '70' + - '71' + - '72' + - '73' + - '74' + - '75' + - '76' + - '77' + - '78' + - '79' + - '80' + - '81' + - '82' + - '83' + - '84' + - '85' + - '86' + - '87' + - '88' + - '89' + - '90' + - '91' + - '92' + - '93' + - '94' + - '95' + - '96' + - '97' + - '98' + - '99' + - '100' + - '101' + - '102' + - '103' + - '104' + cdr1_end: 110 + cdr1_start: 76 + cdr2_end: 160 + cdr2_start: 151 + cdr3_start: 295 + delineation_scheme: IMGT + fwr1_end: 75 + fwr1_start: 1 + fwr2_end: 150 + fwr2_start: 111 + fwr3_end: 294 + fwr3_start: 161 + sequence_delineation_id: '1' + unaligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA + v_rs_end: null + v_rs_start: null + - acknowledgements: [] + aliases: + - watson_et_al:CAST_EiJ_IGHV8-2 + allele_description_id: OGRDB:A00314 + allele_description_ref: OGRDB:Mouse_IGH:IGHV-2ETO + allele_designation: null + chromosome: null + coding_sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC + curation: 'Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV8-2' + curational_tags: null + functional: true + gene_designation: null + gene_end: null + gene_start: null + inference_type: rearranged_only + lab_address: Birkbeck College, University of London, Malet Street, London + label: IGHV-2ETO + leader_1_end: null + leader_1_start: null + leader_2_end: null + leader_2_start: null + locus: IGH + maintainer: William Lees + paralogs: [] + rearranged_support: [] + release_date: 24-Nov-2021 + release_description: First release + release_version: 1 + sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC + sequence_type: V + species: + id: NCBITAXON:10090 + label: Mus musculus + species_subgroup: CAST_EiJ + species_subgroup_type: strain + status: active + subgroup_designation: null + unrearranged_support: [] + utr_5_prime_end: null + utr_5_prime_start: null + v_gene_delineations: + - aligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA + alignment: + - '1' + - '2' + - '3' + - '4' + - '5' + - '6' + - '7' + - '8' + - '9' + - '10' + - '11' + - '12' + - '13' + - '14' + - '15' + - '16' + - '17' + - '18' + - '19' + - '20' + - '21' + - '22' + - '23' + - '24' + - '25' + - '26' + - '27' + - '28' + - '29' + - '30' + - '31' + - '32' + - '33' + - '34' + - '35' + - '36' + - '37' + - '38' + - '39' + - '40' + - '41' + - '42' + - '43' + - '44' + - '45' + - '46' + - '47' + - '48' + - '49' + - '50' + - '51' + - '52' + - '53' + - '54' + - '55' + - '56' + - '57' + - '58' + - '59' + - '60' + - '61' + - '62' + - '63' + - '64' + - '65' + - '66' + - '67' + - '68' + - '69' + - '70' + - '71' + - '72' + - '73' + - '74' + - '75' + - '76' + - '77' + - '78' + - '79' + - '80' + - '81' + - '82' + - '83' + - '84' + - '85' + - '86' + - '87' + - '88' + - '89' + - '90' + - '91' + - '92' + - '93' + - '94' + - '95' + - '96' + - '97' + - '98' + - '99' + - '100' + - '101' + - '102' + - '103' + - '104' + cdr1_end: 110 + cdr1_start: 76 + cdr2_end: 160 + cdr2_start: 151 + cdr3_start: 295 + delineation_scheme: IMGT + fwr1_end: 75 + fwr1_start: 1 + fwr2_end: 150 + fwr2_start: 111 + fwr3_end: 294 + fwr3_start: 161 + sequence_delineation_id: '1' + unaligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA + v_rs_end: null + v_rs_start: null + author: William Lees + curation: null + germline_set_id: OGRDB:G00007 + germline_set_name: CAST IGH + germline_set_ref: OGRDB:G00007.1 + lab_address: Birkbeck College, University of London, Malet Street, London + lab_name: '' + locus: IGH + pub_ids: [''] + release_date: '2021-11-24' + release_description: '' + release_version: 1 + species: + id: NCBITAXON:10090 + label: Mus musculus + species_subgroup: CAST_EiJ + species_subgroup_type: strain + + +GenotypeSet: + - receptor_genotype_set_id: '1' + genotype_class_list: + - receptor_genotype_id: '1' + locus: IGH + documented_alleles: + - label: IGHV1-69*01 + germline_set_ref: IMGT:Homo sapiens:2022.1.31 + phasing: 1 + - label: IGHV1-69*02 + germline_set_ref: IMGT:Homo sapiens:2022.1.31 + phasing: 2 + undocumented_alleles: + - allele_name: IGHD3-1*01_S1234 + sequence: agtagtagtagt + phasing: 1 + deleted_genes: + - label: IGHV3-30-3 + germline_set_ref: IMGT:Homo sapiens:2022.1.31 + phasing: 1 + inference_process: repertoire_sequencing diff --git a/tests/data/good_genotype_set.json b/tests/data/good_genotype_set.json new file mode 100644 index 000000000..ba10f56e9 --- /dev/null +++ b/tests/data/good_genotype_set.json @@ -0,0 +1,38 @@ +{ + "GenotypeSet": [{ + "receptor_genotype_set_id": "1", + "genotype_class_list": [ + { + "receptor_genotype_id": "1", + "locus": "IGH", + "documented_alleles": [ + { + "label": "IGHV1-69*01", + "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", + "phasing": 1 + }, + { + "label": "IGHV1-69*02", + "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", + "phasing": 2 + } + ], + "undocumented_alleles": [ + { + "allele_name": "IGHD3-1*01_S1234", + "sequence": "agtagtagtagt", + "phasing": 1 + } + ], + "deleted_genes": [ + { + "label": "IGHV3-30-3", + "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", + "phasing": 1 + } + ], + "inference_process": "repertoire_sequencing" + } + ] + }] +} \ No newline at end of file diff --git a/tests/data/good_germline_set.json b/tests/data/good_germline_set.json new file mode 100644 index 000000000..41ecf5f7d --- /dev/null +++ b/tests/data/good_germline_set.json @@ -0,0 +1,358 @@ +{ + "GermlineSet": [{ + "germline_set_id": "OGRDB:G00007", + "author": "William Lees", + "lab_name": "", + "lab_address": "Birkbeck College, University of London, Malet Street, London", + "acknowledgements": [], + "release_version": 1, + "release_description": "", + "release_date": "2021-11-24", + "germline_set_name": "CAST IGH", + "germline_set_ref": "OGRDB:G00007.1", + "pub_ids": [""], + "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" }, + "species_subgroup": "CAST_EiJ", + "species_subgroup_type": "strain", + "locus": "IGH", + "allele_descriptions": [ + { + "allele_description_id": "OGRDB:A00301", + "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2DBF", + "maintainer": "William Lees", + "acknowledgements": [], + "lab_address": "Birkbeck College, University of London, Malet Street, London", + "release_version": 1, + "release_date": "24-Nov-2021", + "release_description": "First release", + "label": "IGHV-2DBF", + "sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", + "coding_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", + "aliases": [ + "watson_et_al:CAST_EiJ_IGHV5-3" + ], + "locus": "IGH", + "chromosome": null, + "sequence_type": "V", + "functional": true, + "inference_type": "rearranged_only", + "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" }, + "species_subgroup": "CAST_EiJ", + "species_subgroup_type": "strain", + "status": "active", + "gene_designation": null, + "subgroup_designation": null, + "allele_designation": null, + "gene_start": null, + "gene_end": null, + "utr_5_prime_start": null, + "utr_5_prime_end": null, + "leader_1_start": null, + "leader_1_end": null, + "leader_2_start": null, + "leader_2_end": null, + "v_rs_start": null, + "v_rs_end": null, + "v_gene_delineations": [ + { + "sequence_delineation_id": "1", + "delineation_scheme": "IMGT", + "aligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", + "unaligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", + "fwr1_start": 1, + "fwr1_end": 75, + "cdr1_start": 76, + "cdr1_end": 110, + "fwr2_start": 111, + "fwr2_end": 150, + "cdr2_start": 151, + "cdr2_end": 160, + "fwr3_start": 161, + "fwr3_end": 294, + "cdr3_start": 295, + "alignment": [ + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "21", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "40", + "41", + "42", + "43", + "44", + "45", + "46", + "47", + "48", + "49", + "50", + "51", + "52", + "53", + "54", + "55", + "56", + "57", + "58", + "59", + "60", + "61", + "62", + "63", + "64", + "65", + "66", + "67", + "68", + "69", + "70", + "71", + "72", + "73", + "74", + "75", + "76", + "77", + "78", + "79", + "80", + "81", + "82", + "83", + "84", + "85", + "86", + "87", + "88", + "89", + "90", + "91", + "92", + "93", + "94", + "95", + "96", + "97", + "98", + "99", + "100", + "101", + "102", + "103", + "104" + ] + } + ], + "unrearranged_support": [], + "rearranged_support": [], + "paralogs": [], + "curation": "Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV5-3", + "curational_tags": null + }, + { + "allele_description_id": "OGRDB:A00314", + "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2ETO", + "maintainer": "William Lees", + "acknowledgements": [], + "lab_address": "Birkbeck College, University of London, Malet Street, London", + "release_version": 1, + "release_date": "24-Nov-2021", + "release_description": "First release", + "label": "IGHV-2ETO", + "sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC", + "coding_sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC", + "aliases": [ + "watson_et_al:CAST_EiJ_IGHV8-2" + ], + "locus": "IGH", + "chromosome": null, + "sequence_type": "V", + "functional": true, + "inference_type": "rearranged_only", + "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" }, + "species_subgroup": "CAST_EiJ", + "species_subgroup_type": "strain", + "status": "active", + "gene_designation": null, + "subgroup_designation": null, + "allele_designation": null, + "gene_start": null, + "gene_end": null, + "utr_5_prime_start": null, + "utr_5_prime_end": null, + "leader_1_start": null, + "leader_1_end": null, + "leader_2_start": null, + "leader_2_end": null, + "v_rs_start": null, + "v_rs_end": null, + "v_gene_delineations": [ + { + "sequence_delineation_id": "1", + "delineation_scheme": "IMGT", + "aligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", + "unaligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", + "fwr1_start": 1, + "fwr1_end": 75, + "cdr1_start": 76, + "cdr1_end": 110, + "fwr2_start": 111, + "fwr2_end": 150, + "cdr2_start": 151, + "cdr2_end": 160, + "fwr3_start": 161, + "fwr3_end": 294, + "cdr3_start": 295, + "alignment": [ + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "21", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "40", + "41", + "42", + "43", + "44", + "45", + "46", + "47", + "48", + "49", + "50", + "51", + "52", + "53", + "54", + "55", + "56", + "57", + "58", + "59", + "60", + "61", + "62", + "63", + "64", + "65", + "66", + "67", + "68", + "69", + "70", + "71", + "72", + "73", + "74", + "75", + "76", + "77", + "78", + "79", + "80", + "81", + "82", + "83", + "84", + "85", + "86", + "87", + "88", + "89", + "90", + "91", + "92", + "93", + "94", + "95", + "96", + "97", + "98", + "99", + "100", + "101", + "102", + "103", + "104" + ] + } + ], + "unrearranged_support": [], + "rearranged_support": [], + "paralogs": [], + "curation": "Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV8-2", + "curational_tags": null + } + ], + "curation": null + }] +} diff --git a/tests/data/good_rearrangement.tsv b/tests/data/good_rearrangement.tsv new file mode 100644 index 000000000..e8521767d --- /dev/null +++ b/tests/data/good_rearrangement.tsv @@ -0,0 +1,10 @@ +rearrangement_id rearrangement_set_id sequence_id sequence rev_comp productive sequence_alignment germline_alignment v_call d_call j_call c_call junction junction_length junction_aa v_score d_score j_score c_score v_cigar d_cigar j_cigar c_cigar v_identity v_evalue d_identity d_evalue j_identity j_evalue v_sequence_start v_sequence_end v_germline_start v_germline_end d_sequence_start d_sequence_end d_germline_start d_germline_end j_sequence_start j_sequence_end j_germline_start j_germline_end np1_length np2_length duplicate_count +IVKNQEJ01BVGQ6 1 IVKNQEJ01BVGQ6 GGCCCAGGACTGGTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGGGGCCAGGGAACCCTGGTCACTGTCTCCTCA T T IGHV4-31*03 IGHD1-7*01,IGHD6-19*01 IGHJ4*02 TGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGG 36 CASGVAGTFDYW 430 16.4 75.8 22N1S275= 11N280S8= 6N292S32=1X9= 1 1E-122 1 2.7 0.9762 6E-18 0 275 0 317 279 287 10 18 291 333 5 47 4 4 1247 +IVKNQEJ01AQVWS 1 IVKNQEJ01AQVWS GGCCCAGGACTGGTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCAACTACAACCCCTCCCTCAAGAGTCGAGTCACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCA T T IGHV4-31*03 IGHD1-7*01,IGHD6-19*01 IGHJ4*02 TGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGG 36 CASGVAGTFDYW 420 16.4 83.8 22N1S156=1X10=1X17=1X89= 11N280S8= 6N292S42= 0.9891 8E-120 1 2.7 1 2E-20 0 275 0 317 279 287 10 18 291 333 5 47 4 4 4 +IVKNQEJ01AOYFZ 1 IVKNQEJ01AOYFZ GGCCCAGGACTGGTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGCGGGGTGGCTGGTAACTTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCA T F IGHV4-31*03 IGHD6-19*01 IGHJ4*02 TGTGCGAGCGGGGTGGCTGGTAACTTTTGACTACTGG 37 CASGVAGNF*LLX 430 20.4 83.8 22N1S275= 11N280S10= 6N293S42= 1 1E-122 1 0.17 1 2E-20 0 275 0 317 279 289 10 20 292 334 5 47 4 3 92 +IVKNQEJ01EI5S4 1 IVKNQEJ01EI5S4 GGCCCAGGACTGGTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCA T T IGHV4-31*03 IGHD1-7*01,IGHD6-19*01 IGHJ4*02 TGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGG 36 CASGVAGTFDYW 430 16.4 83.8 22N1S275= 11N280S8= 6N292S42= 1 1E-122 1 2.7 1 2E-20 0 275 0 317 279 287 10 18 291 333 5 47 4 4 2913 +IVKNQEJ01DGRRI 1 IVKNQEJ01DGRRI GGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCGCTGTCTATGGTGGGTCCTTCAGTGGTTACTACTGGAGCTGGATCCGCCAGCCCCCAGGGAAGGGTCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCA T T IGHV4-34*09 IGHD1-7*01,IGHD6-19*01 IGHJ4*02 TGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGG 36 CASGVAGTFDYW 389 16.4 83.8 22N1S23=2X85=1X15=1X1=1X3=1X2=1X1=1X5=1X6=1X118= 11N274S8= 6N286S42= 0.9628 2E-110 1 2.6 1 2E-20 0 269 0 317 273 281 10 18 285 327 5 47 4 4 1 +IVKNQEJ01APN5N 1 IVKNQEJ01APN5N GGCCCAGGACTGGTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTAGGGCCAGGGAACCCTGGTCACTGTCTCCTCA T F IGHV4-31*03 IGHD1-7*01,IGHD6-19*01 IGHJ4*02 TGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTAG 36 CASGVAGTFDY* 430 16.4 67.9 22N1S275= 11N280S8= 6N292S10=1X21=1X9= 1 1E-122 1 2.7 0.9524 1E-15 0 275 0 317 279 287 10 18 291 333 5 47 4 4 1 +IVKNQEJ01B0TT2 1 IVKNQEJ01B0TT2 GGCCCAGGACTGGTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGCGGGGTGGCTGGTAACTTTTGACTACTGGGGCCAGGGAACCCTGGTCACTGTCTCCTCA T F IGHV4-31*03 IGHD6-19*01 IGHJ4*02 TGTGCGAGCGGGGTGGCTGGTAACTTTTGACTACTGG 37 CASGVAGNF*LLX 430 20.4 75.8 22N1S275= 11N280S10= 6N293S32=1X9= 1 1E-122 1 0.17 0.9762 6E-18 0 275 0 317 279 289 10 20 292 334 5 47 4 3 30 +IVKNQEJ01AIS74 1 IVKNQEJ01AIS74 GGCGCAGGACTGTTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGGCGGGGTGGCTGGTAACTTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCA T F IGHV4-31*03 IGHD6-19*01 IGHJ4*02 TGTGCGAGGCGGGGTGGCTGGTAACTTTTGACTACTGG 38 CARRGGW*LLTTG 424 20.4 83.8 22N1S3=1X8=1X262= 11N281S10= 6N294S42= 0.9927 9E-121 1 0.17 1 2E-20 0 275 0 317 280 290 10 20 293 335 5 47 5 3 4 +IVKNQEJ01AJ44V 1 IVKNQEJ01AJ44V GGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCGCTGTCTATGGTGGGTCCTTCAGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGGGGCCAGGGAACCCTGGTCACTGTCTCCTCA T T IGHV4-59*06 IGHD1-7*01,IGHD6-19*01 IGHJ4*02 TGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGG 36 CASGVAGTFDYW 386 16.4 75.8 22N1S45=1X5=2X6=1X3=1X5=1X22=1X4=1X1=1X1=1X165= 11N274S8= 6N286S32=1X9= 0.9625 2E-109 1 2.6 0.9762 5E-18 0 267 0 315 273 281 10 18 285 327 5 47 6 4 12 diff --git a/tests/data/good_repertoire.yaml b/tests/data/good_repertoire.yaml new file mode 100644 index 000000000..9bf3a4653 --- /dev/null +++ b/tests/data/good_repertoire.yaml @@ -0,0 +1,403 @@ +# +# Example metadata +# + +Repertoire: + - repertoire_id: 1841923116114776551-242ac11c-0001-012 + study: + study_id: PRJNA300878 + study_title: "Homo sapiens B and T cell repertoire - MZ twins" + study_type: + id: null + label: null + study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level." + study_contact: "Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X" + inclusion_exclusion_criteria: null + lab_name: "Mark M. Davis" + lab_address: "Stanford University" + submitted_by: "Florian Rubelt" + pub_ids: ["PMID:27005435"] + collected_by: null + grants: null + keywords_study: + - "contains_ig" + - "contains_tr" + subject: + subject_id: TW01A + synthetic: false + species: + id: "NCBITaxon_9606" + label: "Homo sapiens" + sex: female + age_min: 27 + age_max: 27 + age_unit: + id: UO_0000036 + label: year + age_event: null + ancestry_population: + id: null + label: null + location_birth: + id: null + label: null + ethnicity: null + race: null + strain_name: null + linked_subjects: TW01B + link_type: twin + diagnosis: + - study_group_description: null + disease_diagnosis: + id: null + label: null + disease_length: null + disease_stage: null + prior_therapies: null + immunogen: null + intervention: null + medical_history: null + + sample: + - sample_id: TW01A_B_naive + sample_processing_id: null + sample_type: "peripheral venous puncture" + tissue: + id: "UBERON_0000178" + label: "blood" + tissue_processing: "Ficoll gradient" + cell_subset: + id: "CL_0000788" + label: "naive B cell" + cell_phenotype: "expression of CD20 and the absence of CD27" + cell_species: + id: "NCBITaxon_9606" + label: "Homo sapiens" + single_cell: false + cell_isolation: FACS + template_class: RNA + pcr_target: + - pcr_target_locus: IGH + forward_pcr_primer_target_location: null + reverse_pcr_primer_target_location: null + sequencing_platform: "Illumina MiSeq" + sequencing_files: + sequencing_data_id: SRA:SRR2905656 + file_type: fastq + filename: SRR2905656_R1.fastq.gz + read_direction: forward + read_length: 300 + paired_filename: SRR2905656_R2.fastq.gz + paired_read_direction: reverse + paired_read_length: 300 + index_filename: SRR2905656_R3.fastq.gz + index_length: 8 + anatomic_site: null + disease_state_sample: null + collection_time_point_relative: null + collection_time_point_relative_unit: + id: null + label: null + collection_time_point_reference: null + collection_location: + id: null + label: null + biomaterial_provider: null + cell_number: null + cells_per_reaction: null + cell_storage: false + cell_quality: null + cell_processing_protocol: null + template_quality: null + template_amount: null + template_amount_unit: + id: null + label: null + library_generation_method: "RT(oligo-dT)+PCR" + library_generation_protocol: null + library_generation_kit_version: null + complete_sequences: "partial" + physical_linkage: "none" + sequencing_run_id: null + total_reads_passing_qc_filter: null + sequencing_facility: null + sequencing_run_date: null + sequencing_kit: null + data_processing: + - data_processing_id: 3059369183532618216-242ac11b-0001-007 + primary_annotation: true + software_versions: null + paired_reads_assembly: null + quality_thresholds: null + primer_match_cutoffs: null + collapsing_method: null + data_processing_protocols: null + data_processing_files: null + germline_database: null + analysis_provenance_id: 6623294219256599016-242ac11c-0001-012 + + - repertoire_id: 1602908186092376551-242ac11c-0001-012 + study: + study_id: PRJNA300878 + study_title: "Homo sapiens B and T cell repertoire - MZ twins" + study_type: + id: null + label: null + study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level." + study_contact: "Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X" + inclusion_exclusion_criteria: null + lab_name: "Mark M. Davis" + lab_address: "Stanford University" + submitted_by: "Florian Rubelt" + pub_ids: ["PMID:27005435"] + collected_by: null + grants: null + keywords_study: + - "contains_ig" + - "contains_tr" + subject: + subject_id: TW01A + synthetic: false + species: + id: "NCBITaxon_9606" + label: "Homo sapiens" + sex: female + age_min: 27 + age_max: 27 + age_unit: + id: UO_0000036 + label: year + age_event: null + ancestry_population: + id: null + label: null + location_birth: + id: null + label: null + ethnicity: null + race: null + strain_name: null + linked_subjects: TW01B + link_type: twin + diagnosis: + - study_group_description: null + disease_diagnosis: + id: null + label: null + disease_length: null + disease_stage: null + prior_therapies: null + immunogen: null + intervention: null + medical_history: null + + sample: + - sample_id: TW01A_B_memory + sample_processing_id: null + sample_type: "peripheral venous puncture" + tissue: + id: "UBERON_0000178" + label: "blood" + tissue_processing: "Ficoll gradient" + cell_subset: + id: "CL_0000787" + label: "memory B cell" + cell_phenotype: "expression of CD20 and CD27" + cell_species: + id: "NCBITaxon_9606" + label: "Homo sapiens" + single_cell: false + cell_isolation: FACS + template_class: RNA + pcr_target: + - pcr_target_locus: IGH + forward_pcr_primer_target_location: null + reverse_pcr_primer_target_location: null + sequencing_platform: "Illumina MiSeq" + sequencing_files: + sequencing_data_id: SRA:SRR2905655 + file_type: fastq + filename: SRR2905655_R1.fastq.gz + read_direction: forward + read_length: 300 + paired_filename: SRR2905655_R2.fastq.gz + paired_read_direction: reverse + paired_read_length: 300 + index_filename: SRR2905655_R3.fastq.gz + index_length: 8 + anatomic_site: null + disease_state_sample: null + collection_time_point_relative: null + collection_time_point_relative_unit: + id: null + label: null + collection_time_point_reference: null + collection_location: + id: null + label: null + biomaterial_provider: null + cell_number: null + cells_per_reaction: null + cell_storage: false + cell_quality: null + cell_processing_protocol: null + template_quality: null + template_amount: null + template_amount_unit: + id: null + label: null + library_generation_method: "RT(oligo-dT)+PCR" + library_generation_protocol: null + library_generation_kit_version: null + complete_sequences: "partial" + physical_linkage: "none" + sequencing_run_id: null + total_reads_passing_qc_filter: null + sequencing_facility: null + sequencing_run_date: null + sequencing_kit: null + data_processing: + - data_processing_id: 3059369183532618216-242ac11b-0001-007 + primary_annotation: true + software_versions: null + paired_reads_assembly: null + quality_thresholds: null + primer_match_cutoffs: null + collapsing_method: null + data_processing_protocols: null + data_processing_files: null + germline_database: null + analysis_provenance_id: 6623294219256599016-242ac11c-0001-012 + + - repertoire_id: 2366080924918616551-242ac11c-0001-012 + study: + study_id: PRJNA300878 + study_title: "Homo sapiens B and T cell repertoire - MZ twins" + study_type: + id: null + label: null + study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level." + study_contact: "Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X" + inclusion_exclusion_criteria: null + lab_name: "Mark M. Davis" + lab_address: "Stanford University" + submitted_by: "Florian Rubelt" + pub_ids: ["PMID:27005435"] + collected_by: null + grants: null + keywords_study: + - "contains_ig" + - "contains_tr" + subject: + subject_id: TW01A + synthetic: false + species: + id: "NCBITaxon_9606" + label: "Homo sapiens" + sex: female + age_min: 27 + age_max: 27 + age_unit: + id: UO_0000036 + label: year + age_event: null + ancestry_population: + id: null + label: null + location_birth: + id: null + label: null + ethnicity: null + race: null + strain_name: null + linked_subjects: TW01B + link_type: twin + diagnosis: + - study_group_description: null + disease_diagnosis: + id: null + label: null + disease_length: null + disease_stage: null + prior_therapies: null + immunogen: null + intervention: null + medical_history: null + + sample: + - sample_id: TW01A_T_naive_CD4 + sample_processing_id: null + sample_type: "peripheral venous puncture" + tissue: + id: "UBERON_0000178" + label: "blood" + tissue_processing: "Ficoll gradient" + cell_subset: + id: "CL_0000895" + label: "naive thymus-derived CD4-positive, alpha-beta T cell" + cell_phenotype: "expression of CD8 and absence of CD4 and CD45RO" + cell_species: + id: "NCBITaxon_9606" + label: "Homo sapiens" + single_cell: false + cell_isolation: FACS + template_class: RNA + pcr_target: + - pcr_target_locus: TRB + forward_pcr_primer_target_location: null + reverse_pcr_primer_target_location: null + sequencing_platform: "Illumina MiSeq" + sequencing_files: + sequencing_data_id: SRA:SRR2905659 + file_type: fastq + filename: SRR2905659_R1.fastq.gz + read_direction: forward + read_length: 300 + paired_filename: SRR2905659_R2.fastq.gz + paired_read_direction: reverse + paired_read_length: 300 + index_filename: SRR2905659_R3.fastq.gz + index_length: 8 + anatomic_site: null + disease_state_sample: null + collection_time_point_relative: null + collection_time_point_relative_unit: + id: null + label: null + collection_time_point_reference: null + collection_location: + id: null + label: null + biomaterial_provider: null + cell_number: null + cells_per_reaction: null + cell_storage: false + cell_quality: null + cell_processing_protocol: null + template_quality: null + template_amount: null + template_amount_unit: + id: null + label: null + library_generation_method: "RT(oligo-dT)+PCR" + library_generation_protocol: null + library_generation_kit_version: null + complete_sequences: "partial" + physical_linkage: "none" + sequencing_run_id: null + total_reads_passing_qc_filter: null + sequencing_facility: null + sequencing_run_date: null + sequencing_kit: null + data_processing: + - data_processing_id: 651223970338378216-242ac11b-0001-007 + primary_annotation: true + software_versions: null + paired_reads_assembly: null + quality_thresholds: null + primer_match_cutoffs: null + collapsing_method: null + data_processing_protocols: null + data_processing_files: null + germline_database: null + analysis_provenance_id: 4625424004665971176-242ac11c-0001-012 diff --git a/tests/data/output_blank.json b/tests/data/output_blank.json new file mode 100644 index 000000000..3476903ad --- /dev/null +++ b/tests/data/output_blank.json @@ -0,0 +1,231 @@ +{ + "Info": { + "title": "AIRR Data File", + "description": "AIRR Data File written by AIRR Standards Python Library", + "version": 1.4, + "contact": { + "name": "AIRR Community", + "url": "https://github.com/airr-community" + }, + "license": { + "name": "Creative Commons Attribution 4.0 International", + "url": "https://creativecommons.org/licenses/by/4.0/" + } + }, + "Repertoire": { + "repertoire_id": null, + "repertoire_name": null, + "repertoire_description": null, + "study": { + "study_id": null, + "study_title": null, + "study_type": { + "id": null, + "label": null + }, + "study_description": null, + "inclusion_exclusion_criteria": null, + "grants": null, + "study_contact": null, + "collected_by": null, + "lab_name": null, + "lab_address": null, + "submitted_by": null, + "pub_ids": [], + "keywords_study": [], + "adc_publish_date": null, + "adc_update_date": null + }, + "subject": { + "subject_id": null, + "synthetic": false, + "species": { + "id": null, + "label": null + }, + "sex": null, + "age_min": null, + "age_max": null, + "age_unit": { + "id": null, + "label": null + }, + "age_event": null, + "ancestry_population": { + "id": null, + "label": null + }, + "location_birth": { + "id": null, + "label": null + }, + "ethnicity": null, + "race": null, + "strain_name": null, + "linked_subjects": null, + "link_type": null, + "diagnosis": [ + { + "study_group_description": null, + "disease_diagnosis": { + "id": null, + "label": null + }, + "disease_length": null, + "disease_stage": null, + "prior_therapies": null, + "immunogen": null, + "intervention": null, + "medical_history": null + } + ], + "genotype": { + "receptor_genotype_set": { + "receptor_genotype_set_id": null, + "genotype_class_list": [ + { + "receptor_genotype_id": null, + "locus": "IGH", + "documented_alleles": [ + { + "label": null, + "germline_set_ref": null, + "phasing": null + } + ], + "undocumented_alleles": [ + { + "allele_name": null, + "sequence": "", + "phasing": null + } + ], + "deleted_genes": [ + { + "label": "", + "germline_set_ref": null, + "phasing": null + } + ], + "inference_process": null + } + ] + }, + "mhc_genotype_set": { + "mhc_genotype_set_id": null, + "mhc_genotype_list": [ + { + "mhc_genotype_id": null, + "mhc_class": "MHC-I", + "mhc_alleles": [ + { + "allele_designation": null, + "gene": { + "id": null, + "label": null + }, + "reference_set_ref": null + } + ], + "mhc_genotyping_method": null + } + ] + } + } + }, + "sample": [ + { + "sample_processing_id": null, + "sample_id": null, + "sample_type": null, + "tissue": { + "id": null, + "label": null + }, + "anatomic_site": null, + "disease_state_sample": null, + "collection_time_point_relative": null, + "collection_time_point_relative_unit": { + "id": null, + "label": null + }, + "collection_time_point_reference": null, + "collection_location": { + "id": null, + "label": null + }, + "biomaterial_provider": null, + "tissue_processing": null, + "cell_subset": { + "id": null, + "label": null + }, + "cell_phenotype": null, + "cell_species": { + "id": null, + "label": null + }, + "single_cell": null, + "cell_number": null, + "cells_per_reaction": null, + "cell_storage": null, + "cell_quality": null, + "cell_isolation": null, + "cell_processing_protocol": null, + "template_class": "DNA", + "template_quality": null, + "template_amount": null, + "template_amount_unit": { + "id": null, + "label": null + }, + "library_generation_method": "PCR", + "library_generation_protocol": null, + "library_generation_kit_version": null, + "pcr_target": [ + { + "pcr_target_locus": null, + "forward_pcr_primer_target_location": null, + "reverse_pcr_primer_target_location": null + } + ], + "complete_sequences": "partial", + "physical_linkage": "none", + "sequencing_run_id": null, + "total_reads_passing_qc_filter": null, + "sequencing_platform": null, + "sequencing_facility": null, + "sequencing_run_date": null, + "sequencing_kit": null, + "sequencing_files": { + "sequencing_data_id": null, + "file_type": null, + "filename": null, + "read_direction": null, + "read_length": null, + "paired_filename": null, + "paired_read_direction": null, + "paired_read_length": null, + "index_filename": null, + "index_length": null + } + } + ], + "data_processing": [ + { + "data_processing_id": null, + "primary_annotation": false, + "software_versions": null, + "paired_reads_assembly": null, + "quality_thresholds": null, + "primer_match_cutoffs": null, + "collapsing_method": null, + "data_processing_protocols": null, + "data_processing_files": [], + "germline_database": null, + "germline_set_ref": null, + "analysis_provenance_id": null + } + ] + } +} \ No newline at end of file diff --git a/tests/data/output_data.json b/tests/data/output_data.json new file mode 100644 index 000000000..f43d9973e --- /dev/null +++ b/tests/data/output_data.json @@ -0,0 +1,913 @@ +{ + "Info": { + "title": "AIRR Data File", + "description": "AIRR Data File written by AIRR Standards Python Library", + "version": 1.4, + "contact": { + "name": "AIRR Community", + "url": "https://github.com/airr-community" + }, + "license": { + "name": "Creative Commons Attribution 4.0 International", + "url": "https://creativecommons.org/licenses/by/4.0/" + } + }, + "Repertoire": { + "1841923116114776551-242ac11c-0001-012": { + "repertoire_id": "1841923116114776551-242ac11c-0001-012", + "study": { + "study_id": "PRJNA300878", + "study_title": "Homo sapiens B and T cell repertoire - MZ twins", + "study_type": { + "id": null, + "label": null + }, + "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.", + "study_contact": "Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X", + "inclusion_exclusion_criteria": null, + "lab_name": "Mark M. Davis", + "lab_address": "Stanford University", + "submitted_by": "Florian Rubelt", + "pub_ids": [ + "PMID:27005435" + ], + "collected_by": null, + "grants": null, + "keywords_study": [ + "contains_ig", + "contains_tr" + ] + }, + "subject": { + "subject_id": "TW01A", + "synthetic": false, + "species": { + "id": "NCBITaxon_9606", + "label": "Homo sapiens" + }, + "sex": "female", + "age_min": 27, + "age_max": 27, + "age_unit": { + "id": "UO_0000036", + "label": "year" + }, + "age_event": null, + "ancestry_population": { + "id": null, + "label": null + }, + "location_birth": { + "id": null, + "label": null + }, + "ethnicity": null, + "race": null, + "strain_name": null, + "linked_subjects": "TW01B", + "link_type": "twin", + "diagnosis": [ + { + "study_group_description": null, + "disease_diagnosis": { + "id": null, + "label": null + }, + "disease_length": null, + "disease_stage": null, + "prior_therapies": null, + "immunogen": null, + "intervention": null, + "medical_history": null + } + ] + }, + "sample": [ + { + "sample_id": "TW01A_B_naive", + "sample_processing_id": null, + "sample_type": "peripheral venous puncture", + "tissue": { + "id": "UBERON_0000178", + "label": "blood" + }, + "tissue_processing": "Ficoll gradient", + "cell_subset": { + "id": "CL_0000788", + "label": "naive B cell" + }, + "cell_phenotype": "expression of CD20 and the absence of CD27", + "cell_species": { + "id": "NCBITaxon_9606", + "label": "Homo sapiens" + }, + "single_cell": false, + "cell_isolation": "FACS", + "template_class": "RNA", + "pcr_target": [ + { + "pcr_target_locus": "IGH", + "forward_pcr_primer_target_location": null, + "reverse_pcr_primer_target_location": null + } + ], + "sequencing_platform": "Illumina MiSeq", + "sequencing_files": { + "sequencing_data_id": "SRA:SRR2905656", + "file_type": "fastq", + "filename": "SRR2905656_R1.fastq.gz", + "read_direction": "forward", + "read_length": 300, + "paired_filename": "SRR2905656_R2.fastq.gz", + "paired_read_direction": "reverse", + "paired_read_length": 300, + "index_filename": "SRR2905656_R3.fastq.gz", + "index_length": 8 + }, + "anatomic_site": null, + "disease_state_sample": null, + "collection_time_point_relative": null, + "collection_time_point_relative_unit": { + "id": null, + "label": null + }, + "collection_time_point_reference": null, + "collection_location": { + "id": null, + "label": null + }, + "biomaterial_provider": null, + "cell_number": null, + "cells_per_reaction": null, + "cell_storage": false, + "cell_quality": null, + "cell_processing_protocol": null, + "template_quality": null, + "template_amount": null, + "template_amount_unit": { + "id": null, + "label": null + }, + "library_generation_method": "RT(oligo-dT)+PCR", + "library_generation_protocol": null, + "library_generation_kit_version": null, + "complete_sequences": "partial", + "physical_linkage": "none", + "sequencing_run_id": null, + "total_reads_passing_qc_filter": null, + "sequencing_facility": null, + "sequencing_run_date": null, + "sequencing_kit": null + } + ], + "data_processing": [ + { + "data_processing_id": "3059369183532618216-242ac11b-0001-007", + "primary_annotation": true, + "software_versions": null, + "paired_reads_assembly": null, + "quality_thresholds": null, + "primer_match_cutoffs": null, + "collapsing_method": null, + "data_processing_protocols": null, + "data_processing_files": null, + "germline_database": null, + "analysis_provenance_id": "6623294219256599016-242ac11c-0001-012" + } + ] + }, + "1602908186092376551-242ac11c-0001-012": { + "repertoire_id": "1602908186092376551-242ac11c-0001-012", + "study": { + "study_id": "PRJNA300878", + "study_title": "Homo sapiens B and T cell repertoire - MZ twins", + "study_type": { + "id": null, + "label": null + }, + "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.", + "study_contact": "Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X", + "inclusion_exclusion_criteria": null, + "lab_name": "Mark M. Davis", + "lab_address": "Stanford University", + "submitted_by": "Florian Rubelt", + "pub_ids": [ + "PMID:27005435" + ], + "collected_by": null, + "grants": null, + "keywords_study": [ + "contains_ig", + "contains_tr" + ] + }, + "subject": { + "subject_id": "TW01A", + "synthetic": false, + "species": { + "id": "NCBITaxon_9606", + "label": "Homo sapiens" + }, + "sex": "female", + "age_min": 27, + "age_max": 27, + "age_unit": { + "id": "UO_0000036", + "label": "year" + }, + "age_event": null, + "ancestry_population": { + "id": null, + "label": null + }, + "location_birth": { + "id": null, + "label": null + }, + "ethnicity": null, + "race": null, + "strain_name": null, + "linked_subjects": "TW01B", + "link_type": "twin", + "diagnosis": [ + { + "study_group_description": null, + "disease_diagnosis": { + "id": null, + "label": null + }, + "disease_length": null, + "disease_stage": null, + "prior_therapies": null, + "immunogen": null, + "intervention": null, + "medical_history": null + } + ] + }, + "sample": [ + { + "sample_id": "TW01A_B_memory", + "sample_processing_id": null, + "sample_type": "peripheral venous puncture", + "tissue": { + "id": "UBERON_0000178", + "label": "blood" + }, + "tissue_processing": "Ficoll gradient", + "cell_subset": { + "id": "CL_0000787", + "label": "memory B cell" + }, + "cell_phenotype": "expression of CD20 and CD27", + "cell_species": { + "id": "NCBITaxon_9606", + "label": "Homo sapiens" + }, + "single_cell": false, + "cell_isolation": "FACS", + "template_class": "RNA", + "pcr_target": [ + { + "pcr_target_locus": "IGH", + "forward_pcr_primer_target_location": null, + "reverse_pcr_primer_target_location": null + } + ], + "sequencing_platform": "Illumina MiSeq", + "sequencing_files": { + "sequencing_data_id": "SRA:SRR2905655", + "file_type": "fastq", + "filename": "SRR2905655_R1.fastq.gz", + "read_direction": "forward", + "read_length": 300, + "paired_filename": "SRR2905655_R2.fastq.gz", + "paired_read_direction": "reverse", + "paired_read_length": 300, + "index_filename": "SRR2905655_R3.fastq.gz", + "index_length": 8 + }, + "anatomic_site": null, + "disease_state_sample": null, + "collection_time_point_relative": null, + "collection_time_point_relative_unit": { + "id": null, + "label": null + }, + "collection_time_point_reference": null, + "collection_location": { + "id": null, + "label": null + }, + "biomaterial_provider": null, + "cell_number": null, + "cells_per_reaction": null, + "cell_storage": false, + "cell_quality": null, + "cell_processing_protocol": null, + "template_quality": null, + "template_amount": null, + "template_amount_unit": { + "id": null, + "label": null + }, + "library_generation_method": "RT(oligo-dT)+PCR", + "library_generation_protocol": null, + "library_generation_kit_version": null, + "complete_sequences": "partial", + "physical_linkage": "none", + "sequencing_run_id": null, + "total_reads_passing_qc_filter": null, + "sequencing_facility": null, + "sequencing_run_date": null, + "sequencing_kit": null + } + ], + "data_processing": [ + { + "data_processing_id": "3059369183532618216-242ac11b-0001-007", + "primary_annotation": true, + "software_versions": null, + "paired_reads_assembly": null, + "quality_thresholds": null, + "primer_match_cutoffs": null, + "collapsing_method": null, + "data_processing_protocols": null, + "data_processing_files": null, + "germline_database": null, + "analysis_provenance_id": "6623294219256599016-242ac11c-0001-012" + } + ] + }, + "2366080924918616551-242ac11c-0001-012": { + "repertoire_id": "2366080924918616551-242ac11c-0001-012", + "study": { + "study_id": "PRJNA300878", + "study_title": "Homo sapiens B and T cell repertoire - MZ twins", + "study_type": { + "id": null, + "label": null + }, + "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.", + "study_contact": "Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X", + "inclusion_exclusion_criteria": null, + "lab_name": "Mark M. Davis", + "lab_address": "Stanford University", + "submitted_by": "Florian Rubelt", + "pub_ids": [ + "PMID:27005435" + ], + "collected_by": null, + "grants": null, + "keywords_study": [ + "contains_ig", + "contains_tr" + ] + }, + "subject": { + "subject_id": "TW01A", + "synthetic": false, + "species": { + "id": "NCBITaxon_9606", + "label": "Homo sapiens" + }, + "sex": "female", + "age_min": 27, + "age_max": 27, + "age_unit": { + "id": "UO_0000036", + "label": "year" + }, + "age_event": null, + "ancestry_population": { + "id": null, + "label": null + }, + "location_birth": { + "id": null, + "label": null + }, + "ethnicity": null, + "race": null, + "strain_name": null, + "linked_subjects": "TW01B", + "link_type": "twin", + "diagnosis": [ + { + "study_group_description": null, + "disease_diagnosis": { + "id": null, + "label": null + }, + "disease_length": null, + "disease_stage": null, + "prior_therapies": null, + "immunogen": null, + "intervention": null, + "medical_history": null + } + ] + }, + "sample": [ + { + "sample_id": "TW01A_T_naive_CD4", + "sample_processing_id": null, + "sample_type": "peripheral venous puncture", + "tissue": { + "id": "UBERON_0000178", + "label": "blood" + }, + "tissue_processing": "Ficoll gradient", + "cell_subset": { + "id": "CL_0000895", + "label": "naive thymus-derived CD4-positive, alpha-beta T cell" + }, + "cell_phenotype": "expression of CD8 and absence of CD4 and CD45RO", + "cell_species": { + "id": "NCBITaxon_9606", + "label": "Homo sapiens" + }, + "single_cell": false, + "cell_isolation": "FACS", + "template_class": "RNA", + "pcr_target": [ + { + "pcr_target_locus": "TRB", + "forward_pcr_primer_target_location": null, + "reverse_pcr_primer_target_location": null + } + ], + "sequencing_platform": "Illumina MiSeq", + "sequencing_files": { + "sequencing_data_id": "SRA:SRR2905659", + "file_type": "fastq", + "filename": "SRR2905659_R1.fastq.gz", + "read_direction": "forward", + "read_length": 300, + "paired_filename": "SRR2905659_R2.fastq.gz", + "paired_read_direction": "reverse", + "paired_read_length": 300, + "index_filename": "SRR2905659_R3.fastq.gz", + "index_length": 8 + }, + "anatomic_site": null, + "disease_state_sample": null, + "collection_time_point_relative": null, + "collection_time_point_relative_unit": { + "id": null, + "label": null + }, + "collection_time_point_reference": null, + "collection_location": { + "id": null, + "label": null + }, + "biomaterial_provider": null, + "cell_number": null, + "cells_per_reaction": null, + "cell_storage": false, + "cell_quality": null, + "cell_processing_protocol": null, + "template_quality": null, + "template_amount": null, + "template_amount_unit": { + "id": null, + "label": null + }, + "library_generation_method": "RT(oligo-dT)+PCR", + "library_generation_protocol": null, + "library_generation_kit_version": null, + "complete_sequences": "partial", + "physical_linkage": "none", + "sequencing_run_id": null, + "total_reads_passing_qc_filter": null, + "sequencing_facility": null, + "sequencing_run_date": null, + "sequencing_kit": null + } + ], + "data_processing": [ + { + "data_processing_id": "651223970338378216-242ac11b-0001-007", + "primary_annotation": true, + "software_versions": null, + "paired_reads_assembly": null, + "quality_thresholds": null, + "primer_match_cutoffs": null, + "collapsing_method": null, + "data_processing_protocols": null, + "data_processing_files": null, + "germline_database": null, + "analysis_provenance_id": "4625424004665971176-242ac11c-0001-012" + } + ] + } + }, + "GermlineSet": { + "OGRDB:G00007": { + "germline_set_id": "OGRDB:G00007", + "author": "William Lees", + "lab_name": "", + "lab_address": "Birkbeck College, University of London, Malet Street, London", + "acknowledgements": [], + "release_version": 1, + "release_description": "", + "release_date": "2021-11-24", + "germline_set_name": "CAST IGH", + "germline_set_ref": "OGRDB:G00007.1", + "pub_ids": [ + "" + ], + "species": { + "id": "NCBITAXON:10090", + "label": "Mus musculus" + }, + "species_subgroup": "CAST_EiJ", + "species_subgroup_type": "strain", + "locus": "IGH", + "allele_descriptions": [ + { + "allele_description_id": "OGRDB:A00301", + "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2DBF", + "maintainer": "William Lees", + "acknowledgements": [], + "lab_address": "Birkbeck College, University of London, Malet Street, London", + "release_version": 1, + "release_date": "24-Nov-2021", + "release_description": "First release", + "label": "IGHV-2DBF", + "sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", + "coding_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", + "aliases": [ + "watson_et_al:CAST_EiJ_IGHV5-3" + ], + "locus": "IGH", + "chromosome": null, + "sequence_type": "V", + "functional": true, + "inference_type": "rearranged_only", + "species": { + "id": "NCBITAXON:10090", + "label": "Mus musculus" + }, + "species_subgroup": "CAST_EiJ", + "species_subgroup_type": "strain", + "status": "active", + "gene_designation": null, + "subgroup_designation": null, + "allele_designation": null, + "gene_start": null, + "gene_end": null, + "utr_5_prime_start": null, + "utr_5_prime_end": null, + "leader_1_start": null, + "leader_1_end": null, + "leader_2_start": null, + "leader_2_end": null, + "v_rs_start": null, + "v_rs_end": null, + "v_gene_delineations": [ + { + "sequence_delineation_id": "1", + "delineation_scheme": "IMGT", + "aligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", + "unaligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", + "fwr1_start": 1, + "fwr1_end": 75, + "cdr1_start": 76, + "cdr1_end": 110, + "fwr2_start": 111, + "fwr2_end": 150, + "cdr2_start": 151, + "cdr2_end": 160, + "fwr3_start": 161, + "fwr3_end": 294, + "cdr3_start": 295, + "alignment": [ + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "21", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "40", + "41", + "42", + "43", + "44", + "45", + "46", + "47", + "48", + "49", + "50", + "51", + "52", + "53", + "54", + "55", + "56", + "57", + "58", + "59", + "60", + "61", + "62", + "63", + "64", + "65", + "66", + "67", + "68", + "69", + "70", + "71", + "72", + "73", + "74", + "75", + "76", + "77", + "78", + "79", + "80", + "81", + "82", + "83", + "84", + "85", + "86", + "87", + "88", + "89", + "90", + "91", + "92", + "93", + "94", + "95", + "96", + "97", + "98", + "99", + "100", + "101", + "102", + "103", + "104" + ] + } + ], + "unrearranged_support": [], + "rearranged_support": [], + "paralogs": [], + "curation": "Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV5-3", + "curational_tags": null + }, + { + "allele_description_id": "OGRDB:A00314", + "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2ETO", + "maintainer": "William Lees", + "acknowledgements": [], + "lab_address": "Birkbeck College, University of London, Malet Street, London", + "release_version": 1, + "release_date": "24-Nov-2021", + "release_description": "First release", + "label": "IGHV-2ETO", + "sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC", + "coding_sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC", + "aliases": [ + "watson_et_al:CAST_EiJ_IGHV8-2" + ], + "locus": "IGH", + "chromosome": null, + "sequence_type": "V", + "functional": true, + "inference_type": "rearranged_only", + "species": { + "id": "NCBITAXON:10090", + "label": "Mus musculus" + }, + "species_subgroup": "CAST_EiJ", + "species_subgroup_type": "strain", + "status": "active", + "gene_designation": null, + "subgroup_designation": null, + "allele_designation": null, + "gene_start": null, + "gene_end": null, + "utr_5_prime_start": null, + "utr_5_prime_end": null, + "leader_1_start": null, + "leader_1_end": null, + "leader_2_start": null, + "leader_2_end": null, + "v_rs_start": null, + "v_rs_end": null, + "v_gene_delineations": [ + { + "sequence_delineation_id": "1", + "delineation_scheme": "IMGT", + "aligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", + "unaligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", + "fwr1_start": 1, + "fwr1_end": 75, + "cdr1_start": 76, + "cdr1_end": 110, + "fwr2_start": 111, + "fwr2_end": 150, + "cdr2_start": 151, + "cdr2_end": 160, + "fwr3_start": 161, + "fwr3_end": 294, + "cdr3_start": 295, + "alignment": [ + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "21", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "40", + "41", + "42", + "43", + "44", + "45", + "46", + "47", + "48", + "49", + "50", + "51", + "52", + "53", + "54", + "55", + "56", + "57", + "58", + "59", + "60", + "61", + "62", + "63", + "64", + "65", + "66", + "67", + "68", + "69", + "70", + "71", + "72", + "73", + "74", + "75", + "76", + "77", + "78", + "79", + "80", + "81", + "82", + "83", + "84", + "85", + "86", + "87", + "88", + "89", + "90", + "91", + "92", + "93", + "94", + "95", + "96", + "97", + "98", + "99", + "100", + "101", + "102", + "103", + "104" + ] + } + ], + "unrearranged_support": [], + "rearranged_support": [], + "paralogs": [], + "curation": "Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV8-2", + "curational_tags": null + } + ], + "curation": null + } + }, + "GenotypeSet": { + "1": { + "receptor_genotype_set_id": "1", + "genotype_class_list": [ + { + "receptor_genotype_id": "1", + "locus": "IGH", + "documented_alleles": [ + { + "label": "IGHV1-69*01", + "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", + "phasing": 1 + }, + { + "label": "IGHV1-69*02", + "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", + "phasing": 2 + } + ], + "undocumented_alleles": [ + { + "allele_name": "IGHD3-1*01_S1234", + "sequence": "agtagtagtagt", + "phasing": 1 + } + ], + "deleted_genes": [ + { + "label": "IGHV3-30-3", + "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", + "phasing": 1 + } + ], + "inference_process": "repertoire_sequencing" + } + ] + } + } +} \ No newline at end of file diff --git a/tests/data/output_rep.json b/tests/data/output_rep.json new file mode 100644 index 000000000..fa17a056d --- /dev/null +++ b/tests/data/output_rep.json @@ -0,0 +1,506 @@ +{ + "Info": { + "title": "Repertoire metadata", + "description": "Repertoire metadata written by AIRR Standards Python Library", + "version": 1.4, + "contact": { + "name": "AIRR Community", + "url": "https://github.com/airr-community" + }, + "license": { + "name": "Creative Commons Attribution 4.0 International", + "url": "https://creativecommons.org/licenses/by/4.0/" + } + }, + "Repertoire": [ + { + "repertoire_id": "1841923116114776551-242ac11c-0001-012", + "study": { + "study_id": "PRJNA300878", + "study_title": "Homo sapiens B and T cell repertoire - MZ twins", + "study_type": { + "id": null, + "label": null + }, + "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.", + "study_contact": "Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X", + "inclusion_exclusion_criteria": null, + "lab_name": "Mark M. Davis", + "lab_address": "Stanford University", + "submitted_by": "Florian Rubelt", + "pub_ids": [ + "PMID:27005435" + ], + "collected_by": null, + "grants": null, + "keywords_study": [ + "contains_ig", + "contains_tr" + ] + }, + "subject": { + "subject_id": "TW01A", + "synthetic": false, + "species": { + "id": "NCBITaxon_9606", + "label": "Homo sapiens" + }, + "sex": "female", + "age_min": 27, + "age_max": 27, + "age_unit": { + "id": "UO_0000036", + "label": "year" + }, + "age_event": null, + "ancestry_population": { + "id": null, + "label": null + }, + "location_birth": { + "id": null, + "label": null + }, + "ethnicity": null, + "race": null, + "strain_name": null, + "linked_subjects": "TW01B", + "link_type": "twin", + "diagnosis": [ + { + "study_group_description": null, + "disease_diagnosis": { + "id": null, + "label": null + }, + "disease_length": null, + "disease_stage": null, + "prior_therapies": null, + "immunogen": null, + "intervention": null, + "medical_history": null + } + ] + }, + "sample": [ + { + "sample_id": "TW01A_B_naive", + "sample_processing_id": null, + "sample_type": "peripheral venous puncture", + "tissue": { + "id": "UBERON_0000178", + "label": "blood" + }, + "tissue_processing": "Ficoll gradient", + "cell_subset": { + "id": "CL_0000788", + "label": "naive B cell" + }, + "cell_phenotype": "expression of CD20 and the absence of CD27", + "cell_species": { + "id": "NCBITaxon_9606", + "label": "Homo sapiens" + }, + "single_cell": false, + "cell_isolation": "FACS", + "template_class": "RNA", + "pcr_target": [ + { + "pcr_target_locus": "IGH", + "forward_pcr_primer_target_location": null, + "reverse_pcr_primer_target_location": null + } + ], + "sequencing_platform": "Illumina MiSeq", + "sequencing_files": { + "sequencing_data_id": "SRA:SRR2905656", + "file_type": "fastq", + "filename": "SRR2905656_R1.fastq.gz", + "read_direction": "forward", + "read_length": 300, + "paired_filename": "SRR2905656_R2.fastq.gz", + "paired_read_direction": "reverse", + "paired_read_length": 300, + "index_filename": "SRR2905656_R3.fastq.gz", + "index_length": 8 + }, + "anatomic_site": null, + "disease_state_sample": null, + "collection_time_point_relative": null, + "collection_time_point_relative_unit": { + "id": null, + "label": null + }, + "collection_time_point_reference": null, + "collection_location": { + "id": null, + "label": null + }, + "biomaterial_provider": null, + "cell_number": null, + "cells_per_reaction": null, + "cell_storage": false, + "cell_quality": null, + "cell_processing_protocol": null, + "template_quality": null, + "template_amount": null, + "template_amount_unit": { + "id": null, + "label": null + }, + "library_generation_method": "RT(oligo-dT)+PCR", + "library_generation_protocol": null, + "library_generation_kit_version": null, + "complete_sequences": "partial", + "physical_linkage": "none", + "sequencing_run_id": null, + "total_reads_passing_qc_filter": null, + "sequencing_facility": null, + "sequencing_run_date": null, + "sequencing_kit": null + } + ], + "data_processing": [ + { + "data_processing_id": "3059369183532618216-242ac11b-0001-007", + "primary_annotation": true, + "software_versions": null, + "paired_reads_assembly": null, + "quality_thresholds": null, + "primer_match_cutoffs": null, + "collapsing_method": null, + "data_processing_protocols": null, + "data_processing_files": null, + "germline_database": null, + "analysis_provenance_id": "6623294219256599016-242ac11c-0001-012" + } + ] + }, + { + "repertoire_id": "1602908186092376551-242ac11c-0001-012", + "study": { + "study_id": "PRJNA300878", + "study_title": "Homo sapiens B and T cell repertoire - MZ twins", + "study_type": { + "id": null, + "label": null + }, + "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.", + "study_contact": "Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X", + "inclusion_exclusion_criteria": null, + "lab_name": "Mark M. Davis", + "lab_address": "Stanford University", + "submitted_by": "Florian Rubelt", + "pub_ids": [ + "PMID:27005435" + ], + "collected_by": null, + "grants": null, + "keywords_study": [ + "contains_ig", + "contains_tr" + ] + }, + "subject": { + "subject_id": "TW01A", + "synthetic": false, + "species": { + "id": "NCBITaxon_9606", + "label": "Homo sapiens" + }, + "sex": "female", + "age_min": 27, + "age_max": 27, + "age_unit": { + "id": "UO_0000036", + "label": "year" + }, + "age_event": null, + "ancestry_population": { + "id": null, + "label": null + }, + "location_birth": { + "id": null, + "label": null + }, + "ethnicity": null, + "race": null, + "strain_name": null, + "linked_subjects": "TW01B", + "link_type": "twin", + "diagnosis": [ + { + "study_group_description": null, + "disease_diagnosis": { + "id": null, + "label": null + }, + "disease_length": null, + "disease_stage": null, + "prior_therapies": null, + "immunogen": null, + "intervention": null, + "medical_history": null + } + ] + }, + "sample": [ + { + "sample_id": "TW01A_B_memory", + "sample_processing_id": null, + "sample_type": "peripheral venous puncture", + "tissue": { + "id": "UBERON_0000178", + "label": "blood" + }, + "tissue_processing": "Ficoll gradient", + "cell_subset": { + "id": "CL_0000787", + "label": "memory B cell" + }, + "cell_phenotype": "expression of CD20 and CD27", + "cell_species": { + "id": "NCBITaxon_9606", + "label": "Homo sapiens" + }, + "single_cell": false, + "cell_isolation": "FACS", + "template_class": "RNA", + "pcr_target": [ + { + "pcr_target_locus": "IGH", + "forward_pcr_primer_target_location": null, + "reverse_pcr_primer_target_location": null + } + ], + "sequencing_platform": "Illumina MiSeq", + "sequencing_files": { + "sequencing_data_id": "SRA:SRR2905655", + "file_type": "fastq", + "filename": "SRR2905655_R1.fastq.gz", + "read_direction": "forward", + "read_length": 300, + "paired_filename": "SRR2905655_R2.fastq.gz", + "paired_read_direction": "reverse", + "paired_read_length": 300, + "index_filename": "SRR2905655_R3.fastq.gz", + "index_length": 8 + }, + "anatomic_site": null, + "disease_state_sample": null, + "collection_time_point_relative": null, + "collection_time_point_relative_unit": { + "id": null, + "label": null + }, + "collection_time_point_reference": null, + "collection_location": { + "id": null, + "label": null + }, + "biomaterial_provider": null, + "cell_number": null, + "cells_per_reaction": null, + "cell_storage": false, + "cell_quality": null, + "cell_processing_protocol": null, + "template_quality": null, + "template_amount": null, + "template_amount_unit": { + "id": null, + "label": null + }, + "library_generation_method": "RT(oligo-dT)+PCR", + "library_generation_protocol": null, + "library_generation_kit_version": null, + "complete_sequences": "partial", + "physical_linkage": "none", + "sequencing_run_id": null, + "total_reads_passing_qc_filter": null, + "sequencing_facility": null, + "sequencing_run_date": null, + "sequencing_kit": null + } + ], + "data_processing": [ + { + "data_processing_id": "3059369183532618216-242ac11b-0001-007", + "primary_annotation": true, + "software_versions": null, + "paired_reads_assembly": null, + "quality_thresholds": null, + "primer_match_cutoffs": null, + "collapsing_method": null, + "data_processing_protocols": null, + "data_processing_files": null, + "germline_database": null, + "analysis_provenance_id": "6623294219256599016-242ac11c-0001-012" + } + ] + }, + { + "repertoire_id": "2366080924918616551-242ac11c-0001-012", + "study": { + "study_id": "PRJNA300878", + "study_title": "Homo sapiens B and T cell repertoire - MZ twins", + "study_type": { + "id": null, + "label": null + }, + "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.", + "study_contact": "Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X", + "inclusion_exclusion_criteria": null, + "lab_name": "Mark M. Davis", + "lab_address": "Stanford University", + "submitted_by": "Florian Rubelt", + "pub_ids": [ + "PMID:27005435" + ], + "collected_by": null, + "grants": null, + "keywords_study": [ + "contains_ig", + "contains_tr" + ] + }, + "subject": { + "subject_id": "TW01A", + "synthetic": false, + "species": { + "id": "NCBITaxon_9606", + "label": "Homo sapiens" + }, + "sex": "female", + "age_min": 27, + "age_max": 27, + "age_unit": { + "id": "UO_0000036", + "label": "year" + }, + "age_event": null, + "ancestry_population": { + "id": null, + "label": null + }, + "location_birth": { + "id": null, + "label": null + }, + "ethnicity": null, + "race": null, + "strain_name": null, + "linked_subjects": "TW01B", + "link_type": "twin", + "diagnosis": [ + { + "study_group_description": null, + "disease_diagnosis": { + "id": null, + "label": null + }, + "disease_length": null, + "disease_stage": null, + "prior_therapies": null, + "immunogen": null, + "intervention": null, + "medical_history": null + } + ] + }, + "sample": [ + { + "sample_id": "TW01A_T_naive_CD4", + "sample_processing_id": null, + "sample_type": "peripheral venous puncture", + "tissue": { + "id": "UBERON_0000178", + "label": "blood" + }, + "tissue_processing": "Ficoll gradient", + "cell_subset": { + "id": "CL_0000895", + "label": "naive thymus-derived CD4-positive, alpha-beta T cell" + }, + "cell_phenotype": "expression of CD8 and absence of CD4 and CD45RO", + "cell_species": { + "id": "NCBITaxon_9606", + "label": "Homo sapiens" + }, + "single_cell": false, + "cell_isolation": "FACS", + "template_class": "RNA", + "pcr_target": [ + { + "pcr_target_locus": "TRB", + "forward_pcr_primer_target_location": null, + "reverse_pcr_primer_target_location": null + } + ], + "sequencing_platform": "Illumina MiSeq", + "sequencing_files": { + "sequencing_data_id": "SRA:SRR2905659", + "file_type": "fastq", + "filename": "SRR2905659_R1.fastq.gz", + "read_direction": "forward", + "read_length": 300, + "paired_filename": "SRR2905659_R2.fastq.gz", + "paired_read_direction": "reverse", + "paired_read_length": 300, + "index_filename": "SRR2905659_R3.fastq.gz", + "index_length": 8 + }, + "anatomic_site": null, + "disease_state_sample": null, + "collection_time_point_relative": null, + "collection_time_point_relative_unit": { + "id": null, + "label": null + }, + "collection_time_point_reference": null, + "collection_location": { + "id": null, + "label": null + }, + "biomaterial_provider": null, + "cell_number": null, + "cells_per_reaction": null, + "cell_storage": false, + "cell_quality": null, + "cell_processing_protocol": null, + "template_quality": null, + "template_amount": null, + "template_amount_unit": { + "id": null, + "label": null + }, + "library_generation_method": "RT(oligo-dT)+PCR", + "library_generation_protocol": null, + "library_generation_kit_version": null, + "complete_sequences": "partial", + "physical_linkage": "none", + "sequencing_run_id": null, + "total_reads_passing_qc_filter": null, + "sequencing_facility": null, + "sequencing_run_date": null, + "sequencing_kit": null + } + ], + "data_processing": [ + { + "data_processing_id": "651223970338378216-242ac11b-0001-007", + "primary_annotation": true, + "software_versions": null, + "paired_reads_assembly": null, + "quality_thresholds": null, + "primer_match_cutoffs": null, + "collapsing_method": null, + "data_processing_protocols": null, + "data_processing_files": null, + "germline_database": null, + "analysis_provenance_id": "4625424004665971176-242ac11c-0001-012" + } + ] + } + ] +} \ No newline at end of file diff --git a/tests/data/warning_repertoire.json b/tests/data/warning_repertoire.json new file mode 100644 index 000000000..30699a4c6 --- /dev/null +++ b/tests/data/warning_repertoire.json @@ -0,0 +1 @@ +{"Info":{"title":"AIRR Data Commons API for VDJServer Community Data Portal","description":"VDJServer ADC API response for repertoire query","version":"1.3","contact":{"name":"VDJServer","url":"http://vdjserver.org/","email":"vdjserver@utsouthwestern.edu"}},"Repertoire":[{"repertoire_id":"1329892364225474070-242ac113-0001-012","study":{"study_id":"PRJNA593622","study_title":"Determinants governing T cell receptor α/β-chain pairing in repertoire formation of identical twins","study_type":{"id":"NCIT:C16084","label":"Observational Study"},"study_description":"The T cell repertoire in each individual includes T cell receptors (TCRs) of enormous sequence diversity through the pairing of diverse TCR α- and β-chains, each generated by somatic recombination of paralogous gene segments. Whether the TCR repertoire contributes to susceptibility to infectious or autoimmune diseases in concert with disease-associated major histocompatibility complex (MHC) polymorphisms is unknown. Due to a lack in high-throughput technologies to sequence TCR α-β pairs, current studies on whether the TCR repertoire is shaped by host genetics have so far relied only on single-chain analysis. Using a high-throughput single T cell sequencing technology, we obtained the largest paired TCRαβ dataset so far, comprising 965,523 clonotypes from 15 healthy individuals including 6 monozygotic twin pairs. Public TCR α- and, to a lesser extent, TCR β-chain sequences were common in all individuals. In contrast, sharing of entirely identical TCRαβ amino acid sequences was very infrequent in unrelated individuals, but highly increased in twins, in particular in CD4 memory T cells. Based on nucleotide sequence identity, a subset of these shared clonotypes appeared to be the progeny of T cells that had been generated during fetal development and had persisted for more than 50 y. Additional shared TCRαβ in twins were encoded by different nucleotide sequences, implying that genetic determinants impose structural constraints on thymic selection that favor the selection of TCR α-β pairs with entire sequence identities.\n","inclusion_exclusion_criteria":" ","lab_name":"Jörg J Goronzy","lab_address":"Stanford University School of Medicine","submitted_by":"Scott Christley, scott.christley@utsouthwestern.edu","collected_by":"Hidetaka Tanno, hidetakatanno@utexas.edu","grants":"This work was supported by NIH Grants U19 AI057266 (to G.G. and J.J.G.) and R01 AI129191 (to J.J.G.) and US Defense Threat Reduction Agency Grant HDTRA1-12-C-0105 (to G.G.). H.T. was supported by University of Texas Health Innovation for Cancer Prevention Research Training Program Postdoctoral Fellowship (Cancer Prevention and Research Institute of Texas Grant RP160015), Japan Society for the Promotion of Science Postdoctoral Fellowships for Research Abroad, and Uehara Memorial Foundation Research Fellowship.","pub_ids":"PMID:31879353","keywords_study":["contains_tcr","contains_paired_chain"],"vdjserver_uuid":"1400363782577197546-242ac113-0001-012"},"subject":{"subject_id":"A1","synthetic":false,"species":{"id":"NCBITaxon:9606","label":"Homo sapiens"},"sex":"female","age_min":61,"age_max":61,"age_unit":{"id":"UO:0000036","label":"year"},"linked_subjects":"A2","link_type":"twin","diagnosis":[{"disease_diagnosis":{}}],"mhc":["HLA-A*30:02","HLA-A*31:01","HLA-B*35:02","HLA-B*38:01","HLA-C*04:01","HLA-C*12:03","HLA-DRB1*04:02","HLA-DRB1*04:03","HLA-DRB4*01:03","HLA-DQB1*03:02","HLA-DQB1*03:05"],"vdjserver_uuid":"4743918918142914070-242ac113-0001-012"},"sample":[{"sample_id":"A1_CD4_naive_TRB","tissue":{"id":"UBERON:0013756","label":"venous blood"},"biomaterial_provider":"Stanford University, CA","tissue_processing":"Peripheral blood mononuclear cells (PBMCs) were isolated by density centrifugation using Ficoll media at a density of 1.077 g/mL.","cell_subset":{"id":"CL:0000895","label":"naive thymus-derived CD4-positive, alpha-beta T cell"},"cell_phenotype":"CD4+CD45RA+CCR7+","cell_species":{},"single_cell":false,"cell_storage":true,"cell_isolation":"magnetic-bead–based negative EasySep selection reagents","template_class":"RNA","library_generation_method":"RT(oligo-dT)+PCR","pcr_target":[{"pcr_target_locus":"TRB"}],"complete_sequences":"partial","physical_linkage":"hetero_head-head","sequencing_run_id":"SRR10600326","sequencing_platform":"Illumina MiSeq","sequencing_files":{"file_type":"fastq","filename":"SRR10600326.sra_1.fastq.gz","read_length":300},"vdjserver_uuid":"4055006163864514070-242ac113-0001-012"},{"sample_id":"A1_CD4_naive_TRA","tissue":{"id":"UBERON:0013756","label":"venous blood"},"biomaterial_provider":"Stanford University, CA","tissue_processing":"Peripheral blood mononuclear cells (PBMCs) were isolated by density centrifugation using Ficoll media at a density of 1.077 g/mL.","cell_subset":{"id":"CL:0000895","label":"naive thymus-derived CD4-positive, alpha-beta T cell"},"cell_phenotype":"CD4+CD45RA+CCR7+","cell_species":{},"single_cell":false,"cell_storage":true,"cell_isolation":"magnetic-bead–based negative EasySep selection reagents","template_class":"RNA","library_generation_method":"RT(oligo-dT)+PCR","pcr_target":[{"pcr_target_locus":"TRA"}],"complete_sequences":"partial","physical_linkage":"hetero_head-head","sequencing_run_id":"SRR10600326","sequencing_platform":"Illumina MiSeq","sequencing_files":{"file_type":"fastq","filename":"SRR10600326.sra_2.fastq.gz","read_length":300},"vdjserver_uuid":"3987789925682114070-242ac113-0001-012"}],"data_processing":[{"data_processing_id":"65112922-e976-40d9-9dff-6b581acc745f-007","primary_annotation":true,"software_versions":"IgBlast 1.14","data_processing_files":["SRR10600326.sra_1.igblast.airr.tsv.gz","SRR10600326.sra_2.igblast.airr.tsv.gz"],"germline_database":"VDJServer IMGT 2019.01.23","vdjserver_uuid":"2248499969493954070-242ac113-0001-012"}]}]} \ No newline at end of file From 6927e1125bb81b9df8efd2f5e2f398141f0c5c1b Mon Sep 17 00:00:00 2001 From: Brian Corrie Date: Sun, 11 Feb 2024 09:34:20 -0800 Subject: [PATCH 04/15] Minor change Mostly so I can create a pull request... --- specs/airr-schema.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/specs/airr-schema.yaml b/specs/airr-schema.yaml index 606f773f5..260a37636 100644 --- a/specs/airr-schema.yaml +++ b/specs/airr-schema.yaml @@ -3196,7 +3196,7 @@ Repertoire: nullable: false adc-query-support: true -# A collection of repertoires for analysis purposes, includes optional time course +# A group of repertoires for analysis purposes, includes optional time course RepertoireGroup: type: object required: From 9616de2bbc22ba514830a793b027009a7b2799a7 Mon Sep 17 00:00:00 2001 From: Scott Christley Date: Sat, 17 Feb 2024 18:27:34 -0600 Subject: [PATCH 05/15] Update descriptions --- specs/airr-schema.yaml | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/specs/airr-schema.yaml b/specs/airr-schema.yaml index 260a37636..e4b6e34b8 100644 --- a/specs/airr-schema.yaml +++ b/specs/airr-schema.yaml @@ -371,7 +371,8 @@ DataFile: nullable: false RepertoireGroup: type: array - description: List of repertoire collections + description: List of repertoire groups + items: $ref: '#/RepertoireGroup' x-airr: @@ -3196,7 +3197,8 @@ Repertoire: nullable: false adc-query-support: true -# A group of repertoires for analysis purposes, includes optional time course +# An ordered group of repertoires for analysis purposes, includes optional time course +# Can be treated as a set if all repertoire_group_id are unique RepertoireGroup: type: object required: @@ -3205,19 +3207,19 @@ RepertoireGroup: properties: repertoire_group_id: type: string - description: Identifier for this repertoire collection + description: Identifier for this repertoire group x-airr: identifier: true repertoire_group_name: type: string - description: Short display name for this repertoire collection + description: Short display name for this repertoire group repertoire_group_description: type: string - description: Repertoire collection description + description: Repertoire group description repertoires: type: array description: > - List of repertoires in this collection with an associated description and time point designation + List of repertoires in this group with an associated description and time point designation items: type: object properties: From 559075400f30f785da17bc41b18974e805b3ca13 Mon Sep 17 00:00:00 2001 From: Scott Christley Date: Wed, 21 Feb 2024 19:19:55 -0600 Subject: [PATCH 06/15] update descriptions --- lang/R/inst/extdata/airr-schema.yaml | 13 +++++++------ lang/python/airr/specs/airr-schema.yaml | 13 +++++++------ specs/airr-schema-openapi3.yaml | 13 +++++++------ specs/airr-schema.yaml | 1 - 4 files changed, 21 insertions(+), 19 deletions(-) diff --git a/lang/R/inst/extdata/airr-schema.yaml b/lang/R/inst/extdata/airr-schema.yaml index 606f773f5..fe21bbb75 100644 --- a/lang/R/inst/extdata/airr-schema.yaml +++ b/lang/R/inst/extdata/airr-schema.yaml @@ -371,7 +371,7 @@ DataFile: nullable: false RepertoireGroup: type: array - description: List of repertoire collections + description: List of repertoire groups items: $ref: '#/RepertoireGroup' x-airr: @@ -3196,7 +3196,8 @@ Repertoire: nullable: false adc-query-support: true -# A collection of repertoires for analysis purposes, includes optional time course +# An ordered group of repertoires for analysis purposes, includes optional time course +# Can be treated as a set if all repertoire_group_id are unique RepertoireGroup: type: object required: @@ -3205,19 +3206,19 @@ RepertoireGroup: properties: repertoire_group_id: type: string - description: Identifier for this repertoire collection + description: Identifier for this repertoire group x-airr: identifier: true repertoire_group_name: type: string - description: Short display name for this repertoire collection + description: Short display name for this repertoire group repertoire_group_description: type: string - description: Repertoire collection description + description: Repertoire group description repertoires: type: array description: > - List of repertoires in this collection with an associated description and time point designation + List of repertoires in this group with an associated description and time point designation items: type: object properties: diff --git a/lang/python/airr/specs/airr-schema.yaml b/lang/python/airr/specs/airr-schema.yaml index 606f773f5..fe21bbb75 100644 --- a/lang/python/airr/specs/airr-schema.yaml +++ b/lang/python/airr/specs/airr-schema.yaml @@ -371,7 +371,7 @@ DataFile: nullable: false RepertoireGroup: type: array - description: List of repertoire collections + description: List of repertoire groups items: $ref: '#/RepertoireGroup' x-airr: @@ -3196,7 +3196,8 @@ Repertoire: nullable: false adc-query-support: true -# A collection of repertoires for analysis purposes, includes optional time course +# An ordered group of repertoires for analysis purposes, includes optional time course +# Can be treated as a set if all repertoire_group_id are unique RepertoireGroup: type: object required: @@ -3205,19 +3206,19 @@ RepertoireGroup: properties: repertoire_group_id: type: string - description: Identifier for this repertoire collection + description: Identifier for this repertoire group x-airr: identifier: true repertoire_group_name: type: string - description: Short display name for this repertoire collection + description: Short display name for this repertoire group repertoire_group_description: type: string - description: Repertoire collection description + description: Repertoire group description repertoires: type: array description: > - List of repertoires in this collection with an associated description and time point designation + List of repertoires in this group with an associated description and time point designation items: type: object properties: diff --git a/specs/airr-schema-openapi3.yaml b/specs/airr-schema-openapi3.yaml index bba3a45d8..52efed116 100644 --- a/specs/airr-schema-openapi3.yaml +++ b/specs/airr-schema-openapi3.yaml @@ -368,7 +368,7 @@ DataFile: RepertoireGroup: type: array nullable: false - description: List of repertoire collections + description: List of repertoire groups items: $ref: '#/RepertoireGroup' Rearrangement: @@ -3298,7 +3298,8 @@ Repertoire: x-airr: adc-query-support: true -# A collection of repertoires for analysis purposes, includes optional time course +# An ordered group of repertoires for analysis purposes, includes optional time course +# Can be treated as a set if all repertoire_group_id are unique RepertoireGroup: type: object required: @@ -3308,22 +3309,22 @@ RepertoireGroup: repertoire_group_id: type: string nullable: true - description: Identifier for this repertoire collection + description: Identifier for this repertoire group x-airr: identifier: true repertoire_group_name: type: string nullable: true - description: Short display name for this repertoire collection + description: Short display name for this repertoire group repertoire_group_description: type: string nullable: true - description: Repertoire collection description + description: Repertoire group description repertoires: type: array nullable: true description: > - List of repertoires in this collection with an associated description and time point designation + List of repertoires in this group with an associated description and time point designation items: type: object properties: diff --git a/specs/airr-schema.yaml b/specs/airr-schema.yaml index e4b6e34b8..fe21bbb75 100644 --- a/specs/airr-schema.yaml +++ b/specs/airr-schema.yaml @@ -372,7 +372,6 @@ DataFile: RepertoireGroup: type: array description: List of repertoire groups - items: $ref: '#/RepertoireGroup' x-airr: From b027e9f9d33092ec9a150dcd9dcac56967c75c07 Mon Sep 17 00:00:00 2001 From: Scott Christley Date: Wed, 21 Feb 2024 18:31:03 -0600 Subject: [PATCH 07/15] array of extensions --- specs/adc-api-openapi3.yaml | 6 ++++++ specs/adc-api.yaml | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/specs/adc-api-openapi3.yaml b/specs/adc-api-openapi3.yaml index 38a0f2fb9..5a909f1c3 100644 --- a/specs/adc-api-openapi3.yaml +++ b/specs/adc-api-openapi3.yaml @@ -73,6 +73,12 @@ components: type: integer last_update: type: string + extensions: + type: array + items: + type: string + enum: + - async_api api: $ref: '#/components/schemas/info_object' schema: diff --git a/specs/adc-api.yaml b/specs/adc-api.yaml index db5a28372..24142bb2d 100644 --- a/specs/adc-api.yaml +++ b/specs/adc-api.yaml @@ -62,6 +62,12 @@ definitions: type: integer last_update: type: string + extensions: + type: array + items: + type: string + enum: + - async_api api: description: Provides information about the ADC API implemented by this repository service. $ref: '#/definitions/info_object' From 3f834c13929054e0e8c7097a21ddfeac20ca28b8 Mon Sep 17 00:00:00 2001 From: Scott Christley Date: Wed, 21 Feb 2024 18:39:17 -0600 Subject: [PATCH 08/15] update docs --- docs/api/adc_api_overview.rst | 2 ++ docs/api/adc_api_requests.rst | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/api/adc_api_overview.rst b/docs/api/adc_api_overview.rst index cffd157d5..31f520edb 100644 --- a/docs/api/adc_api_overview.rst +++ b/docs/api/adc_api_overview.rst @@ -60,6 +60,8 @@ to be followed. (timeout) should be used if the API does not complete an operation because of an internal time limit, and HTTP 413 (Content too large) should be returned when either max_size or max_query_size are exceeded. +* Extensions beyond the standard API, e.g., support for the Async API, should be specified + with the `extensions` property in the `/info` endpoint. Repository operation principles ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/api/adc_api_requests.rst b/docs/api/adc_api_requests.rst index 528139a6f..9b19a5995 100644 --- a/docs/api/adc_api_requests.rst +++ b/docs/api/adc_api_requests.rst @@ -110,7 +110,8 @@ of the queries sent to the repository. } }, "max_size": 1000, - "max_query_size": 2097152 + "max_query_size": 2097152, + "extensions": ["async_api"] } **Query Repertoire Example** From 6fe6633a62960e33698b2989b3481f08a09010e9 Mon Sep 17 00:00:00 2001 From: Christian Busse Date: Mon, 15 Jan 2024 02:51:46 +0100 Subject: [PATCH 09/15] Add Contributor record, adapt Study object accordingly --- specs/airr-schema-openapi3.yaml | 177 +++++++++++++++++++------------- 1 file changed, 103 insertions(+), 74 deletions(-) diff --git a/specs/airr-schema-openapi3.yaml b/specs/airr-schema-openapi3.yaml index 52efed116..6d5148be3 100644 --- a/specs/airr-schema-openapi3.yaml +++ b/specs/airr-schema-openapi3.yaml @@ -486,34 +486,111 @@ TimePoint: # General objects # -# An individual -Acknowledgement: +# Contributor record to describe invididuals and their contribution to a data set +# +Contributor: description: Individual whose contribution to this work should be acknowledged type: object required: - - acknowledgement_id + - contributor_id - name - - institution_name properties: - acknowledgement_id: + contributor_id: type: string - description: unique identifier of this Acknowledgement within the file + description: unique identifier of this contributor within the file x-airr: identifier: true miairr: important nullable: true name: type: string + nullable: false + description: Full name of contributor + orcid_id: + $ref: '#/Ontology' nullable: true - description: Full name of individual - institution_name: - type: string + description: > + ORCID identifier of the contributor. Note that if present, the label of the ORCID record should take + precedence over the name reported in the `name` property. + title: ORCID iD + example: + id: ORCID:0000-0002-1825-0097 + label: Josiah Carberry + x-airr: + adc-query-support: true + format: ontology + ontology: + draft: false + top_node: + id: null + label: null + affiliation: + $ref: '#/Ontology' nullable: true - description: Individual's department and institution name - orcid_id: + description: > + ROR of the contributor's primary affiliation. Note that ROR are only minted for institutions, not + from individuals institutes, divisions or departments. + title: ROR + example: + id: ROR:05h7xva58 + label: Wesleyan University + x-airr: + adc-query-support: true + format: ontology + ontology: + draft: false + top_node: + id: null + label: null + affiliation_department: type: string nullable: true - description: Individual's ORCID identifier + description: > + Additional information regarding the contributor's primary affiliation. Can be used to specify + individual institutes, divisions or departments. + example: Department for Psychoceramics + contributions: + type: array + nullable: true + description: List of all roles the contributor had in a project + items: + $ref: '#/ContributorContribution' + +ContributorContribution: + type: object + required: + - role + properties: + role: + type: string + nullable: false + description: Role according to CRediT taxonomy + enum: + - conceptualization + - data curation + - formal analysis + - funding acquisition + - investigation + - methodology + - project administration + - resources + - software + - supervision + - validation + - visualization + - writing - original draft + - writing - review & editing + degree: + type: string + nullable: true + description: > + Optional specification of the degree of contribution, should be used if multiple individuals serve + the same role. + enum: + - lead + - equal + - supporting + # # Germline gene schema @@ -849,7 +926,7 @@ AlleleDescription: nullable: true description: List of individuals whose contribution to the gene description should be acknowledged items: - $ref: '#/Acknowledgement' + $ref: '#/Contributor' lab_address: type: string nullable: true @@ -1192,7 +1269,7 @@ GermlineSet: nullable: true description: List of individuals whose contribution to the germline set should be acknowledged items: - $ref: '#/Acknowledgement' + $ref: '#/Contributor' release_version: type: number nullable: true @@ -1702,71 +1779,23 @@ Study: set: 1 subset: study name: Grant funding agency - study_contact: - type: string - nullable: true - description: > - Full contact information of the contact persons for this study This should include an e-mail address - and a persistent identifier such as an ORCID ID. - title: Contact information (study) - example: Dr. P. Stibbons, p.stibbons@unseenu.edu, https://orcid.org/0000-0002-1825-0097 - x-airr: - adc-query-support: true - name: Contact information (study) - collected_by: - type: string - nullable: true - description: > - Full contact information of the data collector, i.e. the person who is legally responsible for data - collection and release. This should include an e-mail address and a persistent identifier such as an - ORCID ID. - title: Contact information (data collection) - example: Dr. P. Stibbons, p.stibbons@unseenu.edu, https://orcid.org/0000-0002-1825-0097 - x-airr: - miairr: important - adc-query-support: true - set: 1 - subset: study - name: Contact information (data collection) - lab_name: - type: string - nullable: true - description: Department of data collector - title: Lab name - example: Department for Planar Immunology - x-airr: - miairr: important - adc-query-support: true - set: 1 - subset: study - name: Lab name - lab_address: - type: string - nullable: true - description: Institution and institutional address of data collector - title: Lab address - example: School of Medicine, Unseen University, Ankh-Morpork, Disk World - x-airr: - miairr: important - adc-query-support: true - set: 1 - subset: study - name: Lab address - submitted_by: - type: string - nullable: true + contributors: + type: array + nullable: false description: > - Full contact information of the data depositor, i.e., the person submitting the data to a repository. - This should include an e-mail address and a persistent identifier such as an ORCID ID. This is - supposed to be a short-lived and technical role until the submission is relased. - title: Contact information (data deposition) - example: Adrian Turnipseed, a.turnipseed@unseenu.edu, https://orcid.org/0000-0002-1825-0097 + List of individuals who contributed to the study. Note that these are not necessarily identical with the + authors on an associated manuscript or other scholarly communication. Further note that at least the + following three CRediT contributor roles "supervision", "investigation" and "data curation" should be + assigned. + title: Contributors + items: + $ref: '#/Contributor' x-airr: - miairr: important + miairr: essential adc-query-support: true set: 1 subset: study - name: Contact information (data deposition) + name: Contributors pub_ids: type: array items: From 505a50f67677b61e8c5d16de03b572fd1a3458aa Mon Sep 17 00:00:00 2001 From: Christian Busse Date: Wed, 21 Feb 2024 02:15:43 +0100 Subject: [PATCH 10/15] Update v3 Schema, sync to v2 Schema and its copies --- lang/R/inst/extdata/airr-schema.yaml | 239 ++++++++++++++++-------- lang/python/airr/specs/airr-schema.yaml | 239 ++++++++++++++++-------- specs/airr-schema-openapi3.yaml | 124 +++++++----- specs/airr-schema.yaml | 239 ++++++++++++++++-------- 4 files changed, 552 insertions(+), 289 deletions(-) diff --git a/lang/R/inst/extdata/airr-schema.yaml b/lang/R/inst/extdata/airr-schema.yaml index fe21bbb75..87a25b5bd 100644 --- a/lang/R/inst/extdata/airr-schema.yaml +++ b/lang/R/inst/extdata/airr-schema.yaml @@ -493,30 +493,115 @@ TimePoint: # TODO: link to global schema with JSON-LD? # -# An individual -Acknowledgement: +# Contributor record to describe invididuals and their contribution to a data set +# +Contributor: description: Individual whose contribution to this work should be acknowledged type: object required: - - acknowledgement_id + - contributor_id - name - - institution_name properties: - acknowledgement_id: + contributor_id: type: string - description: unique identifier of this Acknowledgement within the file + description: Unique identifier of this contributor within the file x-airr: + nullable: true identifier: true miairr: important name: type: string - description: Full name of individual - institution_name: - type: string - description: Individual's department and institution name + description: Full name of contributor + x-airr: + nullable: false orcid_id: + $ref: '#/Ontology' + description: > + ORCID identifier of the contributor. Note that if present, the label of the ORCID record should take + precedence over the name reported in the `name` property. + title: ORCID iD + example: + id: ORCID:0000-0002-1825-0097 + label: Josiah Carberry + x-airr: + nullable: true + adc-query-support: true + format: ontology + ontology: + draft: false + top_node: + id: null + label: null + affiliation: + $ref: '#/Ontology' + description: > + ROR of the contributor's primary affiliation. Note that ROR are only minted for institutions, not + from individuals institutes, divisions or departments. + title: ROR + example: + id: ROR:05h7xva58 + label: Wesleyan University + x-airr: + nullable: true + adc-query-support: true + format: ontology + ontology: + draft: false + top_node: + id: null + label: null + affiliation_department: + type: string + description: > + Additional information regarding the contributor's primary affiliation. Can be used to specify + individual institutes, divisions or departments. + example: Department for Psychoceramics + x-airr: + nullable: true + contributions: + type: array + description: List of all roles the contributor had in a project + items: + $ref: '#/ContributorContribution' + x-airr: + nullable: true + +ContributorContribution: + type: object + required: + - role + properties: + role: type: string - description: Individual's ORCID identifier + description: Role according to CRediT taxonomy + enum: + - conceptualization + - data curation + - formal analysis + - funding acquisition + - investigation + - methodology + - project administration + - resources + - software + - supervision + - validation + - visualization + - writing - original draft + - writing - review & editing + x-airr: + nullable: false + degree: + type: string + description: > + Optional specification of the degree of contribution, should be used if multiple individuals serve + the same role. + enum: + - lead + - equal + - supporting + x-airr: + nullable: true # # Germline gene schema @@ -783,8 +868,7 @@ AlleleDescription: type: object required: - allele_description_id - - maintainer - - lab_address + - acknowledgements - release_version - release_date - release_description @@ -810,21 +894,15 @@ AlleleDescription: example: OGRDB:Human_IGH:IGHV1-69*01.001 x-airr: miairr: important - maintainer: - type: string - description: Maintainer of this sequence record - x-airr: - miairr: defined acknowledgements: type: array - description: List of individuals whose contribution to the gene description should be acknowledged + description: > + List of individuals whose contribution to the gene description should be acknowledged. Note that these + are not necessarily identical with the authors on an associated manuscript or other scholarly + communication. Further note that typically at least the three CRediT contributor roles "supervision", + "investigation" and "data curation" should be assigned. The current maintainer should be listed first. items: - $ref: '#/Acknowledgement' - lab_address: - type: string - description: Institution and full address of corresponding author - x-airr: - miairr: defined + $ref: '#/Contributor' release_version: type: integer description: Version number of this record, updated whenever a revised version is published or released @@ -1084,9 +1162,7 @@ GermlineSet: All genes in a GermlineSet should be from a single locus. required: - germline_set_id - - author - - lab_name - - lab_address + - acknowledgements - release_version - release_description - release_date @@ -1104,26 +1180,15 @@ GermlineSet: x-airr: identifier: true miairr: important - author: - type: string - description: Corresponding author - x-airr: - miairr: important - lab_name: - type: string - description: Department of corresponding author - x-airr: - miairr: important - lab_address: - type: string - description: Institutional address of corresponding author - x-airr: - miairr: important acknowledgements: type: array - description: List of individuals whose contribution to the germline set should be acknowledged + description: > + List of individuals whose contribution to the germline set should be acknowledged. Note that these are + not necessarily identical with the authors on an associated manuscript or other scholarly communication. + Further note that typically at least the three CRediT contributor roles "supervision", "investigation" + and "data curation" should be assigned. The coresponding author should be listed last. items: - $ref: '#/Acknowledgement' + $ref: '#/Contributor' release_version: type: number description: Version number of this record, allocated automatically @@ -1593,71 +1658,81 @@ Study: set: 1 subset: study name: Grant funding agency + contributors: + type: array + description: > + List of individuals who contributed to the study. Note that these are not necessarily identical with + the authors on an associated manuscript or other scholarly communication. Further note that typically + at least the three CRediT contributor roles "supervision", "investigation" and "data curation" should + be assigned. The coresponding author should be listed last. + title: Contributors + items: + $ref: '#/Contributor' + x-airr: + nullable: false + miairr: essential + adc-query-support: true + set: 1 + subset: study + name: Contributors study_contact: type: string description: > Full contact information of the contact persons for this study This should include an e-mail address and a persistent identifier such as an ORCID ID. - title: Contact information (study) - example: Dr. P. Stibbons, p.stibbons@unseenu.edu, https://orcid.org/0000-0002-1825-0097 x-airr: - nullable: true - adc-query-support: true - name: Contact information (study) + deprecated: true + deprecated-description: > + Acknowledgements and contact information was re-organized into the contributors property, which + is an array of Contributor objects. + deprecated-replaced-by: + - contributors collected_by: type: string description: > Full contact information of the data collector, i.e. the person who is legally responsible for data collection and release. This should include an e-mail address and a persistent identifier such as an ORCID ID. - title: Contact information (data collection) - example: Dr. P. Stibbons, p.stibbons@unseenu.edu, https://orcid.org/0000-0002-1825-0097 x-airr: - miairr: important - nullable: true - adc-query-support: true - set: 1 - subset: study - name: Contact information (data collection) + deprecated: true + deprecated-description: > + Acknowledgements and contact information was re-organized into the contributors property, which + is an array of Contributor objects. + deprecated-replaced-by: + - contributors lab_name: type: string description: Department of data collector - title: Lab name - example: Department for Planar Immunology x-airr: - miairr: important - nullable: true - adc-query-support: true - set: 1 - subset: study - name: Lab name + deprecated: true + deprecated-description: > + Acknowledgements and contact information was re-organized into the contributors property, which + is an array of Contributor objects. + deprecated-replaced-by: + - contributors lab_address: type: string description: Institution and institutional address of data collector - title: Lab address - example: School of Medicine, Unseen University, Ankh-Morpork, Disk World x-airr: - miairr: important - nullable: true - adc-query-support: true - set: 1 - subset: study - name: Lab address + deprecated: true + deprecated-description: > + Acknowledgements and contact information was re-organized into the contributors property, which + is an array of Contributor objects. + deprecated-replaced-by: + - contributors submitted_by: type: string description: > Full contact information of the data depositor, i.e., the person submitting the data to a repository. This should include an e-mail address and a persistent identifier such as an ORCID ID. This is supposed to be a short-lived and technical role until the submission is relased. - title: Contact information (data deposition) - example: Adrian Turnipseed, a.turnipseed@unseenu.edu, https://orcid.org/0000-0002-1825-0097 x-airr: - miairr: important - nullable: true - adc-query-support: true - set: 1 - subset: study - name: Contact information (data deposition) + deprecated: true + deprecated-description: > + Acknowledgements and contact information was re-organized into the contributors property, which + is an array of Contributor objects. + deprecated-replaced-by: + - contributors pub_ids: type: array items: diff --git a/lang/python/airr/specs/airr-schema.yaml b/lang/python/airr/specs/airr-schema.yaml index fe21bbb75..87a25b5bd 100644 --- a/lang/python/airr/specs/airr-schema.yaml +++ b/lang/python/airr/specs/airr-schema.yaml @@ -493,30 +493,115 @@ TimePoint: # TODO: link to global schema with JSON-LD? # -# An individual -Acknowledgement: +# Contributor record to describe invididuals and their contribution to a data set +# +Contributor: description: Individual whose contribution to this work should be acknowledged type: object required: - - acknowledgement_id + - contributor_id - name - - institution_name properties: - acknowledgement_id: + contributor_id: type: string - description: unique identifier of this Acknowledgement within the file + description: Unique identifier of this contributor within the file x-airr: + nullable: true identifier: true miairr: important name: type: string - description: Full name of individual - institution_name: - type: string - description: Individual's department and institution name + description: Full name of contributor + x-airr: + nullable: false orcid_id: + $ref: '#/Ontology' + description: > + ORCID identifier of the contributor. Note that if present, the label of the ORCID record should take + precedence over the name reported in the `name` property. + title: ORCID iD + example: + id: ORCID:0000-0002-1825-0097 + label: Josiah Carberry + x-airr: + nullable: true + adc-query-support: true + format: ontology + ontology: + draft: false + top_node: + id: null + label: null + affiliation: + $ref: '#/Ontology' + description: > + ROR of the contributor's primary affiliation. Note that ROR are only minted for institutions, not + from individuals institutes, divisions or departments. + title: ROR + example: + id: ROR:05h7xva58 + label: Wesleyan University + x-airr: + nullable: true + adc-query-support: true + format: ontology + ontology: + draft: false + top_node: + id: null + label: null + affiliation_department: + type: string + description: > + Additional information regarding the contributor's primary affiliation. Can be used to specify + individual institutes, divisions or departments. + example: Department for Psychoceramics + x-airr: + nullable: true + contributions: + type: array + description: List of all roles the contributor had in a project + items: + $ref: '#/ContributorContribution' + x-airr: + nullable: true + +ContributorContribution: + type: object + required: + - role + properties: + role: type: string - description: Individual's ORCID identifier + description: Role according to CRediT taxonomy + enum: + - conceptualization + - data curation + - formal analysis + - funding acquisition + - investigation + - methodology + - project administration + - resources + - software + - supervision + - validation + - visualization + - writing - original draft + - writing - review & editing + x-airr: + nullable: false + degree: + type: string + description: > + Optional specification of the degree of contribution, should be used if multiple individuals serve + the same role. + enum: + - lead + - equal + - supporting + x-airr: + nullable: true # # Germline gene schema @@ -783,8 +868,7 @@ AlleleDescription: type: object required: - allele_description_id - - maintainer - - lab_address + - acknowledgements - release_version - release_date - release_description @@ -810,21 +894,15 @@ AlleleDescription: example: OGRDB:Human_IGH:IGHV1-69*01.001 x-airr: miairr: important - maintainer: - type: string - description: Maintainer of this sequence record - x-airr: - miairr: defined acknowledgements: type: array - description: List of individuals whose contribution to the gene description should be acknowledged + description: > + List of individuals whose contribution to the gene description should be acknowledged. Note that these + are not necessarily identical with the authors on an associated manuscript or other scholarly + communication. Further note that typically at least the three CRediT contributor roles "supervision", + "investigation" and "data curation" should be assigned. The current maintainer should be listed first. items: - $ref: '#/Acknowledgement' - lab_address: - type: string - description: Institution and full address of corresponding author - x-airr: - miairr: defined + $ref: '#/Contributor' release_version: type: integer description: Version number of this record, updated whenever a revised version is published or released @@ -1084,9 +1162,7 @@ GermlineSet: All genes in a GermlineSet should be from a single locus. required: - germline_set_id - - author - - lab_name - - lab_address + - acknowledgements - release_version - release_description - release_date @@ -1104,26 +1180,15 @@ GermlineSet: x-airr: identifier: true miairr: important - author: - type: string - description: Corresponding author - x-airr: - miairr: important - lab_name: - type: string - description: Department of corresponding author - x-airr: - miairr: important - lab_address: - type: string - description: Institutional address of corresponding author - x-airr: - miairr: important acknowledgements: type: array - description: List of individuals whose contribution to the germline set should be acknowledged + description: > + List of individuals whose contribution to the germline set should be acknowledged. Note that these are + not necessarily identical with the authors on an associated manuscript or other scholarly communication. + Further note that typically at least the three CRediT contributor roles "supervision", "investigation" + and "data curation" should be assigned. The coresponding author should be listed last. items: - $ref: '#/Acknowledgement' + $ref: '#/Contributor' release_version: type: number description: Version number of this record, allocated automatically @@ -1593,71 +1658,81 @@ Study: set: 1 subset: study name: Grant funding agency + contributors: + type: array + description: > + List of individuals who contributed to the study. Note that these are not necessarily identical with + the authors on an associated manuscript or other scholarly communication. Further note that typically + at least the three CRediT contributor roles "supervision", "investigation" and "data curation" should + be assigned. The coresponding author should be listed last. + title: Contributors + items: + $ref: '#/Contributor' + x-airr: + nullable: false + miairr: essential + adc-query-support: true + set: 1 + subset: study + name: Contributors study_contact: type: string description: > Full contact information of the contact persons for this study This should include an e-mail address and a persistent identifier such as an ORCID ID. - title: Contact information (study) - example: Dr. P. Stibbons, p.stibbons@unseenu.edu, https://orcid.org/0000-0002-1825-0097 x-airr: - nullable: true - adc-query-support: true - name: Contact information (study) + deprecated: true + deprecated-description: > + Acknowledgements and contact information was re-organized into the contributors property, which + is an array of Contributor objects. + deprecated-replaced-by: + - contributors collected_by: type: string description: > Full contact information of the data collector, i.e. the person who is legally responsible for data collection and release. This should include an e-mail address and a persistent identifier such as an ORCID ID. - title: Contact information (data collection) - example: Dr. P. Stibbons, p.stibbons@unseenu.edu, https://orcid.org/0000-0002-1825-0097 x-airr: - miairr: important - nullable: true - adc-query-support: true - set: 1 - subset: study - name: Contact information (data collection) + deprecated: true + deprecated-description: > + Acknowledgements and contact information was re-organized into the contributors property, which + is an array of Contributor objects. + deprecated-replaced-by: + - contributors lab_name: type: string description: Department of data collector - title: Lab name - example: Department for Planar Immunology x-airr: - miairr: important - nullable: true - adc-query-support: true - set: 1 - subset: study - name: Lab name + deprecated: true + deprecated-description: > + Acknowledgements and contact information was re-organized into the contributors property, which + is an array of Contributor objects. + deprecated-replaced-by: + - contributors lab_address: type: string description: Institution and institutional address of data collector - title: Lab address - example: School of Medicine, Unseen University, Ankh-Morpork, Disk World x-airr: - miairr: important - nullable: true - adc-query-support: true - set: 1 - subset: study - name: Lab address + deprecated: true + deprecated-description: > + Acknowledgements and contact information was re-organized into the contributors property, which + is an array of Contributor objects. + deprecated-replaced-by: + - contributors submitted_by: type: string description: > Full contact information of the data depositor, i.e., the person submitting the data to a repository. This should include an e-mail address and a persistent identifier such as an ORCID ID. This is supposed to be a short-lived and technical role until the submission is relased. - title: Contact information (data deposition) - example: Adrian Turnipseed, a.turnipseed@unseenu.edu, https://orcid.org/0000-0002-1825-0097 x-airr: - miairr: important - nullable: true - adc-query-support: true - set: 1 - subset: study - name: Contact information (data deposition) + deprecated: true + deprecated-description: > + Acknowledgements and contact information was re-organized into the contributors property, which + is an array of Contributor objects. + deprecated-replaced-by: + - contributors pub_ids: type: array items: diff --git a/specs/airr-schema-openapi3.yaml b/specs/airr-schema-openapi3.yaml index 6d5148be3..1ae5ad012 100644 --- a/specs/airr-schema-openapi3.yaml +++ b/specs/airr-schema-openapi3.yaml @@ -497,11 +497,11 @@ Contributor: properties: contributor_id: type: string - description: unique identifier of this contributor within the file + nullable: true + description: Unique identifier of this contributor within the file x-airr: identifier: true miairr: important - nullable: true name: type: string nullable: false @@ -886,8 +886,7 @@ AlleleDescription: type: object required: - allele_description_id - - maintainer - - lab_address + - acknowledgements - release_version - release_date - release_description @@ -915,24 +914,16 @@ AlleleDescription: miairr: important description: Unique reference to the allele description, in standardized form (Repo:Label:Version) example: OGRDB:Human_IGH:IGHV1-69*01.001 - maintainer: - type: string - nullable: true - x-airr: - miairr: defined - description: Maintainer of this sequence record acknowledgements: type: array nullable: true - description: List of individuals whose contribution to the gene description should be acknowledged + description: > + List of individuals whose contribution to the gene description should be acknowledged. Note that these + are not necessarily identical with the authors on an associated manuscript or other scholarly + communication. Further note that typically at least the three CRediT contributor roles "supervision", + "investigation" and "data curation" should be assigned. The current maintainer should be listed first. items: $ref: '#/Contributor' - lab_address: - type: string - nullable: true - x-airr: - miairr: defined - description: Institution and full address of corresponding author release_version: type: integer nullable: true @@ -1225,9 +1216,7 @@ GermlineSet: All genes in a GermlineSet should be from a single locus. required: - germline_set_id - - author - - lab_name - - lab_address + - acknowledgements - release_version - release_description - release_date @@ -1246,28 +1235,14 @@ GermlineSet: x-airr: identifier: true miairr: important - author: - type: string - nullable: true - x-airr: - miairr: important - description: Corresponding author - lab_name: - type: string - nullable: true - x-airr: - miairr: important - description: Department of corresponding author - lab_address: - type: string - nullable: true - x-airr: - miairr: important - description: Institutional address of corresponding author acknowledgements: type: array nullable: true - description: List of individuals whose contribution to the germline set should be acknowledged + description: > + List of individuals whose contribution to the germline set should be acknowledged. Note that these are + not necessarily identical with the authors on an associated manuscript or other scholarly communication. + Further note that typically at least the three CRediT contributor roles "supervision", "investigation" + and "data curation" should be assigned. The coresponding author should be listed last. items: $ref: '#/Contributor' release_version: @@ -1783,10 +1758,10 @@ Study: type: array nullable: false description: > - List of individuals who contributed to the study. Note that these are not necessarily identical with the - authors on an associated manuscript or other scholarly communication. Further note that at least the - following three CRediT contributor roles "supervision", "investigation" and "data curation" should be - assigned. + List of individuals who contributed to the study. Note that these are not necessarily identical with + the authors on an associated manuscript or other scholarly communication. Further note that typically + at least the three CRediT contributor roles "supervision", "investigation" and "data curation" should + be assigned. The coresponding author should be listed last. title: Contributors items: $ref: '#/Contributor' @@ -1796,6 +1771,69 @@ Study: set: 1 subset: study name: Contributors + study_contact: + type: string + nullable: true + description: > + Full contact information of the contact persons for this study This should include an e-mail address + and a persistent identifier such as an ORCID ID. + x-airr: + deprecated: true + deprecated-description: > + Acknowledgements and contact information was re-organized into the contributors property, which + is an array of Contributor objects. + deprecated-replaced-by: + - contributors + collected_by: + type: string + nullable: true + description: > + Full contact information of the data collector, i.e. the person who is legally responsible for data + collection and release. This should include an e-mail address and a persistent identifier such as an + ORCID ID. + x-airr: + deprecated: true + deprecated-description: > + Acknowledgements and contact information was re-organized into the contributors property, which + is an array of Contributor objects. + deprecated-replaced-by: + - contributors + lab_name: + type: string + nullable: true + description: Department of data collector + x-airr: + deprecated: true + deprecated-description: > + Acknowledgements and contact information was re-organized into the contributors property, which + is an array of Contributor objects. + deprecated-replaced-by: + - contributors + lab_address: + type: string + nullable: true + description: Institution and institutional address of data collector + x-airr: + deprecated: true + deprecated-description: > + Acknowledgements and contact information was re-organized into the contributors property, which + is an array of Contributor objects. + deprecated-replaced-by: + - contributors + submitted_by: + type: string + nullable: true + description: > + Full contact information of the data depositor, i.e., the person submitting the data to a repository. + This should include an e-mail address and a persistent identifier such as an ORCID ID. This is + supposed to be a short-lived and technical role until the submission is relased. + x-airr: + deprecated: true + deprecated-description: > + Acknowledgements and contact information was re-organized into the contributors property, which + is an array of Contributor objects. + deprecated-replaced-by: + - contributors pub_ids: type: array items: diff --git a/specs/airr-schema.yaml b/specs/airr-schema.yaml index fe21bbb75..87a25b5bd 100644 --- a/specs/airr-schema.yaml +++ b/specs/airr-schema.yaml @@ -493,30 +493,115 @@ TimePoint: # TODO: link to global schema with JSON-LD? # -# An individual -Acknowledgement: +# Contributor record to describe invididuals and their contribution to a data set +# +Contributor: description: Individual whose contribution to this work should be acknowledged type: object required: - - acknowledgement_id + - contributor_id - name - - institution_name properties: - acknowledgement_id: + contributor_id: type: string - description: unique identifier of this Acknowledgement within the file + description: Unique identifier of this contributor within the file x-airr: + nullable: true identifier: true miairr: important name: type: string - description: Full name of individual - institution_name: - type: string - description: Individual's department and institution name + description: Full name of contributor + x-airr: + nullable: false orcid_id: + $ref: '#/Ontology' + description: > + ORCID identifier of the contributor. Note that if present, the label of the ORCID record should take + precedence over the name reported in the `name` property. + title: ORCID iD + example: + id: ORCID:0000-0002-1825-0097 + label: Josiah Carberry + x-airr: + nullable: true + adc-query-support: true + format: ontology + ontology: + draft: false + top_node: + id: null + label: null + affiliation: + $ref: '#/Ontology' + description: > + ROR of the contributor's primary affiliation. Note that ROR are only minted for institutions, not + from individuals institutes, divisions or departments. + title: ROR + example: + id: ROR:05h7xva58 + label: Wesleyan University + x-airr: + nullable: true + adc-query-support: true + format: ontology + ontology: + draft: false + top_node: + id: null + label: null + affiliation_department: + type: string + description: > + Additional information regarding the contributor's primary affiliation. Can be used to specify + individual institutes, divisions or departments. + example: Department for Psychoceramics + x-airr: + nullable: true + contributions: + type: array + description: List of all roles the contributor had in a project + items: + $ref: '#/ContributorContribution' + x-airr: + nullable: true + +ContributorContribution: + type: object + required: + - role + properties: + role: type: string - description: Individual's ORCID identifier + description: Role according to CRediT taxonomy + enum: + - conceptualization + - data curation + - formal analysis + - funding acquisition + - investigation + - methodology + - project administration + - resources + - software + - supervision + - validation + - visualization + - writing - original draft + - writing - review & editing + x-airr: + nullable: false + degree: + type: string + description: > + Optional specification of the degree of contribution, should be used if multiple individuals serve + the same role. + enum: + - lead + - equal + - supporting + x-airr: + nullable: true # # Germline gene schema @@ -783,8 +868,7 @@ AlleleDescription: type: object required: - allele_description_id - - maintainer - - lab_address + - acknowledgements - release_version - release_date - release_description @@ -810,21 +894,15 @@ AlleleDescription: example: OGRDB:Human_IGH:IGHV1-69*01.001 x-airr: miairr: important - maintainer: - type: string - description: Maintainer of this sequence record - x-airr: - miairr: defined acknowledgements: type: array - description: List of individuals whose contribution to the gene description should be acknowledged + description: > + List of individuals whose contribution to the gene description should be acknowledged. Note that these + are not necessarily identical with the authors on an associated manuscript or other scholarly + communication. Further note that typically at least the three CRediT contributor roles "supervision", + "investigation" and "data curation" should be assigned. The current maintainer should be listed first. items: - $ref: '#/Acknowledgement' - lab_address: - type: string - description: Institution and full address of corresponding author - x-airr: - miairr: defined + $ref: '#/Contributor' release_version: type: integer description: Version number of this record, updated whenever a revised version is published or released @@ -1084,9 +1162,7 @@ GermlineSet: All genes in a GermlineSet should be from a single locus. required: - germline_set_id - - author - - lab_name - - lab_address + - acknowledgements - release_version - release_description - release_date @@ -1104,26 +1180,15 @@ GermlineSet: x-airr: identifier: true miairr: important - author: - type: string - description: Corresponding author - x-airr: - miairr: important - lab_name: - type: string - description: Department of corresponding author - x-airr: - miairr: important - lab_address: - type: string - description: Institutional address of corresponding author - x-airr: - miairr: important acknowledgements: type: array - description: List of individuals whose contribution to the germline set should be acknowledged + description: > + List of individuals whose contribution to the germline set should be acknowledged. Note that these are + not necessarily identical with the authors on an associated manuscript or other scholarly communication. + Further note that typically at least the three CRediT contributor roles "supervision", "investigation" + and "data curation" should be assigned. The coresponding author should be listed last. items: - $ref: '#/Acknowledgement' + $ref: '#/Contributor' release_version: type: number description: Version number of this record, allocated automatically @@ -1593,71 +1658,81 @@ Study: set: 1 subset: study name: Grant funding agency + contributors: + type: array + description: > + List of individuals who contributed to the study. Note that these are not necessarily identical with + the authors on an associated manuscript or other scholarly communication. Further note that typically + at least the three CRediT contributor roles "supervision", "investigation" and "data curation" should + be assigned. The coresponding author should be listed last. + title: Contributors + items: + $ref: '#/Contributor' + x-airr: + nullable: false + miairr: essential + adc-query-support: true + set: 1 + subset: study + name: Contributors study_contact: type: string description: > Full contact information of the contact persons for this study This should include an e-mail address and a persistent identifier such as an ORCID ID. - title: Contact information (study) - example: Dr. P. Stibbons, p.stibbons@unseenu.edu, https://orcid.org/0000-0002-1825-0097 x-airr: - nullable: true - adc-query-support: true - name: Contact information (study) + deprecated: true + deprecated-description: > + Acknowledgements and contact information was re-organized into the contributors property, which + is an array of Contributor objects. + deprecated-replaced-by: + - contributors collected_by: type: string description: > Full contact information of the data collector, i.e. the person who is legally responsible for data collection and release. This should include an e-mail address and a persistent identifier such as an ORCID ID. - title: Contact information (data collection) - example: Dr. P. Stibbons, p.stibbons@unseenu.edu, https://orcid.org/0000-0002-1825-0097 x-airr: - miairr: important - nullable: true - adc-query-support: true - set: 1 - subset: study - name: Contact information (data collection) + deprecated: true + deprecated-description: > + Acknowledgements and contact information was re-organized into the contributors property, which + is an array of Contributor objects. + deprecated-replaced-by: + - contributors lab_name: type: string description: Department of data collector - title: Lab name - example: Department for Planar Immunology x-airr: - miairr: important - nullable: true - adc-query-support: true - set: 1 - subset: study - name: Lab name + deprecated: true + deprecated-description: > + Acknowledgements and contact information was re-organized into the contributors property, which + is an array of Contributor objects. + deprecated-replaced-by: + - contributors lab_address: type: string description: Institution and institutional address of data collector - title: Lab address - example: School of Medicine, Unseen University, Ankh-Morpork, Disk World x-airr: - miairr: important - nullable: true - adc-query-support: true - set: 1 - subset: study - name: Lab address + deprecated: true + deprecated-description: > + Acknowledgements and contact information was re-organized into the contributors property, which + is an array of Contributor objects. + deprecated-replaced-by: + - contributors submitted_by: type: string description: > Full contact information of the data depositor, i.e., the person submitting the data to a repository. This should include an e-mail address and a persistent identifier such as an ORCID ID. This is supposed to be a short-lived and technical role until the submission is relased. - title: Contact information (data deposition) - example: Adrian Turnipseed, a.turnipseed@unseenu.edu, https://orcid.org/0000-0002-1825-0097 x-airr: - miairr: important - nullable: true - adc-query-support: true - set: 1 - subset: study - name: Contact information (data deposition) + deprecated: true + deprecated-description: > + Acknowledgements and contact information was re-organized into the contributors property, which + is an array of Contributor objects. + deprecated-replaced-by: + - contributors pub_ids: type: array items: From e1dab4a04eb6bda6ba6927bb48abbe32e8650176 Mon Sep 17 00:00:00 2001 From: Christian Busse Date: Wed, 21 Feb 2024 03:18:22 +0100 Subject: [PATCH 11/15] Update R and Python Schema files --- lang/R/R/Schema.R | 2 +- lang/python/airr/schema.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lang/R/R/Schema.R b/lang/R/R/Schema.R index da7cb1056..c6925e67f 100644 --- a/lang/R/R/Schema.R +++ b/lang/R/R/Schema.R @@ -312,7 +312,7 @@ AIRRSchema <- list("Info"=load_schema("InfoObject"), "SequencingData"=load_schema("SequencingData"), "DataProcessing"=load_schema("DataProcessing"), "GermlineSet"=load_schema("GermlineSet"), - "Acknowledgement"=load_schema("Acknowledgement"), + "Contributor"=load_schema("Contributor"), "RearrangedSequence"=load_schema("RearrangedSequence"), "UnrearrangedSequence"=load_schema("UnrearrangedSequence"), "SequenceDelineationV"=load_schema("SequenceDelineationV"), diff --git a/lang/python/airr/schema.py b/lang/python/airr/schema.py index 28967d33c..28de0a859 100644 --- a/lang/python/airr/schema.py +++ b/lang/python/airr/schema.py @@ -544,7 +544,7 @@ def _default(spec): 'SequencingData': Schema('SequencingData'), 'DataProcessing': Schema('DataProcessing'), 'GermlineSet': Schema('GermlineSet'), - 'Acknowledgement': Schema('Acknowledgement'), + 'Contributor': Schema('Contributor'), 'RearrangedSequence': Schema('RearrangedSequence'), 'UnrearrangedSequence': Schema('UnrearrangedSequence'), 'SequenceDelineationV': Schema('SequenceDelineationV'), From 44b04655add076901b109be65e5f925198cd4d05 Mon Sep 17 00:00:00 2001 From: Christian Busse Date: Wed, 21 Feb 2024 03:11:27 +0100 Subject: [PATCH 12/15] Update example and test files for R and Python --- lang/R/inst/extdata/airr-schema.yaml | 5 +- lang/R/inst/extdata/germline-example.json | 182 ++-- lang/R/inst/extdata/repertoire-example.yaml | 182 +++- lang/R/tests/data-tests/bad_genotype_set.json | 86 +- lang/R/tests/data-tests/bad_germline_set.json | 102 ++- lang/R/tests/data-tests/bad_repertoire.yaml | 96 +- .../tests/data-tests/good_combined_airr.json | 449 +++++++-- .../tests/data-tests/good_combined_airr.yaml | 287 ++++-- .../R/tests/data-tests/good_genotype_set.json | 74 +- .../R/tests/data-tests/good_germline_set.json | 108 ++- lang/R/tests/data-tests/good_repertoire.yaml | 146 ++- lang/python/airr/specs/airr-schema.yaml | 5 +- lang/python/tests/data/bad_genotype_set.json | 2 +- lang/python/tests/data/bad_germline_set.json | 94 +- lang/python/tests/data/bad_repertoire.yaml | 90 +- .../python/tests/data/good_combined_airr.json | 349 +++++-- .../python/tests/data/good_combined_airr.yaml | 856 ++++++++++-------- lang/python/tests/data/good_genotype_set.json | 2 +- lang/python/tests/data/good_germline_set.json | 108 ++- lang/python/tests/data/good_repertoire.yaml | 128 ++- specs/airr-schema-openapi3.yaml | 5 +- specs/airr-schema.yaml | 5 +- 22 files changed, 2364 insertions(+), 997 deletions(-) diff --git a/lang/R/inst/extdata/airr-schema.yaml b/lang/R/inst/extdata/airr-schema.yaml index 87a25b5bd..dd2c0c241 100644 --- a/lang/R/inst/extdata/airr-schema.yaml +++ b/lang/R/inst/extdata/airr-schema.yaml @@ -1571,10 +1571,7 @@ Study: - study_type - inclusion_exclusion_criteria - grants - - collected_by - - lab_name - - lab_address - - submitted_by + - contributors - pub_ids - keywords_study properties: diff --git a/lang/R/inst/extdata/germline-example.json b/lang/R/inst/extdata/germline-example.json index 926b6d428..9d41e5f38 100644 --- a/lang/R/inst/extdata/germline-example.json +++ b/lang/R/inst/extdata/germline-example.json @@ -1,17 +1,41 @@ { "GermlineSet": [{ "germline_set_id": "OGRDB:G00007", - "author": "William Lees", - "lab_name": "", - "lab_address": "Birkbeck College, University of London, Malet Street, London", - "acknowledgements": [], + "acknowledgements": [ + { + "contributor_id": "1", + "name": "William Lees", + "orcid_id": { + "id": "ORCID:0000-0001-9834-6840", + "label": "William Lees" + }, + "affiliation": { + "id": "ROR:02mb95055", + "label": "Birkbeck, University of London" + }, + "affiliation_department":"", + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + } + ], "release_version": 1, "release_description": "", "release_date": "2021-11-24", "germline_set_name": "CAST IGH", "germline_set_ref": "OGRDB:G00007.1", - "pub_ids": "", - "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" }, + "pub_ids": [""], + "species": { + "id": "NCBITAXON:10090", + "label": "Mus musculus" + }, "species_subgroup": "CAST_EiJ", "species_subgroup_type": "strain", "locus": "IGH", @@ -19,11 +43,33 @@ { "allele_description_id": "OGRDB:A00301", "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2DBF", - "maintainer": "William Lees", - "acknowledgements": [], - "lab_address": "Birkbeck College, University of London, Malet Street, London", + "acknowledgements": [ + { + "contributor_id": "1", + "name": "William Lees", + "orcid_id": { + "id": "ORCID:0000-0001-9834-6840", + "label": "William Lees" + }, + "affiliation": { + "id": "ROR:02mb95055", + "label": "Birkbeck, University of London" + }, + "affiliation_department":"", + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + } + ], "release_version": 1, - "release_date": "24-Nov-2021", + "release_date": "2021-11-24", "release_description": "First release", "label": "IGHV-2DBF", "sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", @@ -36,7 +82,10 @@ "sequence_type": "V", "functional": true, "inference_type": "rearranged_only", - "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" }, + "species": { + "id": "NCBITAXON:10090", + "label": "Mus musculus" + }, "species_subgroup": "CAST_EiJ", "species_subgroup_type": "strain", "status": "active", @@ -70,7 +119,7 @@ "fwr3_start": 161, "fwr3_end": 294, "cdr3_start": 295, - "alignment": [ + "alignment_labels": [ "1", "2", "3", @@ -187,11 +236,33 @@ { "allele_description_id": "OGRDB:A00314", "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2ETO", - "maintainer": "William Lees", - "acknowledgements": [], - "lab_address": "Birkbeck College, University of London, Malet Street, London", + "acknowledgements": [ + { + "contributor_id": "1", + "name": "William Lees", + "orcid_id": { + "id": "ORCID:0000-0001-9834-6840", + "label": "William Lees" + }, + "affiliation": { + "id": "ROR:02mb95055", + "label": "Birkbeck, University of London" + }, + "affiliation_department":"", + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + } + ], "release_version": 1, - "release_date": "24-Nov-2021", + "release_date": "2021-11-24", "release_description": "First release", "label": "IGHV-2ETO", "sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC", @@ -204,7 +275,10 @@ "sequence_type": "V", "functional": true, "inference_type": "rearranged_only", - "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" }, + "species": { + "id": "NCBITAXON:10090", + "label": "Mus musculus" + }, "species_subgroup": "CAST_EiJ", "species_subgroup_type": "strain", "status": "active", @@ -238,7 +312,7 @@ "fwr3_start": 161, "fwr3_end": 294, "cdr3_start": 295, - "alignment": [ + "alignment_labels": [ "1", "2", "3", @@ -356,40 +430,40 @@ "curation": null }], - "GenotypeSet": [{ - "receptor_genotype_set_id": "1", - "genotype_class_list": [ - { - "receptor_genotype_id": "1", - "locus": "IGH", - "documented_alleles": [ - { - "label": "IGHV1-69*01", - "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", - "phasing": 1 - }, - { - "label": "IGHV1-69*02", - "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", - "phasing": 2 - } - ], - "undocumented_alleles": [ - { - "allele_name": "IGHD3-1*01_S1234", - "sequence": "agtagtagtagt", - "phasing": 1 - } - ], - "deleted_genes": [ - { - "label": "IGHV3-30-3", - "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", - "phasing": 1 - } - ], - "inference_process": "repertoire_sequencing" - } - ] - }] + "GenotypeSet": [{ + "receptor_genotype_set_id": "1", + "genotype_class_list": [ + { + "receptor_genotype_id": "1", + "locus": "IGH", + "documented_alleles": [ + { + "label": "IGHV1-69*01", + "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", + "phasing": 1 + }, + { + "label": "IGHV1-69*02", + "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", + "phasing": 2 + } + ], + "undocumented_alleles": [ + { + "allele_name": "IGHD3-1*01_S1234", + "sequence": "agtagtagtagt", + "phasing": 1 + } + ], + "deleted_genes": [ + { + "label": "IGHV3-30-3", + "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", + "phasing": 1 + } + ], + "inference_process": "repertoire_sequencing" + } + ] + }] } diff --git a/lang/R/inst/extdata/repertoire-example.yaml b/lang/R/inst/extdata/repertoire-example.yaml index 5d6808bcc..6adaa2361 100644 --- a/lang/R/inst/extdata/repertoire-example.yaml +++ b/lang/R/inst/extdata/repertoire-example.yaml @@ -11,31 +11,58 @@ Repertoire: id: null label: null study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level." - study_contact: "Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X" inclusion_exclusion_criteria: null - lab_name: "Mark M. Davis" - lab_address: "Stanford University" - submitted_by: "Florian Rubelt" - pub_ids: "PMID:27005435" - collected_by: null + contributors: + - contributor_id: "1" + name: "Florian Rubelt" + orcid_id: + id: null + label: null + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "investigation" + degree: null + - role: "data curation" + degree: null + - contributor_id: "2" + name: "Mark M. Davis" + orcid_id: + id: "ORCID:0000-0001-6868-657X" + label: "Mark Davis" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "supervision" + degree: null + pub_ids: ["PMID:27005435"] grants: null - keywords_study: + keywords_study: - "contains_ig" - "contains_tr" subject: subject_id: TW01A synthetic: false species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" label: "Homo sapiens" - sex: F + sex: female age_min: 27 age_max: 27 age_unit: - id: UO_0000036 + id: UO:0000036 label: year age_event: null - ancestry_population: null + ancestry_population: + id: null + label: null + location_birth: + id: null + label: null ethnicity: null race: null strain_name: null @@ -58,15 +85,15 @@ Repertoire: sample_processing_id: null sample_type: "peripheral venous puncture" tissue: - id: "UBERON_0000178" + id: "UBERON:0000178" label: "blood" tissue_processing: "Ficoll gradient" cell_subset: - id: "CL_0000788" + id: "CL:0000788" label: "naive B cell" cell_phenotype: "expression of CD20 and the absence of CD27" cell_species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" label: "Homo sapiens" single_cell: false cell_isolation: FACS @@ -77,7 +104,7 @@ Repertoire: reverse_pcr_primer_target_location: null sequencing_platform: "Illumina MiSeq" sequencing_files: - sequencing_data_id: SRR2905656 + sequencing_data_id: SRA:SRR2905656 file_type: fastq filename: SRR2905656_R1.fastq.gz read_direction: forward @@ -85,6 +112,8 @@ Repertoire: paired_filename: SRR2905656_R2.fastq.gz paired_read_direction: reverse paired_read_length: 300 + index_filename: SRR2905656_R3.fastq.gz + index_length: 8 anatomic_site: null disease_state_sample: null collection_time_point_relative: null @@ -92,6 +121,9 @@ Repertoire: id: null label: null collection_time_point_reference: null + collection_location: + id: null + label: null biomaterial_provider: null cell_number: null cells_per_reaction: null @@ -134,13 +166,35 @@ Repertoire: id: null label: null study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level." - study_contact: "Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X" inclusion_exclusion_criteria: null - lab_name: "Mark M. Davis" - lab_address: "Stanford University" - submitted_by: "Florian Rubelt" - pub_ids: "PMID:27005435" - collected_by: null + contributors: + - contributor_id: "1" + name: "Florian Rubelt" + orcid_id: + id: null + label: null + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "investigation" + degree: null + - role: "data curation" + degree: null + - contributor_id: "2" + name: "Mark M. Davis" + orcid_id: + id: "ORCID:0000-0001-6868-657X" + label: "Mark Davis" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "supervision" + degree: null + pub_ids: ["PMID:27005435"] grants: null keywords_study: - "contains_ig" @@ -149,16 +203,21 @@ Repertoire: subject_id: TW01A synthetic: false species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" label: "Homo sapiens" - sex: F + sex: female age_min: 27 age_max: 27 age_unit: - id: UO_0000036 + id: UO:0000036 label: year age_event: null - ancestry_population: null + ancestry_population: + id: null + label: null + location_birth: + id: null + label: null ethnicity: null race: null strain_name: null @@ -181,15 +240,15 @@ Repertoire: sample_processing_id: null sample_type: "peripheral venous puncture" tissue: - id: "UBERON_0000178" + id: "UBERON:0000178" label: "blood" tissue_processing: "Ficoll gradient" cell_subset: - id: "CL_0000787" + id: "CL:0000787" label: "memory B cell" cell_phenotype: "expression of CD20 and CD27" cell_species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" label: "Homo sapiens" single_cell: false cell_isolation: FACS @@ -200,7 +259,7 @@ Repertoire: reverse_pcr_primer_target_location: null sequencing_platform: "Illumina MiSeq" sequencing_files: - sequencing_data_id: SRR2905655 + sequencing_data_id: SRA:SRR2905655 file_type: fastq filename: SRR2905655_R1.fastq.gz read_direction: forward @@ -208,6 +267,8 @@ Repertoire: paired_filename: SRR2905655_R2.fastq.gz paired_read_direction: reverse paired_read_length: 300 + index_filename: SRR2905655_R3.fastq.gz + index_length: 8 anatomic_site: null disease_state_sample: null collection_time_point_relative: null @@ -215,6 +276,9 @@ Repertoire: id: null label: null collection_time_point_reference: null + collection_location: + id: null + label: null biomaterial_provider: null cell_number: null cells_per_reaction: null @@ -257,13 +321,35 @@ Repertoire: id: null label: null study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level." - study_contact: "Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X" inclusion_exclusion_criteria: null - lab_name: "Mark M. Davis" - lab_address: "Stanford University" - submitted_by: "Florian Rubelt" - pub_ids: "PMID:27005435" - collected_by: null + contributors: + - contributor_id: "1" + name: "Florian Rubelt" + orcid_id: + id: null + label: null + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "investigation" + degree: null + - role: "data curation" + degree: null + - contributor_id: "2" + name: "Mark M. Davis" + orcid_id: + id: "ORCID:0000-0001-6868-657X" + label: "Mark Davis" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "supervision" + degree: null + pub_ids: ["PMID:27005435"] grants: null keywords_study: - "contains_ig" @@ -272,16 +358,21 @@ Repertoire: subject_id: TW01A synthetic: false species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" label: "Homo sapiens" - sex: F + sex: female age_min: 27 age_max: 27 age_unit: - id: UO_0000036 + id: UO:0000036 label: year age_event: null - ancestry_population: null + ancestry_population: + id: null + label: null + location_birth: + id: null + label: null ethnicity: null race: null strain_name: null @@ -304,15 +395,15 @@ Repertoire: sample_processing_id: null sample_type: "peripheral venous puncture" tissue: - id: "UBERON_0000178" + id: "UBERON:0000178" label: "blood" tissue_processing: "Ficoll gradient" cell_subset: - id: "CL_0000895" + id: "CL:0000895" label: "naive thymus-derived CD4-positive, alpha-beta T cell" cell_phenotype: "expression of CD8 and absence of CD4 and CD45RO" cell_species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" label: "Homo sapiens" single_cell: false cell_isolation: FACS @@ -323,7 +414,7 @@ Repertoire: reverse_pcr_primer_target_location: null sequencing_platform: "Illumina MiSeq" sequencing_files: - sequencing_data_id: SRR2905659 + sequencing_data_id: SRA:SRR2905659 file_type: fastq filename: SRR2905659_R1.fastq.gz read_direction: forward @@ -331,6 +422,8 @@ Repertoire: paired_filename: SRR2905659_R2.fastq.gz paired_read_direction: reverse paired_read_length: 300 + index_filename: SRR2905659_R3.fastq.gz + index_length: 8 anatomic_site: null disease_state_sample: null collection_time_point_relative: null @@ -338,6 +431,9 @@ Repertoire: id: null label: null collection_time_point_reference: null + collection_location: + id: null + label: null biomaterial_provider: null cell_number: null cells_per_reaction: null diff --git a/lang/R/tests/data-tests/bad_genotype_set.json b/lang/R/tests/data-tests/bad_genotype_set.json index 48825e1f8..01709d60a 100644 --- a/lang/R/tests/data-tests/bad_genotype_set.json +++ b/lang/R/tests/data-tests/bad_genotype_set.json @@ -1,44 +1,44 @@ { - "GenotypeSet": [{ - "receptor_genotype_set_id": "1", - "genotype_class_list": [ - { - "receptor_genotype_id": "1", - "locus": 1, - "documented_alleles": [ - { - "label": "IGHV1-69*01", - "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", - "phasing": 1 - }, - { - "label": "IGHV1-69*02", - "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", - "phasing": 2 - }, - { - "label": "IGHV1-69*02", - "name": "1234", - "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", - "phasing": 2 - } - ], - "undocumented_alleles": [ - { - "allele_name": "IGHD3-1*01_S1234", - "sequence": "agtagtagtagt", - "phasing": 1 - } - ], - "deleted_genes": [ - { - "label": "IGHV3-30-3", - "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", - "phasing": "1" - } - ], - "inference_process": "repertoire_sequencing" - } - ] - }] -} \ No newline at end of file + "GenotypeSet": [{ + "receptor_genotype_set_id": "1", + "genotype_class_list": [ + { + "receptor_genotype_id": "1", + "locus": "IGH", + "documented_alleles": [ + { + "label": "IGHV1-69*01", + "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", + "phasing": 1 + }, + { + "label": "IGHV1-69*02", + "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", + "phasing": 2 + }, + { + "label": "IGHV1-69*02", + "name": "1234", + "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", + "phasing": 2 + } + ], + "undocumented_alleles": [ + { + "allele_name": "IGHD3-1*01_S1234", + "sequence": "agtagtagtagt", + "phasing": 1 + } + ], + "deleted_genes": [ + { + "label": "IGHV3-30-3", + "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", + "phasing": "1" + } + ], + "inference_process": "repertoire_sequencing" + } + ] + }] +} diff --git a/lang/R/tests/data-tests/bad_germline_set.json b/lang/R/tests/data-tests/bad_germline_set.json index 0aeea9a2f..28531aabb 100644 --- a/lang/R/tests/data-tests/bad_germline_set.json +++ b/lang/R/tests/data-tests/bad_germline_set.json @@ -1,27 +1,71 @@ { "GermlineSet": [{ "germline_set_id": "OGRDB:G00007", - "author": "William Lees", - "lab_name": "", - "lab_address": "Birkbeck College, University of London, Malet Street, London", + "acknowledgements": [ + { + "contributor_id": "1", + "name": "William Lees", + "orcid_id": { + "id": "ORCID:0000-0001-9834-6840", + "label": "William Lees" + }, + "affiliation": { + "id": "ROR:02mb95055", + "label": "Birkbeck, University of London" + }, + "affiliation_department":"", + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + } + ], "release_version": 1, "release_description": "", "release_date": "2021-11-24", "germline_set_name": "CAST IGH", "germline_set_ref": "OGRDB:G00007.1", "pub_ids": [""], - "species_typo": ["Mouse"], + "species": "Mouse", "species_subgroup": "CAST_EiJ", "species_subgroup_type": "strain", - "locus": 1, + "locus": "IGH", "allele_descriptions": [ { "allele_description_id": "OGRDB:A00301", - "maintainer": "William Lees", - "acknowledgements": [], - "lab_address": "Birkbeck College, University of London, Malet Street, London", + "acknowledgements": [ + { + "contributor_id": "1", + "name": "William Lees", + "orcid_id": { + "id": "ORCID:0000-0001-9834-6840", + "label": "William Lees" + }, + "affiliation": { + "id": "ROR:02mb95055", + "label": "Birkbeck, University of London" + }, + "affiliation_department":"", + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + } + ], "release_version": 1, - "release_date": "24-Nov-2021", + "release_date": "2021-11-24", "release_description": "First release", "label": "IGHV-2DBF", "sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", @@ -29,7 +73,7 @@ "aliases": [ "watson_et_al:CAST_EiJ_IGHV5-3" ], - "locus": 1, + "locus": "IGH", "chromosome": null, "sequence_type": "V", "functional": true, @@ -66,7 +110,7 @@ "fwr3_start": 196, "fwr3_end": 312, "cdr3_start": 313, - "alignment": [ + "alignment_labels": [ "1", "2", "3", @@ -177,16 +221,38 @@ "unrearranged_support": [], "rearranged_support": [], "paralogs": [], - "notes": "Imported to OGRDB with the following notes:\r\nwatson_et_al: CAST_EiJ_IGHV5-3", + "notes": "Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV5-3", "curational_tags": null }, { "allele_description_id": "OGRDB:A00314", - "maintainer": "William Lees", - "acknowledgements": [], - "lab_address": "Birkbeck College, University of London, Malet Street, London", + "acknowledgements": [ + { + "contributor_id": "1", + "name": "William Lees", + "orcid_id": { + "id": "ORCID:0000-0001-9834-6840", + "label": "William Lees" + }, + "affiliation": { + "id": "ROR:02mb95055", + "label": "Birkbeck, University of London" + }, + "affiliation_department":"", + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + } + ], "release_version": 1, - "release_date": "24-Nov-2021", + "release_date": "2021-11-24", "release_description": "First release", "label": "IGHV-2ETO", "sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC", @@ -231,7 +297,7 @@ "fwr3_start": 196, "fwr3_end": 312, "cdr3_start": 313, - "alignment": [ + "alignment_labels": [ "1", "2", "3", @@ -342,7 +408,7 @@ "unrearranged_support": [], "rearranged_support": [], "paralogs": [], - "notes": "Imported to OGRDB with the following notes:\r\nwatson_et_al: CAST_EiJ_IGHV8-2", + "notes": "Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV8-2", "curational_tags": null } ], diff --git a/lang/R/tests/data-tests/bad_repertoire.yaml b/lang/R/tests/data-tests/bad_repertoire.yaml index 57b0b7312..f35355e98 100644 --- a/lang/R/tests/data-tests/bad_repertoire.yaml +++ b/lang/R/tests/data-tests/bad_repertoire.yaml @@ -8,21 +8,39 @@ Repertoire: study_id: PRJNA300878 study_title: "Homo sapiens B and T cell repertoire - MZ twins" study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level." - lab_name: "Mark M. Davis" - lab_address: "Stanford University" - submitted_by: "Florian Rubelt" + contributors: + - contributor_id: "1" + name: "Florian Rubelt" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "investigation" + - role: "data curation" + - contributor_id: "2" + name: "Mark M. Davis" + orcid_id: + id: "ORCID:0000-0001-6868-657X" + label: "Mark Davis" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "supervision" pub_ids: ["PMID:27005435"] subject: subject_id: TW01A synthetic: false species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" value: "Homo sapiens" - sex: F + sex: female age_min: 27 age_max: 27 age_unit: - id: UO_0000036 + id: UO:0000036 value: year linked_subjects: TW01B link_type: twin @@ -32,7 +50,7 @@ Repertoire: cell_subset: "Naive B cell" cell_phenotype: "expression of CD20 and the absence of CD27" cell_species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" value: "Homo sapiens" single_cell: false cell_isolation: FACS @@ -56,21 +74,39 @@ Repertoire: study_id: PRJNA300878 study_title: "Homo sapiens B and T cell repertoire - MZ twins" study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level." - lab_name: "Mark M. Davis" - lab_address: "Stanford University" - submitted_by: "Florian Rubelt" + contributors: + - contributor_id: "1" + name: "Florian Rubelt" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "investigation" + - role: "data curation" + - contributor_id: "2" + name: "Mark M. Davis" + orcid_id: + id: "ORCID:0000-0001-6868-657X" + label: "Mark Davis" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "supervision" pub_ids: ["PMID:27005435"] subject: subject_id: TW01A synthetic: false species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" value: "Homo sapiens" - sex: F + sex: female age_min: 27 age_max: 27 age_unit: - id: UO_0000036 + id: UO:0000036 value: year linked_subjects: TW01B link_type: twin @@ -80,7 +116,7 @@ Repertoire: cell_subset: "Memory B cell" cell_phenotype: "expression of CD20 and CD27" cell_species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" value: "Homo sapiens" single_cell: false cell_isolation: FACS @@ -104,21 +140,39 @@ Repertoire: study_id: PRJNA300878 study_title: "Homo sapiens B and T cell repertoire - MZ twins" study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level." - lab_name: "Mark M. Davis" - lab_address: "Stanford University" - submitted_by: "Florian Rubelt" + contributors: + - contributor_id: "1" + name: "Florian Rubelt" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "investigation" + - role: "data curation" + - contributor_id: "2" + name: "Mark M. Davis" + orcid_id: + id: "ORCID:0000-0001-6868-657X" + label: "Mark Davis" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "supervision" pub_ids: ["PMID:27005435"] subject: subject_id: TW01A synthetic: false species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" value: "Homo sapiens" - sex: F + sex: female age_min: 27 age_max: 27 age_unit: - id: UO_0000036 + id: UO:0000036 value: year linked_subjects: TW01B link_type: twin @@ -128,7 +182,7 @@ Repertoire: cell_subset: "Naive CD4+ T cell" cell_phenotype: "expression of CD8 and absence of CD4 and CD45RO" cell_species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" value: "Homo sapiens" single_cell: false cell_isolation: FACS diff --git a/lang/R/tests/data-tests/good_combined_airr.json b/lang/R/tests/data-tests/good_combined_airr.json index aa7d52ec1..0ef2106ae 100644 --- a/lang/R/tests/data-tests/good_combined_airr.json +++ b/lang/R/tests/data-tests/good_combined_airr.json @@ -10,13 +10,52 @@ "label": null }, "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.", - "study_contact": "Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X", "inclusion_exclusion_criteria": null, - "lab_name": "Mark M. Davis", - "lab_address": "Stanford University", - "submitted_by": "Florian Rubelt", + "contributors": [ + { + "contributor_id": "1", + "name": "Florian Rubelt", + "orcid_id": { + "id": null, + "label": null + }, + "affiliation": { + "id": "ROR:00f54p054", + "label": "Stanford University" + }, + "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine", + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + }, + { + "contributor_id": "2", + "name": "Mark M. Davis", + "orcid_id": { + "id": "ORCID:0000-0001-6868-657X", + "label": "Mark Davis" + }, + "affiliation": { + "id": "ROR:00f54p054", + "label": "Stanford University" + }, + "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine", + "contributions": [ + { + "role": "supervision", + "degree": null + } + ] + } + ], "pub_ids": ["PMID:27005435"], - "collected_by": null, "grants": null, "keywords_study": [ "contains_ig", @@ -27,25 +66,25 @@ "subject_id": "TW01A", "synthetic": false, "species": { - "id": "NCBITaxon_9606", + "id": "NCBITAXON:9606", "label": "Homo sapiens" }, - "sex": "F", + "sex": "female", "age_min": 27, "age_max": 27, "age_unit": { - "id": "UO_0000036", + "id": "UO:0000036", "label": "year" }, "age_event": null, "ancestry_population": { - "id": null, - "label": null - }, - "location_birth": { - "id": null, - "label": null - }, + "id": null, + "label": null + }, + "location_birth": { + "id": null, + "label": null + }, "ethnicity": null, "race": null, "strain_name": null, @@ -65,7 +104,65 @@ "intervention": null, "medical_history": null } - ] + ], + "genotype": { + "receptor_genotype_set": { + "receptor_genotype_set_id": "1", + "genotype_class_list": [ + { + "receptor_genotype_id": "1", + "locus": "IGH", + "documented_alleles": [ + { + "label": "IGHV1-69*01", + "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", + "phasing": 1 + }, + { + "label": "IGHV1-69*02", + "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", + "phasing": 2 + } + ], + "undocumented_alleles": [ + { + "allele_name": "IGHD3-1*01_S1234", + "sequence": "agtagtagtagt", + "phasing": 1 + } + ], + "deleted_genes": [ + { + "label": "IGHV3-30-3", + "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", + "phasing": 1 + } + ], + "inference_process": "repertoire_sequencing" + } + ] + }, + "mhc_genotype_set": { + "mhc_genotype_set_id": "01847298-d0c2-11ee-bc66", + "mhc_genotype_list": [ + { + "mhc_genotype_id": "00be1c2e-d0c2-11ee-bfe7", + "mhc_class": "MHC-I", + "mhc_genotyping_method": "pcr_low_resolution", + "mhc_alleles": [ + { + "allele_designation": "01:01", + "gene": { + "id": "MRO-0000046", + "label": "HLA-A" + }, + "reference_set_ref": null + } + ] + } + ] + } + } }, "sample": [ { @@ -73,17 +170,17 @@ "sample_processing_id": null, "sample_type": "peripheral venous puncture", "tissue": { - "id": "UBERON_0000178", + "id": "UBERON:0000178", "label": "blood" }, "tissue_processing": "Ficoll gradient", "cell_subset": { - "id": "CL_0000788", + "id": "CL:0000788", "label": "naive B cell" }, "cell_phenotype": "expression of CD20 and the absence of CD27", "cell_species": { - "id": "NCBITaxon_9606", + "id": "NCBITAXON:9606", "label": "Homo sapiens" }, "single_cell": false, @@ -115,7 +212,7 @@ "label": null }, "collection_time_point_reference": null, - "collection_location": { + "collection_location": { "id": null, "label": null }, @@ -169,13 +266,52 @@ "label": null }, "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.", - "study_contact": "Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X", "inclusion_exclusion_criteria": null, - "lab_name": "Mark M. Davis", - "lab_address": "Stanford University", - "submitted_by": "Florian Rubelt", + "contributors": [ + { + "contributor_id": "1", + "name": "Florian Rubelt", + "orcid_id": { + "id": null, + "label": null + }, + "affiliation": { + "id": "ROR:00f54p054", + "label": "Stanford University" + }, + "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine", + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + }, + { + "contributor_id": "2", + "name": "Mark M. Davis", + "orcid_id": { + "id": "ORCID:0000-0001-6868-657X", + "label": "Mark Davis" + }, + "affiliation": { + "id": "ROR:00f54p054", + "label": "Stanford University" + }, + "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine", + "contributions": [ + { + "role": "supervision", + "degree": null + } + ] + } + ], "pub_ids": ["PMID:27005435"], - "collected_by": null, "grants": null, "keywords_study": [ "contains_ig", @@ -186,25 +322,25 @@ "subject_id": "TW01A", "synthetic": false, "species": { - "id": "NCBITaxon_9606", + "id": "NCBITAXON:9606", "label": "Homo sapiens" }, - "sex": "F", + "sex": "female", "age_min": 27, "age_max": 27, "age_unit": { - "id": "UO_0000036", + "id": "UO:0000036", "label": "year" }, "age_event": null, "ancestry_population": { - "id": null, - "label": null - }, - "location_birth": { - "id": null, - "label": null - }, + "id": null, + "label": null + }, + "location_birth": { + "id": null, + "label": null + }, "ethnicity": null, "race": null, "strain_name": null, @@ -232,17 +368,17 @@ "sample_processing_id": null, "sample_type": "peripheral venous puncture", "tissue": { - "id": "UBERON_0000178", + "id": "UBERON:0000178", "label": "blood" }, "tissue_processing": "Ficoll gradient", "cell_subset": { - "id": "CL_0000787", + "id": "CL:0000787", "label": "memory B cell" }, "cell_phenotype": "expression of CD20 and CD27", "cell_species": { - "id": "NCBITaxon_9606", + "id": "NCBITAXON:9606", "label": "Homo sapiens" }, "single_cell": false, @@ -274,7 +410,7 @@ "label": null }, "collection_time_point_reference": null, - "collection_location": { + "collection_location": { "id": null, "label": null }, @@ -328,13 +464,52 @@ "label": null }, "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.", - "study_contact": "Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X", "inclusion_exclusion_criteria": null, - "lab_name": "Mark M. Davis", - "lab_address": "Stanford University", - "submitted_by": "Florian Rubelt", + "contributors": [ + { + "contributor_id": "1", + "name": "Florian Rubelt", + "orcid_id": { + "id": null, + "label": null + }, + "affiliation": { + "id": "ROR:00f54p054", + "label": "Stanford University" + }, + "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine", + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + }, + { + "contributor_id": "2", + "name": "Mark M. Davis", + "orcid_id": { + "id": "ORCID:0000-0001-6868-657X", + "label": "Mark Davis" + }, + "affiliation": { + "id": "ROR:00f54p054", + "label": "Stanford University" + }, + "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine", + "contributions": [ + { + "role": "supervision", + "degree": null + } + ] + } + ], "pub_ids": ["PMID:27005435"], - "collected_by": null, "grants": null, "keywords_study": [ "contains_ig", @@ -345,25 +520,25 @@ "subject_id": "TW01A", "synthetic": false, "species": { - "id": "NCBITaxon_9606", + "id": "NCBITAXON:9606", "label": "Homo sapiens" }, - "sex": "F", + "sex": "female", "age_min": 27, "age_max": 27, "age_unit": { - "id": "UO_0000036", + "id": "UO:0000036", "label": "year" }, "age_event": null, "ancestry_population": { - "id": null, - "label": null - }, - "location_birth": { - "id": null, - "label": null - }, + "id": null, + "label": null + }, + "location_birth": { + "id": null, + "label": null + }, "ethnicity": null, "race": null, "strain_name": null, @@ -391,17 +566,17 @@ "sample_processing_id": null, "sample_type": "peripheral venous puncture", "tissue": { - "id": "UBERON_0000178", + "id": "UBERON:0000178", "label": "blood" }, "tissue_processing": "Ficoll gradient", "cell_subset": { - "id": "CL_0000895", + "id": "CL:0000895", "label": "naive thymus-derived CD4-positive, alpha-beta T cell" }, "cell_phenotype": "expression of CD8 and absence of CD4 and CD45RO", "cell_species": { - "id": "NCBITaxon_9606", + "id": "NCBITAXON:9606", "label": "Homo sapiens" }, "single_cell": false, @@ -433,7 +608,7 @@ "label": null }, "collection_time_point_reference": null, - "collection_location": { + "collection_location": { "id": null, "label": null }, @@ -479,19 +654,44 @@ } ], + "GermlineSet": [{ "germline_set_id": "OGRDB:G00007", - "author": "William Lees", - "lab_name": "", - "lab_address": "Birkbeck College, University of London, Malet Street, London", - "acknowledgements": [], + "acknowledgements": [ + { + "contributor_id": "3", + "name": "William Lees", + "orcid_id": { + "id": "ORCID:0000-0001-9834-6840", + "label": "William Lees" + }, + "affiliation": { + "id": "ROR:02mb95055", + "label": "Birkbeck, University of London" + }, + "affiliation_department": null, + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + } + ], "release_version": 1, "release_description": "", "release_date": "2021-11-24", "germline_set_name": "CAST IGH", "germline_set_ref": "OGRDB:G00007.1", "pub_ids": [""], - "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" }, + "species": { + "id": "NCBITAXON:10090", + "label": "Mus musculus" + }, "species_subgroup": "CAST_EiJ", "species_subgroup_type": "strain", "locus": "IGH", @@ -499,15 +699,37 @@ { "allele_description_id": "OGRDB:A00301", "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2DBF", - "maintainer": "William Lees", - "acknowledgements": [], - "lab_address": "Birkbeck College, University of London, Malet Street, London", + "acknowledgements": [ + { + "contributor_id": "3", + "name": "William Lees", + "orcid_id": { + "id": "ORCID:0000-0001-9834-6840", + "label": "William Lees" + }, + "affiliation": { + "id": "ROR:02mb95055", + "label": "Birkbeck, University of London" + }, + "affiliation_department": null, + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + } + ], "release_version": 1, - "release_date": "24-Nov-2021", + "release_date": "2021-11-24", "release_description": "First release", "label": "IGHV-2DBF", "sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", - "coding_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", + "coding_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", "aliases": [ "watson_et_al:CAST_EiJ_IGHV5-3" ], @@ -516,7 +738,10 @@ "sequence_type": "V", "functional": true, "inference_type": "rearranged_only", - "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" }, + "species": { + "id": "NCBITAXON:10090", + "label": "Mus musculus" + }, "species_subgroup": "CAST_EiJ", "species_subgroup_type": "strain", "status": "active", @@ -537,18 +762,20 @@ { "sequence_delineation_id": "1", "delineation_scheme": "IMGT", + "unaligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", + "aligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", "fwr1_start": 1, - "fwr1_end": 78, - "cdr1_start": 79, - "cdr1_end": 114, - "fwr2_start": 115, - "fwr2_end": 165, - "cdr2_start": 166, - "cdr2_end": 195, - "fwr3_start": 196, - "fwr3_end": 312, - "cdr3_start": 313, - "alignment": [ + "fwr1_end": 75, + "cdr1_start": 76, + "cdr1_end": 110, + "fwr2_start": 111, + "fwr2_end": 150, + "cdr2_start": 151, + "cdr2_end": 160, + "fwr3_start": 161, + "fwr3_end": 294, + "cdr3_start": 295, + "alignment_labels": [ "1", "2", "3", @@ -665,15 +892,37 @@ { "allele_description_id": "OGRDB:A00314", "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2ETO", - "maintainer": "William Lees", - "acknowledgements": [], - "lab_address": "Birkbeck College, University of London, Malet Street, London", + "acknowledgements": [ + { + "contributor_id": "3", + "name": "William Lees", + "orcid_id": { + "id": "ORCID:0000-0001-9834-6840", + "label": "William Lees" + }, + "affiliation": { + "id": "ROR:02mb95055", + "label": "Birkbeck, University of London" + }, + "affiliation_department": null, + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + } + ], "release_version": 1, - "release_date": "24-Nov-2021", + "release_date": "2021-11-24", "release_description": "First release", "label": "IGHV-2ETO", "sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC", - "coding_sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCT...GGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGC......ACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGAT.........GATGATAAGTACTATAACCCATCCCTGAAG...AGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC", + "coding_sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC", "aliases": [ "watson_et_al:CAST_EiJ_IGHV8-2" ], @@ -682,7 +931,10 @@ "sequence_type": "V", "functional": true, "inference_type": "rearranged_only", - "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" }, + "species": { + "id": "NCBITAXON:10090", + "label": "Mus musculus" + }, "species_subgroup": "CAST_EiJ", "species_subgroup_type": "strain", "status": "active", @@ -703,18 +955,20 @@ { "sequence_delineation_id": "1", "delineation_scheme": "IMGT", + "unaligned_sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC", + "aligned_sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTG...GGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGA............GCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGT......GGGATGATGATAAGTACTATAACCCATCCCTGA...AGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC", "fwr1_start": 1, - "fwr1_end": 78, - "cdr1_start": 79, - "cdr1_end": 114, - "fwr2_start": 115, - "fwr2_end": 165, - "cdr2_start": 166, - "cdr2_end": 195, - "fwr3_start": 196, - "fwr3_end": 312, - "cdr3_start": 313, - "alignment": [ + "fwr1_end": 75, + "cdr1_start": 76, + "cdr1_end": 110, + "fwr2_start": 111, + "fwr2_end": 150, + "cdr2_start": 151, + "cdr2_end": 160, + "fwr3_start": 161, + "fwr3_end": 294, + "cdr3_start": 295, + "alignment_labels": [ "1", "2", "3", @@ -831,7 +1085,6 @@ ], "curation": null }], - "GenotypeSet": [{ "receptor_genotype_set_id": "1", "genotype_class_list": [ diff --git a/lang/R/tests/data-tests/good_combined_airr.yaml b/lang/R/tests/data-tests/good_combined_airr.yaml index f4fdcb0ef..2c9ab547c 100644 --- a/lang/R/tests/data-tests/good_combined_airr.yaml +++ b/lang/R/tests/data-tests/good_combined_airr.yaml @@ -21,13 +21,35 @@ Repertoire: a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level. - study_contact: Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X inclusion_exclusion_criteria: - lab_name: Mark M. Davis - lab_address: Stanford University - submitted_by: Florian Rubelt + contributors: + - contributor_id: "1" + name: "Florian Rubelt" + orcid_id: + id: + label: + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "investigation" + degree: + - role: "data curation" + degree: + - contributor_id: "2" + name: "Mark M. Davis" + orcid_id: + id: "ORCID:0000-0001-6868-657X" + label: "Mark Davis" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "supervision" + degree: pub_ids: ["PMID:27005435"] - collected_by: grants: keywords_study: - contains_ig @@ -36,13 +58,13 @@ Repertoire: subject_id: TW01A synthetic: false species: - id: NCBITaxon_9606 + id: NCBITAXON:9606 label: Homo sapiens - sex: F + sex: female age_min: 27 age_max: 27 age_unit: - id: UO_0000036 + id: UO:0000036 label: year age_event: ancestry_population: @@ -67,20 +89,54 @@ Repertoire: immunogen: intervention: medical_history: + genotype: + receptor_genotype_set: + receptor_genotype_set_id: "1" + genotype_class_list: + - receptor_genotype_id: "1" + locus: IGH + documented_alleles: + - label: IGHV1-69*01 + germline_set_ref: IMGT:Homo sapiens:2022.1.31 + phasing: 1 + - label: IGHV1-69*02 + germline_set_ref: IMGT:Homo sapiens:2022.1.31 + phasing: 2 + undocumented_alleles: + - allele_name: IGHD3-1*01_S1234 + sequence: agtagtagtagt + phasing: 1 + deleted_genes: + - label: IGHV3-30-3 + germline_set_ref: IMGT:Homo sapiens:2022.1.31 + phasing: 1 + inference_process: repertoire_sequencing + mhc_genotype_set: + mhc_genotype_set_id: 01847298-d0c2-11ee-bc66 + mhc_genotype_list: + - mhc_genotype_id: 00be1c2e-d0c2-11ee-bfe7 + mhc_class: MHC-I + mhc_genotyping_method: pcr_low_resolution + mhc_alleles: + - allele_designation: "01:01" + gene: + id: MRO-0000046 + label: HLA-A + reference_set_ref: sample: - sample_id: TW01A_B_naive sample_processing_id: sample_type: peripheral venous puncture tissue: - id: UBERON_0000178 + id: UBERON:0000178 label: blood tissue_processing: Ficoll gradient cell_subset: - id: CL_0000788 + id: CL:0000788 label: naive B cell cell_phenotype: expression of CD20 and the absence of CD27 cell_species: - id: NCBITaxon_9606 + id: NCBITAXON:9606 label: Homo sapiens single_cell: false cell_isolation: FACS @@ -164,13 +220,35 @@ Repertoire: a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level. - study_contact: Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X inclusion_exclusion_criteria: - lab_name: Mark M. Davis - lab_address: Stanford University - submitted_by: Florian Rubelt + contributors: + - contributor_id: "1" + name: "Florian Rubelt" + orcid_id: + id: + label: + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "investigation" + degree: + - role: "data curation" + degree: + - contributor_id: "2" + name: "Mark M. Davis" + orcid_id: + id: "ORCID:0000-0001-6868-657X" + label: "Mark Davis" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "supervision" + degree: pub_ids: ["PMID:27005435"] - collected_by: grants: keywords_study: - contains_ig @@ -179,13 +257,13 @@ Repertoire: subject_id: TW01A synthetic: false species: - id: NCBITaxon_9606 + id: NCBITAXON:9606 label: Homo sapiens - sex: F + sex: female age_min: 27 age_max: 27 age_unit: - id: UO_0000036 + id: UO:0000036 label: year age_event: ancestry_population: @@ -215,15 +293,15 @@ Repertoire: sample_processing_id: sample_type: peripheral venous puncture tissue: - id: UBERON_0000178 + id: UBERON:0000178 label: blood tissue_processing: Ficoll gradient cell_subset: - id: CL_0000787 + id: CL:0000787 label: memory B cell cell_phenotype: expression of CD20 and CD27 cell_species: - id: NCBITaxon_9606 + id: NCBITAXON:9606 label: Homo sapiens single_cell: false cell_isolation: FACS @@ -307,13 +385,35 @@ Repertoire: a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level. - study_contact: Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X inclusion_exclusion_criteria: - lab_name: Mark M. Davis - lab_address: Stanford University - submitted_by: Florian Rubelt + contributors: + - contributor_id: "1" + name: "Florian Rubelt" + orcid_id: + id: + label: + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "investigation" + degree: + - role: "data curation" + degree: + - contributor_id: "2" + name: "Mark M. Davis" + orcid_id: + id: "ORCID:0000-0001-6868-657X" + label: "Mark Davis" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "supervision" + degree: pub_ids: ["PMID:27005435"] - collected_by: grants: keywords_study: - contains_ig @@ -322,13 +422,13 @@ Repertoire: subject_id: TW01A synthetic: false species: - id: NCBITaxon_9606 + id: NCBITAXON:9606 label: Homo sapiens - sex: F + sex: female age_min: 27 age_max: 27 age_unit: - id: UO_0000036 + id: UO:0000036 label: year age_event: ancestry_population: @@ -358,15 +458,15 @@ Repertoire: sample_processing_id: sample_type: peripheral venous puncture tissue: - id: UBERON_0000178 + id: UBERON:0000178 label: blood tissue_processing: Ficoll gradient cell_subset: - id: CL_0000895 + id: CL:0000895 label: naive thymus-derived CD4-positive, alpha-beta T cell cell_phenotype: expression of CD8 and absence of CD4 and CD45RO cell_species: - id: NCBITaxon_9606 + id: NCBITAXON:9606 label: Homo sapiens single_cell: false cell_isolation: FACS @@ -431,16 +531,27 @@ Repertoire: GermlineSet: - germline_set_id: OGRDB:G00007 - author: William Lees - lab_name: '' - lab_address: Birkbeck College, University of London, Malet Street, London - acknowledgements: [] + acknowledgements: + - contributor_id: "3" + name: William Lees + orcid_id: + id: ORCID:0000-0001-9834-6840 + label: William Lees + affiliation: + id: ROR:02mb95055 + label: Birkbeck, University of London + affiliation_department: + contributions: + - role: investigation + degree: null + - role: data curation + degree: null release_version: 1 - release_description: '' - release_date: '2021-11-24' + release_description: "" + release_date: "2021-11-24" germline_set_name: CAST IGH germline_set_ref: OGRDB:G00007.1 - pub_ids: [''] + pub_ids: [""] species: id: NCBITAXON:10090 label: Mus musculus @@ -450,15 +561,27 @@ GermlineSet: allele_descriptions: - allele_description_id: OGRDB:A00301 allele_description_ref: OGRDB:Mouse_IGH:IGHV-2DBF - maintainer: William Lees - acknowledgements: [] - lab_address: Birkbeck College, University of London, Malet Street, London + acknowledgements: + - contributor_id: "3" + name: William Lees + orcid_id: + id: ORCID:0000-0001-9834-6840 + label: William Lees + affiliation: + id: ROR:02mb95055 + label: Birkbeck, University of London + affiliation_department: + contributions: + - role: investigation + degree: + - role: data curation + degree: release_version: 1 - release_date: 24-Nov-2021 + release_date: "2021-11-24" release_description: First release label: IGHV-2DBF sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA - coding_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA + coding_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA aliases: - watson_et_al:CAST_EiJ_IGHV5-3 locus: IGH @@ -488,18 +611,20 @@ GermlineSet: v_gene_delineations: - sequence_delineation_id: '1' delineation_scheme: IMGT + unaligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA + aligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA fwr1_start: 1 - fwr1_end: 78 - cdr1_start: 79 - cdr1_end: 114 - fwr2_start: 115 - fwr2_end: 165 - cdr2_start: 166 - cdr2_end: 195 - fwr3_start: 196 - fwr3_end: 312 - cdr3_start: 313 - alignment: + fwr1_end: 75 + cdr1_start: 76 + cdr1_end: 110 + fwr2_start: 111 + fwr2_end: 150 + cdr2_start: 151 + cdr2_end: 160 + fwr3_start: 161 + fwr3_end: 294 + cdr3_start: 295 + alignment_labels: - '1' - '2' - '3' @@ -611,15 +736,27 @@ GermlineSet: curational_tags: - allele_description_id: OGRDB:A00314 allele_description_ref: OGRDB:Mouse_IGH:IGHV-2ETO - maintainer: William Lees - acknowledgements: [] - lab_address: Birkbeck College, University of London, Malet Street, London + acknowledgements: + - contributor_id: "3" + name: William Lees + orcid_id: + id: ORCID:0000-0001-9834-6840 + label: William Lees + affiliation: + id: ROR:02mb95055 + label: Birkbeck, University of London + affiliation_department: + contributions: + - role: investigation + degree: + - role: data curation + degree: release_version: 1 - release_date: 24-Nov-2021 + release_date: "2021-11-24" release_description: First release label: IGHV-2ETO sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC - coding_sequence: CAAGTTACTCTAAAAGAGTCTGGCCCT...GGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGC......ACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGAT.........GATGATAAGTACTATAACCCATCCCTGAAG...AGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC + coding_sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC aliases: - watson_et_al:CAST_EiJ_IGHV8-2 locus: IGH @@ -649,18 +786,20 @@ GermlineSet: v_gene_delineations: - sequence_delineation_id: '1' delineation_scheme: IMGT + unaligned_sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC + aligned_sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTG...GGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGA............GCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGT......GGGATGATGATAAGTACTATAACCCATCCCTGA...AGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC fwr1_start: 1 - fwr1_end: 78 - cdr1_start: 79 - cdr1_end: 114 - fwr2_start: 115 - fwr2_end: 165 - cdr2_start: 166 - cdr2_end: 195 - fwr3_start: 196 - fwr3_end: 312 - cdr3_start: 313 - alignment: + fwr1_end: 75 + cdr1_start: 76 + cdr1_end: 110 + fwr2_start: 111 + fwr2_end: 150 + cdr2_start: 151 + cdr2_end: 160 + fwr3_start: 161 + fwr3_end: 294 + cdr3_start: 295 + alignment_labels: - '1' - '2' - '3' @@ -773,9 +912,9 @@ GermlineSet: curation: GenotypeSet: - - receptor_genotype_set_id: '1' + - receptor_genotype_set_id: "1" genotype_class_list: - - receptor_genotype_id: '1' + - receptor_genotype_id: "1" locus: IGH documented_alleles: - label: IGHV1-69*01 diff --git a/lang/R/tests/data-tests/good_genotype_set.json b/lang/R/tests/data-tests/good_genotype_set.json index 4335b02e1..abd24646c 100644 --- a/lang/R/tests/data-tests/good_genotype_set.json +++ b/lang/R/tests/data-tests/good_genotype_set.json @@ -1,38 +1,38 @@ { - "GenotypeSet": [{ - "receptor_genotype_set_id": "1", - "genotype_class_list": [ - { - "receptor_genotype_id": "1", - "locus": "IGH", - "documented_alleles": [ - { - "label": "IGHV1-69*01", - "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", - "phasing": 1 - }, - { - "label": "IGHV1-69*02", - "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", - "phasing": 2 - } - ], - "undocumented_alleles": [ - { - "allele_name": "IGHD3-1*01_S1234", - "sequence": "agtagtagtagt", - "phasing": 1 - } - ], - "deleted_genes": [ - { - "label": "IGHV3-30-3", - "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", - "phasing": 1 - } - ], - "inference_process": "repertoire_sequencing" - } - ] - }] -} \ No newline at end of file + "GenotypeSet": [{ + "receptor_genotype_set_id": "1", + "genotype_class_list": [ + { + "receptor_genotype_id": "1", + "locus": "IGH", + "documented_alleles": [ + { + "label": "IGHV1-69*01", + "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", + "phasing": 1 + }, + { + "label": "IGHV1-69*02", + "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", + "phasing": 2 + } + ], + "undocumented_alleles": [ + { + "allele_name": "IGHD3-1*01_S1234", + "sequence": "agtagtagtagt", + "phasing": 1 + } + ], + "deleted_genes": [ + { + "label": "IGHV3-30-3", + "germline_set_ref": "IMGT:Homo sapiens:2022.1.31", + "phasing": 1 + } + ], + "inference_process": "repertoire_sequencing" + } + ] + }] +} diff --git a/lang/R/tests/data-tests/good_germline_set.json b/lang/R/tests/data-tests/good_germline_set.json index 41ecf5f7d..e74c590dc 100644 --- a/lang/R/tests/data-tests/good_germline_set.json +++ b/lang/R/tests/data-tests/good_germline_set.json @@ -1,17 +1,41 @@ { "GermlineSet": [{ "germline_set_id": "OGRDB:G00007", - "author": "William Lees", - "lab_name": "", - "lab_address": "Birkbeck College, University of London, Malet Street, London", - "acknowledgements": [], + "acknowledgements": [ + { + "contributor_id": "1", + "name": "William Lees", + "orcid_id": { + "id": "ORCID:0000-0001-9834-6840", + "label": "William Lees" + }, + "affiliation": { + "id": "ROR:02mb95055", + "label": "Birkbeck, University of London" + }, + "affiliation_department":"", + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + } + ], "release_version": 1, "release_description": "", "release_date": "2021-11-24", "germline_set_name": "CAST IGH", "germline_set_ref": "OGRDB:G00007.1", "pub_ids": [""], - "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" }, + "species": { + "id": "NCBITAXON:10090", + "label": "Mus musculus" + }, "species_subgroup": "CAST_EiJ", "species_subgroup_type": "strain", "locus": "IGH", @@ -19,11 +43,33 @@ { "allele_description_id": "OGRDB:A00301", "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2DBF", - "maintainer": "William Lees", - "acknowledgements": [], - "lab_address": "Birkbeck College, University of London, Malet Street, London", + "acknowledgements": [ + { + "contributor_id": "1", + "name": "William Lees", + "orcid_id": { + "id": "ORCID:0000-0001-9834-6840", + "label": "William Lees" + }, + "affiliation": { + "id": "ROR:02mb95055", + "label": "Birkbeck, University of London" + }, + "affiliation_department":"", + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + } + ], "release_version": 1, - "release_date": "24-Nov-2021", + "release_date": "2021-11-24", "release_description": "First release", "label": "IGHV-2DBF", "sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", @@ -36,7 +82,10 @@ "sequence_type": "V", "functional": true, "inference_type": "rearranged_only", - "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" }, + "species": { + "id": "NCBITAXON:10090", + "label": "Mus musculus" + }, "species_subgroup": "CAST_EiJ", "species_subgroup_type": "strain", "status": "active", @@ -70,7 +119,7 @@ "fwr3_start": 161, "fwr3_end": 294, "cdr3_start": 295, - "alignment": [ + "alignment_labels": [ "1", "2", "3", @@ -187,11 +236,33 @@ { "allele_description_id": "OGRDB:A00314", "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2ETO", - "maintainer": "William Lees", - "acknowledgements": [], - "lab_address": "Birkbeck College, University of London, Malet Street, London", + "acknowledgements": [ + { + "contributor_id": "1", + "name": "William Lees", + "orcid_id": { + "id": "ORCID:0000-0001-9834-6840", + "label": "William Lees" + }, + "affiliation": { + "id": "ROR:02mb95055", + "label": "Birkbeck, University of London" + }, + "affiliation_department":"", + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + } + ], "release_version": 1, - "release_date": "24-Nov-2021", + "release_date": "2021-11-24", "release_description": "First release", "label": "IGHV-2ETO", "sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC", @@ -204,7 +275,10 @@ "sequence_type": "V", "functional": true, "inference_type": "rearranged_only", - "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" }, + "species": { + "id": "NCBITAXON:10090", + "label": "Mus musculus" + }, "species_subgroup": "CAST_EiJ", "species_subgroup_type": "strain", "status": "active", @@ -238,7 +312,7 @@ "fwr3_start": 161, "fwr3_end": 294, "cdr3_start": 295, - "alignment": [ + "alignment_labels": [ "1", "2", "3", diff --git a/lang/R/tests/data-tests/good_repertoire.yaml b/lang/R/tests/data-tests/good_repertoire.yaml index c935c9b67..6adaa2361 100644 --- a/lang/R/tests/data-tests/good_repertoire.yaml +++ b/lang/R/tests/data-tests/good_repertoire.yaml @@ -11,28 +11,50 @@ Repertoire: id: null label: null study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level." - study_contact: "Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X" inclusion_exclusion_criteria: null - lab_name: "Mark M. Davis" - lab_address: "Stanford University" - submitted_by: "Florian Rubelt" + contributors: + - contributor_id: "1" + name: "Florian Rubelt" + orcid_id: + id: null + label: null + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "investigation" + degree: null + - role: "data curation" + degree: null + - contributor_id: "2" + name: "Mark M. Davis" + orcid_id: + id: "ORCID:0000-0001-6868-657X" + label: "Mark Davis" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "supervision" + degree: null pub_ids: ["PMID:27005435"] - collected_by: null grants: null - keywords_study: + keywords_study: - "contains_ig" - "contains_tr" subject: subject_id: TW01A synthetic: false species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" label: "Homo sapiens" - sex: F + sex: female age_min: 27 age_max: 27 age_unit: - id: UO_0000036 + id: UO:0000036 label: year age_event: null ancestry_population: @@ -63,15 +85,15 @@ Repertoire: sample_processing_id: null sample_type: "peripheral venous puncture" tissue: - id: "UBERON_0000178" + id: "UBERON:0000178" label: "blood" tissue_processing: "Ficoll gradient" cell_subset: - id: "CL_0000788" + id: "CL:0000788" label: "naive B cell" cell_phenotype: "expression of CD20 and the absence of CD27" cell_species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" label: "Homo sapiens" single_cell: false cell_isolation: FACS @@ -82,7 +104,7 @@ Repertoire: reverse_pcr_primer_target_location: null sequencing_platform: "Illumina MiSeq" sequencing_files: - sequencing_data_id: SRR2905656 + sequencing_data_id: SRA:SRR2905656 file_type: fastq filename: SRR2905656_R1.fastq.gz read_direction: forward @@ -90,6 +112,8 @@ Repertoire: paired_filename: SRR2905656_R2.fastq.gz paired_read_direction: reverse paired_read_length: 300 + index_filename: SRR2905656_R3.fastq.gz + index_length: 8 anatomic_site: null disease_state_sample: null collection_time_point_relative: null @@ -142,13 +166,35 @@ Repertoire: id: null label: null study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level." - study_contact: "Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X" inclusion_exclusion_criteria: null - lab_name: "Mark M. Davis" - lab_address: "Stanford University" - submitted_by: "Florian Rubelt" + contributors: + - contributor_id: "1" + name: "Florian Rubelt" + orcid_id: + id: null + label: null + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "investigation" + degree: null + - role: "data curation" + degree: null + - contributor_id: "2" + name: "Mark M. Davis" + orcid_id: + id: "ORCID:0000-0001-6868-657X" + label: "Mark Davis" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "supervision" + degree: null pub_ids: ["PMID:27005435"] - collected_by: null grants: null keywords_study: - "contains_ig" @@ -157,13 +203,13 @@ Repertoire: subject_id: TW01A synthetic: false species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" label: "Homo sapiens" - sex: F + sex: female age_min: 27 age_max: 27 age_unit: - id: UO_0000036 + id: UO:0000036 label: year age_event: null ancestry_population: @@ -194,15 +240,15 @@ Repertoire: sample_processing_id: null sample_type: "peripheral venous puncture" tissue: - id: "UBERON_0000178" + id: "UBERON:0000178" label: "blood" tissue_processing: "Ficoll gradient" cell_subset: - id: "CL_0000787" + id: "CL:0000787" label: "memory B cell" cell_phenotype: "expression of CD20 and CD27" cell_species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" label: "Homo sapiens" single_cell: false cell_isolation: FACS @@ -213,7 +259,7 @@ Repertoire: reverse_pcr_primer_target_location: null sequencing_platform: "Illumina MiSeq" sequencing_files: - sequencing_data_id: SRR2905655 + sequencing_data_id: SRA:SRR2905655 file_type: fastq filename: SRR2905655_R1.fastq.gz read_direction: forward @@ -221,6 +267,8 @@ Repertoire: paired_filename: SRR2905655_R2.fastq.gz paired_read_direction: reverse paired_read_length: 300 + index_filename: SRR2905655_R3.fastq.gz + index_length: 8 anatomic_site: null disease_state_sample: null collection_time_point_relative: null @@ -273,13 +321,35 @@ Repertoire: id: null label: null study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level." - study_contact: "Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X" inclusion_exclusion_criteria: null - lab_name: "Mark M. Davis" - lab_address: "Stanford University" - submitted_by: "Florian Rubelt" + contributors: + - contributor_id: "1" + name: "Florian Rubelt" + orcid_id: + id: null + label: null + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "investigation" + degree: null + - role: "data curation" + degree: null + - contributor_id: "2" + name: "Mark M. Davis" + orcid_id: + id: "ORCID:0000-0001-6868-657X" + label: "Mark Davis" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "supervision" + degree: null pub_ids: ["PMID:27005435"] - collected_by: null grants: null keywords_study: - "contains_ig" @@ -288,13 +358,13 @@ Repertoire: subject_id: TW01A synthetic: false species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" label: "Homo sapiens" - sex: F + sex: female age_min: 27 age_max: 27 age_unit: - id: UO_0000036 + id: UO:0000036 label: year age_event: null ancestry_population: @@ -325,15 +395,15 @@ Repertoire: sample_processing_id: null sample_type: "peripheral venous puncture" tissue: - id: "UBERON_0000178" + id: "UBERON:0000178" label: "blood" tissue_processing: "Ficoll gradient" cell_subset: - id: "CL_0000895" + id: "CL:0000895" label: "naive thymus-derived CD4-positive, alpha-beta T cell" cell_phenotype: "expression of CD8 and absence of CD4 and CD45RO" cell_species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" label: "Homo sapiens" single_cell: false cell_isolation: FACS @@ -344,7 +414,7 @@ Repertoire: reverse_pcr_primer_target_location: null sequencing_platform: "Illumina MiSeq" sequencing_files: - sequencing_data_id: SRR2905659 + sequencing_data_id: SRA:SRR2905659 file_type: fastq filename: SRR2905659_R1.fastq.gz read_direction: forward @@ -352,6 +422,8 @@ Repertoire: paired_filename: SRR2905659_R2.fastq.gz paired_read_direction: reverse paired_read_length: 300 + index_filename: SRR2905659_R3.fastq.gz + index_length: 8 anatomic_site: null disease_state_sample: null collection_time_point_relative: null diff --git a/lang/python/airr/specs/airr-schema.yaml b/lang/python/airr/specs/airr-schema.yaml index 87a25b5bd..dd2c0c241 100644 --- a/lang/python/airr/specs/airr-schema.yaml +++ b/lang/python/airr/specs/airr-schema.yaml @@ -1571,10 +1571,7 @@ Study: - study_type - inclusion_exclusion_criteria - grants - - collected_by - - lab_name - - lab_address - - submitted_by + - contributors - pub_ids - keywords_study properties: diff --git a/lang/python/tests/data/bad_genotype_set.json b/lang/python/tests/data/bad_genotype_set.json index c58a39027..01709d60a 100644 --- a/lang/python/tests/data/bad_genotype_set.json +++ b/lang/python/tests/data/bad_genotype_set.json @@ -41,4 +41,4 @@ } ] }] -} \ No newline at end of file +} diff --git a/lang/python/tests/data/bad_germline_set.json b/lang/python/tests/data/bad_germline_set.json index 168cc1fa5..28531aabb 100644 --- a/lang/python/tests/data/bad_germline_set.json +++ b/lang/python/tests/data/bad_germline_set.json @@ -1,27 +1,71 @@ { "GermlineSet": [{ "germline_set_id": "OGRDB:G00007", - "author": "William Lees", - "lab_name": "", - "lab_address": "Birkbeck College, University of London, Malet Street, London", + "acknowledgements": [ + { + "contributor_id": "1", + "name": "William Lees", + "orcid_id": { + "id": "ORCID:0000-0001-9834-6840", + "label": "William Lees" + }, + "affiliation": { + "id": "ROR:02mb95055", + "label": "Birkbeck, University of London" + }, + "affiliation_department":"", + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + } + ], "release_version": 1, "release_description": "", "release_date": "2021-11-24", "germline_set_name": "CAST IGH", "germline_set_ref": "OGRDB:G00007.1", "pub_ids": [""], - "species": ["Mouse"], + "species": "Mouse", "species_subgroup": "CAST_EiJ", "species_subgroup_type": "strain", "locus": "IGH", "allele_descriptions": [ { "allele_description_id": "OGRDB:A00301", - "maintainer": "William Lees", - "acknowledgements": [], - "lab_address": "Birkbeck College, University of London, Malet Street, London", + "acknowledgements": [ + { + "contributor_id": "1", + "name": "William Lees", + "orcid_id": { + "id": "ORCID:0000-0001-9834-6840", + "label": "William Lees" + }, + "affiliation": { + "id": "ROR:02mb95055", + "label": "Birkbeck, University of London" + }, + "affiliation_department":"", + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + } + ], "release_version": 1, - "release_date": "24-Nov-2021", + "release_date": "2021-11-24", "release_description": "First release", "label": "IGHV-2DBF", "sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", @@ -66,7 +110,7 @@ "fwr3_start": 196, "fwr3_end": 312, "cdr3_start": 313, - "alignment": [ + "alignment_labels": [ "1", "2", "3", @@ -182,11 +226,33 @@ }, { "allele_description_id": "OGRDB:A00314", - "maintainer": "William Lees", - "acknowledgements": [], - "lab_address": "Birkbeck College, University of London, Malet Street, London", + "acknowledgements": [ + { + "contributor_id": "1", + "name": "William Lees", + "orcid_id": { + "id": "ORCID:0000-0001-9834-6840", + "label": "William Lees" + }, + "affiliation": { + "id": "ROR:02mb95055", + "label": "Birkbeck, University of London" + }, + "affiliation_department":"", + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + } + ], "release_version": 1, - "release_date": "24-Nov-2021", + "release_date": "2021-11-24", "release_description": "First release", "label": "IGHV-2ETO", "sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC", @@ -231,7 +297,7 @@ "fwr3_start": 196, "fwr3_end": 312, "cdr3_start": 313, - "alignment": [ + "alignment_labels": [ "1", "2", "3", diff --git a/lang/python/tests/data/bad_repertoire.yaml b/lang/python/tests/data/bad_repertoire.yaml index 2de377cb3..f35355e98 100644 --- a/lang/python/tests/data/bad_repertoire.yaml +++ b/lang/python/tests/data/bad_repertoire.yaml @@ -8,21 +8,39 @@ Repertoire: study_id: PRJNA300878 study_title: "Homo sapiens B and T cell repertoire - MZ twins" study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level." - lab_name: "Mark M. Davis" - lab_address: "Stanford University" - submitted_by: "Florian Rubelt" + contributors: + - contributor_id: "1" + name: "Florian Rubelt" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "investigation" + - role: "data curation" + - contributor_id: "2" + name: "Mark M. Davis" + orcid_id: + id: "ORCID:0000-0001-6868-657X" + label: "Mark Davis" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "supervision" pub_ids: ["PMID:27005435"] subject: subject_id: TW01A synthetic: false species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" value: "Homo sapiens" sex: female age_min: 27 age_max: 27 age_unit: - id: UO_0000036 + id: UO:0000036 value: year linked_subjects: TW01B link_type: twin @@ -32,7 +50,7 @@ Repertoire: cell_subset: "Naive B cell" cell_phenotype: "expression of CD20 and the absence of CD27" cell_species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" value: "Homo sapiens" single_cell: false cell_isolation: FACS @@ -56,21 +74,39 @@ Repertoire: study_id: PRJNA300878 study_title: "Homo sapiens B and T cell repertoire - MZ twins" study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level." - lab_name: "Mark M. Davis" - lab_address: "Stanford University" - submitted_by: "Florian Rubelt" + contributors: + - contributor_id: "1" + name: "Florian Rubelt" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "investigation" + - role: "data curation" + - contributor_id: "2" + name: "Mark M. Davis" + orcid_id: + id: "ORCID:0000-0001-6868-657X" + label: "Mark Davis" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "supervision" pub_ids: ["PMID:27005435"] subject: subject_id: TW01A synthetic: false species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" value: "Homo sapiens" sex: female age_min: 27 age_max: 27 age_unit: - id: UO_0000036 + id: UO:0000036 value: year linked_subjects: TW01B link_type: twin @@ -80,7 +116,7 @@ Repertoire: cell_subset: "Memory B cell" cell_phenotype: "expression of CD20 and CD27" cell_species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" value: "Homo sapiens" single_cell: false cell_isolation: FACS @@ -104,21 +140,39 @@ Repertoire: study_id: PRJNA300878 study_title: "Homo sapiens B and T cell repertoire - MZ twins" study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level." - lab_name: "Mark M. Davis" - lab_address: "Stanford University" - submitted_by: "Florian Rubelt" + contributors: + - contributor_id: "1" + name: "Florian Rubelt" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "investigation" + - role: "data curation" + - contributor_id: "2" + name: "Mark M. Davis" + orcid_id: + id: "ORCID:0000-0001-6868-657X" + label: "Mark Davis" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "supervision" pub_ids: ["PMID:27005435"] subject: subject_id: TW01A synthetic: false species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" value: "Homo sapiens" sex: female age_min: 27 age_max: 27 age_unit: - id: UO_0000036 + id: UO:0000036 value: year linked_subjects: TW01B link_type: twin @@ -128,7 +182,7 @@ Repertoire: cell_subset: "Naive CD4+ T cell" cell_phenotype: "expression of CD8 and absence of CD4 and CD45RO" cell_species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" value: "Homo sapiens" single_cell: false cell_isolation: FACS diff --git a/lang/python/tests/data/good_combined_airr.json b/lang/python/tests/data/good_combined_airr.json index 9101b24a9..0ef2106ae 100644 --- a/lang/python/tests/data/good_combined_airr.json +++ b/lang/python/tests/data/good_combined_airr.json @@ -10,13 +10,52 @@ "label": null }, "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.", - "study_contact": "Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X", "inclusion_exclusion_criteria": null, - "lab_name": "Mark M. Davis", - "lab_address": "Stanford University", - "submitted_by": "Florian Rubelt", + "contributors": [ + { + "contributor_id": "1", + "name": "Florian Rubelt", + "orcid_id": { + "id": null, + "label": null + }, + "affiliation": { + "id": "ROR:00f54p054", + "label": "Stanford University" + }, + "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine", + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + }, + { + "contributor_id": "2", + "name": "Mark M. Davis", + "orcid_id": { + "id": "ORCID:0000-0001-6868-657X", + "label": "Mark Davis" + }, + "affiliation": { + "id": "ROR:00f54p054", + "label": "Stanford University" + }, + "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine", + "contributions": [ + { + "role": "supervision", + "degree": null + } + ] + } + ], "pub_ids": ["PMID:27005435"], - "collected_by": null, "grants": null, "keywords_study": [ "contains_ig", @@ -27,25 +66,25 @@ "subject_id": "TW01A", "synthetic": false, "species": { - "id": "NCBITaxon_9606", + "id": "NCBITAXON:9606", "label": "Homo sapiens" }, "sex": "female", "age_min": 27, "age_max": 27, "age_unit": { - "id": "UO_0000036", + "id": "UO:0000036", "label": "year" }, "age_event": null, "ancestry_population": { - "id": null, - "label": null - }, - "location_birth": { - "id": null, - "label": null - }, + "id": null, + "label": null + }, + "location_birth": { + "id": null, + "label": null + }, "ethnicity": null, "race": null, "strain_name": null, @@ -104,10 +143,10 @@ ] }, "mhc_genotype_set": { - "mhc_genotype_set_id": "this is a unique identifier", + "mhc_genotype_set_id": "01847298-d0c2-11ee-bc66", "mhc_genotype_list": [ { - "mhc_genotype_id": "unique", + "mhc_genotype_id": "00be1c2e-d0c2-11ee-bfe7", "mhc_class": "MHC-I", "mhc_genotyping_method": "pcr_low_resolution", "mhc_alleles": [ @@ -117,7 +156,7 @@ "id": "MRO-0000046", "label": "HLA-A" }, - "reference_set_ref": "blah" + "reference_set_ref": null } ] } @@ -131,17 +170,17 @@ "sample_processing_id": null, "sample_type": "peripheral venous puncture", "tissue": { - "id": "UBERON_0000178", + "id": "UBERON:0000178", "label": "blood" }, "tissue_processing": "Ficoll gradient", "cell_subset": { - "id": "CL_0000788", + "id": "CL:0000788", "label": "naive B cell" }, "cell_phenotype": "expression of CD20 and the absence of CD27", "cell_species": { - "id": "NCBITaxon_9606", + "id": "NCBITAXON:9606", "label": "Homo sapiens" }, "single_cell": false, @@ -173,7 +212,7 @@ "label": null }, "collection_time_point_reference": null, - "collection_location": { + "collection_location": { "id": null, "label": null }, @@ -227,13 +266,52 @@ "label": null }, "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.", - "study_contact": "Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X", "inclusion_exclusion_criteria": null, - "lab_name": "Mark M. Davis", - "lab_address": "Stanford University", - "submitted_by": "Florian Rubelt", + "contributors": [ + { + "contributor_id": "1", + "name": "Florian Rubelt", + "orcid_id": { + "id": null, + "label": null + }, + "affiliation": { + "id": "ROR:00f54p054", + "label": "Stanford University" + }, + "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine", + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + }, + { + "contributor_id": "2", + "name": "Mark M. Davis", + "orcid_id": { + "id": "ORCID:0000-0001-6868-657X", + "label": "Mark Davis" + }, + "affiliation": { + "id": "ROR:00f54p054", + "label": "Stanford University" + }, + "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine", + "contributions": [ + { + "role": "supervision", + "degree": null + } + ] + } + ], "pub_ids": ["PMID:27005435"], - "collected_by": null, "grants": null, "keywords_study": [ "contains_ig", @@ -244,25 +322,25 @@ "subject_id": "TW01A", "synthetic": false, "species": { - "id": "NCBITaxon_9606", + "id": "NCBITAXON:9606", "label": "Homo sapiens" }, "sex": "female", "age_min": 27, "age_max": 27, "age_unit": { - "id": "UO_0000036", + "id": "UO:0000036", "label": "year" }, "age_event": null, "ancestry_population": { - "id": null, - "label": null - }, - "location_birth": { - "id": null, - "label": null - }, + "id": null, + "label": null + }, + "location_birth": { + "id": null, + "label": null + }, "ethnicity": null, "race": null, "strain_name": null, @@ -290,17 +368,17 @@ "sample_processing_id": null, "sample_type": "peripheral venous puncture", "tissue": { - "id": "UBERON_0000178", + "id": "UBERON:0000178", "label": "blood" }, "tissue_processing": "Ficoll gradient", "cell_subset": { - "id": "CL_0000787", + "id": "CL:0000787", "label": "memory B cell" }, "cell_phenotype": "expression of CD20 and CD27", "cell_species": { - "id": "NCBITaxon_9606", + "id": "NCBITAXON:9606", "label": "Homo sapiens" }, "single_cell": false, @@ -332,7 +410,7 @@ "label": null }, "collection_time_point_reference": null, - "collection_location": { + "collection_location": { "id": null, "label": null }, @@ -386,13 +464,52 @@ "label": null }, "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.", - "study_contact": "Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X", "inclusion_exclusion_criteria": null, - "lab_name": "Mark M. Davis", - "lab_address": "Stanford University", - "submitted_by": "Florian Rubelt", + "contributors": [ + { + "contributor_id": "1", + "name": "Florian Rubelt", + "orcid_id": { + "id": null, + "label": null + }, + "affiliation": { + "id": "ROR:00f54p054", + "label": "Stanford University" + }, + "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine", + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + }, + { + "contributor_id": "2", + "name": "Mark M. Davis", + "orcid_id": { + "id": "ORCID:0000-0001-6868-657X", + "label": "Mark Davis" + }, + "affiliation": { + "id": "ROR:00f54p054", + "label": "Stanford University" + }, + "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine", + "contributions": [ + { + "role": "supervision", + "degree": null + } + ] + } + ], "pub_ids": ["PMID:27005435"], - "collected_by": null, "grants": null, "keywords_study": [ "contains_ig", @@ -403,25 +520,25 @@ "subject_id": "TW01A", "synthetic": false, "species": { - "id": "NCBITaxon_9606", + "id": "NCBITAXON:9606", "label": "Homo sapiens" }, "sex": "female", "age_min": 27, "age_max": 27, "age_unit": { - "id": "UO_0000036", + "id": "UO:0000036", "label": "year" }, "age_event": null, "ancestry_population": { - "id": null, - "label": null - }, - "location_birth": { - "id": null, - "label": null - }, + "id": null, + "label": null + }, + "location_birth": { + "id": null, + "label": null + }, "ethnicity": null, "race": null, "strain_name": null, @@ -449,17 +566,17 @@ "sample_processing_id": null, "sample_type": "peripheral venous puncture", "tissue": { - "id": "UBERON_0000178", + "id": "UBERON:0000178", "label": "blood" }, "tissue_processing": "Ficoll gradient", "cell_subset": { - "id": "CL_0000895", + "id": "CL:0000895", "label": "naive thymus-derived CD4-positive, alpha-beta T cell" }, "cell_phenotype": "expression of CD8 and absence of CD4 and CD45RO", "cell_species": { - "id": "NCBITaxon_9606", + "id": "NCBITAXON:9606", "label": "Homo sapiens" }, "single_cell": false, @@ -491,7 +608,7 @@ "label": null }, "collection_time_point_reference": null, - "collection_location": { + "collection_location": { "id": null, "label": null }, @@ -540,17 +657,41 @@ "GermlineSet": [{ "germline_set_id": "OGRDB:G00007", - "author": "William Lees", - "lab_name": "", - "lab_address": "Birkbeck College, University of London, Malet Street, London", - "acknowledgements": [], + "acknowledgements": [ + { + "contributor_id": "3", + "name": "William Lees", + "orcid_id": { + "id": "ORCID:0000-0001-9834-6840", + "label": "William Lees" + }, + "affiliation": { + "id": "ROR:02mb95055", + "label": "Birkbeck, University of London" + }, + "affiliation_department": null, + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + } + ], "release_version": 1, "release_description": "", "release_date": "2021-11-24", "germline_set_name": "CAST IGH", "germline_set_ref": "OGRDB:G00007.1", "pub_ids": [""], - "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" }, + "species": { + "id": "NCBITAXON:10090", + "label": "Mus musculus" + }, "species_subgroup": "CAST_EiJ", "species_subgroup_type": "strain", "locus": "IGH", @@ -558,15 +699,37 @@ { "allele_description_id": "OGRDB:A00301", "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2DBF", - "maintainer": "William Lees", - "acknowledgements": [], - "lab_address": "Birkbeck College, University of London, Malet Street, London", + "acknowledgements": [ + { + "contributor_id": "3", + "name": "William Lees", + "orcid_id": { + "id": "ORCID:0000-0001-9834-6840", + "label": "William Lees" + }, + "affiliation": { + "id": "ROR:02mb95055", + "label": "Birkbeck, University of London" + }, + "affiliation_department": null, + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + } + ], "release_version": 1, - "release_date": "24-Nov-2021", + "release_date": "2021-11-24", "release_description": "First release", "label": "IGHV-2DBF", - "sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", - "coding_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", + "sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", + "coding_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", "aliases": [ "watson_et_al:CAST_EiJ_IGHV5-3" ], @@ -575,7 +738,10 @@ "sequence_type": "V", "functional": true, "inference_type": "rearranged_only", - "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" }, + "species": { + "id": "NCBITAXON:10090", + "label": "Mus musculus" + }, "species_subgroup": "CAST_EiJ", "species_subgroup_type": "strain", "status": "active", @@ -596,8 +762,8 @@ { "sequence_delineation_id": "1", "delineation_scheme": "IMGT", + "unaligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", "aligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", - "unaligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", "fwr1_start": 1, "fwr1_end": 75, "cdr1_start": 76, @@ -609,7 +775,7 @@ "fwr3_start": 161, "fwr3_end": 294, "cdr3_start": 295, - "alignment": [ + "alignment_labels": [ "1", "2", "3", @@ -726,11 +892,33 @@ { "allele_description_id": "OGRDB:A00314", "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2ETO", - "maintainer": "William Lees", - "acknowledgements": [], - "lab_address": "Birkbeck College, University of London, Malet Street, London", + "acknowledgements": [ + { + "contributor_id": "3", + "name": "William Lees", + "orcid_id": { + "id": "ORCID:0000-0001-9834-6840", + "label": "William Lees" + }, + "affiliation": { + "id": "ROR:02mb95055", + "label": "Birkbeck, University of London" + }, + "affiliation_department": null, + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + } + ], "release_version": 1, - "release_date": "24-Nov-2021", + "release_date": "2021-11-24", "release_description": "First release", "label": "IGHV-2ETO", "sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC", @@ -743,7 +931,10 @@ "sequence_type": "V", "functional": true, "inference_type": "rearranged_only", - "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" }, + "species": { + "id": "NCBITAXON:10090", + "label": "Mus musculus" + }, "species_subgroup": "CAST_EiJ", "species_subgroup_type": "strain", "status": "active", @@ -764,8 +955,8 @@ { "sequence_delineation_id": "1", "delineation_scheme": "IMGT", - "aligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", - "unaligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", + "unaligned_sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC", + "aligned_sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTG...GGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGA............GCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGT......GGGATGATGATAAGTACTATAACCCATCCCTGA...AGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC", "fwr1_start": 1, "fwr1_end": 75, "cdr1_start": 76, @@ -777,7 +968,7 @@ "fwr3_start": 161, "fwr3_end": 294, "cdr3_start": 295, - "alignment": [ + "alignment_labels": [ "1", "2", "3", diff --git a/lang/python/tests/data/good_combined_airr.yaml b/lang/python/tests/data/good_combined_airr.yaml index 80d0fe3a2..2c9ab547c 100644 --- a/lang/python/tests/data/good_combined_airr.yaml +++ b/lang/python/tests/data/good_combined_airr.yaml @@ -21,13 +21,35 @@ Repertoire: a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level. - study_contact: Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X inclusion_exclusion_criteria: - lab_name: Mark M. Davis - lab_address: Stanford University - submitted_by: Florian Rubelt + contributors: + - contributor_id: "1" + name: "Florian Rubelt" + orcid_id: + id: + label: + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "investigation" + degree: + - role: "data curation" + degree: + - contributor_id: "2" + name: "Mark M. Davis" + orcid_id: + id: "ORCID:0000-0001-6868-657X" + label: "Mark Davis" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "supervision" + degree: pub_ids: ["PMID:27005435"] - collected_by: grants: keywords_study: - contains_ig @@ -36,13 +58,13 @@ Repertoire: subject_id: TW01A synthetic: false species: - id: NCBITaxon_9606 + id: NCBITAXON:9606 label: Homo sapiens sex: female age_min: 27 age_max: 27 age_unit: - id: UO_0000036 + id: UO:0000036 label: year age_event: ancestry_population: @@ -69,9 +91,9 @@ Repertoire: medical_history: genotype: receptor_genotype_set: - receptor_genotype_set_id: '1' + receptor_genotype_set_id: "1" genotype_class_list: - - receptor_genotype_id: '1' + - receptor_genotype_id: "1" locus: IGH documented_alleles: - label: IGHV1-69*01 @@ -90,31 +112,31 @@ Repertoire: phasing: 1 inference_process: repertoire_sequencing mhc_genotype_set: - mhc_genotype_set_id: "this is a unique identifier" + mhc_genotype_set_id: 01847298-d0c2-11ee-bc66 mhc_genotype_list: - - mhc_genotype_id: unique + - mhc_genotype_id: 00be1c2e-d0c2-11ee-bfe7 mhc_class: MHC-I mhc_genotyping_method: pcr_low_resolution mhc_alleles: - allele_designation: "01:01" gene: - id: "MRO-0000046" - label: "HLA-A" - reference_set_ref: blah + id: MRO-0000046 + label: HLA-A + reference_set_ref: sample: - sample_id: TW01A_B_naive sample_processing_id: sample_type: peripheral venous puncture tissue: - id: UBERON_0000178 + id: UBERON:0000178 label: blood tissue_processing: Ficoll gradient cell_subset: - id: CL_0000788 + id: CL:0000788 label: naive B cell cell_phenotype: expression of CD20 and the absence of CD27 cell_species: - id: NCBITaxon_9606 + id: NCBITAXON:9606 label: Homo sapiens single_cell: false cell_isolation: FACS @@ -198,13 +220,35 @@ Repertoire: a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level. - study_contact: Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X inclusion_exclusion_criteria: - lab_name: Mark M. Davis - lab_address: Stanford University - submitted_by: Florian Rubelt + contributors: + - contributor_id: "1" + name: "Florian Rubelt" + orcid_id: + id: + label: + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "investigation" + degree: + - role: "data curation" + degree: + - contributor_id: "2" + name: "Mark M. Davis" + orcid_id: + id: "ORCID:0000-0001-6868-657X" + label: "Mark Davis" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "supervision" + degree: pub_ids: ["PMID:27005435"] - collected_by: grants: keywords_study: - contains_ig @@ -213,13 +257,13 @@ Repertoire: subject_id: TW01A synthetic: false species: - id: NCBITaxon_9606 + id: NCBITAXON:9606 label: Homo sapiens sex: female age_min: 27 age_max: 27 age_unit: - id: UO_0000036 + id: UO:0000036 label: year age_event: ancestry_population: @@ -249,15 +293,15 @@ Repertoire: sample_processing_id: sample_type: peripheral venous puncture tissue: - id: UBERON_0000178 + id: UBERON:0000178 label: blood tissue_processing: Ficoll gradient cell_subset: - id: CL_0000787 + id: CL:0000787 label: memory B cell cell_phenotype: expression of CD20 and CD27 cell_species: - id: NCBITaxon_9606 + id: NCBITAXON:9606 label: Homo sapiens single_cell: false cell_isolation: FACS @@ -341,13 +385,35 @@ Repertoire: a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level. - study_contact: Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X inclusion_exclusion_criteria: - lab_name: Mark M. Davis - lab_address: Stanford University - submitted_by: Florian Rubelt + contributors: + - contributor_id: "1" + name: "Florian Rubelt" + orcid_id: + id: + label: + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "investigation" + degree: + - role: "data curation" + degree: + - contributor_id: "2" + name: "Mark M. Davis" + orcid_id: + id: "ORCID:0000-0001-6868-657X" + label: "Mark Davis" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "supervision" + degree: pub_ids: ["PMID:27005435"] - collected_by: grants: keywords_study: - contains_ig @@ -356,13 +422,13 @@ Repertoire: subject_id: TW01A synthetic: false species: - id: NCBITaxon_9606 + id: NCBITAXON:9606 label: Homo sapiens sex: female age_min: 27 age_max: 27 age_unit: - id: UO_0000036 + id: UO:0000036 label: year age_event: ancestry_population: @@ -392,15 +458,15 @@ Repertoire: sample_processing_id: sample_type: peripheral venous puncture tissue: - id: UBERON_0000178 + id: UBERON:0000178 label: blood tissue_processing: Ficoll gradient cell_subset: - id: CL_0000895 + id: CL:0000895 label: naive thymus-derived CD4-positive, alpha-beta T cell cell_phenotype: expression of CD8 and absence of CD4 and CD45RO cell_species: - id: NCBITaxon_9606 + id: NCBITAXON:9606 label: Homo sapiens single_cell: false cell_isolation: FACS @@ -464,357 +530,391 @@ Repertoire: analysis_provenance_id: 4625424004665971176-242ac11c-0001-012 GermlineSet: -- acknowledgements: [] - allele_descriptions: - - acknowledgements: [] - aliases: - - watson_et_al:CAST_EiJ_IGHV5-3 - allele_description_id: OGRDB:A00301 - allele_description_ref: OGRDB:Mouse_IGH:IGHV-2DBF - allele_designation: null - chromosome: null - coding_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA - curation: 'Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV5-3' - curational_tags: null - functional: true - gene_designation: null - gene_end: null - gene_start: null - inference_type: rearranged_only - lab_address: Birkbeck College, University of London, Malet Street, London - label: IGHV-2DBF - leader_1_end: null - leader_1_start: null - leader_2_end: null - leader_2_start: null - locus: IGH - maintainer: William Lees - paralogs: [] - rearranged_support: [] - release_date: 24-Nov-2021 - release_description: First release + - germline_set_id: OGRDB:G00007 + acknowledgements: + - contributor_id: "3" + name: William Lees + orcid_id: + id: ORCID:0000-0001-9834-6840 + label: William Lees + affiliation: + id: ROR:02mb95055 + label: Birkbeck, University of London + affiliation_department: + contributions: + - role: investigation + degree: null + - role: data curation + degree: null release_version: 1 - sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA - sequence_type: V + release_description: "" + release_date: "2021-11-24" + germline_set_name: CAST IGH + germline_set_ref: OGRDB:G00007.1 + pub_ids: [""] species: id: NCBITAXON:10090 label: Mus musculus species_subgroup: CAST_EiJ species_subgroup_type: strain - status: active - subgroup_designation: null - unrearranged_support: [] - utr_5_prime_end: null - utr_5_prime_start: null - v_gene_delineations: - - aligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA - alignment: - - '1' - - '2' - - '3' - - '4' - - '5' - - '6' - - '7' - - '8' - - '9' - - '10' - - '11' - - '12' - - '13' - - '14' - - '15' - - '16' - - '17' - - '18' - - '19' - - '20' - - '21' - - '22' - - '23' - - '24' - - '25' - - '26' - - '27' - - '28' - - '29' - - '30' - - '31' - - '32' - - '33' - - '34' - - '35' - - '36' - - '37' - - '38' - - '39' - - '40' - - '41' - - '42' - - '43' - - '44' - - '45' - - '46' - - '47' - - '48' - - '49' - - '50' - - '51' - - '52' - - '53' - - '54' - - '55' - - '56' - - '57' - - '58' - - '59' - - '60' - - '61' - - '62' - - '63' - - '64' - - '65' - - '66' - - '67' - - '68' - - '69' - - '70' - - '71' - - '72' - - '73' - - '74' - - '75' - - '76' - - '77' - - '78' - - '79' - - '80' - - '81' - - '82' - - '83' - - '84' - - '85' - - '86' - - '87' - - '88' - - '89' - - '90' - - '91' - - '92' - - '93' - - '94' - - '95' - - '96' - - '97' - - '98' - - '99' - - '100' - - '101' - - '102' - - '103' - - '104' - cdr1_end: 110 - cdr1_start: 76 - cdr2_end: 160 - cdr2_start: 151 - cdr3_start: 295 - delineation_scheme: IMGT - fwr1_end: 75 - fwr1_start: 1 - fwr2_end: 150 - fwr2_start: 111 - fwr3_end: 294 - fwr3_start: 161 - sequence_delineation_id: '1' - unaligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA - v_rs_end: null - v_rs_start: null - - acknowledgements: [] - aliases: - - watson_et_al:CAST_EiJ_IGHV8-2 - allele_description_id: OGRDB:A00314 - allele_description_ref: OGRDB:Mouse_IGH:IGHV-2ETO - allele_designation: null - chromosome: null - coding_sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC - curation: 'Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV8-2' - curational_tags: null - functional: true - gene_designation: null - gene_end: null - gene_start: null - inference_type: rearranged_only - lab_address: Birkbeck College, University of London, Malet Street, London - label: IGHV-2ETO - leader_1_end: null - leader_1_start: null - leader_2_end: null - leader_2_start: null locus: IGH - maintainer: William Lees - paralogs: [] - rearranged_support: [] - release_date: 24-Nov-2021 - release_description: First release - release_version: 1 - sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC - sequence_type: V - species: - id: NCBITAXON:10090 - label: Mus musculus - species_subgroup: CAST_EiJ - species_subgroup_type: strain - status: active - subgroup_designation: null - unrearranged_support: [] - utr_5_prime_end: null - utr_5_prime_start: null - v_gene_delineations: - - aligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA - alignment: - - '1' - - '2' - - '3' - - '4' - - '5' - - '6' - - '7' - - '8' - - '9' - - '10' - - '11' - - '12' - - '13' - - '14' - - '15' - - '16' - - '17' - - '18' - - '19' - - '20' - - '21' - - '22' - - '23' - - '24' - - '25' - - '26' - - '27' - - '28' - - '29' - - '30' - - '31' - - '32' - - '33' - - '34' - - '35' - - '36' - - '37' - - '38' - - '39' - - '40' - - '41' - - '42' - - '43' - - '44' - - '45' - - '46' - - '47' - - '48' - - '49' - - '50' - - '51' - - '52' - - '53' - - '54' - - '55' - - '56' - - '57' - - '58' - - '59' - - '60' - - '61' - - '62' - - '63' - - '64' - - '65' - - '66' - - '67' - - '68' - - '69' - - '70' - - '71' - - '72' - - '73' - - '74' - - '75' - - '76' - - '77' - - '78' - - '79' - - '80' - - '81' - - '82' - - '83' - - '84' - - '85' - - '86' - - '87' - - '88' - - '89' - - '90' - - '91' - - '92' - - '93' - - '94' - - '95' - - '96' - - '97' - - '98' - - '99' - - '100' - - '101' - - '102' - - '103' - - '104' - cdr1_end: 110 - cdr1_start: 76 - cdr2_end: 160 - cdr2_start: 151 - cdr3_start: 295 - delineation_scheme: IMGT - fwr1_end: 75 - fwr1_start: 1 - fwr2_end: 150 - fwr2_start: 111 - fwr3_end: 294 - fwr3_start: 161 - sequence_delineation_id: '1' - unaligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA - v_rs_end: null - v_rs_start: null - author: William Lees - curation: null - germline_set_id: OGRDB:G00007 - germline_set_name: CAST IGH - germline_set_ref: OGRDB:G00007.1 - lab_address: Birkbeck College, University of London, Malet Street, London - lab_name: '' - locus: IGH - pub_ids: [''] - release_date: '2021-11-24' - release_description: '' - release_version: 1 - species: - id: NCBITAXON:10090 - label: Mus musculus - species_subgroup: CAST_EiJ - species_subgroup_type: strain - + allele_descriptions: + - allele_description_id: OGRDB:A00301 + allele_description_ref: OGRDB:Mouse_IGH:IGHV-2DBF + acknowledgements: + - contributor_id: "3" + name: William Lees + orcid_id: + id: ORCID:0000-0001-9834-6840 + label: William Lees + affiliation: + id: ROR:02mb95055 + label: Birkbeck, University of London + affiliation_department: + contributions: + - role: investigation + degree: + - role: data curation + degree: + release_version: 1 + release_date: "2021-11-24" + release_description: First release + label: IGHV-2DBF + sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA + coding_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA + aliases: + - watson_et_al:CAST_EiJ_IGHV5-3 + locus: IGH + chromosome: + sequence_type: V + functional: true + inference_type: rearranged_only + species: + id: NCBITAXON:10090 + label: Mus musculus + species_subgroup: CAST_EiJ + species_subgroup_type: strain + status: active + gene_designation: + subgroup_designation: + allele_designation: + gene_start: + gene_end: + utr_5_prime_start: + utr_5_prime_end: + leader_1_start: + leader_1_end: + leader_2_start: + leader_2_end: + v_rs_start: + v_rs_end: + v_gene_delineations: + - sequence_delineation_id: '1' + delineation_scheme: IMGT + unaligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA + aligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA + fwr1_start: 1 + fwr1_end: 75 + cdr1_start: 76 + cdr1_end: 110 + fwr2_start: 111 + fwr2_end: 150 + cdr2_start: 151 + cdr2_end: 160 + fwr3_start: 161 + fwr3_end: 294 + cdr3_start: 295 + alignment_labels: + - '1' + - '2' + - '3' + - '4' + - '5' + - '6' + - '7' + - '8' + - '9' + - '10' + - '11' + - '12' + - '13' + - '14' + - '15' + - '16' + - '17' + - '18' + - '19' + - '20' + - '21' + - '22' + - '23' + - '24' + - '25' + - '26' + - '27' + - '28' + - '29' + - '30' + - '31' + - '32' + - '33' + - '34' + - '35' + - '36' + - '37' + - '38' + - '39' + - '40' + - '41' + - '42' + - '43' + - '44' + - '45' + - '46' + - '47' + - '48' + - '49' + - '50' + - '51' + - '52' + - '53' + - '54' + - '55' + - '56' + - '57' + - '58' + - '59' + - '60' + - '61' + - '62' + - '63' + - '64' + - '65' + - '66' + - '67' + - '68' + - '69' + - '70' + - '71' + - '72' + - '73' + - '74' + - '75' + - '76' + - '77' + - '78' + - '79' + - '80' + - '81' + - '82' + - '83' + - '84' + - '85' + - '86' + - '87' + - '88' + - '89' + - '90' + - '91' + - '92' + - '93' + - '94' + - '95' + - '96' + - '97' + - '98' + - '99' + - '100' + - '101' + - '102' + - '103' + - '104' + unrearranged_support: [] + rearranged_support: [] + paralogs: [] + curation: 'Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV5-3' + curational_tags: + - allele_description_id: OGRDB:A00314 + allele_description_ref: OGRDB:Mouse_IGH:IGHV-2ETO + acknowledgements: + - contributor_id: "3" + name: William Lees + orcid_id: + id: ORCID:0000-0001-9834-6840 + label: William Lees + affiliation: + id: ROR:02mb95055 + label: Birkbeck, University of London + affiliation_department: + contributions: + - role: investigation + degree: + - role: data curation + degree: + release_version: 1 + release_date: "2021-11-24" + release_description: First release + label: IGHV-2ETO + sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC + coding_sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC + aliases: + - watson_et_al:CAST_EiJ_IGHV8-2 + locus: IGH + chromosome: + sequence_type: V + functional: true + inference_type: rearranged_only + species: + id: NCBITAXON:10090 + label: Mus musculus + species_subgroup: CAST_EiJ + species_subgroup_type: strain + status: active + gene_designation: + subgroup_designation: + allele_designation: + gene_start: + gene_end: + utr_5_prime_start: + utr_5_prime_end: + leader_1_start: + leader_1_end: + leader_2_start: + leader_2_end: + v_rs_start: + v_rs_end: + v_gene_delineations: + - sequence_delineation_id: '1' + delineation_scheme: IMGT + unaligned_sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC + aligned_sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTG...GGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGA............GCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGT......GGGATGATGATAAGTACTATAACCCATCCCTGA...AGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC + fwr1_start: 1 + fwr1_end: 75 + cdr1_start: 76 + cdr1_end: 110 + fwr2_start: 111 + fwr2_end: 150 + cdr2_start: 151 + cdr2_end: 160 + fwr3_start: 161 + fwr3_end: 294 + cdr3_start: 295 + alignment_labels: + - '1' + - '2' + - '3' + - '4' + - '5' + - '6' + - '7' + - '8' + - '9' + - '10' + - '11' + - '12' + - '13' + - '14' + - '15' + - '16' + - '17' + - '18' + - '19' + - '20' + - '21' + - '22' + - '23' + - '24' + - '25' + - '26' + - '27' + - '28' + - '29' + - '30' + - '31' + - '32' + - '33' + - '34' + - '35' + - '36' + - '37' + - '38' + - '39' + - '40' + - '41' + - '42' + - '43' + - '44' + - '45' + - '46' + - '47' + - '48' + - '49' + - '50' + - '51' + - '52' + - '53' + - '54' + - '55' + - '56' + - '57' + - '58' + - '59' + - '60' + - '61' + - '62' + - '63' + - '64' + - '65' + - '66' + - '67' + - '68' + - '69' + - '70' + - '71' + - '72' + - '73' + - '74' + - '75' + - '76' + - '77' + - '78' + - '79' + - '80' + - '81' + - '82' + - '83' + - '84' + - '85' + - '86' + - '87' + - '88' + - '89' + - '90' + - '91' + - '92' + - '93' + - '94' + - '95' + - '96' + - '97' + - '98' + - '99' + - '100' + - '101' + - '102' + - '103' + - '104' + unrearranged_support: [] + rearranged_support: [] + paralogs: [] + curation: 'Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV8-2' + curational_tags: + curation: GenotypeSet: - - receptor_genotype_set_id: '1' + - receptor_genotype_set_id: "1" genotype_class_list: - - receptor_genotype_id: '1' + - receptor_genotype_id: "1" locus: IGH documented_alleles: - label: IGHV1-69*01 diff --git a/lang/python/tests/data/good_genotype_set.json b/lang/python/tests/data/good_genotype_set.json index ba10f56e9..abd24646c 100644 --- a/lang/python/tests/data/good_genotype_set.json +++ b/lang/python/tests/data/good_genotype_set.json @@ -35,4 +35,4 @@ } ] }] -} \ No newline at end of file +} diff --git a/lang/python/tests/data/good_germline_set.json b/lang/python/tests/data/good_germline_set.json index 41ecf5f7d..e74c590dc 100644 --- a/lang/python/tests/data/good_germline_set.json +++ b/lang/python/tests/data/good_germline_set.json @@ -1,17 +1,41 @@ { "GermlineSet": [{ "germline_set_id": "OGRDB:G00007", - "author": "William Lees", - "lab_name": "", - "lab_address": "Birkbeck College, University of London, Malet Street, London", - "acknowledgements": [], + "acknowledgements": [ + { + "contributor_id": "1", + "name": "William Lees", + "orcid_id": { + "id": "ORCID:0000-0001-9834-6840", + "label": "William Lees" + }, + "affiliation": { + "id": "ROR:02mb95055", + "label": "Birkbeck, University of London" + }, + "affiliation_department":"", + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + } + ], "release_version": 1, "release_description": "", "release_date": "2021-11-24", "germline_set_name": "CAST IGH", "germline_set_ref": "OGRDB:G00007.1", "pub_ids": [""], - "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" }, + "species": { + "id": "NCBITAXON:10090", + "label": "Mus musculus" + }, "species_subgroup": "CAST_EiJ", "species_subgroup_type": "strain", "locus": "IGH", @@ -19,11 +43,33 @@ { "allele_description_id": "OGRDB:A00301", "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2DBF", - "maintainer": "William Lees", - "acknowledgements": [], - "lab_address": "Birkbeck College, University of London, Malet Street, London", + "acknowledgements": [ + { + "contributor_id": "1", + "name": "William Lees", + "orcid_id": { + "id": "ORCID:0000-0001-9834-6840", + "label": "William Lees" + }, + "affiliation": { + "id": "ROR:02mb95055", + "label": "Birkbeck, University of London" + }, + "affiliation_department":"", + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + } + ], "release_version": 1, - "release_date": "24-Nov-2021", + "release_date": "2021-11-24", "release_description": "First release", "label": "IGHV-2DBF", "sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", @@ -36,7 +82,10 @@ "sequence_type": "V", "functional": true, "inference_type": "rearranged_only", - "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" }, + "species": { + "id": "NCBITAXON:10090", + "label": "Mus musculus" + }, "species_subgroup": "CAST_EiJ", "species_subgroup_type": "strain", "status": "active", @@ -70,7 +119,7 @@ "fwr3_start": 161, "fwr3_end": 294, "cdr3_start": 295, - "alignment": [ + "alignment_labels": [ "1", "2", "3", @@ -187,11 +236,33 @@ { "allele_description_id": "OGRDB:A00314", "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2ETO", - "maintainer": "William Lees", - "acknowledgements": [], - "lab_address": "Birkbeck College, University of London, Malet Street, London", + "acknowledgements": [ + { + "contributor_id": "1", + "name": "William Lees", + "orcid_id": { + "id": "ORCID:0000-0001-9834-6840", + "label": "William Lees" + }, + "affiliation": { + "id": "ROR:02mb95055", + "label": "Birkbeck, University of London" + }, + "affiliation_department":"", + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + } + ], "release_version": 1, - "release_date": "24-Nov-2021", + "release_date": "2021-11-24", "release_description": "First release", "label": "IGHV-2ETO", "sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC", @@ -204,7 +275,10 @@ "sequence_type": "V", "functional": true, "inference_type": "rearranged_only", - "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" }, + "species": { + "id": "NCBITAXON:10090", + "label": "Mus musculus" + }, "species_subgroup": "CAST_EiJ", "species_subgroup_type": "strain", "status": "active", @@ -238,7 +312,7 @@ "fwr3_start": 161, "fwr3_end": 294, "cdr3_start": 295, - "alignment": [ + "alignment_labels": [ "1", "2", "3", diff --git a/lang/python/tests/data/good_repertoire.yaml b/lang/python/tests/data/good_repertoire.yaml index 9bf3a4653..6adaa2361 100644 --- a/lang/python/tests/data/good_repertoire.yaml +++ b/lang/python/tests/data/good_repertoire.yaml @@ -11,28 +11,50 @@ Repertoire: id: null label: null study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level." - study_contact: "Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X" inclusion_exclusion_criteria: null - lab_name: "Mark M. Davis" - lab_address: "Stanford University" - submitted_by: "Florian Rubelt" + contributors: + - contributor_id: "1" + name: "Florian Rubelt" + orcid_id: + id: null + label: null + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "investigation" + degree: null + - role: "data curation" + degree: null + - contributor_id: "2" + name: "Mark M. Davis" + orcid_id: + id: "ORCID:0000-0001-6868-657X" + label: "Mark Davis" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "supervision" + degree: null pub_ids: ["PMID:27005435"] - collected_by: null grants: null - keywords_study: + keywords_study: - "contains_ig" - "contains_tr" subject: subject_id: TW01A synthetic: false species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" label: "Homo sapiens" sex: female age_min: 27 age_max: 27 age_unit: - id: UO_0000036 + id: UO:0000036 label: year age_event: null ancestry_population: @@ -63,15 +85,15 @@ Repertoire: sample_processing_id: null sample_type: "peripheral venous puncture" tissue: - id: "UBERON_0000178" + id: "UBERON:0000178" label: "blood" tissue_processing: "Ficoll gradient" cell_subset: - id: "CL_0000788" + id: "CL:0000788" label: "naive B cell" cell_phenotype: "expression of CD20 and the absence of CD27" cell_species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" label: "Homo sapiens" single_cell: false cell_isolation: FACS @@ -144,13 +166,35 @@ Repertoire: id: null label: null study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level." - study_contact: "Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X" inclusion_exclusion_criteria: null - lab_name: "Mark M. Davis" - lab_address: "Stanford University" - submitted_by: "Florian Rubelt" + contributors: + - contributor_id: "1" + name: "Florian Rubelt" + orcid_id: + id: null + label: null + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "investigation" + degree: null + - role: "data curation" + degree: null + - contributor_id: "2" + name: "Mark M. Davis" + orcid_id: + id: "ORCID:0000-0001-6868-657X" + label: "Mark Davis" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "supervision" + degree: null pub_ids: ["PMID:27005435"] - collected_by: null grants: null keywords_study: - "contains_ig" @@ -159,13 +203,13 @@ Repertoire: subject_id: TW01A synthetic: false species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" label: "Homo sapiens" sex: female age_min: 27 age_max: 27 age_unit: - id: UO_0000036 + id: UO:0000036 label: year age_event: null ancestry_population: @@ -196,15 +240,15 @@ Repertoire: sample_processing_id: null sample_type: "peripheral venous puncture" tissue: - id: "UBERON_0000178" + id: "UBERON:0000178" label: "blood" tissue_processing: "Ficoll gradient" cell_subset: - id: "CL_0000787" + id: "CL:0000787" label: "memory B cell" cell_phenotype: "expression of CD20 and CD27" cell_species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" label: "Homo sapiens" single_cell: false cell_isolation: FACS @@ -277,13 +321,35 @@ Repertoire: id: null label: null study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level." - study_contact: "Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X" inclusion_exclusion_criteria: null - lab_name: "Mark M. Davis" - lab_address: "Stanford University" - submitted_by: "Florian Rubelt" + contributors: + - contributor_id: "1" + name: "Florian Rubelt" + orcid_id: + id: null + label: null + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "investigation" + degree: null + - role: "data curation" + degree: null + - contributor_id: "2" + name: "Mark M. Davis" + orcid_id: + id: "ORCID:0000-0001-6868-657X" + label: "Mark Davis" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "supervision" + degree: null pub_ids: ["PMID:27005435"] - collected_by: null grants: null keywords_study: - "contains_ig" @@ -292,13 +358,13 @@ Repertoire: subject_id: TW01A synthetic: false species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" label: "Homo sapiens" sex: female age_min: 27 age_max: 27 age_unit: - id: UO_0000036 + id: UO:0000036 label: year age_event: null ancestry_population: @@ -329,15 +395,15 @@ Repertoire: sample_processing_id: null sample_type: "peripheral venous puncture" tissue: - id: "UBERON_0000178" + id: "UBERON:0000178" label: "blood" tissue_processing: "Ficoll gradient" cell_subset: - id: "CL_0000895" + id: "CL:0000895" label: "naive thymus-derived CD4-positive, alpha-beta T cell" cell_phenotype: "expression of CD8 and absence of CD4 and CD45RO" cell_species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" label: "Homo sapiens" single_cell: false cell_isolation: FACS diff --git a/specs/airr-schema-openapi3.yaml b/specs/airr-schema-openapi3.yaml index 1ae5ad012..d6c6d48e2 100644 --- a/specs/airr-schema-openapi3.yaml +++ b/specs/airr-schema-openapi3.yaml @@ -1667,10 +1667,7 @@ Study: - study_type - inclusion_exclusion_criteria - grants - - collected_by - - lab_name - - lab_address - - submitted_by + - contributors - pub_ids - keywords_study properties: diff --git a/specs/airr-schema.yaml b/specs/airr-schema.yaml index 87a25b5bd..dd2c0c241 100644 --- a/specs/airr-schema.yaml +++ b/specs/airr-schema.yaml @@ -1571,10 +1571,7 @@ Study: - study_type - inclusion_exclusion_criteria - grants - - collected_by - - lab_name - - lab_address - - submitted_by + - contributors - pub_ids - keywords_study properties: From 6a137b5b225fa47947090d2e37f678f78c488a88 Mon Sep 17 00:00:00 2001 From: Christian Busse Date: Thu, 22 Feb 2024 04:43:50 +0100 Subject: [PATCH 13/15] Add routine to validate class of an ontology object --- lang/R/R/Interface.R | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/lang/R/R/Interface.R b/lang/R/R/Interface.R index 2b88e3801..ed7756f3d 100644 --- a/lang/R/R/Interface.R +++ b/lang/R/R/Interface.R @@ -645,7 +645,7 @@ validate_airr <- function(data, model=TRUE, each=FALSE) { validate_entry <- function(entry, schema) { schema_name <- schema@definition valid <- TRUE - + # Check all required fields exist missing_fields <- setdiff(schema@required, names(entry)) @@ -664,8 +664,15 @@ validate_entry <- function(entry, schema) { # in this case the type on the 1st level is NULL if (is.na(schema[f][["type"]]) || is.null(schema[f][["type"]])) { if (!is.null(reference_schemes)) { - v <- validate_entry(entry[[f]], schema=reference_schemes) - if (!v) { valid <- FALSE } + # check whether an ontology is a list, before recursing into it. + if (reference_schemes@definition == "Ontology" & class(entry[[f]]) != "list") { + valid <- FALSE + warning(paste("Warning: Property", paste(schema_name, ".", f, sep=""), + "should be an ontology but is of class", class(entry[[f]]), "\n")) + } else { + v <- validate_entry(entry[[f]], schema=reference_schemes) + if (!v) { valid <- FALSE } + } } # entry of array type with a list of on or several reference schemes } else if (schema[f][["type"]] == "array" & !is.null(reference_schemes)) { From ca4a564fef59853f39f2a7b5504bf7fa4ea35fe3 Mon Sep 17 00:00:00 2001 From: Scott Christley Date: Mon, 26 Feb 2024 16:13:12 -0600 Subject: [PATCH 14/15] update openapi3 spec in lang directories --- lang/R/inst/extdata/airr-schema-openapi3.yaml | 253 +++++++++++------- .../airr/specs/airr-schema-openapi3.yaml | 253 +++++++++++------- 2 files changed, 318 insertions(+), 188 deletions(-) diff --git a/lang/R/inst/extdata/airr-schema-openapi3.yaml b/lang/R/inst/extdata/airr-schema-openapi3.yaml index bba3a45d8..d6c6d48e2 100644 --- a/lang/R/inst/extdata/airr-schema-openapi3.yaml +++ b/lang/R/inst/extdata/airr-schema-openapi3.yaml @@ -368,7 +368,7 @@ DataFile: RepertoireGroup: type: array nullable: false - description: List of repertoire collections + description: List of repertoire groups items: $ref: '#/RepertoireGroup' Rearrangement: @@ -486,34 +486,111 @@ TimePoint: # General objects # -# An individual -Acknowledgement: +# Contributor record to describe invididuals and their contribution to a data set +# +Contributor: description: Individual whose contribution to this work should be acknowledged type: object required: - - acknowledgement_id + - contributor_id - name - - institution_name properties: - acknowledgement_id: + contributor_id: type: string - description: unique identifier of this Acknowledgement within the file + nullable: true + description: Unique identifier of this contributor within the file x-airr: identifier: true miairr: important - nullable: true name: type: string + nullable: false + description: Full name of contributor + orcid_id: + $ref: '#/Ontology' nullable: true - description: Full name of individual - institution_name: - type: string + description: > + ORCID identifier of the contributor. Note that if present, the label of the ORCID record should take + precedence over the name reported in the `name` property. + title: ORCID iD + example: + id: ORCID:0000-0002-1825-0097 + label: Josiah Carberry + x-airr: + adc-query-support: true + format: ontology + ontology: + draft: false + top_node: + id: null + label: null + affiliation: + $ref: '#/Ontology' nullable: true - description: Individual's department and institution name - orcid_id: + description: > + ROR of the contributor's primary affiliation. Note that ROR are only minted for institutions, not + from individuals institutes, divisions or departments. + title: ROR + example: + id: ROR:05h7xva58 + label: Wesleyan University + x-airr: + adc-query-support: true + format: ontology + ontology: + draft: false + top_node: + id: null + label: null + affiliation_department: type: string nullable: true - description: Individual's ORCID identifier + description: > + Additional information regarding the contributor's primary affiliation. Can be used to specify + individual institutes, divisions or departments. + example: Department for Psychoceramics + contributions: + type: array + nullable: true + description: List of all roles the contributor had in a project + items: + $ref: '#/ContributorContribution' + +ContributorContribution: + type: object + required: + - role + properties: + role: + type: string + nullable: false + description: Role according to CRediT taxonomy + enum: + - conceptualization + - data curation + - formal analysis + - funding acquisition + - investigation + - methodology + - project administration + - resources + - software + - supervision + - validation + - visualization + - writing - original draft + - writing - review & editing + degree: + type: string + nullable: true + description: > + Optional specification of the degree of contribution, should be used if multiple individuals serve + the same role. + enum: + - lead + - equal + - supporting + # # Germline gene schema @@ -809,8 +886,7 @@ AlleleDescription: type: object required: - allele_description_id - - maintainer - - lab_address + - acknowledgements - release_version - release_date - release_description @@ -838,24 +914,16 @@ AlleleDescription: miairr: important description: Unique reference to the allele description, in standardized form (Repo:Label:Version) example: OGRDB:Human_IGH:IGHV1-69*01.001 - maintainer: - type: string - nullable: true - x-airr: - miairr: defined - description: Maintainer of this sequence record acknowledgements: type: array nullable: true - description: List of individuals whose contribution to the gene description should be acknowledged + description: > + List of individuals whose contribution to the gene description should be acknowledged. Note that these + are not necessarily identical with the authors on an associated manuscript or other scholarly + communication. Further note that typically at least the three CRediT contributor roles "supervision", + "investigation" and "data curation" should be assigned. The current maintainer should be listed first. items: - $ref: '#/Acknowledgement' - lab_address: - type: string - nullable: true - x-airr: - miairr: defined - description: Institution and full address of corresponding author + $ref: '#/Contributor' release_version: type: integer nullable: true @@ -1148,9 +1216,7 @@ GermlineSet: All genes in a GermlineSet should be from a single locus. required: - germline_set_id - - author - - lab_name - - lab_address + - acknowledgements - release_version - release_description - release_date @@ -1169,30 +1235,16 @@ GermlineSet: x-airr: identifier: true miairr: important - author: - type: string - nullable: true - x-airr: - miairr: important - description: Corresponding author - lab_name: - type: string - nullable: true - x-airr: - miairr: important - description: Department of corresponding author - lab_address: - type: string - nullable: true - x-airr: - miairr: important - description: Institutional address of corresponding author acknowledgements: type: array nullable: true - description: List of individuals whose contribution to the germline set should be acknowledged + description: > + List of individuals whose contribution to the germline set should be acknowledged. Note that these are + not necessarily identical with the authors on an associated manuscript or other scholarly communication. + Further note that typically at least the three CRediT contributor roles "supervision", "investigation" + and "data curation" should be assigned. The coresponding author should be listed last. items: - $ref: '#/Acknowledgement' + $ref: '#/Contributor' release_version: type: number nullable: true @@ -1615,10 +1667,7 @@ Study: - study_type - inclusion_exclusion_criteria - grants - - collected_by - - lab_name - - lab_address - - submitted_by + - contributors - pub_ids - keywords_study properties: @@ -1702,17 +1751,36 @@ Study: set: 1 subset: study name: Grant funding agency + contributors: + type: array + nullable: false + description: > + List of individuals who contributed to the study. Note that these are not necessarily identical with + the authors on an associated manuscript or other scholarly communication. Further note that typically + at least the three CRediT contributor roles "supervision", "investigation" and "data curation" should + be assigned. The coresponding author should be listed last. + title: Contributors + items: + $ref: '#/Contributor' + x-airr: + miairr: essential + adc-query-support: true + set: 1 + subset: study + name: Contributors study_contact: type: string nullable: true description: > Full contact information of the contact persons for this study This should include an e-mail address and a persistent identifier such as an ORCID ID. - title: Contact information (study) - example: Dr. P. Stibbons, p.stibbons@unseenu.edu, https://orcid.org/0000-0002-1825-0097 x-airr: - adc-query-support: true - name: Contact information (study) + deprecated: true + deprecated-description: > + Acknowledgements and contact information was re-organized into the contributors property, which + is an array of Contributor objects. + deprecated-replaced-by: + - contributors collected_by: type: string nullable: true @@ -1720,38 +1788,35 @@ Study: Full contact information of the data collector, i.e. the person who is legally responsible for data collection and release. This should include an e-mail address and a persistent identifier such as an ORCID ID. - title: Contact information (data collection) - example: Dr. P. Stibbons, p.stibbons@unseenu.edu, https://orcid.org/0000-0002-1825-0097 x-airr: - miairr: important - adc-query-support: true - set: 1 - subset: study - name: Contact information (data collection) + deprecated: true + deprecated-description: > + Acknowledgements and contact information was re-organized into the contributors property, which + is an array of Contributor objects. + deprecated-replaced-by: + - contributors lab_name: type: string nullable: true description: Department of data collector - title: Lab name - example: Department for Planar Immunology x-airr: - miairr: important - adc-query-support: true - set: 1 - subset: study - name: Lab name + deprecated: true + deprecated-description: > + Acknowledgements and contact information was re-organized into the contributors property, which + is an array of Contributor objects. + deprecated-replaced-by: + - contributors lab_address: type: string nullable: true description: Institution and institutional address of data collector - title: Lab address - example: School of Medicine, Unseen University, Ankh-Morpork, Disk World x-airr: - miairr: important - adc-query-support: true - set: 1 - subset: study - name: Lab address + deprecated: true + deprecated-description: > + Acknowledgements and contact information was re-organized into the contributors property, which + is an array of Contributor objects. + deprecated-replaced-by: + - contributors submitted_by: type: string nullable: true @@ -1759,14 +1824,13 @@ Study: Full contact information of the data depositor, i.e., the person submitting the data to a repository. This should include an e-mail address and a persistent identifier such as an ORCID ID. This is supposed to be a short-lived and technical role until the submission is relased. - title: Contact information (data deposition) - example: Adrian Turnipseed, a.turnipseed@unseenu.edu, https://orcid.org/0000-0002-1825-0097 x-airr: - miairr: important - adc-query-support: true - set: 1 - subset: study - name: Contact information (data deposition) + deprecated: true + deprecated-description: > + Acknowledgements and contact information was re-organized into the contributors property, which + is an array of Contributor objects. + deprecated-replaced-by: + - contributors pub_ids: type: array items: @@ -3298,7 +3362,8 @@ Repertoire: x-airr: adc-query-support: true -# A collection of repertoires for analysis purposes, includes optional time course +# An ordered group of repertoires for analysis purposes, includes optional time course +# Can be treated as a set if all repertoire_group_id are unique RepertoireGroup: type: object required: @@ -3308,22 +3373,22 @@ RepertoireGroup: repertoire_group_id: type: string nullable: true - description: Identifier for this repertoire collection + description: Identifier for this repertoire group x-airr: identifier: true repertoire_group_name: type: string nullable: true - description: Short display name for this repertoire collection + description: Short display name for this repertoire group repertoire_group_description: type: string nullable: true - description: Repertoire collection description + description: Repertoire group description repertoires: type: array nullable: true description: > - List of repertoires in this collection with an associated description and time point designation + List of repertoires in this group with an associated description and time point designation items: type: object properties: diff --git a/lang/python/airr/specs/airr-schema-openapi3.yaml b/lang/python/airr/specs/airr-schema-openapi3.yaml index bba3a45d8..d6c6d48e2 100644 --- a/lang/python/airr/specs/airr-schema-openapi3.yaml +++ b/lang/python/airr/specs/airr-schema-openapi3.yaml @@ -368,7 +368,7 @@ DataFile: RepertoireGroup: type: array nullable: false - description: List of repertoire collections + description: List of repertoire groups items: $ref: '#/RepertoireGroup' Rearrangement: @@ -486,34 +486,111 @@ TimePoint: # General objects # -# An individual -Acknowledgement: +# Contributor record to describe invididuals and their contribution to a data set +# +Contributor: description: Individual whose contribution to this work should be acknowledged type: object required: - - acknowledgement_id + - contributor_id - name - - institution_name properties: - acknowledgement_id: + contributor_id: type: string - description: unique identifier of this Acknowledgement within the file + nullable: true + description: Unique identifier of this contributor within the file x-airr: identifier: true miairr: important - nullable: true name: type: string + nullable: false + description: Full name of contributor + orcid_id: + $ref: '#/Ontology' nullable: true - description: Full name of individual - institution_name: - type: string + description: > + ORCID identifier of the contributor. Note that if present, the label of the ORCID record should take + precedence over the name reported in the `name` property. + title: ORCID iD + example: + id: ORCID:0000-0002-1825-0097 + label: Josiah Carberry + x-airr: + adc-query-support: true + format: ontology + ontology: + draft: false + top_node: + id: null + label: null + affiliation: + $ref: '#/Ontology' nullable: true - description: Individual's department and institution name - orcid_id: + description: > + ROR of the contributor's primary affiliation. Note that ROR are only minted for institutions, not + from individuals institutes, divisions or departments. + title: ROR + example: + id: ROR:05h7xva58 + label: Wesleyan University + x-airr: + adc-query-support: true + format: ontology + ontology: + draft: false + top_node: + id: null + label: null + affiliation_department: type: string nullable: true - description: Individual's ORCID identifier + description: > + Additional information regarding the contributor's primary affiliation. Can be used to specify + individual institutes, divisions or departments. + example: Department for Psychoceramics + contributions: + type: array + nullable: true + description: List of all roles the contributor had in a project + items: + $ref: '#/ContributorContribution' + +ContributorContribution: + type: object + required: + - role + properties: + role: + type: string + nullable: false + description: Role according to CRediT taxonomy + enum: + - conceptualization + - data curation + - formal analysis + - funding acquisition + - investigation + - methodology + - project administration + - resources + - software + - supervision + - validation + - visualization + - writing - original draft + - writing - review & editing + degree: + type: string + nullable: true + description: > + Optional specification of the degree of contribution, should be used if multiple individuals serve + the same role. + enum: + - lead + - equal + - supporting + # # Germline gene schema @@ -809,8 +886,7 @@ AlleleDescription: type: object required: - allele_description_id - - maintainer - - lab_address + - acknowledgements - release_version - release_date - release_description @@ -838,24 +914,16 @@ AlleleDescription: miairr: important description: Unique reference to the allele description, in standardized form (Repo:Label:Version) example: OGRDB:Human_IGH:IGHV1-69*01.001 - maintainer: - type: string - nullable: true - x-airr: - miairr: defined - description: Maintainer of this sequence record acknowledgements: type: array nullable: true - description: List of individuals whose contribution to the gene description should be acknowledged + description: > + List of individuals whose contribution to the gene description should be acknowledged. Note that these + are not necessarily identical with the authors on an associated manuscript or other scholarly + communication. Further note that typically at least the three CRediT contributor roles "supervision", + "investigation" and "data curation" should be assigned. The current maintainer should be listed first. items: - $ref: '#/Acknowledgement' - lab_address: - type: string - nullable: true - x-airr: - miairr: defined - description: Institution and full address of corresponding author + $ref: '#/Contributor' release_version: type: integer nullable: true @@ -1148,9 +1216,7 @@ GermlineSet: All genes in a GermlineSet should be from a single locus. required: - germline_set_id - - author - - lab_name - - lab_address + - acknowledgements - release_version - release_description - release_date @@ -1169,30 +1235,16 @@ GermlineSet: x-airr: identifier: true miairr: important - author: - type: string - nullable: true - x-airr: - miairr: important - description: Corresponding author - lab_name: - type: string - nullable: true - x-airr: - miairr: important - description: Department of corresponding author - lab_address: - type: string - nullable: true - x-airr: - miairr: important - description: Institutional address of corresponding author acknowledgements: type: array nullable: true - description: List of individuals whose contribution to the germline set should be acknowledged + description: > + List of individuals whose contribution to the germline set should be acknowledged. Note that these are + not necessarily identical with the authors on an associated manuscript or other scholarly communication. + Further note that typically at least the three CRediT contributor roles "supervision", "investigation" + and "data curation" should be assigned. The coresponding author should be listed last. items: - $ref: '#/Acknowledgement' + $ref: '#/Contributor' release_version: type: number nullable: true @@ -1615,10 +1667,7 @@ Study: - study_type - inclusion_exclusion_criteria - grants - - collected_by - - lab_name - - lab_address - - submitted_by + - contributors - pub_ids - keywords_study properties: @@ -1702,17 +1751,36 @@ Study: set: 1 subset: study name: Grant funding agency + contributors: + type: array + nullable: false + description: > + List of individuals who contributed to the study. Note that these are not necessarily identical with + the authors on an associated manuscript or other scholarly communication. Further note that typically + at least the three CRediT contributor roles "supervision", "investigation" and "data curation" should + be assigned. The coresponding author should be listed last. + title: Contributors + items: + $ref: '#/Contributor' + x-airr: + miairr: essential + adc-query-support: true + set: 1 + subset: study + name: Contributors study_contact: type: string nullable: true description: > Full contact information of the contact persons for this study This should include an e-mail address and a persistent identifier such as an ORCID ID. - title: Contact information (study) - example: Dr. P. Stibbons, p.stibbons@unseenu.edu, https://orcid.org/0000-0002-1825-0097 x-airr: - adc-query-support: true - name: Contact information (study) + deprecated: true + deprecated-description: > + Acknowledgements and contact information was re-organized into the contributors property, which + is an array of Contributor objects. + deprecated-replaced-by: + - contributors collected_by: type: string nullable: true @@ -1720,38 +1788,35 @@ Study: Full contact information of the data collector, i.e. the person who is legally responsible for data collection and release. This should include an e-mail address and a persistent identifier such as an ORCID ID. - title: Contact information (data collection) - example: Dr. P. Stibbons, p.stibbons@unseenu.edu, https://orcid.org/0000-0002-1825-0097 x-airr: - miairr: important - adc-query-support: true - set: 1 - subset: study - name: Contact information (data collection) + deprecated: true + deprecated-description: > + Acknowledgements and contact information was re-organized into the contributors property, which + is an array of Contributor objects. + deprecated-replaced-by: + - contributors lab_name: type: string nullable: true description: Department of data collector - title: Lab name - example: Department for Planar Immunology x-airr: - miairr: important - adc-query-support: true - set: 1 - subset: study - name: Lab name + deprecated: true + deprecated-description: > + Acknowledgements and contact information was re-organized into the contributors property, which + is an array of Contributor objects. + deprecated-replaced-by: + - contributors lab_address: type: string nullable: true description: Institution and institutional address of data collector - title: Lab address - example: School of Medicine, Unseen University, Ankh-Morpork, Disk World x-airr: - miairr: important - adc-query-support: true - set: 1 - subset: study - name: Lab address + deprecated: true + deprecated-description: > + Acknowledgements and contact information was re-organized into the contributors property, which + is an array of Contributor objects. + deprecated-replaced-by: + - contributors submitted_by: type: string nullable: true @@ -1759,14 +1824,13 @@ Study: Full contact information of the data depositor, i.e., the person submitting the data to a repository. This should include an e-mail address and a persistent identifier such as an ORCID ID. This is supposed to be a short-lived and technical role until the submission is relased. - title: Contact information (data deposition) - example: Adrian Turnipseed, a.turnipseed@unseenu.edu, https://orcid.org/0000-0002-1825-0097 x-airr: - miairr: important - adc-query-support: true - set: 1 - subset: study - name: Contact information (data deposition) + deprecated: true + deprecated-description: > + Acknowledgements and contact information was re-organized into the contributors property, which + is an array of Contributor objects. + deprecated-replaced-by: + - contributors pub_ids: type: array items: @@ -3298,7 +3362,8 @@ Repertoire: x-airr: adc-query-support: true -# A collection of repertoires for analysis purposes, includes optional time course +# An ordered group of repertoires for analysis purposes, includes optional time course +# Can be treated as a set if all repertoire_group_id are unique RepertoireGroup: type: object required: @@ -3308,22 +3373,22 @@ RepertoireGroup: repertoire_group_id: type: string nullable: true - description: Identifier for this repertoire collection + description: Identifier for this repertoire group x-airr: identifier: true repertoire_group_name: type: string nullable: true - description: Short display name for this repertoire collection + description: Short display name for this repertoire group repertoire_group_description: type: string nullable: true - description: Repertoire collection description + description: Repertoire group description repertoires: type: array nullable: true description: > - List of repertoires in this collection with an associated description and time point designation + List of repertoires in this group with an associated description and time point designation items: type: object properties: From caff1123914a0e68d8c4a179e85c3434105e8c2b Mon Sep 17 00:00:00 2001 From: Scott Christley Date: Mon, 26 Feb 2024 16:17:37 -0600 Subject: [PATCH 15/15] update tests --- Makefile | 13 +- tests/data/bad_genotype_set.json | 2 +- tests/data/bad_germline_set.json | 94 +++- tests/data/bad_repertoire.yaml | 90 ++- tests/data/good_combined_airr.json | 349 +++++++++--- tests/data/good_combined_airr.yaml | 856 ++++++++++++++++------------- tests/data/good_genotype_set.json | 2 +- tests/data/good_germline_set.json | 108 +++- tests/data/good_repertoire.yaml | 128 +++-- 9 files changed, 1099 insertions(+), 543 deletions(-) diff --git a/Makefile b/Makefile index 151bf2dd4..a18207a95 100644 --- a/Makefile +++ b/Makefile @@ -7,8 +7,8 @@ help: @echo "Helper commands for AIRR Standards repository" @echo "" @echo "make gen-v2 -- Generate OpenAPI V2 spec from the V3 spec" - @echo "make docs -- Build documentation" - @echo "make lang-copy -- Copy spec files to language directories" + @echo "make build-docs -- Build documentation" + @echo "make spec-copy -- Copy spec files to language directories" @echo "make data-copy -- Copy test data files to language directories" @echo "make checks -- Run consistency checks on spec files" @echo "make tests -- Run all language test suites" @@ -20,7 +20,10 @@ help: gen-v2: @echo "Not implemented" -lang-copy: +build-docs: + sphinx-build -a -E -b html docs docs/_build/html + +spec-copy: @echo "Copying specs to language directories" cp specs/airr-schema.yaml lang/python/airr/specs cp specs/airr-schema-openapi3.yaml lang/python/airr/specs @@ -30,7 +33,9 @@ lang-copy: # cp specs/airr-schema-openapi3.yaml lang/js/ data-copy: - @echo "Not implemented" + @echo "Copying test data to language directories" + cp tests/data/* lang/python/tests/data + cp tests/data/* lang/R/tests/data-tests checks: @echo "Running consistency checks on spec files" diff --git a/tests/data/bad_genotype_set.json b/tests/data/bad_genotype_set.json index c58a39027..01709d60a 100644 --- a/tests/data/bad_genotype_set.json +++ b/tests/data/bad_genotype_set.json @@ -41,4 +41,4 @@ } ] }] -} \ No newline at end of file +} diff --git a/tests/data/bad_germline_set.json b/tests/data/bad_germline_set.json index 168cc1fa5..28531aabb 100644 --- a/tests/data/bad_germline_set.json +++ b/tests/data/bad_germline_set.json @@ -1,27 +1,71 @@ { "GermlineSet": [{ "germline_set_id": "OGRDB:G00007", - "author": "William Lees", - "lab_name": "", - "lab_address": "Birkbeck College, University of London, Malet Street, London", + "acknowledgements": [ + { + "contributor_id": "1", + "name": "William Lees", + "orcid_id": { + "id": "ORCID:0000-0001-9834-6840", + "label": "William Lees" + }, + "affiliation": { + "id": "ROR:02mb95055", + "label": "Birkbeck, University of London" + }, + "affiliation_department":"", + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + } + ], "release_version": 1, "release_description": "", "release_date": "2021-11-24", "germline_set_name": "CAST IGH", "germline_set_ref": "OGRDB:G00007.1", "pub_ids": [""], - "species": ["Mouse"], + "species": "Mouse", "species_subgroup": "CAST_EiJ", "species_subgroup_type": "strain", "locus": "IGH", "allele_descriptions": [ { "allele_description_id": "OGRDB:A00301", - "maintainer": "William Lees", - "acknowledgements": [], - "lab_address": "Birkbeck College, University of London, Malet Street, London", + "acknowledgements": [ + { + "contributor_id": "1", + "name": "William Lees", + "orcid_id": { + "id": "ORCID:0000-0001-9834-6840", + "label": "William Lees" + }, + "affiliation": { + "id": "ROR:02mb95055", + "label": "Birkbeck, University of London" + }, + "affiliation_department":"", + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + } + ], "release_version": 1, - "release_date": "24-Nov-2021", + "release_date": "2021-11-24", "release_description": "First release", "label": "IGHV-2DBF", "sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", @@ -66,7 +110,7 @@ "fwr3_start": 196, "fwr3_end": 312, "cdr3_start": 313, - "alignment": [ + "alignment_labels": [ "1", "2", "3", @@ -182,11 +226,33 @@ }, { "allele_description_id": "OGRDB:A00314", - "maintainer": "William Lees", - "acknowledgements": [], - "lab_address": "Birkbeck College, University of London, Malet Street, London", + "acknowledgements": [ + { + "contributor_id": "1", + "name": "William Lees", + "orcid_id": { + "id": "ORCID:0000-0001-9834-6840", + "label": "William Lees" + }, + "affiliation": { + "id": "ROR:02mb95055", + "label": "Birkbeck, University of London" + }, + "affiliation_department":"", + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + } + ], "release_version": 1, - "release_date": "24-Nov-2021", + "release_date": "2021-11-24", "release_description": "First release", "label": "IGHV-2ETO", "sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC", @@ -231,7 +297,7 @@ "fwr3_start": 196, "fwr3_end": 312, "cdr3_start": 313, - "alignment": [ + "alignment_labels": [ "1", "2", "3", diff --git a/tests/data/bad_repertoire.yaml b/tests/data/bad_repertoire.yaml index 2de377cb3..f35355e98 100644 --- a/tests/data/bad_repertoire.yaml +++ b/tests/data/bad_repertoire.yaml @@ -8,21 +8,39 @@ Repertoire: study_id: PRJNA300878 study_title: "Homo sapiens B and T cell repertoire - MZ twins" study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level." - lab_name: "Mark M. Davis" - lab_address: "Stanford University" - submitted_by: "Florian Rubelt" + contributors: + - contributor_id: "1" + name: "Florian Rubelt" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "investigation" + - role: "data curation" + - contributor_id: "2" + name: "Mark M. Davis" + orcid_id: + id: "ORCID:0000-0001-6868-657X" + label: "Mark Davis" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "supervision" pub_ids: ["PMID:27005435"] subject: subject_id: TW01A synthetic: false species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" value: "Homo sapiens" sex: female age_min: 27 age_max: 27 age_unit: - id: UO_0000036 + id: UO:0000036 value: year linked_subjects: TW01B link_type: twin @@ -32,7 +50,7 @@ Repertoire: cell_subset: "Naive B cell" cell_phenotype: "expression of CD20 and the absence of CD27" cell_species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" value: "Homo sapiens" single_cell: false cell_isolation: FACS @@ -56,21 +74,39 @@ Repertoire: study_id: PRJNA300878 study_title: "Homo sapiens B and T cell repertoire - MZ twins" study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level." - lab_name: "Mark M. Davis" - lab_address: "Stanford University" - submitted_by: "Florian Rubelt" + contributors: + - contributor_id: "1" + name: "Florian Rubelt" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "investigation" + - role: "data curation" + - contributor_id: "2" + name: "Mark M. Davis" + orcid_id: + id: "ORCID:0000-0001-6868-657X" + label: "Mark Davis" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "supervision" pub_ids: ["PMID:27005435"] subject: subject_id: TW01A synthetic: false species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" value: "Homo sapiens" sex: female age_min: 27 age_max: 27 age_unit: - id: UO_0000036 + id: UO:0000036 value: year linked_subjects: TW01B link_type: twin @@ -80,7 +116,7 @@ Repertoire: cell_subset: "Memory B cell" cell_phenotype: "expression of CD20 and CD27" cell_species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" value: "Homo sapiens" single_cell: false cell_isolation: FACS @@ -104,21 +140,39 @@ Repertoire: study_id: PRJNA300878 study_title: "Homo sapiens B and T cell repertoire - MZ twins" study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level." - lab_name: "Mark M. Davis" - lab_address: "Stanford University" - submitted_by: "Florian Rubelt" + contributors: + - contributor_id: "1" + name: "Florian Rubelt" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "investigation" + - role: "data curation" + - contributor_id: "2" + name: "Mark M. Davis" + orcid_id: + id: "ORCID:0000-0001-6868-657X" + label: "Mark Davis" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "supervision" pub_ids: ["PMID:27005435"] subject: subject_id: TW01A synthetic: false species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" value: "Homo sapiens" sex: female age_min: 27 age_max: 27 age_unit: - id: UO_0000036 + id: UO:0000036 value: year linked_subjects: TW01B link_type: twin @@ -128,7 +182,7 @@ Repertoire: cell_subset: "Naive CD4+ T cell" cell_phenotype: "expression of CD8 and absence of CD4 and CD45RO" cell_species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" value: "Homo sapiens" single_cell: false cell_isolation: FACS diff --git a/tests/data/good_combined_airr.json b/tests/data/good_combined_airr.json index 9101b24a9..0ef2106ae 100644 --- a/tests/data/good_combined_airr.json +++ b/tests/data/good_combined_airr.json @@ -10,13 +10,52 @@ "label": null }, "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.", - "study_contact": "Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X", "inclusion_exclusion_criteria": null, - "lab_name": "Mark M. Davis", - "lab_address": "Stanford University", - "submitted_by": "Florian Rubelt", + "contributors": [ + { + "contributor_id": "1", + "name": "Florian Rubelt", + "orcid_id": { + "id": null, + "label": null + }, + "affiliation": { + "id": "ROR:00f54p054", + "label": "Stanford University" + }, + "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine", + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + }, + { + "contributor_id": "2", + "name": "Mark M. Davis", + "orcid_id": { + "id": "ORCID:0000-0001-6868-657X", + "label": "Mark Davis" + }, + "affiliation": { + "id": "ROR:00f54p054", + "label": "Stanford University" + }, + "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine", + "contributions": [ + { + "role": "supervision", + "degree": null + } + ] + } + ], "pub_ids": ["PMID:27005435"], - "collected_by": null, "grants": null, "keywords_study": [ "contains_ig", @@ -27,25 +66,25 @@ "subject_id": "TW01A", "synthetic": false, "species": { - "id": "NCBITaxon_9606", + "id": "NCBITAXON:9606", "label": "Homo sapiens" }, "sex": "female", "age_min": 27, "age_max": 27, "age_unit": { - "id": "UO_0000036", + "id": "UO:0000036", "label": "year" }, "age_event": null, "ancestry_population": { - "id": null, - "label": null - }, - "location_birth": { - "id": null, - "label": null - }, + "id": null, + "label": null + }, + "location_birth": { + "id": null, + "label": null + }, "ethnicity": null, "race": null, "strain_name": null, @@ -104,10 +143,10 @@ ] }, "mhc_genotype_set": { - "mhc_genotype_set_id": "this is a unique identifier", + "mhc_genotype_set_id": "01847298-d0c2-11ee-bc66", "mhc_genotype_list": [ { - "mhc_genotype_id": "unique", + "mhc_genotype_id": "00be1c2e-d0c2-11ee-bfe7", "mhc_class": "MHC-I", "mhc_genotyping_method": "pcr_low_resolution", "mhc_alleles": [ @@ -117,7 +156,7 @@ "id": "MRO-0000046", "label": "HLA-A" }, - "reference_set_ref": "blah" + "reference_set_ref": null } ] } @@ -131,17 +170,17 @@ "sample_processing_id": null, "sample_type": "peripheral venous puncture", "tissue": { - "id": "UBERON_0000178", + "id": "UBERON:0000178", "label": "blood" }, "tissue_processing": "Ficoll gradient", "cell_subset": { - "id": "CL_0000788", + "id": "CL:0000788", "label": "naive B cell" }, "cell_phenotype": "expression of CD20 and the absence of CD27", "cell_species": { - "id": "NCBITaxon_9606", + "id": "NCBITAXON:9606", "label": "Homo sapiens" }, "single_cell": false, @@ -173,7 +212,7 @@ "label": null }, "collection_time_point_reference": null, - "collection_location": { + "collection_location": { "id": null, "label": null }, @@ -227,13 +266,52 @@ "label": null }, "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.", - "study_contact": "Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X", "inclusion_exclusion_criteria": null, - "lab_name": "Mark M. Davis", - "lab_address": "Stanford University", - "submitted_by": "Florian Rubelt", + "contributors": [ + { + "contributor_id": "1", + "name": "Florian Rubelt", + "orcid_id": { + "id": null, + "label": null + }, + "affiliation": { + "id": "ROR:00f54p054", + "label": "Stanford University" + }, + "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine", + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + }, + { + "contributor_id": "2", + "name": "Mark M. Davis", + "orcid_id": { + "id": "ORCID:0000-0001-6868-657X", + "label": "Mark Davis" + }, + "affiliation": { + "id": "ROR:00f54p054", + "label": "Stanford University" + }, + "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine", + "contributions": [ + { + "role": "supervision", + "degree": null + } + ] + } + ], "pub_ids": ["PMID:27005435"], - "collected_by": null, "grants": null, "keywords_study": [ "contains_ig", @@ -244,25 +322,25 @@ "subject_id": "TW01A", "synthetic": false, "species": { - "id": "NCBITaxon_9606", + "id": "NCBITAXON:9606", "label": "Homo sapiens" }, "sex": "female", "age_min": 27, "age_max": 27, "age_unit": { - "id": "UO_0000036", + "id": "UO:0000036", "label": "year" }, "age_event": null, "ancestry_population": { - "id": null, - "label": null - }, - "location_birth": { - "id": null, - "label": null - }, + "id": null, + "label": null + }, + "location_birth": { + "id": null, + "label": null + }, "ethnicity": null, "race": null, "strain_name": null, @@ -290,17 +368,17 @@ "sample_processing_id": null, "sample_type": "peripheral venous puncture", "tissue": { - "id": "UBERON_0000178", + "id": "UBERON:0000178", "label": "blood" }, "tissue_processing": "Ficoll gradient", "cell_subset": { - "id": "CL_0000787", + "id": "CL:0000787", "label": "memory B cell" }, "cell_phenotype": "expression of CD20 and CD27", "cell_species": { - "id": "NCBITaxon_9606", + "id": "NCBITAXON:9606", "label": "Homo sapiens" }, "single_cell": false, @@ -332,7 +410,7 @@ "label": null }, "collection_time_point_reference": null, - "collection_location": { + "collection_location": { "id": null, "label": null }, @@ -386,13 +464,52 @@ "label": null }, "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.", - "study_contact": "Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X", "inclusion_exclusion_criteria": null, - "lab_name": "Mark M. Davis", - "lab_address": "Stanford University", - "submitted_by": "Florian Rubelt", + "contributors": [ + { + "contributor_id": "1", + "name": "Florian Rubelt", + "orcid_id": { + "id": null, + "label": null + }, + "affiliation": { + "id": "ROR:00f54p054", + "label": "Stanford University" + }, + "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine", + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + }, + { + "contributor_id": "2", + "name": "Mark M. Davis", + "orcid_id": { + "id": "ORCID:0000-0001-6868-657X", + "label": "Mark Davis" + }, + "affiliation": { + "id": "ROR:00f54p054", + "label": "Stanford University" + }, + "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine", + "contributions": [ + { + "role": "supervision", + "degree": null + } + ] + } + ], "pub_ids": ["PMID:27005435"], - "collected_by": null, "grants": null, "keywords_study": [ "contains_ig", @@ -403,25 +520,25 @@ "subject_id": "TW01A", "synthetic": false, "species": { - "id": "NCBITaxon_9606", + "id": "NCBITAXON:9606", "label": "Homo sapiens" }, "sex": "female", "age_min": 27, "age_max": 27, "age_unit": { - "id": "UO_0000036", + "id": "UO:0000036", "label": "year" }, "age_event": null, "ancestry_population": { - "id": null, - "label": null - }, - "location_birth": { - "id": null, - "label": null - }, + "id": null, + "label": null + }, + "location_birth": { + "id": null, + "label": null + }, "ethnicity": null, "race": null, "strain_name": null, @@ -449,17 +566,17 @@ "sample_processing_id": null, "sample_type": "peripheral venous puncture", "tissue": { - "id": "UBERON_0000178", + "id": "UBERON:0000178", "label": "blood" }, "tissue_processing": "Ficoll gradient", "cell_subset": { - "id": "CL_0000895", + "id": "CL:0000895", "label": "naive thymus-derived CD4-positive, alpha-beta T cell" }, "cell_phenotype": "expression of CD8 and absence of CD4 and CD45RO", "cell_species": { - "id": "NCBITaxon_9606", + "id": "NCBITAXON:9606", "label": "Homo sapiens" }, "single_cell": false, @@ -491,7 +608,7 @@ "label": null }, "collection_time_point_reference": null, - "collection_location": { + "collection_location": { "id": null, "label": null }, @@ -540,17 +657,41 @@ "GermlineSet": [{ "germline_set_id": "OGRDB:G00007", - "author": "William Lees", - "lab_name": "", - "lab_address": "Birkbeck College, University of London, Malet Street, London", - "acknowledgements": [], + "acknowledgements": [ + { + "contributor_id": "3", + "name": "William Lees", + "orcid_id": { + "id": "ORCID:0000-0001-9834-6840", + "label": "William Lees" + }, + "affiliation": { + "id": "ROR:02mb95055", + "label": "Birkbeck, University of London" + }, + "affiliation_department": null, + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + } + ], "release_version": 1, "release_description": "", "release_date": "2021-11-24", "germline_set_name": "CAST IGH", "germline_set_ref": "OGRDB:G00007.1", "pub_ids": [""], - "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" }, + "species": { + "id": "NCBITAXON:10090", + "label": "Mus musculus" + }, "species_subgroup": "CAST_EiJ", "species_subgroup_type": "strain", "locus": "IGH", @@ -558,15 +699,37 @@ { "allele_description_id": "OGRDB:A00301", "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2DBF", - "maintainer": "William Lees", - "acknowledgements": [], - "lab_address": "Birkbeck College, University of London, Malet Street, London", + "acknowledgements": [ + { + "contributor_id": "3", + "name": "William Lees", + "orcid_id": { + "id": "ORCID:0000-0001-9834-6840", + "label": "William Lees" + }, + "affiliation": { + "id": "ROR:02mb95055", + "label": "Birkbeck, University of London" + }, + "affiliation_department": null, + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + } + ], "release_version": 1, - "release_date": "24-Nov-2021", + "release_date": "2021-11-24", "release_description": "First release", "label": "IGHV-2DBF", - "sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", - "coding_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", + "sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", + "coding_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", "aliases": [ "watson_et_al:CAST_EiJ_IGHV5-3" ], @@ -575,7 +738,10 @@ "sequence_type": "V", "functional": true, "inference_type": "rearranged_only", - "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" }, + "species": { + "id": "NCBITAXON:10090", + "label": "Mus musculus" + }, "species_subgroup": "CAST_EiJ", "species_subgroup_type": "strain", "status": "active", @@ -596,8 +762,8 @@ { "sequence_delineation_id": "1", "delineation_scheme": "IMGT", + "unaligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", "aligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", - "unaligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", "fwr1_start": 1, "fwr1_end": 75, "cdr1_start": 76, @@ -609,7 +775,7 @@ "fwr3_start": 161, "fwr3_end": 294, "cdr3_start": 295, - "alignment": [ + "alignment_labels": [ "1", "2", "3", @@ -726,11 +892,33 @@ { "allele_description_id": "OGRDB:A00314", "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2ETO", - "maintainer": "William Lees", - "acknowledgements": [], - "lab_address": "Birkbeck College, University of London, Malet Street, London", + "acknowledgements": [ + { + "contributor_id": "3", + "name": "William Lees", + "orcid_id": { + "id": "ORCID:0000-0001-9834-6840", + "label": "William Lees" + }, + "affiliation": { + "id": "ROR:02mb95055", + "label": "Birkbeck, University of London" + }, + "affiliation_department": null, + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + } + ], "release_version": 1, - "release_date": "24-Nov-2021", + "release_date": "2021-11-24", "release_description": "First release", "label": "IGHV-2ETO", "sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC", @@ -743,7 +931,10 @@ "sequence_type": "V", "functional": true, "inference_type": "rearranged_only", - "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" }, + "species": { + "id": "NCBITAXON:10090", + "label": "Mus musculus" + }, "species_subgroup": "CAST_EiJ", "species_subgroup_type": "strain", "status": "active", @@ -764,8 +955,8 @@ { "sequence_delineation_id": "1", "delineation_scheme": "IMGT", - "aligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", - "unaligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", + "unaligned_sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC", + "aligned_sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTG...GGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGA............GCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGT......GGGATGATGATAAGTACTATAACCCATCCCTGA...AGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC", "fwr1_start": 1, "fwr1_end": 75, "cdr1_start": 76, @@ -777,7 +968,7 @@ "fwr3_start": 161, "fwr3_end": 294, "cdr3_start": 295, - "alignment": [ + "alignment_labels": [ "1", "2", "3", diff --git a/tests/data/good_combined_airr.yaml b/tests/data/good_combined_airr.yaml index 80d0fe3a2..2c9ab547c 100644 --- a/tests/data/good_combined_airr.yaml +++ b/tests/data/good_combined_airr.yaml @@ -21,13 +21,35 @@ Repertoire: a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level. - study_contact: Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X inclusion_exclusion_criteria: - lab_name: Mark M. Davis - lab_address: Stanford University - submitted_by: Florian Rubelt + contributors: + - contributor_id: "1" + name: "Florian Rubelt" + orcid_id: + id: + label: + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "investigation" + degree: + - role: "data curation" + degree: + - contributor_id: "2" + name: "Mark M. Davis" + orcid_id: + id: "ORCID:0000-0001-6868-657X" + label: "Mark Davis" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "supervision" + degree: pub_ids: ["PMID:27005435"] - collected_by: grants: keywords_study: - contains_ig @@ -36,13 +58,13 @@ Repertoire: subject_id: TW01A synthetic: false species: - id: NCBITaxon_9606 + id: NCBITAXON:9606 label: Homo sapiens sex: female age_min: 27 age_max: 27 age_unit: - id: UO_0000036 + id: UO:0000036 label: year age_event: ancestry_population: @@ -69,9 +91,9 @@ Repertoire: medical_history: genotype: receptor_genotype_set: - receptor_genotype_set_id: '1' + receptor_genotype_set_id: "1" genotype_class_list: - - receptor_genotype_id: '1' + - receptor_genotype_id: "1" locus: IGH documented_alleles: - label: IGHV1-69*01 @@ -90,31 +112,31 @@ Repertoire: phasing: 1 inference_process: repertoire_sequencing mhc_genotype_set: - mhc_genotype_set_id: "this is a unique identifier" + mhc_genotype_set_id: 01847298-d0c2-11ee-bc66 mhc_genotype_list: - - mhc_genotype_id: unique + - mhc_genotype_id: 00be1c2e-d0c2-11ee-bfe7 mhc_class: MHC-I mhc_genotyping_method: pcr_low_resolution mhc_alleles: - allele_designation: "01:01" gene: - id: "MRO-0000046" - label: "HLA-A" - reference_set_ref: blah + id: MRO-0000046 + label: HLA-A + reference_set_ref: sample: - sample_id: TW01A_B_naive sample_processing_id: sample_type: peripheral venous puncture tissue: - id: UBERON_0000178 + id: UBERON:0000178 label: blood tissue_processing: Ficoll gradient cell_subset: - id: CL_0000788 + id: CL:0000788 label: naive B cell cell_phenotype: expression of CD20 and the absence of CD27 cell_species: - id: NCBITaxon_9606 + id: NCBITAXON:9606 label: Homo sapiens single_cell: false cell_isolation: FACS @@ -198,13 +220,35 @@ Repertoire: a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level. - study_contact: Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X inclusion_exclusion_criteria: - lab_name: Mark M. Davis - lab_address: Stanford University - submitted_by: Florian Rubelt + contributors: + - contributor_id: "1" + name: "Florian Rubelt" + orcid_id: + id: + label: + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "investigation" + degree: + - role: "data curation" + degree: + - contributor_id: "2" + name: "Mark M. Davis" + orcid_id: + id: "ORCID:0000-0001-6868-657X" + label: "Mark Davis" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "supervision" + degree: pub_ids: ["PMID:27005435"] - collected_by: grants: keywords_study: - contains_ig @@ -213,13 +257,13 @@ Repertoire: subject_id: TW01A synthetic: false species: - id: NCBITaxon_9606 + id: NCBITAXON:9606 label: Homo sapiens sex: female age_min: 27 age_max: 27 age_unit: - id: UO_0000036 + id: UO:0000036 label: year age_event: ancestry_population: @@ -249,15 +293,15 @@ Repertoire: sample_processing_id: sample_type: peripheral venous puncture tissue: - id: UBERON_0000178 + id: UBERON:0000178 label: blood tissue_processing: Ficoll gradient cell_subset: - id: CL_0000787 + id: CL:0000787 label: memory B cell cell_phenotype: expression of CD20 and CD27 cell_species: - id: NCBITaxon_9606 + id: NCBITAXON:9606 label: Homo sapiens single_cell: false cell_isolation: FACS @@ -341,13 +385,35 @@ Repertoire: a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level. - study_contact: Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X inclusion_exclusion_criteria: - lab_name: Mark M. Davis - lab_address: Stanford University - submitted_by: Florian Rubelt + contributors: + - contributor_id: "1" + name: "Florian Rubelt" + orcid_id: + id: + label: + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "investigation" + degree: + - role: "data curation" + degree: + - contributor_id: "2" + name: "Mark M. Davis" + orcid_id: + id: "ORCID:0000-0001-6868-657X" + label: "Mark Davis" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "supervision" + degree: pub_ids: ["PMID:27005435"] - collected_by: grants: keywords_study: - contains_ig @@ -356,13 +422,13 @@ Repertoire: subject_id: TW01A synthetic: false species: - id: NCBITaxon_9606 + id: NCBITAXON:9606 label: Homo sapiens sex: female age_min: 27 age_max: 27 age_unit: - id: UO_0000036 + id: UO:0000036 label: year age_event: ancestry_population: @@ -392,15 +458,15 @@ Repertoire: sample_processing_id: sample_type: peripheral venous puncture tissue: - id: UBERON_0000178 + id: UBERON:0000178 label: blood tissue_processing: Ficoll gradient cell_subset: - id: CL_0000895 + id: CL:0000895 label: naive thymus-derived CD4-positive, alpha-beta T cell cell_phenotype: expression of CD8 and absence of CD4 and CD45RO cell_species: - id: NCBITaxon_9606 + id: NCBITAXON:9606 label: Homo sapiens single_cell: false cell_isolation: FACS @@ -464,357 +530,391 @@ Repertoire: analysis_provenance_id: 4625424004665971176-242ac11c-0001-012 GermlineSet: -- acknowledgements: [] - allele_descriptions: - - acknowledgements: [] - aliases: - - watson_et_al:CAST_EiJ_IGHV5-3 - allele_description_id: OGRDB:A00301 - allele_description_ref: OGRDB:Mouse_IGH:IGHV-2DBF - allele_designation: null - chromosome: null - coding_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA - curation: 'Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV5-3' - curational_tags: null - functional: true - gene_designation: null - gene_end: null - gene_start: null - inference_type: rearranged_only - lab_address: Birkbeck College, University of London, Malet Street, London - label: IGHV-2DBF - leader_1_end: null - leader_1_start: null - leader_2_end: null - leader_2_start: null - locus: IGH - maintainer: William Lees - paralogs: [] - rearranged_support: [] - release_date: 24-Nov-2021 - release_description: First release + - germline_set_id: OGRDB:G00007 + acknowledgements: + - contributor_id: "3" + name: William Lees + orcid_id: + id: ORCID:0000-0001-9834-6840 + label: William Lees + affiliation: + id: ROR:02mb95055 + label: Birkbeck, University of London + affiliation_department: + contributions: + - role: investigation + degree: null + - role: data curation + degree: null release_version: 1 - sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA - sequence_type: V + release_description: "" + release_date: "2021-11-24" + germline_set_name: CAST IGH + germline_set_ref: OGRDB:G00007.1 + pub_ids: [""] species: id: NCBITAXON:10090 label: Mus musculus species_subgroup: CAST_EiJ species_subgroup_type: strain - status: active - subgroup_designation: null - unrearranged_support: [] - utr_5_prime_end: null - utr_5_prime_start: null - v_gene_delineations: - - aligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA - alignment: - - '1' - - '2' - - '3' - - '4' - - '5' - - '6' - - '7' - - '8' - - '9' - - '10' - - '11' - - '12' - - '13' - - '14' - - '15' - - '16' - - '17' - - '18' - - '19' - - '20' - - '21' - - '22' - - '23' - - '24' - - '25' - - '26' - - '27' - - '28' - - '29' - - '30' - - '31' - - '32' - - '33' - - '34' - - '35' - - '36' - - '37' - - '38' - - '39' - - '40' - - '41' - - '42' - - '43' - - '44' - - '45' - - '46' - - '47' - - '48' - - '49' - - '50' - - '51' - - '52' - - '53' - - '54' - - '55' - - '56' - - '57' - - '58' - - '59' - - '60' - - '61' - - '62' - - '63' - - '64' - - '65' - - '66' - - '67' - - '68' - - '69' - - '70' - - '71' - - '72' - - '73' - - '74' - - '75' - - '76' - - '77' - - '78' - - '79' - - '80' - - '81' - - '82' - - '83' - - '84' - - '85' - - '86' - - '87' - - '88' - - '89' - - '90' - - '91' - - '92' - - '93' - - '94' - - '95' - - '96' - - '97' - - '98' - - '99' - - '100' - - '101' - - '102' - - '103' - - '104' - cdr1_end: 110 - cdr1_start: 76 - cdr2_end: 160 - cdr2_start: 151 - cdr3_start: 295 - delineation_scheme: IMGT - fwr1_end: 75 - fwr1_start: 1 - fwr2_end: 150 - fwr2_start: 111 - fwr3_end: 294 - fwr3_start: 161 - sequence_delineation_id: '1' - unaligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA - v_rs_end: null - v_rs_start: null - - acknowledgements: [] - aliases: - - watson_et_al:CAST_EiJ_IGHV8-2 - allele_description_id: OGRDB:A00314 - allele_description_ref: OGRDB:Mouse_IGH:IGHV-2ETO - allele_designation: null - chromosome: null - coding_sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC - curation: 'Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV8-2' - curational_tags: null - functional: true - gene_designation: null - gene_end: null - gene_start: null - inference_type: rearranged_only - lab_address: Birkbeck College, University of London, Malet Street, London - label: IGHV-2ETO - leader_1_end: null - leader_1_start: null - leader_2_end: null - leader_2_start: null locus: IGH - maintainer: William Lees - paralogs: [] - rearranged_support: [] - release_date: 24-Nov-2021 - release_description: First release - release_version: 1 - sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC - sequence_type: V - species: - id: NCBITAXON:10090 - label: Mus musculus - species_subgroup: CAST_EiJ - species_subgroup_type: strain - status: active - subgroup_designation: null - unrearranged_support: [] - utr_5_prime_end: null - utr_5_prime_start: null - v_gene_delineations: - - aligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA - alignment: - - '1' - - '2' - - '3' - - '4' - - '5' - - '6' - - '7' - - '8' - - '9' - - '10' - - '11' - - '12' - - '13' - - '14' - - '15' - - '16' - - '17' - - '18' - - '19' - - '20' - - '21' - - '22' - - '23' - - '24' - - '25' - - '26' - - '27' - - '28' - - '29' - - '30' - - '31' - - '32' - - '33' - - '34' - - '35' - - '36' - - '37' - - '38' - - '39' - - '40' - - '41' - - '42' - - '43' - - '44' - - '45' - - '46' - - '47' - - '48' - - '49' - - '50' - - '51' - - '52' - - '53' - - '54' - - '55' - - '56' - - '57' - - '58' - - '59' - - '60' - - '61' - - '62' - - '63' - - '64' - - '65' - - '66' - - '67' - - '68' - - '69' - - '70' - - '71' - - '72' - - '73' - - '74' - - '75' - - '76' - - '77' - - '78' - - '79' - - '80' - - '81' - - '82' - - '83' - - '84' - - '85' - - '86' - - '87' - - '88' - - '89' - - '90' - - '91' - - '92' - - '93' - - '94' - - '95' - - '96' - - '97' - - '98' - - '99' - - '100' - - '101' - - '102' - - '103' - - '104' - cdr1_end: 110 - cdr1_start: 76 - cdr2_end: 160 - cdr2_start: 151 - cdr3_start: 295 - delineation_scheme: IMGT - fwr1_end: 75 - fwr1_start: 1 - fwr2_end: 150 - fwr2_start: 111 - fwr3_end: 294 - fwr3_start: 161 - sequence_delineation_id: '1' - unaligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA - v_rs_end: null - v_rs_start: null - author: William Lees - curation: null - germline_set_id: OGRDB:G00007 - germline_set_name: CAST IGH - germline_set_ref: OGRDB:G00007.1 - lab_address: Birkbeck College, University of London, Malet Street, London - lab_name: '' - locus: IGH - pub_ids: [''] - release_date: '2021-11-24' - release_description: '' - release_version: 1 - species: - id: NCBITAXON:10090 - label: Mus musculus - species_subgroup: CAST_EiJ - species_subgroup_type: strain - + allele_descriptions: + - allele_description_id: OGRDB:A00301 + allele_description_ref: OGRDB:Mouse_IGH:IGHV-2DBF + acknowledgements: + - contributor_id: "3" + name: William Lees + orcid_id: + id: ORCID:0000-0001-9834-6840 + label: William Lees + affiliation: + id: ROR:02mb95055 + label: Birkbeck, University of London + affiliation_department: + contributions: + - role: investigation + degree: + - role: data curation + degree: + release_version: 1 + release_date: "2021-11-24" + release_description: First release + label: IGHV-2DBF + sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA + coding_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA + aliases: + - watson_et_al:CAST_EiJ_IGHV5-3 + locus: IGH + chromosome: + sequence_type: V + functional: true + inference_type: rearranged_only + species: + id: NCBITAXON:10090 + label: Mus musculus + species_subgroup: CAST_EiJ + species_subgroup_type: strain + status: active + gene_designation: + subgroup_designation: + allele_designation: + gene_start: + gene_end: + utr_5_prime_start: + utr_5_prime_end: + leader_1_start: + leader_1_end: + leader_2_start: + leader_2_end: + v_rs_start: + v_rs_end: + v_gene_delineations: + - sequence_delineation_id: '1' + delineation_scheme: IMGT + unaligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA + aligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA + fwr1_start: 1 + fwr1_end: 75 + cdr1_start: 76 + cdr1_end: 110 + fwr2_start: 111 + fwr2_end: 150 + cdr2_start: 151 + cdr2_end: 160 + fwr3_start: 161 + fwr3_end: 294 + cdr3_start: 295 + alignment_labels: + - '1' + - '2' + - '3' + - '4' + - '5' + - '6' + - '7' + - '8' + - '9' + - '10' + - '11' + - '12' + - '13' + - '14' + - '15' + - '16' + - '17' + - '18' + - '19' + - '20' + - '21' + - '22' + - '23' + - '24' + - '25' + - '26' + - '27' + - '28' + - '29' + - '30' + - '31' + - '32' + - '33' + - '34' + - '35' + - '36' + - '37' + - '38' + - '39' + - '40' + - '41' + - '42' + - '43' + - '44' + - '45' + - '46' + - '47' + - '48' + - '49' + - '50' + - '51' + - '52' + - '53' + - '54' + - '55' + - '56' + - '57' + - '58' + - '59' + - '60' + - '61' + - '62' + - '63' + - '64' + - '65' + - '66' + - '67' + - '68' + - '69' + - '70' + - '71' + - '72' + - '73' + - '74' + - '75' + - '76' + - '77' + - '78' + - '79' + - '80' + - '81' + - '82' + - '83' + - '84' + - '85' + - '86' + - '87' + - '88' + - '89' + - '90' + - '91' + - '92' + - '93' + - '94' + - '95' + - '96' + - '97' + - '98' + - '99' + - '100' + - '101' + - '102' + - '103' + - '104' + unrearranged_support: [] + rearranged_support: [] + paralogs: [] + curation: 'Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV5-3' + curational_tags: + - allele_description_id: OGRDB:A00314 + allele_description_ref: OGRDB:Mouse_IGH:IGHV-2ETO + acknowledgements: + - contributor_id: "3" + name: William Lees + orcid_id: + id: ORCID:0000-0001-9834-6840 + label: William Lees + affiliation: + id: ROR:02mb95055 + label: Birkbeck, University of London + affiliation_department: + contributions: + - role: investigation + degree: + - role: data curation + degree: + release_version: 1 + release_date: "2021-11-24" + release_description: First release + label: IGHV-2ETO + sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC + coding_sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC + aliases: + - watson_et_al:CAST_EiJ_IGHV8-2 + locus: IGH + chromosome: + sequence_type: V + functional: true + inference_type: rearranged_only + species: + id: NCBITAXON:10090 + label: Mus musculus + species_subgroup: CAST_EiJ + species_subgroup_type: strain + status: active + gene_designation: + subgroup_designation: + allele_designation: + gene_start: + gene_end: + utr_5_prime_start: + utr_5_prime_end: + leader_1_start: + leader_1_end: + leader_2_start: + leader_2_end: + v_rs_start: + v_rs_end: + v_gene_delineations: + - sequence_delineation_id: '1' + delineation_scheme: IMGT + unaligned_sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC + aligned_sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTG...GGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGA............GCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGT......GGGATGATGATAAGTACTATAACCCATCCCTGA...AGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC + fwr1_start: 1 + fwr1_end: 75 + cdr1_start: 76 + cdr1_end: 110 + fwr2_start: 111 + fwr2_end: 150 + cdr2_start: 151 + cdr2_end: 160 + fwr3_start: 161 + fwr3_end: 294 + cdr3_start: 295 + alignment_labels: + - '1' + - '2' + - '3' + - '4' + - '5' + - '6' + - '7' + - '8' + - '9' + - '10' + - '11' + - '12' + - '13' + - '14' + - '15' + - '16' + - '17' + - '18' + - '19' + - '20' + - '21' + - '22' + - '23' + - '24' + - '25' + - '26' + - '27' + - '28' + - '29' + - '30' + - '31' + - '32' + - '33' + - '34' + - '35' + - '36' + - '37' + - '38' + - '39' + - '40' + - '41' + - '42' + - '43' + - '44' + - '45' + - '46' + - '47' + - '48' + - '49' + - '50' + - '51' + - '52' + - '53' + - '54' + - '55' + - '56' + - '57' + - '58' + - '59' + - '60' + - '61' + - '62' + - '63' + - '64' + - '65' + - '66' + - '67' + - '68' + - '69' + - '70' + - '71' + - '72' + - '73' + - '74' + - '75' + - '76' + - '77' + - '78' + - '79' + - '80' + - '81' + - '82' + - '83' + - '84' + - '85' + - '86' + - '87' + - '88' + - '89' + - '90' + - '91' + - '92' + - '93' + - '94' + - '95' + - '96' + - '97' + - '98' + - '99' + - '100' + - '101' + - '102' + - '103' + - '104' + unrearranged_support: [] + rearranged_support: [] + paralogs: [] + curation: 'Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV8-2' + curational_tags: + curation: GenotypeSet: - - receptor_genotype_set_id: '1' + - receptor_genotype_set_id: "1" genotype_class_list: - - receptor_genotype_id: '1' + - receptor_genotype_id: "1" locus: IGH documented_alleles: - label: IGHV1-69*01 diff --git a/tests/data/good_genotype_set.json b/tests/data/good_genotype_set.json index ba10f56e9..abd24646c 100644 --- a/tests/data/good_genotype_set.json +++ b/tests/data/good_genotype_set.json @@ -35,4 +35,4 @@ } ] }] -} \ No newline at end of file +} diff --git a/tests/data/good_germline_set.json b/tests/data/good_germline_set.json index 41ecf5f7d..e74c590dc 100644 --- a/tests/data/good_germline_set.json +++ b/tests/data/good_germline_set.json @@ -1,17 +1,41 @@ { "GermlineSet": [{ "germline_set_id": "OGRDB:G00007", - "author": "William Lees", - "lab_name": "", - "lab_address": "Birkbeck College, University of London, Malet Street, London", - "acknowledgements": [], + "acknowledgements": [ + { + "contributor_id": "1", + "name": "William Lees", + "orcid_id": { + "id": "ORCID:0000-0001-9834-6840", + "label": "William Lees" + }, + "affiliation": { + "id": "ROR:02mb95055", + "label": "Birkbeck, University of London" + }, + "affiliation_department":"", + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + } + ], "release_version": 1, "release_description": "", "release_date": "2021-11-24", "germline_set_name": "CAST IGH", "germline_set_ref": "OGRDB:G00007.1", "pub_ids": [""], - "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" }, + "species": { + "id": "NCBITAXON:10090", + "label": "Mus musculus" + }, "species_subgroup": "CAST_EiJ", "species_subgroup_type": "strain", "locus": "IGH", @@ -19,11 +43,33 @@ { "allele_description_id": "OGRDB:A00301", "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2DBF", - "maintainer": "William Lees", - "acknowledgements": [], - "lab_address": "Birkbeck College, University of London, Malet Street, London", + "acknowledgements": [ + { + "contributor_id": "1", + "name": "William Lees", + "orcid_id": { + "id": "ORCID:0000-0001-9834-6840", + "label": "William Lees" + }, + "affiliation": { + "id": "ROR:02mb95055", + "label": "Birkbeck, University of London" + }, + "affiliation_department":"", + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + } + ], "release_version": 1, - "release_date": "24-Nov-2021", + "release_date": "2021-11-24", "release_description": "First release", "label": "IGHV-2DBF", "sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA", @@ -36,7 +82,10 @@ "sequence_type": "V", "functional": true, "inference_type": "rearranged_only", - "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" }, + "species": { + "id": "NCBITAXON:10090", + "label": "Mus musculus" + }, "species_subgroup": "CAST_EiJ", "species_subgroup_type": "strain", "status": "active", @@ -70,7 +119,7 @@ "fwr3_start": 161, "fwr3_end": 294, "cdr3_start": 295, - "alignment": [ + "alignment_labels": [ "1", "2", "3", @@ -187,11 +236,33 @@ { "allele_description_id": "OGRDB:A00314", "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2ETO", - "maintainer": "William Lees", - "acknowledgements": [], - "lab_address": "Birkbeck College, University of London, Malet Street, London", + "acknowledgements": [ + { + "contributor_id": "1", + "name": "William Lees", + "orcid_id": { + "id": "ORCID:0000-0001-9834-6840", + "label": "William Lees" + }, + "affiliation": { + "id": "ROR:02mb95055", + "label": "Birkbeck, University of London" + }, + "affiliation_department":"", + "contributions": [ + { + "role": "investigation", + "degree": null + }, + { + "role": "data curation", + "degree": null + } + ] + } + ], "release_version": 1, - "release_date": "24-Nov-2021", + "release_date": "2021-11-24", "release_description": "First release", "label": "IGHV-2ETO", "sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC", @@ -204,7 +275,10 @@ "sequence_type": "V", "functional": true, "inference_type": "rearranged_only", - "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" }, + "species": { + "id": "NCBITAXON:10090", + "label": "Mus musculus" + }, "species_subgroup": "CAST_EiJ", "species_subgroup_type": "strain", "status": "active", @@ -238,7 +312,7 @@ "fwr3_start": 161, "fwr3_end": 294, "cdr3_start": 295, - "alignment": [ + "alignment_labels": [ "1", "2", "3", diff --git a/tests/data/good_repertoire.yaml b/tests/data/good_repertoire.yaml index 9bf3a4653..6adaa2361 100644 --- a/tests/data/good_repertoire.yaml +++ b/tests/data/good_repertoire.yaml @@ -11,28 +11,50 @@ Repertoire: id: null label: null study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level." - study_contact: "Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X" inclusion_exclusion_criteria: null - lab_name: "Mark M. Davis" - lab_address: "Stanford University" - submitted_by: "Florian Rubelt" + contributors: + - contributor_id: "1" + name: "Florian Rubelt" + orcid_id: + id: null + label: null + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "investigation" + degree: null + - role: "data curation" + degree: null + - contributor_id: "2" + name: "Mark M. Davis" + orcid_id: + id: "ORCID:0000-0001-6868-657X" + label: "Mark Davis" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "supervision" + degree: null pub_ids: ["PMID:27005435"] - collected_by: null grants: null - keywords_study: + keywords_study: - "contains_ig" - "contains_tr" subject: subject_id: TW01A synthetic: false species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" label: "Homo sapiens" sex: female age_min: 27 age_max: 27 age_unit: - id: UO_0000036 + id: UO:0000036 label: year age_event: null ancestry_population: @@ -63,15 +85,15 @@ Repertoire: sample_processing_id: null sample_type: "peripheral venous puncture" tissue: - id: "UBERON_0000178" + id: "UBERON:0000178" label: "blood" tissue_processing: "Ficoll gradient" cell_subset: - id: "CL_0000788" + id: "CL:0000788" label: "naive B cell" cell_phenotype: "expression of CD20 and the absence of CD27" cell_species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" label: "Homo sapiens" single_cell: false cell_isolation: FACS @@ -144,13 +166,35 @@ Repertoire: id: null label: null study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level." - study_contact: "Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X" inclusion_exclusion_criteria: null - lab_name: "Mark M. Davis" - lab_address: "Stanford University" - submitted_by: "Florian Rubelt" + contributors: + - contributor_id: "1" + name: "Florian Rubelt" + orcid_id: + id: null + label: null + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "investigation" + degree: null + - role: "data curation" + degree: null + - contributor_id: "2" + name: "Mark M. Davis" + orcid_id: + id: "ORCID:0000-0001-6868-657X" + label: "Mark Davis" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "supervision" + degree: null pub_ids: ["PMID:27005435"] - collected_by: null grants: null keywords_study: - "contains_ig" @@ -159,13 +203,13 @@ Repertoire: subject_id: TW01A synthetic: false species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" label: "Homo sapiens" sex: female age_min: 27 age_max: 27 age_unit: - id: UO_0000036 + id: UO:0000036 label: year age_event: null ancestry_population: @@ -196,15 +240,15 @@ Repertoire: sample_processing_id: null sample_type: "peripheral venous puncture" tissue: - id: "UBERON_0000178" + id: "UBERON:0000178" label: "blood" tissue_processing: "Ficoll gradient" cell_subset: - id: "CL_0000787" + id: "CL:0000787" label: "memory B cell" cell_phenotype: "expression of CD20 and CD27" cell_species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" label: "Homo sapiens" single_cell: false cell_isolation: FACS @@ -277,13 +321,35 @@ Repertoire: id: null label: null study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level." - study_contact: "Mark M. Davis, mmdavis@stanford.edu, ORCID:0000-0001-6868-657X" inclusion_exclusion_criteria: null - lab_name: "Mark M. Davis" - lab_address: "Stanford University" - submitted_by: "Florian Rubelt" + contributors: + - contributor_id: "1" + name: "Florian Rubelt" + orcid_id: + id: null + label: null + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "investigation" + degree: null + - role: "data curation" + degree: null + - contributor_id: "2" + name: "Mark M. Davis" + orcid_id: + id: "ORCID:0000-0001-6868-657X" + label: "Mark Davis" + affiliation: + id: "ROR:00f54p054" + label: "Stanford University" + affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine" + contributions: + - role: "supervision" + degree: null pub_ids: ["PMID:27005435"] - collected_by: null grants: null keywords_study: - "contains_ig" @@ -292,13 +358,13 @@ Repertoire: subject_id: TW01A synthetic: false species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" label: "Homo sapiens" sex: female age_min: 27 age_max: 27 age_unit: - id: UO_0000036 + id: UO:0000036 label: year age_event: null ancestry_population: @@ -329,15 +395,15 @@ Repertoire: sample_processing_id: null sample_type: "peripheral venous puncture" tissue: - id: "UBERON_0000178" + id: "UBERON:0000178" label: "blood" tissue_processing: "Ficoll gradient" cell_subset: - id: "CL_0000895" + id: "CL:0000895" label: "naive thymus-derived CD4-positive, alpha-beta T cell" cell_phenotype: "expression of CD8 and absence of CD4 and CD45RO" cell_species: - id: "NCBITaxon_9606" + id: "NCBITAXON:9606" label: "Homo sapiens" single_cell: false cell_isolation: FACS