From 5012dd22ab08a7ea0c258d9e0b144eb843f97cd8 Mon Sep 17 00:00:00 2001
From: Scott Christley <scott.christley@utsouthwestern.edu>
Date: Wed, 21 Feb 2024 17:35:34 -0600
Subject: [PATCH 01/15] good ole make, great for simple tasks

---
 Makefile | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 Makefile

diff --git a/Makefile b/Makefile
new file mode 100644
index 000000000..151bf2dd4
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,50 @@
+# helper commands for keeping the language directories in sync
+
+# note: "help" MUST be the first target in the file,
+# when the user types "make" they should get help info
+help:
+	@echo ""
+	@echo "Helper commands for AIRR Standards repository"
+	@echo ""
+	@echo "make gen-v2       -- Generate OpenAPI V2 spec from the V3 spec"
+	@echo "make docs         -- Build documentation"
+	@echo "make lang-copy    -- Copy spec files to language directories"
+	@echo "make data-copy    -- Copy test data files to language directories"
+	@echo "make checks       -- Run consistency checks on spec files"
+	@echo "make tests        -- Run all language test suites"
+	@echo "make python-tests -- Run Python test suite"
+	@echo "make r-tests      -- Run R test suite"
+	@echo "make js-tests     -- Run Javascript test suite"
+	@echo ""
+
+gen-v2:
+	@echo "Not implemented"
+
+lang-copy:
+	@echo "Copying specs to language directories"
+	cp specs/airr-schema.yaml lang/python/airr/specs
+	cp specs/airr-schema-openapi3.yaml lang/python/airr/specs
+	cp specs/airr-schema.yaml lang/R/inst/extdata
+	cp specs/airr-schema-openapi3.yaml lang/R/inst/extdata
+#	cp specs/airr-schema.yaml lang/js/
+#	cp specs/airr-schema-openapi3.yaml lang/js/
+
+data-copy:
+	@echo "Not implemented"
+
+checks:
+	@echo "Running consistency checks on spec files"
+	python3 tests/check-consistency-formats.py
+
+tests: python-tests r-tests js-tests
+
+python-tests:
+	@echo "Running Python test suite"
+	cd lang/python; python3 -m unittest discover
+
+r-tests:
+	@echo "Running R test suite"
+	cd lang/R; R -e "library(devtools); test()"
+
+js-tests:
+	@echo "Running Javascript test suite"

From 8dd05cb6cb954028131bc73998d65eb2e221be9f Mon Sep 17 00:00:00 2001
From: Scott Christley <scott.christley@utsouthwestern.edu>
Date: Wed, 21 Feb 2024 18:02:24 -0600
Subject: [PATCH 02/15] openapi v3 spec in the lang directories, add
 consistency checks

---
 lang/R/inst/extdata/airr-schema-openapi3.yaml | 5091 +++++++++++++++++
 .../airr/specs/airr-schema-openapi3.yaml      | 5091 +++++++++++++++++
 tests/check-consistency-formats.py            |   41 +-
 3 files changed, 10221 insertions(+), 2 deletions(-)
 create mode 100644 lang/R/inst/extdata/airr-schema-openapi3.yaml
 create mode 100644 lang/python/airr/specs/airr-schema-openapi3.yaml

diff --git a/lang/R/inst/extdata/airr-schema-openapi3.yaml b/lang/R/inst/extdata/airr-schema-openapi3.yaml
new file mode 100644
index 000000000..bba3a45d8
--- /dev/null
+++ b/lang/R/inst/extdata/airr-schema-openapi3.yaml
@@ -0,0 +1,5091 @@
+#
+# Schema definitions for AIRR standards objects
+#
+Info:
+    title: AIRR Schema
+    description: Schema definitions for AIRR standards objects
+    version: 1.4
+    contact:
+        name: AIRR Community
+        url: https://github.com/airr-community
+    license:
+        name: Creative Commons Attribution 4.0 International
+        url: https://creativecommons.org/licenses/by/4.0/
+
+
+# Properties that are based upon an ontology use this
+# standard schema definition
+Ontology:
+    type: object
+    properties:
+        id:
+            type: string
+            nullable: true
+            description: CURIE of the concept, encoding the ontology and the local ID
+        label:
+            type: string
+            nullable: true
+            description: Label of the concept in the respective ontology
+
+# Map to expand CURIE prefixes to full IRIs
+CURIEMap:
+    ABREG:
+        type: identifier
+        default:
+            map: ABREG
+        map:
+            ABREG:
+                iri_prefix: "http://antibodyregistry.org/AB_"
+    CHEBI:
+        type: ontology
+        default:
+            map: OBO
+            provider: OLS
+        map:
+            OBO:
+                iri_prefix: "http://purl.obolibrary.org/obo/CHEBI_"
+    CL:
+        type: ontology
+        default:
+            map: OBO
+            provider: OLS
+        map:
+            OBO:
+                iri_prefix: "http://purl.obolibrary.org/obo/CL_"
+    DOI:
+        type: identifier
+        default:
+            map: DOI
+        map:
+            DOI:
+                iri_prefix: "https://doi.org/"
+    DOID:
+        type: ontology
+        default:
+            map: OBO
+            provider: OLS
+        map:
+            OBO:
+                iri_prefix: "http://purl.obolibrary.org/obo/DOID_"
+    ENA:
+        type: identifier
+        default:
+            map: ENA
+        map:
+            ENA:
+                iri_prefix: "https://www.ebi.ac.uk/ena/browser/view/"
+    ENSG:
+        type: identifier
+        default:
+            map: ENSG
+        map:
+            ENSG:
+                iri_prefix: "https://www.ensembl.org/Multi/Search/Results?q="
+    GAZ:
+        type: ontology
+        default:
+            map: OBO
+            provider: OLS
+        map:
+            OBO:
+                iri_prefix: "http://purl.obolibrary.org/obo/GAZ_"
+    IEDB_RECEPTOR:
+        type: identifier
+        default:
+            map: IEDB
+            provider: IEDB
+        map:
+            IEDB:
+                iri_prefix: "https://www.iedb.org/receptor/"
+    MRO:
+        type: ontology
+        default:
+            map: OBO
+            provider: OLS
+        map:
+            OBO:
+                iri_prefix: "http://purl.obolibrary.org/obo/MRO_"
+    NCBITAXON:
+        type: taxonomy
+        default:
+            map: OBO
+            provider: OLS
+        map:
+            OBO:
+                iri_prefix: "http://purl.obolibrary.org/obo/NCBITaxon_"
+            BioPortal:
+                iri_prefix: "http://purl.bioontology.org/ontology/NCBITAXON/"
+    NCIT:
+        type: ontology
+        default:
+            map: OBO
+            provider: OLS
+        map:
+            OBO:
+                iri_prefix: "http://purl.obolibrary.org/obo/NCIT_"
+    ORCID:
+        type: catalog
+        default:
+            map: ORCID
+            provider: ORCID
+        map:
+            ORCID:
+                iri_prefix: "https://orcid.org/"
+    ROR:
+        type: catalog
+        default:
+            map: ROR
+            provider: ROR
+        map:
+            ROR:
+                iri_prefix: "https://ror.org/"
+    SRA:
+        type: identifier
+        default:
+            map: SRA
+        map:
+            SRA:
+                iri_prefix: "https://trace.ncbi.nlm.nih.gov/Traces/sra/?run="
+    UBERON:
+        type: ontology
+        default:
+            map: OBO
+            provider: OLS
+        map:
+            OBO:
+                iri_prefix: "http://purl.obolibrary.org/obo/UBERON_"
+    UNIPROT:
+        type: identifier
+        default:
+            map: UNIPROT
+        map:
+            UniProt:
+                iri_prefix: "http://purl.uniprot.org/uniprot/"
+    UO:
+        type: ontology
+        default:
+            map: OBO
+            provider: OLS
+        map:
+            OBO:
+                iri_prefix: "http://purl.obolibrary.org/obo/UO_"
+
+InformationProvider:
+    provider:
+        ENA:
+            request:
+                url: "{iri}"
+                response: text/html
+        IEDB:
+            request:
+                url: "https://query-api.iedb.org/tcr_search?receptor_group_id=eq.{local_id}"
+                response: application/json
+        OLS:
+            request:
+                url: "https://www.ebi.ac.uk/ols/api/ontologies/{ontology_id}/terms?iri={iri}"
+                response: application/json
+        Ontobee:
+            request:
+                url: "http://www.ontobee.org/ontology/rdf/{ontology_id}?iri={iri}"
+                response: application/rdf+xml
+        ORCID:
+            request:
+                url: "https://pub.orcid.org/v2.1/{local_id}"
+                header:
+                    Accept: application/json
+                response: application/json
+        ROR:
+            request:
+                url: "https://api.ror.org/organizations/{iri}"
+                response: application/json
+        SRA:
+            request:
+                url: "{iri}"
+                response: text/html
+    parameter:
+        CHEBI:
+            Ontobee:
+                ontology_id: CHEBI
+            OLS:
+                ontology_id: chebi
+        CL:
+            Ontobee:
+                ontology_id: CL
+            OLS:
+                ontology_id: cl
+        DOID:
+            Ontobee:
+                ontology_id: DOID
+            OLS:
+                ontology_id: doid
+        GAZ:
+            Ontobee:
+                ontology_id: GAZ
+            OLS:
+                ontology_id: gaz
+        MRO:
+            Ontobee:
+                ontology_id: MRO
+            OLS:
+                ontology_id: mro
+        NCBITAXON:
+            Ontobee:
+                ontology_id: NCBITaxon
+            OLS:
+                ontology_id: ncbitaxon
+            BioPortal:
+                ontology_id: NCBITAXON
+        NCIT:
+            Ontobee:
+                ontology_id: NCIT
+            OLS:
+                ontology_id: ncit
+        UBERON:
+            Ontobee:
+                ontology_id: UBERON
+            OLS:
+                ontology_id: uberon
+        UO:
+            Ontobee:
+                ontology_id: UO
+            OLS:
+                ontology_id: uo
+
+# AIRR specification extensions
+#
+# The schema definitions for AIRR standards objects is extended to
+# provide a number of AIRR specific attributes. This schema definition
+# specifies the structure, property names and data types. These
+# attributes are attached to an AIRR field with the x-airr property.
+
+Attributes:
+    type: object
+    properties:
+        miairr:
+            type: string
+            description: MiAIRR requirement level.
+            enum:
+                - essential
+                - important
+                - defined
+            default: defined
+        identifier:
+            type: boolean
+            description: >
+                True if the field is an identifier required to link metadata and/or individual
+                sequence records across objects in the complete AIRR Data Model and ADC API.
+            default: false
+        adc-query-support:
+            type: boolean
+            description: >
+                True if an ADC API implementation must support queries on the field.
+                If false, query support for the field in ADC API implementations is optional.
+            default: false
+        adc-api-optional:
+            type: boolean
+            description: >
+                If false, repositories must implement these fields both for queries and query repsonse.
+                Only applies to fields in the ADC API spec that are extensions to the AIRR Standard,
+                targeted at "convenience query fields" that make queries against repositories more
+                efficient than if queries were limited to AIRR fields only.
+                If true, repositories can choose to support the field or not.
+            default: false
+        deprecated:
+            type: boolean
+            description: True if the field has been deprecated from the schema.
+            default: false
+        deprecated-description:
+            type: string
+            description: Information regarding the deprecation of the field.
+        deprecated-replaced-by:
+            type: array
+            items:
+                type: string
+            description: The deprecated field is replaced by this list of fields.
+        set:
+            type: integer
+            description: MiAIRR set
+        subset:
+            type: string
+            description: MiAIRR subset
+        name:
+            type: string
+            description: MiAIRR name
+        format:
+            type: string
+            description: Field format. If null then assume the full range of the field data type
+            enum:
+                - ontology
+                - controlled_vocabulary
+                - physical_quantity
+                - CURIE
+        ontology:
+            type: object
+            description: Ontology definition for field
+            properties:
+                draft:
+                    type: boolean
+                    description: Indicates if ontology definition is a draft
+                top_node:
+                    type: object
+                    description: >
+                        Concept to use as top node for ontology. Note that this must have the same CURIE namespace
+                        as the actually annotated concept.
+                    properties:
+                        id:
+                            type: string
+                            description: CURIE for the top node term
+                        label:
+                            type: string
+                            description: Ontology name for the top node term
+
+# AIRR Data File
+#
+# A JSON data file that holds Repertoire metadata, data processing
+# analysis objects, or any object in the AIRR Data Model.
+#
+# It is presumed that the objects gathered together in an AIRR Data File are related
+# or relevant to each other, e.g. part of the same study; thus, the ID fields can be
+# internally resolved unless the ID contains an external PID. This implies that AIRR
+# Data Files cannot be merged simply by concatenating arrays; any merge program
+# would need to manage duplicate or conflicting ID values.
+#
+# While the properties in an AIRR Data File are not required, if one is provided then
+# the value should not be null.
+
+DataFile:
+    type: object
+    properties:
+        Info:
+            nullable: false
+            $ref: '#/InfoObject'
+        Repertoire:
+            type: array
+            nullable: false
+            description: List of repertoires
+            items:
+                $ref: '#/Repertoire'
+        RepertoireGroup:
+            type: array
+            nullable: false
+            description: List of repertoire collections
+            items:
+                $ref: '#/RepertoireGroup'
+        Rearrangement:
+            type: array
+            nullable: false
+            description: List of rearrangement records
+            items:
+                $ref: '#/Rearrangement'
+        Cell:
+            type: array
+            nullable: false
+            description: List of cells
+            items:
+                $ref: '#/Cell'
+        Clone:
+            type: array
+            nullable: false
+            description: List of clones
+            items:
+                $ref: '#/Clone'
+        GermlineSet:
+            type: array
+            nullable: false
+            description: List of germline sets
+            items:
+                $ref: '#/GermlineSet'
+        GenotypeSet:
+            type: array
+            nullable: false
+            description: List of genotype sets
+            items:
+                $ref: '#/GenotypeSet'
+
+# AIRR Info object, should be similar to openapi
+# should we point to an openapi schema?
+InfoObject:
+    type: object
+    description: Provides information about data and API responses.
+    required:
+        - title
+        - version
+    properties:
+        title:
+            type: string
+            nullable: false
+        version:
+            type: string
+            nullable: false
+        description:
+            type: string
+            nullable: true
+        contact:
+            type: object
+            nullable: true
+            properties:
+                name:
+                    type: string
+                    nullable: true
+                url:
+                    type: string
+                    nullable: true
+                email:
+                    type: string
+                    nullable: true
+        license:
+            type: object
+            nullable: true
+            required:
+                - name
+            properties:
+                name:
+                    type: string
+                    nullable: false
+                url:
+                    type: string
+                    nullable: true
+
+# A time point
+TimePoint:
+    description: Time point at which an observation or other action was performed.
+    type: object
+    properties:
+        label:
+            type: string
+            nullable: true
+            description: Informative label for the time point
+            example: Pre-operative sampling of cancer tissue
+            x-airr:
+                adc-query-support: true
+        value:
+            type: number
+            nullable: true
+            description: Value of the time point
+            example: -5.0
+            x-airr:
+                adc-query-support: true
+        unit:
+            $ref: '#/Ontology'
+            nullable: true
+            description: Unit of the time point
+            title: Unit of immunization schedule
+            example:
+                id: UO:0000033
+                label: day
+            x-airr:
+                adc-query-support: true
+                format: ontology
+                ontology:
+                    draft: false
+                    top_node:
+                        id: UO:0000003
+                        label: time unit
+
+#
+# General objects
+#
+
+# An individual
+Acknowledgement:
+    description: Individual whose contribution to this work should be acknowledged
+    type: object
+    required:
+        - acknowledgement_id
+        - name
+        - institution_name
+    properties:
+        acknowledgement_id:
+            type: string
+            description: unique identifier of this Acknowledgement within the file
+            x-airr:
+                identifier: true
+                miairr: important
+            nullable: true
+        name:
+            type: string
+            nullable: true
+            description: Full name of individual
+        institution_name:
+            type: string
+            nullable: true
+            description: Individual's department and institution name
+        orcid_id:
+            type: string
+            nullable: true
+            description: Individual's ORCID identifier
+
+#
+# Germline gene schema
+#
+
+# Rearranged and genomic germline sequences
+RearrangedSequence:
+    type: object
+    description: >
+        Details of a directly observed rearranged sequence or an inference from rearranged sequences 
+        contributing support for a gene or allele.
+    required:
+        - sequence_id
+        - sequence
+        - derivation
+        - observation_type
+        - repository_name
+        - repository_id
+        - deposited_version
+        - seq_start
+        - seq_end
+    properties:
+        sequence_id:
+            type: string
+            nullable: true
+            description: >
+                Unique identifier of this RearrangedSequence within the file, typically generated by the repository 
+                hosting the schema, for example from the underlying ID of the database record.
+            x-airr:
+                identifier: true
+                miairr: important
+        sequence:
+            type: string
+            nullable: false
+            x-airr:
+                miairr: essential
+            description: nucleotide sequence
+        derivation:
+            type: string
+            nullable: true
+            enum:
+                - DNA
+                - RNA
+                - null
+            description: The class of nucleic acid that was used as primary starting material
+            x-airr:
+                miairr: important
+        observation_type:
+            type: string
+            nullable: false
+            enum:
+                - direct_sequencing
+                - inference_from_repertoire
+            description: >
+                The type of observation from which this sequence was drawn, such as direct sequencing or 
+                inference from repertoire sequencing data.
+            x-airr:
+                miairr: essential
+        curation:
+            type: string
+            nullable: true
+            description: Curational notes on the sequence
+        repository_name:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: defined
+            description: Name of the repository in which the sequence has been deposited
+        repository_ref:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: defined
+            description: Queryable id or accession number of the sequence published by the repository
+        deposited_version:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: defined
+            description: Version number of the sequence within the repository
+        sequence_start:
+            type: integer
+            nullable: false
+            x-airr:
+                miairr: essential
+            description: Start co-ordinate of the sequence detailed in this record, within the sequence deposited
+        sequence_end:
+            type: integer
+            nullable: false
+            x-airr:
+                miairr: essential
+            description: End co-ordinate of the sequence detailed in this record, within the sequence deposited
+
+UnrearrangedSequence:
+    description: Details of an unrearranged sequence contributing support for a gene or allele
+    type: object
+    required:
+        - sequence_id
+        - sequence
+        - repository_name
+        - assembly_id
+        - gff_seqid
+        - gff_start
+        - gff_end
+        - strand
+    properties:
+        sequence_id:
+            type: string
+            nullable: true
+            x-airr:
+                identifier: true
+                miairr: important
+            description: unique identifier of this UnrearrangedSequence within the file
+        sequence:
+            type: string
+            nullable: false
+            description: >
+                Sequence of interest described in this record. Typically, this will include gene and promoter region.
+            x-airr:
+                miairr: essential
+        curation:
+            type: string
+            nullable: true
+            description: Curational notes on the sequence
+        repository_name:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: defined
+            description: Name of the repository in which the assembly or contig is deposited
+        repository_ref:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: defined
+            description: Queryable id or accession number of the sequence published by the repository
+        patch_no:
+            type: string
+            nullable: true
+            description: Genome assembly patch number in which this gene was determined
+        gff_seqid:
+            type: string
+            nullable: true
+            description: >
+                Sequence (from the assembly) of a window including the gene and preferably also the promoter region.
+        gff_start:
+            type: integer
+            nullable: true
+            description: >
+                Genomic co-ordinates of the start of the sequence of interest described in this record in 
+                Ensemble GFF version 3.
+        gff_end:
+            type: integer
+            nullable: true
+            description: >
+                Genomic co-ordinates of the end of the sequence of interest described in this record in 
+                Ensemble GFF version 3.
+        strand:
+            type: string
+            nullable: true
+            enum:
+                - +
+                - "-"
+                - null
+            description: sense (+ or -)
+
+# V gene delineation
+SequenceDelineationV:
+    description: Delineation of a V-gene in a particular system
+    type: object
+    required:
+        - sequence_delineation_id
+        - delineation_scheme
+        - fwr1_start
+        - fwr1_end
+        - cdr1_start
+        - cdr1_end
+        - fwr2_start
+        - fwr2_end
+        - cdr2_start
+        - cdr2_end
+        - fwr3_start
+        - fwr3_end
+        - cdr3_start
+    properties:
+        sequence_delineation_id:
+            type: string
+            nullable: true
+            description: >
+                Unique identifier of this SequenceDelineationV within the file. Typically, generated by the 
+                repository hosting the record.
+            x-airr:
+                identifier: true
+                miairr: important
+
+        delineation_scheme:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: important
+            description: Name of the delineation scheme
+            example: Chothia
+        unaligned_sequence:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: important
+            description: entire V-sequence covered by this delineation
+        aligned_sequence:
+            type: string
+            nullable: true
+            description: >
+                Aligned sequence if this delineation provides an alignment. An aligned sequence should always be 
+                provided for IMGT delineations.
+        fwr1_start:
+            type: integer
+            nullable: true
+            x-airr:
+                miairr: important
+            description: FWR1 start co-ordinate in the 'unaligned sequence' field
+        fwr1_end:
+            type: integer
+            nullable: true
+            x-airr:
+                miairr: important
+            description: FWR1 end co-ordinate in the 'unaligned sequence' field
+        cdr1_start:
+            type: integer
+            nullable: true
+            x-airr:
+                miairr: important
+            description: CDR1 start co-ordinate in the 'unaligned sequence' field
+        cdr1_end:
+            type: integer
+            nullable: true
+            x-airr:
+                miairr: important
+            description: CDR1 end co-ordinate in the 'unaligned sequence' field
+        fwr2_start:
+            type: integer
+            nullable: true
+            x-airr:
+                miairr: important
+            description: FWR2 start co-ordinate in the 'unaligned sequence' field
+        fwr2_end:
+            type: integer
+            nullable: true
+            x-airr:
+                miairr: important
+            description: FWR2 end co-ordinate in the 'unaligned sequence' field
+        cdr2_start:
+            type: integer
+            nullable: true
+            x-airr:
+                miairr: important
+            description: CDR2 start co-ordinate in the 'unaligned sequence' field
+        cdr2_end:
+            type: integer
+            nullable: true
+            x-airr:
+                miairr: important
+            description: CDR2 end co-ordinate in the 'unaligned sequence' field
+        fwr3_start:
+            type: integer
+            nullable: true
+            x-airr:
+                miairr: important
+            description: FWR3 start co-ordinate in the 'unaligned sequence' field
+        fwr3_end:
+            type: integer
+            nullable: true
+            x-airr:
+                miairr: important
+            description: FWR3 end co-ordinate in the 'unaligned sequence' field
+        cdr3_start:
+            type: integer
+            nullable: true
+            x-airr:
+                miairr: important
+            description: CDR3 start co-ordinate in the 'unaligned sequence' field
+        alignment_labels:
+            type: array
+            nullable: true
+            items:
+                type: string
+            description: >
+                One string for each codon in the aligned_sequence indicating the label of that codon according to 
+                the numbering of the delineation scheme if it provides one.
+
+# Description of a putative or confirmed Ig receptor gene/allele
+AlleleDescription:
+    description: Details of a putative or confirmed Ig receptor gene/allele inferred from one or more observations
+    type: object
+    required:
+        - allele_description_id
+        - maintainer
+        - lab_address
+        - release_version
+        - release_date
+        - release_description
+        - sequence
+        - coding_sequence
+        - locus
+        - sequence_type
+        - functional
+        - inference_type
+        - species
+    properties:
+        allele_description_id:
+            type: string
+            nullable: true
+            x-airr:
+                identifier: true
+                miairr: important
+            description: >
+                Unique identifier of this AlleleDescription within the file. Typically, generated by the 
+                repository hosting the record.
+        allele_description_ref:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: important
+            description: Unique reference to the allele description, in standardized form (Repo:Label:Version)
+            example: OGRDB:Human_IGH:IGHV1-69*01.001
+        maintainer:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: defined
+            description: Maintainer of this sequence record
+        acknowledgements:
+            type: array
+            nullable: true
+            description: List of individuals whose contribution to the gene description should be acknowledged
+            items:
+                $ref: '#/Acknowledgement'
+        lab_address:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: defined
+            description: Institution and full address of corresponding author
+        release_version:
+            type: integer
+            nullable: true
+            x-airr:
+                miairr: important
+            description: Version number of this record, updated whenever a revised version is published or released
+        release_date:
+            type: string
+            nullable: true
+            format: date-time
+            x-airr:
+                miairr: important
+            description: Date of this release
+            title: Release Date
+            example: "2021-02-02"
+        release_description:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: important
+            description: Brief descriptive notes of the reason for this release and the changes embodied
+        label:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: important
+            description: >
+                The accepted name for this gene or allele following the relevant nomenclature.
+                The value in this field should correspond to values in acceptable name fields of other schemas, 
+                such as v_call, d_call, and j_call fields.
+            example: IGHV1-69*01
+        sequence:
+            type: string
+            nullable: false
+            x-airr:
+                miairr: essential
+            description: >
+                Nucleotide sequence of the gene. This should cover the full length that is available, 
+                including where possible RSS, and 5' UTR and lead-in for V-gene sequences.
+        coding_sequence:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: important
+            description: >
+                Nucleotide sequence of the core coding region, such as the coding region of a D-, J- or C- gene 
+                or the coding region of a V-gene excluding the leader.
+        aliases:
+            type: array
+            nullable: true
+            items:
+                type: string
+            description: Alternative names for this sequence
+        locus:
+            type: string
+            nullable: false
+            enum:
+                - IGH
+                - IGI
+                - IGK
+                - IGL
+                - TRA
+                - TRB
+                - TRG
+                - TRD
+            description: Gene locus
+            x-airr:
+                miairr: essential
+        chromosome:
+            type: integer
+            nullable: true
+            description: chromosome on which the gene is located
+        sequence_type:
+            type: string
+            nullable: false
+            enum:
+                - V
+                - D
+                - J
+                - C
+            description: Sequence type (V, D, J, C)
+            x-airr:
+                miairr: essential
+        functional:
+            type: boolean
+            nullable: true
+            x-airr:
+                miairr: important
+            description: True if the gene is functional, false if it is a pseudogene
+        inference_type:
+            type: string
+            nullable: true
+            enum:
+                - genomic_and_rearranged
+                - genomic_only
+                - rearranged_only
+                - null
+            description: Type of inference(s) from which this gene sequence was inferred
+            x-airr:
+                miairr: important
+        species:
+            $ref: '#/Ontology'
+            nullable: false
+            description: Binomial designation of subject's species
+            title: Organism
+            example:
+                id: NCBITAXON:9606
+                label: Homo sapiens
+            x-airr:
+                miairr: essential
+        species_subgroup:
+            type: string
+            nullable: true
+            description: Race, strain or other species subgroup to which this subject belongs
+            example: BALB/c
+        species_subgroup_type:
+            type: string
+            nullable: true
+            enum:
+                - breed
+                - strain
+                - inbred
+                - outbred
+                - locational
+                - null
+        status:
+            type: string
+            nullable: true
+            enum:
+                - active
+                - draft
+                - retired
+                - withdrawn
+                - null
+            description: Status of record, assumed active if the field is not present
+        subgroup_designation:
+            type: string
+            nullable: true
+            description: Identifier of the gene subgroup or clade, as (and if) defined
+        gene_designation:
+            type: string
+            nullable: true
+            description: Gene number or other identifier, as (and if) defined
+        allele_designation:
+            type: string
+            nullable: true
+            description: Allele number or other identifier, as (and if) defined
+        allele_similarity_cluster_designation:
+            type: string
+            nullable: true
+            description: ID of the similarity cluster used in this germline set, if designated
+        allele_similarity_cluster_member_id:
+            type: string
+            nullable: true
+            description: Membership ID of the allele within the similarity cluster, if a cluster is designated
+        j_codon_frame:
+            type: integer
+            nullable: true
+            enum:
+                - 1
+                - 2
+                - 3
+                - null
+            description: >
+                Codon position of the first nucleotide in the 'coding_sequence' field. Mandatory for J genes. 
+                Not used for V or D genes. '1' means the sequence is in-frame, '2' means that the first bp is 
+                missing from the first codon, and '3' means that the first 2 bp are missing.
+        gene_start:
+            type: integer
+            nullable: true
+            description: >
+                Co-ordinate in the sequence field of the first nucleotide in the coding_sequence field.
+            x-airr:
+                miairr: important
+        gene_end:
+            type: integer
+            nullable: true
+            description: >
+                Co-ordinate in the sequence field of the last gene-coding nucleotide in the coding_sequence field.
+            x-airr:
+                miairr: important
+        utr_5_prime_start:
+            type: integer
+            nullable: true
+            description: Start co-ordinate in the sequence field of the 5 prime UTR (V-genes only).
+        utr_5_prime_end:
+            type: integer
+            nullable: true
+            description: End co-ordinate in the sequence field of the 5 prime UTR (V-genes only).
+        leader_1_start:
+            type: integer
+            nullable: true
+            description: Start co-ordinate in the sequence field of L-PART1 (V-genes only).
+        leader_1_end:
+            type: integer
+            nullable: true
+            description: End co-ordinate in the sequence field of L-PART1 (V-genes only).
+        leader_2_start:
+            type: integer
+            nullable: true
+            description: Start co-ordinate in the sequence field of L-PART2 (V-genes only).
+        leader_2_end:
+            type: integer
+            nullable: true
+            description: End co-ordinate in the sequence field of L-PART2 (V-genes only).
+        v_rs_start:
+            type: integer
+            nullable: true
+            description: Start co-ordinate in the sequence field of the V recombination site (V-genes only).
+        v_rs_end:
+            type: integer
+            nullable: true
+            description: End co-ordinate in the sequence field of the V recombination site (V-genes only).
+        d_rs_3_prime_start:
+            type: integer
+            nullable: true
+            description: Start co-ordinate in the sequence field of the 3 prime D recombination site (D-genes only).
+        d_rs_3_prime_end:
+            type: integer
+            nullable: true
+            description: End co-ordinate in the sequence field of the 3 prime D recombination site (D-genes only).
+        d_rs_5_prime_start:
+            type: integer
+            nullable: true
+            description: Start co-ordinate in the sequence field of the 5 prime D recombination site (D-genes only).
+        d_rs_5_prime_end:
+            type: integer
+            nullable: true
+            description: End co-ordinate in the sequence field of 5 the prime D recombination site (D-genes only).
+        j_cdr3_end:
+            type: integer
+            nullable: true
+            description: >
+                In the case of a J-gene, the co-ordinate in the sequence field of the first nucelotide of the 
+                conserved PHE or TRP (IMGT codon position 118).
+        j_rs_start:
+            type: integer
+            nullable: true
+            description: Start co-ordinate in the sequence field of J recombination site (J-genes only).
+        j_rs_end:
+            type: integer
+            nullable: true
+            description: End co-ordinate in the sequence field of J recombination site (J-genes only).
+        j_donor_splice:
+            type: integer
+            nullable: true
+            description: Co-ordinate in the sequence field of the final 3' nucleotide of the J-REGION (J-genes only).
+        v_gene_delineations:
+            type: array
+            nullable: true
+            items:
+                $ref: '#/SequenceDelineationV'
+        unrearranged_support:
+            type: array
+            nullable: true
+            items:
+                $ref: '#/UnrearrangedSequence'
+        rearranged_support:
+            type: array
+            nullable: true
+            items:
+                $ref: '#/RearrangedSequence'
+        paralogs:
+            type: array
+            nullable: true
+            items:
+                type: string
+            description: Gene symbols of any paralogs
+        curation:
+            type: string
+            nullable: true
+            description: >
+                Curational notes on the AlleleDescription. This can be used to give more extensive notes on the 
+                decisions taken than are provided in the release_description.
+        curational_tags:
+            type: array
+            nullable: true
+            items:
+                type: string
+                enum:
+                    - likely_truncated
+                    - likely_full_length
+            description: Controlled-vocabulary tags applied to this description
+
+# Collection of gene descriptions into a germline set
+GermlineSet:
+    type: object
+    description: >
+        A germline object set bringing together multiple AlleleDescriptions from the same strain or species. 
+        All genes in a GermlineSet should be from a single locus.
+    required:
+        - germline_set_id
+        - author
+        - lab_name
+        - lab_address
+        - release_version
+        - release_description
+        - release_date
+        - germline_set_name
+        - germline_set_ref
+        - species
+        - locus
+        - allele_descriptions
+    properties:
+        germline_set_id:
+            type: string
+            nullable: true
+            description: >
+                Unique identifier of the GermlineSet within this file. Typically, generated by the 
+                repository hosting the record.
+            x-airr:
+                identifier: true
+                miairr: important
+        author:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: important
+            description: Corresponding author
+        lab_name:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: important
+            description: Department of corresponding author
+        lab_address:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: important
+            description: Institutional address of corresponding author
+        acknowledgements:
+            type: array
+            nullable: true
+            description: List of individuals whose contribution to the germline set should be acknowledged
+            items:
+                $ref: '#/Acknowledgement'
+        release_version:
+            type: number
+            nullable: true
+            x-airr:
+                miairr: important
+            description: Version number of this record, allocated automatically
+        release_description:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: important
+            description: Brief descriptive notes of the reason for this release and the changes embodied
+        release_date:
+            type: string
+            nullable: true
+            format: date-time
+            x-airr:
+                miairr: important
+            description: Date of this release
+            title: Release Date
+            example: "2021-02-02"
+        germline_set_name:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: important
+            description: descriptive name of this germline set
+        germline_set_ref:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: important
+            description: Unique identifier of the germline set and version, in standardized form (Repo:Label:Version)
+            example: OGRDB:Human_IGH:2021.11
+        pub_ids:
+            type: array
+            items:
+                type: string
+            nullable: true
+            description: Publications describing the germline set
+            example: ["PMID:35720344"]
+        species:
+            $ref: '#/Ontology'
+            nullable: false
+            x-airr:
+                miairr: essential
+            description: Binomial designation of subject's species
+            title: Organism
+            example:
+                id: NCBITAXON:9606
+                label: Homo sapiens
+        species_subgroup:
+            type: string
+            nullable: true
+            description: Race, strain or other species subgroup to which this subject belongs
+            example: BALB/c
+        species_subgroup_type:
+            type: string
+            nullable: true
+            enum:
+                - breed
+                - strain
+                - inbred
+                - outbred
+                - locational
+                - null
+        locus:
+            type: string
+            nullable: false
+            enum:
+                - IGH
+                - IGI
+                - IGK
+                - IGL
+                - TRA
+                - TRB
+                - TRG
+                - TRD
+            description: Gene locus
+            x-airr:
+                miairr: essential
+        allele_descriptions:
+            type: array
+            nullable: true
+            items:
+                $ref: '#/AlleleDescription'
+            description: list of allele_descriptions in the germline set
+            x-airr:
+                miairr: important
+        curation:
+            type: string
+            nullable: true
+            description: >
+                Curational notes on the GermlineSet. This can be used to give more extensive notes on the 
+                decisions taken than are provided in the release_description.
+
+#
+# Genotype schema
+#
+
+# GenotypeSet lists the Genotypes (describing different loci) inferred for this subject
+
+GenotypeSet:
+    type: object
+    required:
+        - receptor_genotype_set_id
+    properties:
+        receptor_genotype_set_id:
+            type: string
+            nullable: true
+            x-airr:
+                identifier: true
+                miairr: important
+            description: >
+                A unique identifier for this Receptor Genotype Set, typically generated by the repository 
+                hosting the schema, for example from the underlying ID of the database record.
+        genotype_class_list:
+            description: List of Genotypes included in this Receptor Genotype Set.
+            type: array
+            nullable: true
+            items:
+                $ref: '#/Genotype'
+
+# Genotype of adaptive immune receptors
+# This enumerates the alleles and gene deletions inferred in a single subject.
+# Included alleles may either be listed by reference to a GermlineSet, or
+# listed as 'undocumented', in which case the inferred sequence is provided
+
+Genotype:
+    type: object
+    required:
+        - receptor_genotype_id
+        - locus
+    properties:
+        receptor_genotype_id:
+            type: string
+            nullable: true
+            x-airr:
+                identifier: true
+                miairr: important
+            description: >
+                A unique identifier within the file for this Receptor Genotype, typically generated by the 
+                repository hosting the schema, for example from the underlying ID of the database record.
+        locus:
+            type: string
+            nullable: false
+            enum:
+                - IGH
+                - IGI
+                - IGK
+                - IGL
+                - TRA
+                - TRB
+                - TRD
+                - TRG
+            description: Gene locus
+            example: IGH
+            x-airr:
+                adc-query-support: true
+                format: controlled_vocabulary
+                miairr: essential
+        documented_alleles:
+            type: array
+            nullable: true
+            description: List of alleles documented in reference set(s)
+            items:
+                $ref: '#/DocumentedAllele'
+            x-airr:
+                miairr: important
+        undocumented_alleles:
+            type: array
+            nullable: true
+            description: List of alleles inferred to be present and not documented in an identified GermlineSet
+            items:
+                $ref: '#/UndocumentedAllele'
+            x-airr:
+                adc-query-support: true
+        deleted_genes:
+            type: array
+            nullable: true
+            description: Array of genes identified as being deleted in this genotype
+            items:
+                $ref: '#/DeletedGene'
+            x-airr:
+                adc-query-support: true
+        inference_process:
+            type: string
+            nullable: true
+            enum:
+                - genomic_sequencing
+                - repertoire_sequencing
+                - null
+            description: Information on how the genotype was acquired. Controlled vocabulary.
+            title: Genotype acquisition process
+            example: repertoire_sequencing
+            x-airr:
+                adc-query-support: true
+                format: controlled_vocabulary
+
+# Documented Allele
+# This describes a 'known' allele found in a genotype
+# It 'known' in the sense that it is documented in a reference set
+
+DocumentedAllele:
+    type: object
+    required:
+        - label
+        - germline_set_ref
+    properties:
+        label:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: important
+            description: The accepted name for this allele, taken from the GermlineSet
+        germline_set_ref:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: important
+            description: GermlineSet from which it was taken, referenced in standardized form (Repo:Label:Version)
+            example: OGRDB:Human_IGH:2021.11
+        phasing:
+            type: integer
+            nullable: true
+            description: >
+                Chromosomal phasing indicator. Alleles with the same value are inferred to be located on the 
+                same chromosome.
+
+# Undocumented Allele
+# This describes a 'undocumented' allele found in a genotype
+# It is 'undocumented' in the sense that it was not found in reference sets consulted for the analysis
+
+UndocumentedAllele:
+    required:
+        - allele_name
+        - sequence
+    type: object
+    properties:
+        allele_name:
+            type: string
+            nullable: true
+            description: Allele name as allocated by the inference pipeline
+            x-airr:
+                miairr: important
+        sequence:
+            type: string
+            nullable: false
+            description: nt sequence of the allele, as provided by the inference pipeline
+            x-airr:
+                miairr: essential
+        phasing:
+            type: integer
+            nullable: true
+            description: >
+                Chromosomal phasing indicator. Alleles with the same value are inferred to be located on the 
+                same chromosome.
+
+# Deleted Gene
+# It is regarded as 'deleted' in the sense that it was not identified during inference of the genotype
+
+DeletedGene:
+    required:
+        - label
+        - germline_set_ref
+    type: object
+    properties:
+        label:
+            type: string
+            nullable: false
+            description: The accepted name for this gene, taken from the GermlineSet
+            x-airr:
+                miairr: essential
+        germline_set_ref:
+            type: string
+            nullable: true
+            description: GermlineSet from which it was taken (issuer/name/version)
+            x-airr:
+                miairr: important
+        phasing:
+            type: integer
+            nullable: true
+            description: >
+                Chromosomal phasing indicator. Alleles with the same value are inferred to be located on the 
+                same chromosome.
+
+
+# List of MHCGenotypes describing a subject's genotype
+MHCGenotypeSet:
+    type: object
+    required:
+        - mhc_genotype_set_id
+        - mhc_genotype_list
+    properties:
+        mhc_genotype_set_id:
+            type: string
+            nullable: true
+            x-airr:
+                identifier: true
+                miairr: important
+            description: A unique identifier for this MHCGenotypeSet
+        mhc_genotype_list:
+            description: List of MHCGenotypes included in this set
+            type: array
+            nullable: true
+            x-airr:
+                miairr: important
+            items:
+                $ref: '#/MHCGenotype'
+
+# Genotype of major histocompatibility complex (MHC) class I, class II and non-classical loci
+MHCGenotype:
+    type: object
+    required:
+        - mhc_genotype_id
+        - mhc_class
+        - mhc_alleles
+    properties:
+        mhc_genotype_id:
+            type: string
+            nullable: true
+            x-airr:
+                identifier: true
+                miairr: important
+            description: A unique identifier for this MHCGenotype, assumed to be unique in the context of the study
+        mhc_class:
+            type: string
+            nullable: false
+            enum:
+                - MHC-I
+                - MHC-II
+                - MHC-nonclassical
+            description: Class of MHC alleles described by the MHCGenotype
+            example: MHC-I
+            x-airr:
+                miairr: essential
+                adc-query-support: true
+                format: controlled_vocabulary
+        mhc_alleles:
+            type: array
+            nullable: true
+            description: List of MHC alleles of the indicated mhc_class identified in an individual
+            items:
+                $ref: '#/MHCAllele'
+            x-airr:
+                miairr: important
+                adc-query-support: true
+        mhc_genotyping_method:
+            type: string
+            nullable: true
+            description: >
+                Information on how the genotype was determined. The content of this field should come from a list of
+                recommended terms provided in the AIRR Schema documentation.
+            title: MHC genotyping method
+            example: pcr_low_resolution
+            x-airr:
+                adc-query-support: true
+                miairr: important
+
+
+# Allele of an MHC gene
+MHCAllele:
+    type: object
+    properties:
+        allele_designation:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: important
+            description: >
+                The accepted designation of an allele, usually its gene symbol plus allele/sub-allele/etc
+                identifiers, if provided by the mhc_typing method
+        gene:
+            $ref: '#/Ontology'
+            nullable: true
+            description: The MHC gene to which the described allele belongs
+            title: MHC gene
+            example:
+                id: MRO:0000046
+                label: HLA-A
+            x-airr:
+                adc-query-support: false
+                format: ontology
+                ontology:
+                    draft: true
+                    top_node:
+                        id: MRO:0000004
+                        label: MHC gene
+                miairr: important
+        reference_set_ref:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: important
+            description: Repository and list from which it was taken (issuer/name/version)
+
+
+SubjectGenotype:
+    type: object
+    properties:
+        receptor_genotype_set:
+            nullable: true
+            $ref: '#/GenotypeSet'
+            description: Immune receptor genotype set for this subject.
+        mhc_genotype_set:
+            nullable: true
+            $ref: '#/MHCGenotypeSet'
+            description: MHC genotype set for this subject.
+
+#
+# Repertoire metadata schema
+#
+
+# The overall study with a globally unique study_id
+Study:
+    type: object
+    required:
+        - study_id
+        - study_title
+        - study_type
+        - inclusion_exclusion_criteria
+        - grants
+        - collected_by
+        - lab_name
+        - lab_address
+        - submitted_by
+        - pub_ids
+        - keywords_study
+    properties:
+        study_id:
+            type: string
+            nullable: true
+            description: >
+                Unique ID assigned by study registry such as one of the International Nucleotide Sequence Database
+                Collaboration (INSDC) repositories.
+            title: Study ID
+            example: PRJNA001
+            x-airr:
+                identifier: true
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: study
+                name: Study ID
+        study_title:
+            type: string
+            nullable: true
+            description: Descriptive study title
+            title: Study title
+            example: Effects of sun light exposure of the Treg repertoire
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: study
+                name: Study title
+        study_type:
+            $ref: '#/Ontology'
+            nullable: true
+            description: Type of study design
+            title: Study type
+            example:
+                id: NCIT:C15197
+                label: Case-Control Study
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: study
+                name: Study type
+                format: ontology
+                ontology:
+                    draft: false
+                    top_node:
+                        id: NCIT:C63536
+                        label: Study
+        study_description:
+            type: string
+            nullable: true
+            description: Generic study description
+            title: Study description
+            example: Longer description
+            x-airr:
+                name: Study description
+                adc-query-support: true
+        inclusion_exclusion_criteria:
+            type: string
+            nullable: true
+            description: List of criteria for inclusion/exclusion for the study
+            title: Study inclusion/exclusion criteria
+            example: "Include: Clinical P. falciparum infection; Exclude: Seropositive for HIV"
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: study
+                name: Study inclusion/exclusion criteria
+        grants:
+            type: string
+            nullable: true
+            description: Funding agencies and grant numbers
+            title: Grant funding agency
+            example: NIH, award number R01GM987654
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: study
+                name: Grant funding agency
+        study_contact:
+            type: string
+            nullable: true
+            description: >
+                Full contact information of the contact persons for this study This should include an e-mail address
+                and a persistent identifier such as an ORCID ID.
+            title: Contact information (study)
+            example: Dr. P. Stibbons, p.stibbons@unseenu.edu, https://orcid.org/0000-0002-1825-0097
+            x-airr:
+                adc-query-support: true
+                name: Contact information (study)
+        collected_by:
+            type: string
+            nullable: true
+            description: >
+                Full contact information of the data collector, i.e. the person who is legally responsible for data
+                collection and release. This should include an e-mail address and a persistent identifier such as an
+                ORCID ID.
+            title: Contact information (data collection)
+            example: Dr. P. Stibbons, p.stibbons@unseenu.edu, https://orcid.org/0000-0002-1825-0097
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: study
+                name: Contact information (data collection)
+        lab_name:
+            type: string
+            nullable: true
+            description: Department of data collector
+            title: Lab name
+            example: Department for Planar Immunology
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: study
+                name: Lab name
+        lab_address:
+            type: string
+            nullable: true
+            description: Institution and institutional address of data collector
+            title: Lab address
+            example: School of Medicine, Unseen University, Ankh-Morpork, Disk World
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: study
+                name: Lab address
+        submitted_by:
+            type: string
+            nullable: true
+            description: >
+                Full contact information of the data depositor, i.e., the person submitting the data to a repository.
+                This should include an e-mail address and a persistent identifier such as an ORCID ID. This is
+                supposed to be a short-lived and technical role until the submission is relased.
+            title: Contact information (data deposition)
+            example: Adrian Turnipseed, a.turnipseed@unseenu.edu, https://orcid.org/0000-0002-1825-0097
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: study
+                name: Contact information (data deposition)
+        pub_ids:
+            type: array
+            items:
+                type: string
+            nullable: true
+            description: >
+                Array of publications describing the rationale and/or outcome of the study as an array of CURIE objects such as 
+                a DOI or Pubmed ID. Where more than one publication is given, if there is a primary publication for the study it
+                should come first.
+            title: Relevant publications
+            example: ["PMID:29144493", "DOI:10.1038/ni.3873"]
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: study
+                name: Relevant publications
+        keywords_study:
+            type: array
+            items:
+                type: string
+                enum:
+                    - contains_ig
+                    - contains_tr
+                    - contains_paired_chain
+                    - contains_schema_rearrangement
+                    - contains_schema_clone
+                    - contains_schema_cell
+                    - contains_schema_receptor
+                    - contains_schema_cellexpression
+                    - contains_schema_receptorreactivity
+            nullable: true
+            description: >
+                Keywords describing properties of one or more data sets in a study. "contains_schema" keywords indicate that
+                the study contains data objects from the AIRR Schema of that type (Rearrangement, Clone, Cell, Receptor) while
+                the other keywords indicate that the study design considers the type of data indicated (e.g. it is possible to have
+                a study that "contains_paired_chain" but does not "contains_schema_cell").
+            title: Keywords for study
+            example:
+                - contains_ig
+                - contains_schema_rearrangement
+                - contains_schema_clone
+                - contains_schema_cell
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: study
+                name: Keywords for study
+                format: controlled_vocabulary
+        adc_publish_date:
+            type: string
+            format: date-time
+            nullable: true
+            description: >
+                Date the study was first published in the AIRR Data Commons.
+            title: ADC Publish Date
+            example: "2021-02-02"
+            x-airr:
+                adc-query-support: true
+                name: ADC Publish Date
+        adc_update_date:
+            type: string
+            format: date-time
+            nullable: true
+            description: >
+                Date the study data was updated in the AIRR Data Commons.
+            title: ADC Update Date
+            example: "2021-02-02"
+            x-airr:
+                adc-query-support: true
+                name: ADC Update Date
+
+# 1-to-n relationship between a study and its subjects
+# subject_id is unique within a study
+Subject:
+    type: object
+    required:
+        - subject_id
+        - synthetic
+        - species
+        - sex
+        - age_min
+        - age_max
+        - age_unit
+        - age_event
+        - ancestry_population
+        - ethnicity
+        - race
+        - strain_name
+        - linked_subjects
+        - link_type
+    properties:
+        subject_id:
+            type: string
+            nullable: true
+            description: >
+                Subject ID assigned by submitter, unique within study. If possible, a persistent subject ID linked to
+                an INSDC or similar repository study should be used.
+            title: Subject ID
+            example: SUB856413
+            x-airr:
+                identifier: true
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: subject
+                name: Subject ID
+        synthetic:
+            type: boolean
+            nullable: false
+            description: TRUE for libraries in which the diversity has been synthetically generated (e.g. phage display)
+            title: Synthetic library
+            x-airr:
+                miairr: essential
+                adc-query-support: true
+                set: 1
+                subset: subject
+                name: Synthetic library
+        species:
+            $ref: '#/Ontology'
+            nullable: false
+            description: Binomial designation of subject's species
+            title: Organism
+            example:
+                id: NCBITAXON:9606
+                label: Homo sapiens
+            x-airr:
+                miairr: essential
+                adc-query-support: true
+                set: 1
+                subset: subject
+                name: Species
+                format: ontology
+                ontology:
+                    draft: false
+                    top_node:
+                        id: NCBITAXON:7776
+                        label: Gnathostomata
+        organism:
+            $ref: '#/Ontology'
+            nullable: true
+            description: Binomial designation of subject's species
+            x-airr:
+                deprecated: true
+                deprecated-description: Field was renamed to species for clarity.
+                deprecated-replaced-by:
+                    - species
+        sex:
+            type: string
+            enum:
+                - male
+                - female
+                - pooled
+                - hermaphrodite
+                - intersex
+                - null
+            nullable: true
+            description: Biological sex of subject
+            title: Sex
+            example: female
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: subject
+                name: Sex
+                format: controlled_vocabulary
+        age_min:
+            type: number
+            nullable: true
+            description: Specific age or lower boundary of age range.
+            title: Age minimum
+            example: 60
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: subject
+                name: Age minimum
+        age_max:
+            type: number
+            nullable: true
+            description: >
+                Upper boundary of age range or equal to age_min for specific age.
+                This field should only be null if age_min is null.
+            title: Age maximum
+            example: 80
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: subject
+                name: Age maximum
+        age_unit:
+            $ref: '#/Ontology'
+            nullable: true
+            description: Unit of age range
+            title: Age unit
+            example:
+                id: UO:0000036
+                label: year
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: subject
+                name: Age unit
+                format: ontology
+                ontology:
+                    draft: false
+                    top_node:
+                        id: UO:0000003
+                        label: time unit
+        age_event:
+            type: string
+            nullable: true
+            description: >
+                Event in the study schedule to which `Age` refers. For NCBI BioSample this MUST be `sampling`. For other
+                implementations submitters need to be aware that there is currently no mechanism to encode to potential
+                delta between `Age event` and `Sample collection time`, hence the chosen events should be in temporal proximity.
+            title: Age event
+            example: enrollment
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: subject
+                name: Age event
+        age:
+            type: string
+            nullable: true
+            x-airr:
+                deprecated: true
+                deprecated-description: Split into two fields to specify as an age range.
+                deprecated-replaced-by:
+                    - age_min
+                    - age_max
+                    - age_unit
+        ancestry_population:
+            $ref: '#/Ontology'
+            nullable: true
+            description: Broad geographic origin of ancestry (continent)
+            title: Ancestry population
+            example:
+                id: GAZ:00000459
+                label: South America
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: subject
+                name: Ancestry population
+                format: ontology
+                ontology:
+                    draft: true
+                    top_node:
+                        id: GAZ:00000448
+                        label: geographic location
+        location_birth:
+            $ref: '#/Ontology'
+            nullable: true
+            description: Self-reported location of birth of the subject, preferred granularity is country-level
+            example:
+                id: GAZ:00002939
+                label: Poland
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: subject
+                name: Location of birth
+                format: ontology
+                ontology:
+                    draft: true
+                    top_node:
+                        id: GAZ:00000448
+                        label: geographic location
+        ethnicity:
+            type: string
+            nullable: true
+            description: Ethnic group of subject (defined as cultural/language-based membership)
+            title: Ethnicity
+            example: English, Kurds, Manchu, Yakuts (and other fields from Wikipedia)
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: subject
+                name: Ethnicity
+        race:
+            type: string
+            nullable: true
+            description: Racial group of subject (as defined by NIH)
+            title: Race
+            example: White, American Indian or Alaska Native, Black, Asian, Native Hawaiian or Other Pacific Islander, Other
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: subject
+                name: Race
+        strain_name:
+            type: string
+            nullable: true
+            description: Non-human designation of the strain or breed of animal used
+            title: Strain name
+            example: C57BL/6J
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: subject
+                name: Strain name
+        linked_subjects:
+            type: string
+            nullable: true
+            description: Subject ID to which `Relation type` refers
+            title: Relation to other subjects
+            example: SUB1355648
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: subject
+                name: Relation to other subjects
+        link_type:
+            type: string
+            nullable: true
+            description: Relation between subject and `linked_subjects`, can be genetic or environmental (e.g.exposure)
+            title: Relation type
+            example: father, daughter, household
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: subject
+                name: Relation type
+        diagnosis:
+            type: array
+            nullable: false
+            description: Diagnosis information for subject
+            items:
+                $ref: '#/Diagnosis'
+            x-airr:
+                adc-query-support: true
+        genotype:
+            nullable: true
+            $ref: '#/SubjectGenotype'
+            title: SubjectGenotype
+
+# 1-to-n relationship between a subject and its diagnoses
+Diagnosis:
+    type: object
+    required:
+        - study_group_description
+        - disease_diagnosis
+        - disease_length
+        - disease_stage
+        - prior_therapies
+        - immunogen
+        - intervention
+        - medical_history
+    properties:
+        study_group_description:
+            type: string
+            nullable: true
+            description: Designation of study arm to which the subject is assigned to
+            title: Study group description
+            example: control
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: diagnosis and intervention
+                name: Study group description
+        disease_diagnosis:
+            $ref: '#/Ontology'
+            nullable: true
+            description: Diagnosis of subject
+            title: Diagnosis
+            example:
+                id: DOID:9538
+                label: multiple myeloma
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: diagnosis and intervention
+                name: Diagnosis
+                format: ontology
+                ontology:
+                    draft: false
+                    top_node:
+                        id: DOID:4
+                        label: disease
+        disease_length:
+            type: string
+            nullable: true
+            description: Time duration between initial diagnosis and current intervention
+            title: Length of disease
+            example: 23 months
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: diagnosis and intervention
+                name: Length of disease
+                format: physical_quantity
+        disease_stage:
+            type: string
+            nullable: true
+            description: Stage of disease at current intervention
+            title: Disease stage
+            example: Stage II
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: diagnosis and intervention
+                name: Disease stage
+        prior_therapies:
+            type: string
+            nullable: true
+            description: List of all relevant previous therapies applied to subject for treatment of `Diagnosis`
+            title: Prior therapies for primary disease under study
+            example: melphalan/prednisone
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: diagnosis and intervention
+                name: Prior therapies for primary disease under study
+        immunogen:
+            type: string
+            nullable: true
+            description: Antigen, vaccine or drug applied to subject at this intervention
+            title: Immunogen/agent
+            example: bortezomib
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: diagnosis and intervention
+                name: Immunogen/agent
+        intervention:
+            type: string
+            nullable: true
+            description: Description of intervention
+            title: Intervention definition
+            example: systemic chemotherapy, 6 cycles, 1.25 mg/m2
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: diagnosis and intervention
+                name: Intervention definition
+        medical_history:
+            type: string
+            nullable: true
+            description: Medical history of subject that is relevant to assess the course of disease and/or treatment
+            title: Other relevant medical history
+            example: MGUS, first diagnosed 5 years prior
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: diagnosis and intervention
+                name: Other relevant medical history
+
+# 1-to-n relationship between a subject and its samples
+# sample_id is unique within a study
+Sample:
+    type: object
+    required:
+        - sample_id
+        - sample_type
+        - tissue
+        - anatomic_site
+        - disease_state_sample
+        - collection_time_point_relative
+        - collection_time_point_relative_unit
+        - collection_time_point_reference
+        - biomaterial_provider
+    properties:
+        sample_id:
+            type: string
+            nullable: true
+            description: >
+                Sample ID assigned by submitter, unique within study. If possible, a persistent sample ID linked to
+                INSDC or similar repository study should be used.
+            title: Biological sample ID
+            example: SUP52415
+            x-airr:
+                identifier: true
+                miairr: important
+                adc-query-support: true
+                set: 2
+                subset: sample
+                name: Biological sample ID
+        sample_type:
+            type: string
+            nullable: true
+            description: The way the sample was obtained, e.g. fine-needle aspirate, organ harvest, peripheral venous puncture
+            title: Sample type
+            example: Biopsy
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 2
+                subset: sample
+                name: Sample type
+        tissue:
+            $ref: '#/Ontology'
+            nullable: true
+            description: The actual tissue sampled, e.g. lymph node, liver, peripheral blood
+            title: Tissue
+            example:
+                id: UBERON:0002371
+                label: bone marrow
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 2
+                subset: sample
+                name: Tissue
+                format: ontology
+                ontology:
+                    draft: false
+                    top_node:
+                        id: UBERON:0010000
+                        label: multicellular anatomical structure
+        anatomic_site:
+            type: string
+            nullable: true
+            description: The anatomic location of the tissue, e.g. Inguinal, femur
+            title: Anatomic site
+            example: Iliac crest
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 2
+                subset: sample
+                name: Anatomic site
+        disease_state_sample:
+            type: string
+            nullable: true
+            description: Histopathologic evaluation of the sample
+            title: Disease state of sample
+            example: Tumor infiltration
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 2
+                subset: sample
+                name: Disease state of sample
+        collection_time_point_relative:
+            type: number
+            nullable: true
+            description: Time point at which sample was taken, relative to `Collection time event`
+            title: Sample collection time
+            example: 14
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 2
+                subset: sample
+                name: Sample collection time
+        collection_time_point_relative_unit:
+            $ref: '#/Ontology'
+            nullable: true
+            description: Unit of Sample collection time
+            title: Sample collection time unit
+            example:
+                id: UO:0000033
+                label: day
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 2
+                subset: sample
+                name: Sample collection time unit
+                format: ontology
+                ontology:
+                    draft: false
+                    top_node:
+                        id: UO:0000003
+                        label: time unit
+        collection_time_point_reference:
+            type: string
+            nullable: true
+            description: Event in the study schedule to which `Sample collection time` relates to
+            title: Collection time event
+            example: Primary vaccination
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 2
+                subset: sample
+                name: Collection time event
+        collection_location:
+            $ref: '#/Ontology'
+            nullable: true
+            description: Location where the sample was taken, preferred granularity is country-level
+            title: Sample collection location
+            example:
+                id: GAZ:00002939
+                label: Poland
+            x-airr:
+                miairr: important
+                set: 2
+                subset: sample
+                name: Sample collection location
+                format: ontology
+                ontology:
+                    draft: true
+                    top_node:
+                        id: GAZ:00000448
+                        label: geographic location
+        biomaterial_provider:
+            type: string
+            nullable: true
+            description: Name and address of the entity providing the sample
+            title: Biomaterial provider
+            example: Tissues-R-Us, Tampa, FL, USA
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 2
+                subset: sample
+                name: Biomaterial provider
+
+# 1-to-n relationship between a sample and processing of its cells
+CellProcessing:
+    type: object
+    required:
+        - tissue_processing
+        - cell_subset
+        - cell_phenotype
+        - single_cell
+        - cell_number
+        - cells_per_reaction
+        - cell_storage
+        - cell_quality
+        - cell_isolation
+        - cell_processing_protocol
+    properties:
+        tissue_processing:
+            type: string
+            nullable: true
+            description: Enzymatic digestion and/or physical methods used to isolate cells from sample
+            title: Tissue processing
+            example: Collagenase A/Dnase I digested, followed by Percoll gradient
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (cell)
+                name: Tissue processing
+        cell_subset:
+            $ref: '#/Ontology'
+            nullable: true
+            description: Commonly-used designation of isolated cell population
+            title: Cell subset
+            example:
+                id: CL:0000972
+                label: class switched memory B cell
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (cell)
+                name: Cell subset
+                format: ontology
+                ontology:
+                    draft: false
+                    top_node:
+                        id: CL:0000542
+                        label: lymphocyte
+        cell_phenotype:
+            type: string
+            nullable: true
+            description: List of cellular markers and their expression levels used to isolate the cell population
+            title: Cell subset phenotype
+            example: CD19+ CD38+ CD27+ IgM- IgD-
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (cell)
+                name: Cell subset phenotype
+        cell_species:
+            $ref: '#/Ontology'
+            nullable: true
+            description: >
+                Binomial designation of the species from which the analyzed cells originate. Typically, this value
+                should be identical to `species`, in which case it SHOULD NOT be set explicitly. However, there are
+                valid experimental setups in which the two might differ, e.g., chimeric animal models. If set, this
+                key will overwrite the `species` information for all lower layers of the schema.
+            title: Cell species
+            example:
+                id: NCBITAXON:9606
+                label: Homo sapiens
+            x-airr:
+                miairr: defined
+                adc-query-support: true
+                set: 3
+                subset: process (cell)
+                name: Cell species
+                format: ontology
+                ontology:
+                    draft: false
+                    top_node:
+                        id: NCBITAXON:7776
+                        label: Gnathostomata
+        single_cell:
+            type: boolean
+            nullable: true
+            description: TRUE if single cells were isolated into separate compartments
+            title: Single-cell sort
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (cell)
+                name: Single-cell sort
+        cell_number:
+            type: integer
+            nullable: true
+            description: Total number of cells that went into the experiment
+            title: Number of cells in experiment
+            example: 1000000
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (cell)
+                name: Number of cells in experiment
+        cells_per_reaction:
+            type: integer
+            nullable: true
+            description: Number of cells for each biological replicate
+            title: Number of cells per sequencing reaction
+            example: 50000
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (cell)
+                name: Number of cells per sequencing reaction
+        cell_storage:
+            type: boolean
+            nullable: true
+            description: TRUE if cells were cryo-preserved between isolation and further processing
+            title: Cell storage
+            example: TRUE
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (cell)
+                name: Cell storage
+        cell_quality:
+            type: string
+            nullable: true
+            description: Relative amount of viable cells after preparation and (if applicable) thawing
+            title: Cell quality
+            example: 90% viability as determined by 7-AAD
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (cell)
+                name: Cell quality
+        cell_isolation:
+            type: string
+            nullable: true
+            description: Description of the procedure used for marker-based isolation or enrich cells
+            title: Cell isolation / enrichment procedure
+            example: >
+                Cells were stained with fluorochrome labeled antibodies and then sorted on a FlowMerlin (CE) cytometer.
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (cell)
+                name: Cell isolation / enrichment procedure
+        cell_processing_protocol:
+            type: string
+            nullable: true
+            description: >
+                Description of the methods applied to the sample including cell preparation/ isolation/enrichment and
+                nucleic acid extraction. This should closely mirror the Materials and methods section in the manuscript.
+            title: Processing protocol
+            example: Stimulated wih anti-CD3/anti-CD28
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (cell)
+                name: Processing protocol
+
+# object for PCR primer targets
+PCRTarget:
+    type: object
+    required:
+        - pcr_target_locus
+        - forward_pcr_primer_target_location
+        - reverse_pcr_primer_target_location
+    properties:
+        pcr_target_locus:
+            type: string
+            enum:
+                - IGH
+                - IGI
+                - IGK
+                - IGL
+                - TRA
+                - TRB
+                - TRD
+                - TRG
+                - null
+            nullable: true
+            description: >
+                Designation of the target locus. Note that this field uses a controlled vocubulary that is meant to
+                provide a generic classification of the locus, not necessarily the correct designation according to
+                a specific nomenclature.
+            title: Target locus for PCR
+            example: IGK
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (nucleic acid [pcr])
+                name: Target locus for PCR
+                format: controlled_vocabulary
+        forward_pcr_primer_target_location:
+            type: string
+            nullable: true
+            description: Position of the most distal nucleotide templated by the forward primer or primer mix
+            title: Forward PCR primer target location
+            example: IGHV, +23
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (nucleic acid [pcr])
+                name: Forward PCR primer target location
+        reverse_pcr_primer_target_location:
+            type: string
+            nullable: true
+            description: Position of the most proximal nucleotide templated by the reverse primer or primer mix
+            title: Reverse PCR primer target location
+            example: IGHG, +57
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (nucleic acid [pcr])
+                name: Reverse PCR primer target location
+
+# generally, a 1-to-1 relationship between a CellProcessing and processing of its nucleic acid
+# but may be 1-to-n for technical replicates.
+NucleicAcidProcessing:
+    type: object
+    required:
+        - template_class
+        - template_quality
+        - template_amount
+        - template_amount_unit
+        - library_generation_method
+        - library_generation_protocol
+        - library_generation_kit_version
+        - complete_sequences
+        - physical_linkage
+    properties:
+        template_class:
+            type: string
+            enum:
+                - DNA
+                - RNA
+            nullable: false
+            description: >
+                The class of nucleic acid that was used as primary starting material for the following procedures
+            title: Target substrate
+            example: RNA
+            x-airr:
+                miairr: essential
+                adc-query-support: true
+                set: 3
+                subset: process (nucleic acid)
+                name: Target substrate
+                format: controlled_vocabulary
+        template_quality:
+            type: string
+            nullable: true
+            description: Description and results of the quality control performed on the template material
+            title: Target substrate quality
+            example: RIN 9.2
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (nucleic acid)
+                name: Target substrate quality
+        template_amount:
+            type: number
+            nullable: true
+            description: Amount of template that went into the process
+            title: Template amount
+            example: 1000
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (nucleic acid)
+                name: Template amount
+        template_amount_unit:
+            $ref: '#/Ontology'
+            nullable: true
+            description: Unit of template amount
+            title: Template amount time unit
+            example:
+                id: UO:0000024
+                label: nanogram
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (nucleic acid)
+                name: Template amount time unit
+                format: ontology
+                ontology:
+                    draft: false
+                    top_node:
+                        id: UO:0000002
+                        label: physical quantity
+        library_generation_method:
+            type: string
+            enum:
+                - "PCR"
+                - "RT(RHP)+PCR"
+                - "RT(oligo-dT)+PCR"
+                - "RT(oligo-dT)+TS+PCR"
+                - "RT(oligo-dT)+TS(UMI)+PCR"
+                - "RT(specific)+PCR"
+                - "RT(specific)+TS+PCR"
+                - "RT(specific)+TS(UMI)+PCR"
+                - "RT(specific+UMI)+PCR"
+                - "RT(specific+UMI)+TS+PCR"
+                - "RT(specific)+TS"
+                - "other"
+            nullable: false
+            description: Generic type of library generation
+            title: Library generation method
+            example: RT(oligo-dT)+TS(UMI)+PCR
+            x-airr:
+                miairr: essential
+                adc-query-support: true
+                set: 3
+                subset: process (nucleic acid)
+                name: Library generation method
+                format: controlled_vocabulary
+        library_generation_protocol:
+            type: string
+            nullable: true
+            description: Description of processes applied to substrate to obtain a library that is ready for sequencing
+            title: Library generation protocol
+            example: cDNA was generated using
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (nucleic acid)
+                name: Library generation protocol
+        library_generation_kit_version:
+            type: string
+            nullable: true
+            description: When using a library generation protocol from a commercial provider, provide the protocol version number
+            title: Protocol IDs
+            example: v2.1 (2016-09-15)
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (nucleic acid)
+                name: Protocol IDs
+        pcr_target:
+            type: array
+            nullable: false
+            description: >
+                If a PCR step was performed that specifically targets the IG/TR loci, the target and primer locations
+                need to be provided here. This field holds an array of PCRTarget objects, so that multiplex PCR setups
+                amplifying multiple loci at the same time can be annotated using one record per locus. PCR setups not
+                targeting any specific locus must not annotate this field but select the appropriate
+                library_generation_method instead.
+            items:
+                $ref: '#/PCRTarget'
+            x-airr:
+                adc-query-support: true
+        complete_sequences:
+            type: string
+            enum:
+                - partial
+                - complete
+                - "complete+untemplated"
+                - mixed
+            nullable: false
+            description: >
+                To be considered `complete`, the procedure used for library construction MUST generate sequences that
+                1) include the first V gene codon that encodes the mature polypeptide chain (i.e. after the
+                leader sequence) and 2) include the last complete codon of the J gene (i.e. 1 bp 5' of the J->C
+                splice site) and 3) provide sequence information for all positions between 1) and 2). To be considered
+                `complete & untemplated`, the sections of the sequences defined in points 1) to 3) of the previous
+                sentence MUST be untemplated, i.e. MUST NOT overlap with the primers used in library preparation.
+                `mixed` should only be used if the procedure used for library construction will likely produce multiple
+                categories of sequences in the given experiment. It SHOULD NOT be used as a replacement of a NULL value.
+            title: Complete sequences
+            example: partial
+            x-airr:
+                miairr: essential
+                adc-query-support: true
+                set: 3
+                subset: process (nucleic acid)
+                name: Complete sequences
+                format: controlled_vocabulary
+        physical_linkage:
+            type: string
+            enum:
+                - none
+                - "hetero_head-head"
+                - "hetero_tail-head"
+                - "hetero_prelinked"
+            nullable: false
+            description: >
+                In case an experimental setup is used that physically links nucleic acids derived from distinct
+                `Rearrangements` before library preparation, this field describes the mode of that linkage. All
+                `hetero_*` terms indicate that in case of paired-read sequencing, the two reads should be expected
+                to map to distinct IG/TR loci. `*_head-head` refers to techniques that link the 5' ends of transcripts
+                in a single-cell context. `*_tail-head` refers to techniques that link the 3' end of one transcript to
+                the 5' end of another one in a single-cell context. This term does not provide any information whether
+                a continuous reading-frame between the two is generated. `*_prelinked` refers to constructs in which
+                the linkage was already present on the DNA level (e.g. scFv).
+            title: Physical linkage of different rearrangements
+            example: hetero_head-head
+            x-airr:
+                miairr: essential
+                adc-query-support: true
+                set: 3
+                subset: process (nucleic acid)
+                name: Physical linkage of different rearrangements
+                format: controlled_vocabulary
+
+# 1-to-n relationship between a NucleicAcidProcessing and SequencingRun with resultant raw sequence file(s)
+SequencingRun:
+    type: object
+    required:
+        - sequencing_run_id
+        - total_reads_passing_qc_filter
+        - sequencing_platform
+        - sequencing_facility
+        - sequencing_run_date
+        - sequencing_kit
+    properties:
+        sequencing_run_id:
+            type: string
+            nullable: true
+            description: ID of sequencing run assigned by the sequencing facility
+            title: Batch number
+            example: 160101_M01234
+            x-airr:
+                identifier: true
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (sequencing)
+                name: Batch number
+        total_reads_passing_qc_filter:
+            type: integer
+            nullable: true
+            description: Number of usable reads for analysis
+            title: Total reads passing QC filter
+            example: 10365118
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (sequencing)
+                name: Total reads passing QC filter
+        sequencing_platform:
+            type: string
+            nullable: true
+            description: Designation of sequencing instrument used
+            title: Sequencing platform
+            example: Alumina LoSeq 1000
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (sequencing)
+                name: Sequencing platform
+        sequencing_facility:
+            type: string
+            nullable: true
+            description: Name and address of sequencing facility
+            title: Sequencing facility
+            example: Seqs-R-Us, Vancouver, BC, Canada
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (sequencing)
+                name: Sequencing facility
+        sequencing_run_date:
+            type: string
+            nullable: true
+            description: Date of sequencing run
+            title: Date of sequencing run
+            format: date
+            example: 2016-12-16
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (sequencing)
+                name: Date of sequencing run
+        sequencing_kit:
+            type: string
+            nullable: true
+            description: Name, manufacturer, order and lot numbers of sequencing kit
+            title: Sequencing kit
+            example: "FullSeq 600, Alumina, #M123456C0, 789G1HK"
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (sequencing)
+                name: Sequencing kit
+        sequencing_files:
+            $ref: '#/SequencingData'
+            nullable: false
+            description: Set of sequencing files produced by the sequencing run
+            x-airr:
+                adc-query-support: true
+
+# Resultant raw sequencing files from a SequencingRun
+SequencingData:
+    type: object
+    required:
+        - sequencing_data_id
+        - file_type
+        - filename
+        - read_direction
+        - read_length
+        - paired_filename
+        - paired_read_direction
+        - paired_read_length
+    properties:
+        sequencing_data_id:
+            type: string
+            nullable: true
+            description: >
+                Persistent identifier of raw data stored in an archive (e.g. INSDC run ID). Data archive should 
+                be identified in the CURIE prefix.
+            title: Raw sequencing data persistent identifier
+            example: "SRA:SRR11610494"
+            x-airr:
+                identifier: true
+                miairr: important
+                adc-query-support: true
+                set: 4
+                subset: data (raw reads)
+                format: CURIE
+        file_type:
+            type: string
+            nullable: true
+            description: File format for the raw reads or sequences
+            title: Raw sequencing data file type
+            enum:
+                - fasta
+                - fastq
+                - null
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 4
+                subset: data (raw reads)
+                name: Raw sequencing data file type
+                format: controlled_vocabulary
+        filename:
+            type: string
+            nullable: true
+            description: File name for the raw reads or sequences. The first file in paired-read sequencing.
+            title: Raw sequencing data file name
+            example: MS10R-NMonson-C7JR9_S1_R1_001.fastq
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 4
+                subset: data (raw reads)
+                name: Raw sequencing data file name
+        read_direction:
+            type: string
+            nullable: true
+            description: Read direction for the raw reads or sequences. The first file in paired-read sequencing.
+            title: Read direction
+            example: forward
+            enum:
+                - forward
+                - reverse
+                - mixed
+                - null
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 4
+                subset: data (raw reads)
+                name: Read direction
+                format: controlled_vocabulary
+        read_length:
+            type: integer
+            nullable: true
+            description: Read length in bases for the first file in paired-read sequencing
+            title: Forward read length
+            example: 300
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 4
+                subset: data (raw reads)
+                name: Forward read length
+        paired_filename:
+            type: string
+            nullable: true
+            description: File name for the second file in paired-read sequencing
+            title: Paired raw sequencing data file name
+            example: MS10R-NMonson-C7JR9_S1_R2_001.fastq
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 4
+                subset: data (raw reads)
+                name: Paired raw sequencing data file name
+        paired_read_direction:
+            type: string
+            nullable: true
+            description: Read direction for the second file in paired-read sequencing
+            title: Paired read direction
+            example: reverse
+            enum:
+                - forward
+                - reverse
+                - mixed
+                - null
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 4
+                subset: data (raw reads)
+                name: Paired read direction
+                format: controlled_vocabulary
+        paired_read_length:
+            type: integer
+            nullable: true
+            description: Read length in bases for the second file in paired-read sequencing
+            title: Paired read length
+            example: 300
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 4
+                subset: data (raw reads)
+                name: Paired read length
+        index_filename:
+            type: string
+            nullable: true
+            description: File name for the index file
+            title: Sequencing index file name
+            example: MS10R-NMonson-C7JR9_S1_R3_001.fastq
+            x-airr:
+                adc-query-support: true
+        index_length:
+            type: integer
+            nullable: true
+            description: Read length in bases for the index file
+            title: Index read length
+            example: 8
+            x-airr:
+                adc-query-support: true
+
+# 1-to-n relationship between a repertoire and data processing
+#
+# Set of annotated rearrangement sequences produced by
+# data processing upon the raw sequence data for a repertoire.
+DataProcessing:
+    type: object
+    required:
+        - software_versions
+        - paired_reads_assembly
+        - quality_thresholds
+        - primer_match_cutoffs
+        - collapsing_method
+        - data_processing_protocols
+        - germline_database
+    properties:
+        data_processing_id:
+            type: string
+            nullable: true
+            description: Identifier for the data processing object.
+            title: Data processing ID
+            x-airr:
+                name: Data processing ID
+                adc-query-support: true
+                identifier: true
+        primary_annotation:
+            type: boolean
+            default: false
+            nullable: false
+            description: >
+                If true, indicates this is the primary or default data processing for
+                the repertoire and its rearrangements. If false, indicates this is a secondary
+                or additional data processing.
+            title: Primary annotation
+            x-airr:
+                adc-query-support: true
+                identifier: true
+        software_versions:
+            type: string
+            nullable: true
+            description: Version number and / or date, include company pipelines
+            title: Software tools and version numbers
+            example: IgBLAST 1.6
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 5
+                subset: process (computational)
+                name: Software tools and version numbers
+        paired_reads_assembly:
+            type: string
+            nullable: true
+            description: How paired end reads were assembled into a single receptor sequence
+            title: Paired read assembly
+            example: PandaSeq (minimal overlap 50, threshold 0.8)
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 5
+                subset: process (computational)
+                name: Paired read assembly
+        quality_thresholds:
+            type: string
+            nullable: true
+            description: How/if sequences were removed from (4) based on base quality scores
+            title: Quality thresholds
+            example: Average Phred score >=20
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 5
+                subset: process (computational)
+                name: Quality thresholds
+        primer_match_cutoffs:
+            type: string
+            nullable: true
+            description: How primers were identified in the sequences, were they removed/masked/etc?
+            title: Primer match cutoffs
+            example: Hamming distance <= 2
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 5
+                subset: process (computational)
+                name: Primer match cutoffs
+        collapsing_method:
+            type: string
+            nullable: true
+            description: The method used for combining multiple sequences from (4) into a single sequence in (5)
+            title: Collapsing method
+            example: MUSCLE 3.8.31
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 5
+                subset: process (computational)
+                name: Collapsing method
+        data_processing_protocols:
+            type: string
+            nullable: true
+            description: General description of how QC is performed
+            title: Data processing protocols
+            example: Data was processed using [...]
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 5
+                subset: process (computational)
+                name: Data processing protocols
+        data_processing_files:
+            type: array
+            items:
+                type: string
+            nullable: true
+            description: Array of file names for data produced by this data processing.
+            title: Processed data file names
+            example:
+                - 'ERR1278153_aa.txz'
+                - 'ERR1278153_ab.txz'
+                - 'ERR1278153_ac.txz'
+            x-airr:
+                adc-query-support: true
+                name: Processed data file names
+        germline_database:
+            type: string
+            nullable: true
+            description: Source of germline V(D)J genes with version number or date accessed.
+            title: V(D)J germline reference database
+            example: ENSEMBL, Homo sapiens build 90, 2017-10-01
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 5
+                subset: data (processed sequence)
+                name: V(D)J germline reference database
+        germline_set_ref:
+            type: string
+            nullable: true
+            description: Unique identifier of the germline set and version, in standardized form (Repo:Label:Version)
+            example: OGRDB:Human_IGH:2021.11
+            x-airr:
+                adc-query-support: true
+        analysis_provenance_id:
+            type: string
+            nullable: true
+            description: Identifier for machine-readable PROV model of analysis provenance
+            title: Analysis provenance ID
+            x-airr:
+                adc-query-support: true
+
+SampleProcessing:
+    allOf:
+        - type: object
+          properties:
+              sample_processing_id:
+                  type: string
+                  nullable: true
+                  description: >
+                      Identifier for the sample processing object. This field should be unique within the repertoire.
+                      This field can be used to uniquely identify the combination of sample, cell processing,
+                      nucleic acid processing and sequencing run information for the repertoire.
+                  title: Sample processing ID
+                  x-airr:
+                      name: Sample processing ID
+                      adc-query-support: true
+                      identifier: true
+        - $ref: '#/Sample'
+        - $ref: '#/CellProcessing'
+        - $ref: '#/NucleicAcidProcessing'
+        - $ref: '#/SequencingRun'
+
+
+# The composite schema for the repertoire object
+#
+# This represents a sample repertoire as defined by the study
+# and experimentally observed by raw sequence data. A repertoire
+# can only be for one subject but may include multiple samples.
+Repertoire:
+    type: object
+    required:
+        - study
+        - subject
+        - sample
+        - data_processing
+    properties:
+        repertoire_id:
+            type: string
+            nullable: true
+            description: >
+                Identifier for the repertoire object. This identifier should be globally unique so that repertoires
+                from multiple studies can be combined together without conflict. The repertoire_id is used to link
+                other AIRR data to a Repertoire. Specifically, the Rearrangements Schema includes repertoire_id for
+                referencing the specific Repertoire for that Rearrangement.
+            title: Repertoire ID
+            x-airr:
+                adc-query-support: true
+                identifier: true
+        repertoire_name:
+            type: string
+            nullable: true
+            description: Short generic display name for the repertoire
+            title: Repertoire name
+            x-airr:
+                name: Repertoire name
+                adc-query-support: true
+        repertoire_description:
+            type: string
+            nullable: true
+            description: Generic repertoire description
+            title: Repertoire description
+            x-airr:
+                name: Repertoire description
+                adc-query-support: true
+        study:
+            $ref: '#/Study'
+            nullable: false
+            description: Study object
+            x-airr:
+                adc-query-support: true
+        subject:
+            $ref: '#/Subject'
+            nullable: false
+            description: Subject object
+            x-airr:
+                adc-query-support: true
+        sample:
+            type: array
+            nullable: false
+            description: List of Sample Processing objects
+            items:
+                $ref: '#/SampleProcessing'
+            x-airr:
+                adc-query-support: true
+        data_processing:
+            type: array
+            nullable: false
+            description: List of Data Processing objects
+            items:
+                $ref: '#/DataProcessing'
+            x-airr:
+                adc-query-support: true
+
+# A collection of repertoires for analysis purposes, includes optional time course
+RepertoireGroup:
+    type: object
+    required:
+        - repertoire_group_id
+        - repertoires
+    properties:
+        repertoire_group_id:
+            type: string
+            nullable: true
+            description: Identifier for this repertoire collection
+            x-airr:
+                identifier: true
+        repertoire_group_name:
+            type: string
+            nullable: true
+            description: Short display name for this repertoire collection
+        repertoire_group_description:
+            type: string
+            nullable: true
+            description: Repertoire collection description
+        repertoires:
+            type: array
+            nullable: true
+            description: >
+                List of repertoires in this collection with an associated description and time point designation
+            items:
+                type: object
+                properties:
+                    repertoire_id:
+                        type: string
+                        nullable: false
+                        description: Identifier to the repertoire
+                        x-airr:
+                            adc-query-support: true
+                    repertoire_description:
+                        type: string
+                        nullable: true
+                        description: Description of this repertoire within the group
+                        x-airr:
+                            adc-query-support: true
+                    time_point:
+                        $ref: '#/TimePoint'
+                        nullable: true
+                        description: Time point designation for this repertoire within the group
+                        x-airr:
+                            adc-query-support: true
+
+Alignment:
+    type: object
+    required:
+        - sequence_id
+        - segment
+        - call
+        - score
+        - cigar
+    properties:
+        sequence_id:
+            type: string
+            nullable: true
+            description: >
+                Unique query sequence identifier within the file. Most often this will be the input sequence
+                header or a substring thereof, but may also be a custom identifier defined by the tool in
+                cases where query sequences have been combined in some fashion prior to alignment.
+            x-airr:
+                identifier: true
+        segment:
+            type: string
+            nullable: true
+            description: >
+                The segment for this alignment. One of V, D, J or C.
+        rev_comp:
+            type: boolean
+            nullable: true
+            description: >
+                Alignment result is from the reverse complement of the query sequence.
+        call:
+            type: string
+            nullable: true
+            description: >
+                Gene assignment with allele.
+        score:
+            type: number
+            nullable: true
+            description: >
+                Alignment score.
+        identity:
+            type: number
+            nullable: true
+            description: >
+                Alignment fractional identity.
+        support:
+            type: number
+            nullable: true
+            description: >
+                Alignment E-value, p-value, likelihood, probability or other similar measure of
+                support for the gene assignment as defined by the alignment tool.
+        cigar:
+            type: string
+            nullable: true
+            description: >
+                Alignment CIGAR string.
+        sequence_start:
+            type: integer
+            nullable: true
+            description: >
+                Start position of the segment in the query sequence (1-based closed interval).
+        sequence_end:
+            type: integer
+            nullable: true
+            description: >
+                End position of the segment in the query sequence (1-based closed interval).
+        germline_start:
+            type: integer
+            nullable: true
+            description: >
+                Alignment start position in the reference sequence (1-based closed interval).
+        germline_end:
+            type: integer
+            nullable: true
+            description: >
+                Alignment end position in the reference sequence (1-based closed interval).
+        rank:
+            type: integer
+            nullable: true
+            description: >
+                Alignment rank.
+        rearrangement_id:
+            type: string
+            nullable: true
+            description: >
+                Identifier for the Rearrangement object. May be identical to sequence_id,
+                but will usually be a universally unique record locator for database applications.
+            x-airr:
+                deprecated: true
+                deprecated-description: Field has been merged with sequence_id to avoid confusion.
+                deprecated-replaced-by:
+                    - sequence_id
+        data_processing_id:
+            type: string
+            nullable: true
+            description: >
+                Identifier to the data processing object in the repertoire metadata
+                for this rearrangement. If this field is empty than the primary data processing object is assumed.
+        germline_database:
+            type: string
+            nullable: true
+            description: Source of germline V(D)J genes with version number or date accessed.
+            example: ENSEMBL, Homo sapiens build 90, 2017-10-01
+            x-airr:
+                deprecated: true
+                deprecated-description: Field was moved up to the DataProcessing level to avoid data duplication.
+                deprecated-replaced-by:
+                    - "DataProcessing:germline_database"
+
+
+# The extended rearrangement object
+Rearrangement:
+    type: object
+    required:
+        - sequence_id
+        - sequence
+        - rev_comp
+        - productive
+        - v_call
+        - d_call
+        - j_call
+        - sequence_alignment
+        - germline_alignment
+        - junction
+        - junction_aa
+        - v_cigar
+        - d_cigar
+        - j_cigar
+    properties:
+        sequence_id:
+            type: string
+            nullable: true
+            description: >
+                Unique query sequence identifier for the Rearrangement. Most often this will be the input sequence
+                header or a substring thereof, but may also be a custom identifier defined by the tool in
+                cases where query sequences have been combined in some fashion prior to alignment. When
+                downloaded from an AIRR Data Commons repository, this will usually be a universally unique
+                record locator for linking with other objects in the AIRR Data Model.
+            x-airr:
+                adc-query-support: true
+                identifier: true
+        sequence:
+            type: string
+            nullable: true
+            description: >
+                The query nucleotide sequence. Usually, this is the unmodified input sequence, which may be
+                reverse complemented if necessary. In some cases, this field may contain consensus sequences or
+                other types of collapsed input sequences if these steps are performed prior to alignment.
+        quality:
+            type: string
+            nullable: true
+            description: >
+                The Sanger/Phred quality scores for assessment of sequence quality.
+                Phred quality scores from 0 to 93 are encoded using ASCII 33 to 126 (Used by Illumina from v1.8.)
+        sequence_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the query nucleotide sequence.
+        rev_comp:
+            type: boolean
+            nullable: true
+            description: >
+                True if the alignment is on the opposite strand (reverse complemented) with respect to the
+                query sequence. If True then all output data, such as alignment coordinates and sequences,
+                are based on the reverse complement of 'sequence'.
+        productive:
+            type: boolean
+            nullable: true
+            description: >
+                True if the V(D)J sequence is predicted to be productive.
+            x-airr:
+                adc-query-support: true
+        vj_in_frame:
+            type: boolean
+            nullable: true
+            description: True if the V and J gene alignments are in-frame.
+        stop_codon:
+            type: boolean
+            nullable: true
+            description: True if the aligned sequence contains a stop codon.
+        complete_vdj:
+            type: boolean
+            nullable: true
+            description: >
+                True if the sequence alignment spans the entire V(D)J region. Meaning,
+                sequence_alignment includes both the first V gene codon that encodes the
+                mature polypeptide chain (i.e., after the leader sequence) and the last
+                complete codon of the J gene (i.e., before the J-C splice site).
+                This does not require an absence of deletions within the internal
+                FWR and CDR regions of the alignment.
+        locus:
+            type: string
+            enum:
+                - IGH
+                - IGI
+                - IGK
+                - IGL
+                - TRA
+                - TRB
+                - TRD
+                - TRG
+                - null
+            nullable: true
+            description: >
+                Gene locus (chain type). Note that this field uses a controlled vocabulary that is meant to provide a
+                generic classification of the locus, not necessarily the correct designation according to a specific
+                nomenclature.
+            title: Gene locus
+            example: IGH
+            x-airr:
+                adc-query-support: true
+                name: Gene locus
+                format: controlled_vocabulary
+        locus_species:
+            $ref: '#/Ontology'
+            nullable: true
+            description: >
+                Binomial designation of the species from which the locus originates. Typically, this value should be
+                identical to `organism`, if which case it SHOULD NOT be set explicitly. However, there are valid
+                experimental setups in which the two might differ, e.g. transgenic animal models. If set, this key
+                will overwrite the `organism` information for all lower layers of the schema.
+            title: Locus species
+            example:
+                id: NCBITAXON:9606
+                label: Homo sapiens
+            x-airr:
+                miairr: defined
+                adc-query-support: true
+                name: Locus species
+                format: ontology
+                ontology:
+                    draft: false
+                    top_node:
+                        id: NCBITAXON:7776
+                        label: Gnathostomata
+        v_call:
+            type: string
+            nullable: true
+            description: >
+                V gene with allele. If referring to a known reference sequence in a database
+                the relevant gene/allele nomenclature should be followed (e.g., IGHV4-59*01 if using IMGT/GENE-DB).
+            title: V gene with allele
+            example: IGHV4-59*01
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 6
+                subset: data (processed sequence)
+                name: V gene with allele
+        d_call:
+            type: string
+            nullable: true
+            description: >
+                First or only D gene with allele. If referring to a known reference sequence in a database
+                the relevant gene/allele nomenclature should be followed (e.g., IGHD3-10*01 if using IMGT/GENE-DB).
+            title: D gene with allele
+            example: IGHD3-10*01
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 6
+                subset: data (processed sequence)
+                name: D gene with allele
+        d2_call:
+            type: string
+            nullable: true
+            description: >
+                Second D gene with allele. If referring to a known reference sequence in a database the relevant
+                gene/allele nomenclature should be followed (e.g., IGHD3-10*01 if using IMGT/GENE-DB).
+            example: IGHD3-10*01
+        j_call:
+            type: string
+            nullable: true
+            description: >
+                J gene with allele. If referring to a known reference sequence in a database the relevant
+                gene/allele nomenclature should be followed (e.g., IGHJ4*02 if using IMGT/GENE-DB).
+            title: J gene with allele
+            example: IGHJ4*02
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 6
+                subset: data (processed sequence)
+                name: J gene with allele
+        c_call:
+            type: string
+            nullable: true
+            description: >
+                Constant region gene with allele. If referring to a known reference sequence in a database the
+                relevant gene/allele nomenclature should be followed (e.g., IGHG1*01 if using IMGT/GENE-DB).
+            title: C region
+            example: IGHG1*01
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 6
+                subset: data (processed sequence)
+                name: C region
+        sequence_alignment:
+            type: string
+            nullable: true
+            description: >
+                Aligned portion of query sequence, including any indel corrections or numbering spacers,
+                such as IMGT-gaps. Typically, this will include only the V(D)J region, but that is not
+                a requirement.
+        quality_alignment:
+            type: string
+            nullable: true
+            description: >
+                Sanger/Phred quality scores for assessment of sequence_alignment quality.
+                Phred quality scores from 0 to 93 are encoded using ASCII 33 to 126 (Used by Illumina from v1.8.)
+        sequence_alignment_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the aligned query sequence.
+        germline_alignment:
+            type: string
+            nullable: true
+            description: >
+                Assembled, aligned, full-length inferred germline sequence spanning the same region
+                as the sequence_alignment field (typically the V(D)J region) and including the same set
+                of corrections and spacers (if any).
+        germline_alignment_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the assembled germline sequence.
+        junction:
+            type: string
+            nullable: true
+            description: >
+                Junction region nucleotide sequence, where the junction is defined as
+                the CDR3 plus the two flanking conserved codons.
+            title: IMGT-JUNCTION nucleotide sequence
+            example: TGTGCAAGAGCGGGAGTTTACGACGGATATACTATGGACTACTGG
+            x-airr:
+                miairr: important
+                set: 6
+                subset: data (processed sequence)
+                name: IMGT-JUNCTION nucleotide sequence
+        junction_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the junction.
+            title: IMGT-JUNCTION amino acid sequence
+            example: CARAGVYDGYTMDYW
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 6
+                subset: data (processed sequence)
+                name: IMGT-JUNCTION amino acid sequence
+        np1:
+            type: string
+            nullable: true
+            description: >
+                Nucleotide sequence of the combined N/P region between the V gene and
+                first D gene alignment or between the V gene and J gene alignments.
+        np1_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the np1 field.
+        np2:
+            type: string
+            nullable: true
+            description: >
+                Nucleotide sequence of the combined N/P region between either the first D gene and J gene
+                alignments or the first D gene and second D gene alignments.
+        np2_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the np2 field.
+        np3:
+            type: string
+            nullable: true
+            description: >
+                Nucleotide sequence of the combined N/P region between the second D gene
+                and J gene alignments.
+        np3_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the np3 field.
+        cdr1:
+            type: string
+            nullable: true
+            description: >
+                Nucleotide sequence of the aligned CDR1 region.
+        cdr1_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the cdr1 field.
+        cdr2:
+            type: string
+            nullable: true
+            description: >
+                Nucleotide sequence of the aligned CDR2 region.
+        cdr2_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the cdr2 field.
+        cdr3:
+            type: string
+            nullable: true
+            description: >
+                Nucleotide sequence of the aligned CDR3 region.
+        cdr3_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the cdr3 field.
+        fwr1:
+            type: string
+            nullable: true
+            description: >
+                Nucleotide sequence of the aligned FWR1 region.
+        fwr1_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the fwr1 field.
+        fwr2:
+            type: string
+            nullable: true
+            description: >
+                Nucleotide sequence of the aligned FWR2 region.
+        fwr2_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the fwr2 field.
+        fwr3:
+            type: string
+            nullable: true
+            description: >
+                Nucleotide sequence of the aligned FWR3 region.
+        fwr3_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the fwr3 field.
+        fwr4:
+            type: string
+            nullable: true
+            description: >
+                Nucleotide sequence of the aligned FWR4 region.
+        fwr4_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the fwr4 field.
+        v_score:
+            type: number
+            nullable: true
+            description: Alignment score for the V gene.
+        v_identity:
+            type: number
+            nullable: true
+            description: Fractional identity for the V gene alignment.
+        v_support:
+            type: number
+            nullable: true
+            description: >
+                V gene alignment E-value, p-value, likelihood, probability or other similar measure of
+                support for the V gene assignment as defined by the alignment tool.
+        v_cigar:
+            type: string
+            nullable: true
+            description: CIGAR string for the V gene alignment.
+        d_score:
+            type: number
+            nullable: true
+            description: Alignment score for the first or only D gene alignment.
+        d_identity:
+            type: number
+            nullable: true
+            description: Fractional identity for the first or only D gene alignment.
+        d_support:
+            type: number
+            nullable: true
+            description: >
+                D gene alignment E-value, p-value, likelihood, probability or other similar measure of
+                support for the first or only D gene as defined by the alignment tool.
+        d_cigar:
+            type: string
+            nullable: true
+            description: CIGAR string for the first or only D gene alignment.
+        d2_score:
+            type: number
+            nullable: true
+            description: Alignment score for the second D gene alignment.
+        d2_identity:
+            type: number
+            nullable: true
+            description: Fractional identity for the second D gene alignment.
+        d2_support:
+            type: number
+            nullable: true
+            description: >
+                D gene alignment E-value, p-value, likelihood, probability or other similar measure of
+                support for the second D gene as defined by the alignment tool.
+        d2_cigar:
+            type: string
+            nullable: true
+            description: CIGAR string for the second D gene alignment.
+        j_score:
+            type: number
+            nullable: true
+            description: Alignment score for the J gene alignment.
+        j_identity:
+            type: number
+            nullable: true
+            description: Fractional identity for the J gene alignment.
+        j_support:
+            type: number
+            nullable: true
+            description: >
+                J gene alignment E-value, p-value, likelihood, probability or other similar measure of
+                support for the J gene assignment as defined by the alignment tool.
+        j_cigar:
+            type: string
+            nullable: true
+            description: CIGAR string for the J gene alignment.
+        c_score:
+            type: number
+            nullable: true
+            description: Alignment score for the C gene alignment.
+        c_identity:
+            type: number
+            nullable: true
+            description: Fractional identity for the C gene alignment.
+        c_support:
+            type: number
+            nullable: true
+            description: >
+                C gene alignment E-value, p-value, likelihood, probability or other similar measure of
+                support for the C gene assignment as defined by the alignment tool.
+        c_cigar:
+            type: string
+            nullable: true
+            description: CIGAR string for the C gene alignment.
+        v_sequence_start:
+            type: integer
+            nullable: true
+            description: >
+                Start position of the V gene in the query sequence (1-based closed interval).
+        v_sequence_end:
+            type: integer
+            nullable: true
+            description: >
+                End position of the V gene in the query sequence (1-based closed interval).
+        v_germline_start:
+            type: integer
+            nullable: true
+            description: >
+                Alignment start position in the V gene reference sequence (1-based closed interval).
+        v_germline_end:
+            type: integer
+            nullable: true
+            description: >
+                Alignment end position in the V gene reference sequence (1-based closed interval).
+        v_alignment_start:
+            type: integer
+            nullable: true
+            description: >
+                Start position of the V gene alignment in both the sequence_alignment and germline_alignment
+                fields (1-based closed interval).
+        v_alignment_end:
+            type: integer
+            nullable: true
+            description: >
+                End position of the V gene alignment in both the sequence_alignment and germline_alignment
+                fields (1-based closed interval).
+        d_sequence_start:
+            type: integer
+            nullable: true
+            description: >
+                Start position of the first or only D gene in the query sequence.
+                (1-based closed interval).
+        d_sequence_end:
+            type: integer
+            nullable: true
+            description: >
+                End position of the first or only D gene in the query sequence.
+                (1-based closed interval).
+        d_germline_start:
+            type: integer
+            nullable: true
+            description: >
+                Alignment start position in the D gene reference sequence for the first or only
+                D gene (1-based closed interval).
+        d_germline_end:
+            type: integer
+            nullable: true
+            description: >
+                Alignment end position in the D gene reference sequence for the first or only
+                D gene (1-based closed interval).
+        d_alignment_start:
+            type: integer
+            nullable: true
+            description: >
+                Start position of the first or only D gene in both the sequence_alignment
+                and germline_alignment fields (1-based closed interval).
+        d_alignment_end:
+            type: integer
+            nullable: true
+            description: >
+                End position of the first or only D gene in both the sequence_alignment
+                and germline_alignment fields (1-based closed interval).
+        d2_sequence_start:
+            type: integer
+            nullable: true
+            description: >
+                Start position of the second D gene in the query sequence (1-based closed interval).
+        d2_sequence_end:
+            type: integer
+            nullable: true
+            description: >
+                End position of the second D gene in the query sequence (1-based closed interval).
+        d2_germline_start:
+            type: integer
+            nullable: true
+            description: >
+                Alignment start position in the second D gene reference sequence (1-based closed interval).
+        d2_germline_end:
+            type: integer
+            nullable: true
+            description: >
+                Alignment end position in the second D gene reference sequence (1-based closed interval).
+        d2_alignment_start:
+            type: integer
+            nullable: true
+            description: >
+                Start position of the second D gene alignment in both the sequence_alignment and
+                germline_alignment fields (1-based closed interval).
+        d2_alignment_end:
+            type: integer
+            nullable: true
+            description: >
+                End position of the second D gene alignment in both the sequence_alignment and
+                germline_alignment fields (1-based closed interval).
+        j_sequence_start:
+            type: integer
+            nullable: true
+            description: >
+                Start position of the J gene in the query sequence (1-based closed interval).
+        j_sequence_end:
+            type: integer
+            nullable: true
+            description: >
+                End position of the J gene in the query sequence (1-based closed interval).
+        j_germline_start:
+            type: integer
+            nullable: true
+            description: >
+                Alignment start position in the J gene reference sequence (1-based closed interval).
+        j_germline_end:
+            type: integer
+            nullable: true
+            description: >
+                Alignment end position in the J gene reference sequence (1-based closed interval).
+        j_alignment_start:
+            type: integer
+            nullable: true
+            description: >
+                Start position of the J gene alignment in both the sequence_alignment and germline_alignment
+                fields (1-based closed interval).
+        j_alignment_end:
+            type: integer
+            nullable: true
+            description: >
+                End position of the J gene alignment in both the sequence_alignment and germline_alignment
+                fields (1-based closed interval).
+        c_sequence_start:
+            type: integer
+            nullable: true
+            description: >
+                Start position of the C gene in the query sequence (1-based closed interval).
+        c_sequence_end:
+            type: integer
+            nullable: true
+            description: >
+                End position of the C gene in the query sequence (1-based closed interval).
+        c_germline_start:
+            type: integer
+            nullable: true
+            description: >
+                Alignment start position in the C gene reference sequence (1-based closed interval).
+        c_germline_end:
+            type: integer
+            nullable: true
+            description: >
+                Alignment end position in the C gene reference sequence (1-based closed interval).
+        c_alignment_start:
+            type: integer
+            nullable: true
+            description: >
+                Start position of the C gene alignment in both the sequence_alignment and germline_alignment
+                fields (1-based closed interval).
+        c_alignment_end:
+            type: integer
+            nullable: true
+            description: >
+                End position of the C gene alignment in both the sequence_alignment and germline_alignment
+                fields (1-based closed interval).
+        cdr1_start:
+            type: integer
+            nullable: true
+            description: CDR1 start position in the query sequence (1-based closed interval).
+        cdr1_end:
+            type: integer
+            nullable: true
+            description: CDR1 end position in the query sequence (1-based closed interval).
+        cdr2_start:
+            type: integer
+            nullable: true
+            description: CDR2 start position in the query sequence (1-based closed interval).
+        cdr2_end:
+            type: integer
+            nullable: true
+            description: CDR2 end position in the query sequence (1-based closed interval).
+        cdr3_start:
+            type: integer
+            nullable: true
+            description: CDR3 start position in the query sequence (1-based closed interval).
+        cdr3_end:
+            type: integer
+            nullable: true
+            description: CDR3 end position in the query sequence (1-based closed interval).
+        fwr1_start:
+            type: integer
+            nullable: true
+            description: FWR1 start position in the query sequence (1-based closed interval).
+        fwr1_end:
+            type: integer
+            nullable: true
+            description: FWR1 end position in the query sequence (1-based closed interval).
+        fwr2_start:
+            type: integer
+            nullable: true
+            description: FWR2 start position in the query sequence (1-based closed interval).
+        fwr2_end:
+            type: integer
+            nullable: true
+            description: FWR2 end position in the query sequence (1-based closed interval).
+        fwr3_start:
+            type: integer
+            nullable: true
+            description: FWR3 start position in the query sequence (1-based closed interval).
+        fwr3_end:
+            type: integer
+            nullable: true
+            description: FWR3 end position in the query sequence (1-based closed interval).
+        fwr4_start:
+            type: integer
+            nullable: true
+            description: FWR4 start position in the query sequence (1-based closed interval).
+        fwr4_end:
+            type: integer
+            nullable: true
+            description: FWR4 end position in the query sequence (1-based closed interval).
+        v_sequence_alignment:
+            type: string
+            nullable: true
+            description: >
+                Aligned portion of query sequence assigned to the V gene, including any
+                indel corrections or numbering spacers.
+        v_sequence_alignment_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the v_sequence_alignment field.
+        d_sequence_alignment:
+            type: string
+            nullable: true
+            description: >
+                Aligned portion of query sequence assigned to the first or only D gene, including any
+                indel corrections or numbering spacers.
+        d_sequence_alignment_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the d_sequence_alignment field.
+        d2_sequence_alignment:
+            type: string
+            nullable: true
+            description: >
+                Aligned portion of query sequence assigned to the second D gene, including any
+                indel corrections or numbering spacers.
+        d2_sequence_alignment_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the d2_sequence_alignment field.
+        j_sequence_alignment:
+            type: string
+            nullable: true
+            description: >
+                Aligned portion of query sequence assigned to the J gene, including any
+                indel corrections or numbering spacers.
+        j_sequence_alignment_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the j_sequence_alignment field.
+        c_sequence_alignment:
+            type: string
+            nullable: true
+            description: >
+                Aligned portion of query sequence assigned to the constant region, including
+                any indel corrections or numbering spacers.
+        c_sequence_alignment_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the c_sequence_alignment field.
+        v_germline_alignment:
+            type: string
+            nullable: true
+            description: >
+                Aligned V gene germline sequence spanning the same region
+                as the v_sequence_alignment field and including the same set
+                of corrections and spacers (if any).
+        v_germline_alignment_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the v_germline_alignment field.
+        d_germline_alignment:
+            type: string
+            nullable: true
+            description: >
+                Aligned D gene germline sequence spanning the same region
+                as the d_sequence_alignment field and including the same set
+                of corrections and spacers (if any).
+        d_germline_alignment_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the d_germline_alignment field.
+        d2_germline_alignment:
+            type: string
+            nullable: true
+            description: >
+                Aligned D gene germline sequence spanning the same region
+                as the d2_sequence_alignment field and including the same set
+                of corrections and spacers (if any).
+        d2_germline_alignment_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the d2_germline_alignment field.
+        j_germline_alignment:
+            type: string
+            nullable: true
+            description: >
+                Aligned J gene germline sequence spanning the same region
+                as the j_sequence_alignment field and including the same set
+                of corrections and spacers (if any).
+        j_germline_alignment_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the j_germline_alignment field.
+        c_germline_alignment:
+            type: string
+            nullable: true
+            description: >
+                Aligned constant region germline sequence spanning the same region
+                as the c_sequence_alignment field and including the same set
+                of corrections and spacers (if any).
+        c_germline_alignment_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the c_germline_aligment field.
+        junction_length:
+            type: integer
+            nullable: true
+            description: Number of nucleotides in the junction sequence.
+        junction_aa_length:
+            type: integer
+            nullable: true
+            description: Number of amino acids in the junction sequence.
+            x-airr:
+                adc-query-support: true
+        np1_length:
+            type: integer
+            nullable: true
+            description: >
+                Number of nucleotides between the V gene and first D gene alignments or
+                between the V gene and J gene alignments.
+        np2_length:
+            type: integer
+            nullable: true
+            description: >
+                Number of nucleotides between either the first D gene and J gene alignments
+                or the first D gene and second D gene alignments.
+        np3_length:
+            type: integer
+            nullable: true
+            description: >
+                Number of nucleotides between the second D gene and J gene alignments.
+        n1_length:
+            type: integer
+            nullable: true
+            description: Number of untemplated nucleotides 5' of the first or only D gene alignment.
+        n2_length:
+            type: integer
+            nullable: true
+            description: Number of untemplated nucleotides 3' of the first or only D gene alignment.
+        n3_length:
+            type: integer
+            nullable: true
+            description: Number of untemplated nucleotides 3' of the second D gene alignment.
+        p3v_length:
+            type: integer
+            nullable: true
+            description: Number of palindromic nucleotides 3' of the V gene alignment.
+        p5d_length:
+            type: integer
+            nullable: true
+            description: Number of palindromic nucleotides 5' of the first or only D gene alignment.
+        p3d_length:
+            type: integer
+            nullable: true
+            description: Number of palindromic nucleotides 3' of the first or only D gene alignment.
+        p5d2_length:
+            type: integer
+            nullable: true
+            description: Number of palindromic nucleotides 5' of the second D gene alignment.
+        p3d2_length:
+            type: integer
+            nullable: true
+            description: Number of palindromic nucleotides 3' of the second D gene alignment.
+        p5j_length:
+            type: integer
+            nullable: true
+            description: Number of palindromic nucleotides 5' of the J gene alignment.
+        v_frameshift:
+            type: boolean
+            nullable: true
+            description: >
+                True if the V gene in the query nucleotide sequence contains a translational
+                frameshift relative to the frame of the V gene reference sequence.
+        j_frameshift:
+            type: boolean
+            nullable: true
+            description: >
+                True if the J gene in the query nucleotide sequence contains a translational
+                frameshift relative to the frame of the J gene reference sequence.
+        d_frame:
+            type: integer
+            nullable: true
+            description: >
+                Numerical reading frame (1, 2, 3) of the first or only D gene in the query nucleotide sequence,
+                where frame 1 is relative to the first codon of D gene reference sequence.
+        d2_frame:
+            type: integer
+            nullable: true
+            description: >
+                Numerical reading frame (1, 2, 3) of the second D gene in the query nucleotide sequence,
+                where frame 1 is relative to the first codon of D gene reference sequence.
+        consensus_count:
+            type: integer
+            nullable: true
+            description: >
+                Number of reads contributing to the UMI consensus or contig assembly for this sequence.
+                For example, the sum of the number of reads for all UMIs that contribute to
+                the query sequence.
+        duplicate_count:
+            type: integer
+            nullable: true
+            description: >
+                Copy number or number of duplicate observations for the query sequence.
+                For example, the number of identical reads observed for this sequence.
+            title: Read count
+            example: 123
+            x-airr:
+                miairr: important
+                set: 6
+                subset: data (processed sequence)
+                name: Read count
+        umi_count:
+            type: integer
+            nullable: true
+            description: >
+                Number of distinct UMIs represented by this sequence.
+                For example, the total number of UMIs that contribute to
+                the contig assembly for the query sequence.
+        cell_id:
+            type: string
+            nullable: true
+            description: >
+                Identifier defining the cell of origin for the query sequence.
+            title: Cell index
+            example: W06_046_091
+            x-airr:
+                identifier: true
+                miairr: important
+                adc-query-support: true
+                set: 6
+                subset: data (processed sequence)
+                name: Cell index
+        clone_id:
+            type: string
+            nullable: true
+            description: Clonal cluster assignment for the query sequence.
+            x-airr:
+                adc-query-support: true
+                identifier: true
+        repertoire_id:
+            type: string
+            nullable: true
+            description: Identifier to the associated repertoire in study metadata.
+            x-airr:
+                adc-query-support: true
+                identifier: true
+        sample_processing_id:
+            type: string
+            nullable: true
+            description: >
+                Identifier to the sample processing object in the repertoire metadata
+                for this rearrangement. If the repertoire has a single sample then
+                this field may be empty or missing. If the repertoire has multiple samples then
+                this field may be empty or missing if the sample cannot be differentiated or
+                the relationship is not maintained by the data processing.
+            x-airr:
+                adc-query-support: true
+                identifier: true
+        data_processing_id:
+            type: string
+            nullable: true
+            description: >
+                Identifier to the data processing object in the repertoire metadata
+                for this rearrangement. If this field is empty than the primary data processing object is assumed.
+            x-airr:
+                adc-query-support: true
+                identifier: true
+        rearrangement_id:
+            type: string
+            nullable: true
+            description: >
+                Identifier for the Rearrangement object. May be identical to sequence_id,
+                but will usually be a universally unique record locator for database applications.
+            x-airr:
+                deprecated: true
+                deprecated-description: Field has been merged with sequence_id to avoid confusion.
+                deprecated-replaced-by:
+                    - sequence_id
+        rearrangement_set_id:
+            type: string
+            nullable: true
+            description: >
+                Identifier for grouping Rearrangement objects.
+            x-airr:
+                deprecated: true
+                deprecated-description: Field has been replaced by other specialized identifiers.
+                deprecated-replaced-by:
+                    - repertoire_id
+                    - sample_processing_id
+                    - data_processing_id
+        germline_database:
+            type: string
+            nullable: true
+            description: Source of germline V(D)J genes with version number or date accessed.
+            example: ENSEMBL, Homo sapiens build 90, 2017-10-01
+            x-airr:
+                deprecated: true
+                deprecated-description: Field was moved up to the DataProcessing level to avoid data duplication.
+                deprecated-replaced-by:
+                    - "DataProcessing:germline_database"
+
+# A unique inferred clone object that has been constructed within a single data processing
+# for a single repertoire and a subset of its sequences and/or rearrangements.
+Clone:
+    type: object
+    required:
+        - clone_id
+        - germline_alignment
+    properties:
+        clone_id:
+            type: string
+            nullable: true
+            description: Identifier for the clone.
+            x-airr:
+                identifier: true
+        repertoire_id:
+            type: string
+            nullable: true
+            description: Identifier to the associated repertoire in study metadata.
+            x-airr:
+                adc-query-support: true
+        data_processing_id:
+            type: string
+            nullable: true
+            description: Identifier of the data processing object in the repertoire metadata for this clone.
+            x-airr:
+                adc-query-support: true
+        sequences:
+            type: array
+            items:
+                type: string
+            nullable: true
+            description: >
+                List sequence_id strings that act as keys to the Rearrangement records for members of the clone.
+        v_call:
+            type: string
+            nullable: true
+            description: >
+                V gene with allele of the inferred ancestral of the clone. For example, IGHV4-59*01.
+            example: IGHV4-59*01
+        d_call:
+            type: string
+            nullable: true
+            description: >
+                D gene with allele of the inferred ancestor of the clone. For example, IGHD3-10*01.
+            example: IGHD3-10*01
+        j_call:
+            type: string
+            nullable: true
+            description: >
+                J gene with allele of the inferred ancestor of the clone. For example, IGHJ4*02.
+            example: IGHJ4*02
+        junction:
+            type: string
+            nullable: true
+            description: >
+                Nucleotide sequence for the junction region of the inferred ancestor of the clone,
+                where the junction is defined as the CDR3 plus the two flanking conserved codons.
+        junction_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the junction.
+        junction_length:
+            type: integer
+            nullable: true
+            description: Number of nucleotides in the junction.
+        junction_aa_length:
+            type: integer
+            nullable: true
+            description: Number of amino acids in junction_aa.
+        germline_alignment:
+            type: string
+            nullable: true
+            description: >
+                Assembled, aligned, full-length inferred ancestor of the clone spanning the same region
+                as the sequence_alignment field of nodes (typically the V(D)J region) and including the
+                same set of corrections and spacers (if any).
+        germline_alignment_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of germline_alignment.
+        v_alignment_start:
+            type: integer
+            nullable: true
+            description: >
+                Start position in the V gene alignment in both the sequence_alignment and germline_alignment
+                fields (1-based closed interval).
+        v_alignment_end:
+            type: integer
+            nullable: true
+            description: >
+                End position in the V gene alignment in both the sequence_alignment and germline_alignment
+                fields (1-based closed interval).
+        d_alignment_start:
+            type: integer
+            nullable: true
+            description: >
+                Start position of the D gene alignment in both the sequence_alignment and germline_alignment
+                fields (1-based closed interval).
+        d_alignment_end:
+            type: integer
+            nullable: true
+            description: >
+                End position of the D gene alignment in both the sequence_alignment and germline_alignment
+                fields (1-based closed interval).
+        j_alignment_start:
+            type: integer
+            nullable: true
+            description: >
+                Start position of the J gene alignment in both the sequence_alignment and germline_alignment
+                fields (1-based closed interval).
+        j_alignment_end:
+            type: integer
+            nullable: true
+            description: >
+                End position of the J gene alignment in both the sequence_alignment and germline_alignment
+                fields (1-based closed interval).
+        junction_start:
+            type: integer
+            nullable: true
+            description: Junction region start position in the alignment (1-based closed interval).
+        junction_end:
+            type: integer
+            nullable: true
+            description: Junction region end position in the alignment (1-based closed interval).
+        umi_count:
+            type: integer
+            nullable: true
+            description: >
+                Number of distinct UMIs observed across all sequences (Rearrangement records) in this clone.
+        clone_count:
+            type: integer
+            nullable: true
+            description: >
+                Absolute count of the size (number of members) of this clone in the repertoire.
+                This could simply be the number of sequences (Rearrangement records) observed in this clone,
+                the number of distinct cell barcodes (unique cell_id values),
+                or a more sophisticated calculation appropriate to the experimental protocol.
+                Absolute count is provided versus a frequency so that downstream analysis tools can perform their own normalization.
+        seed_id:
+            type: string
+            nullable: true
+            description: sequence_id of the seed sequence. Empty string (or null) if there is no seed sequence.
+
+# 1-to-n relationship for a clone to its trees.
+Tree:
+    type: object
+    required:
+        - tree_id
+        - clone_id
+        - newick
+    properties:
+        tree_id:
+            type: string
+            nullable: true
+            description: Identifier for the tree.
+            x-airr:
+                identifier: true
+        clone_id:
+            type: string
+            nullable: true
+            description: Identifier for the clone.
+        newick:
+            type: string
+            nullable: true
+            description: Newick string of the tree edges.
+        nodes:
+            type: object
+            nullable: true
+            description: Dictionary of nodes in the tree, keyed by sequence_id string
+            additionalProperties:
+                $ref: '#/Node'
+
+# 1-to-n relationship between a tree and its nodes
+Node:
+    type: object
+    required:
+        - sequence_id
+    properties:
+        sequence_id:
+            type: string
+            nullable: true
+            description: >
+                Identifier for this node that matches the identifier in the newick string and, where possible,
+                the sequence_id in the source repertoire.
+            x-airr:
+                identifier: true
+        sequence_alignment:
+            type: string
+            nullable: true
+            description: >
+                Nucleotide sequence of the node, aligned to the germline_alignment for this clone, including
+                including any indel corrections or spacers.
+        junction:
+            type: string
+            nullable: true
+            description: >
+                Junction region nucleotide sequence for the node, where the junction is defined as
+                the CDR3 plus the two flanking conserved codons.
+        junction_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the junction.
+
+# The cell object acts as point of reference for all data that can be related
+# to an individual cell, either by direct observation or inference.
+Cell:
+    type: object
+    required:
+        - cell_id
+        - rearrangements
+        - repertoire_id
+        - virtual_pairing
+    properties:
+        cell_id:
+            type: string
+            nullable: false
+            description: >
+                Identifier defining the cell of origin for the query sequence.
+            title: Cell index
+            example: W06_046_091
+            x-airr:
+                identifier: true
+                miairr: defined
+                adc-query-support: true
+                name: Cell index
+        rearrangements:
+            type: array
+            nullable: true
+            description: >
+                Array of sequence identifiers defined for the Rearrangement object
+            title: Cell-associated rearrangements
+            items:
+                type: string
+            example: [id1, id2] #empty vs NULL?
+            x-airr:
+                miairr: defined
+                adc-query-support: true
+                name: Cell-associated rearrangements
+        receptors:
+            type: array
+            nullable: true
+            description: >
+                Array of receptor identifiers defined for the Receptor object
+            title: Cell-associated receptors
+            items:
+                type: string
+            example: [id1, id2] #empty vs NULL?
+            x-airr:
+                miairr: defined
+                adc-query-support: true
+                name: Cell-associated receptors
+        repertoire_id:
+            type: string
+            nullable: true
+            description: Identifier to the associated repertoire in study metadata.
+            title: Parental repertoire of cell
+            x-airr:
+                miairr: defined
+                adc-query-support: true
+                name: Parental repertoire of cell
+        data_processing_id:
+            type: string
+            nullable: true
+            description: Identifier of the data processing object in the repertoire metadata for this clone.
+            title: Data processing for cell
+            x-airr:
+                miairr: defined
+                adc-query-support: true
+                name: Data processing for cell
+        expression_study_method:
+            type: string
+            enum:
+                - flow_cytometry
+                - single-cell_transcriptome
+                - null
+            nullable: true
+            description: >
+                Keyword describing the methodology used to assess expression. This values for this field MUST 
+                come from a controlled vocabulary.
+            x-airr:
+                miairr: defined
+                adc-query-support: true
+        expression_raw_doi:
+            type: string
+            nullable: true
+            description: >
+                DOI of raw data set containing the current event
+            x-airr:
+                miairr: defined
+                adc-query-support: true
+        expression_index:
+            type: string
+            nullable: true
+            description: >
+                Index addressing the current event within the raw data set.
+            x-airr:
+                miairr: defined
+        virtual_pairing:
+            type: boolean
+            nullable: true
+            description: >
+                boolean to indicate if pairing was inferred.
+            title: Virtual pairing
+            x-airr:
+                miairr: defined
+                adc-query-support: true
+                name: Virtual pairing
+
+# The CellExpression object acts as a container to hold a single expression level measurement from
+# an experiment. Expression data is associated with a cell_id and the related repertoire_id and
+# data_processing_id as cell_id is not guaranteed to be unique outside the data processing for
+# a single repertoire.
+CellExpression:
+    type: object
+    required:
+        - expression_id
+        - repertoire_id
+        - data_processing_id
+        - cell_id
+        - property
+        - property_type
+        - value
+    properties:
+        expression_id:
+            type: string
+            description: >
+                Identifier of this expression property measurement.
+            title: Expression property measurement identifier
+            nullable: false
+            x-airr:
+                identifier: true
+                miairr: defined
+                adc-query-support: true
+                name: Expression measurement identifier
+        cell_id:
+            type: string
+            description: >
+                Identifier of the cell to which this expression data is related.
+            title: Cell identifier
+            nullable: false
+            example: W06_046_091
+            x-airr:
+                miairr: defined
+                adc-query-support: true
+                name: Cell identifier
+        repertoire_id:
+            type: string
+            description: Identifier for the associated repertoire in study metadata.
+            title: Parental repertoire of cell
+            nullable: true
+            x-airr:
+                miairr: defined
+                adc-query-support: true
+                name: Parental repertoire of cell
+        data_processing_id:
+            type: string
+            description: Identifier of the data processing object in the repertoire metadata for this clone.
+            title: Data processing for cell
+            nullable: true
+            x-airr:
+                miairr: defined
+                adc-query-support: true
+                name: Data processing for cell
+        property_type:
+            type: string
+            description: >
+                Keyword describing the property type and detection method used to measure the property value.
+                The following keywords are recommended, but custom property types are also valid:
+                "mrna_expression_by_read_count",
+                "protein_expression_by_fluorescence_intensity", "antigen_bait_binding_by_fluorescence_intensity",
+                "protein_expression_by_dna_barcode_count" and "antigen_bait_binding_by_dna_barcode_count".
+            nullable: false
+            title: Property type and detection method
+            x-airr:
+                miairr: defined
+                adc-query-support: true
+                name: Property type and detection method
+        property:
+            $ref: '#/Ontology'
+            nullable: true
+            title: Property information
+            description: >
+                Name of the property observed, typically a gene or antibody identifier (and label) from a 
+                canonical resource such as Ensembl (e.g. ENSG00000275747, IGHV3-79) or 
+                Antibody Registry (ABREG:1236456, Purified anti-mouse/rat/human CD27 antibody).
+            example:
+                id: ENSG:ENSG00000275747
+                label: IGHV3-79
+            x-airr:
+                miairr: defined
+                adc-query-support: true
+                format: ontology
+                name: Property information
+        value:
+            type: number
+            description: Level at which the property was observed in the experiment (non-normalized).
+            title: Property value
+            nullable: true
+            example: 3
+            x-airr:
+                miairr: defined
+                adc-query-support: true
+                name: Property value
+
+
+# The Receptor object hold information about a receptor and its reactivity.
+#
+Receptor:
+    type: object
+    required:
+        - receptor_id
+        - receptor_hash
+        - receptor_type
+        - receptor_variable_domain_1_aa
+        - receptor_variable_domain_1_locus
+        - receptor_variable_domain_2_aa
+        - receptor_variable_domain_2_locus
+    properties:
+        receptor_id:
+            type: string
+            nullable: false
+            description: ID of the current Receptor object, unique within the local repository.
+            title: Receptor ID
+            example: TCR-MM-012345
+            x-airr:
+                identifier: true
+                adc-query-support: true
+        receptor_hash:
+            type: string
+            nullable: false
+            description: >
+                The SHA256 hash of the receptor amino acid sequence, calculated on the concatenated
+                ``receptor_variable_domain_*_aa`` sequences and represented as base16-encoded string.
+            title: Receptor hash ID
+            example: aa1c4b77a6f4927611ab39f5267415beaa0ba07a952c233d803b07e52261f026
+            x-airr:
+                adc-query-support: true
+        receptor_type:
+            type: string
+            nullable: false
+            enum:
+                - Ig
+                - TCR
+            description: The top-level receptor type, either Immunoglobulin (Ig) or T Cell Receptor (TCR).
+            x-airr:
+                adc-query-support: true
+        receptor_variable_domain_1_aa:
+            type: string
+            nullable: false
+            description: >
+                Complete amino acid sequence of the mature variable domain of the Ig heavy, TCR beta or TCR delta chain.
+                The mature variable domain is defined as encompassing all AA from and including first AA after the the
+                signal peptide to and including the last AA that is completely encoded by the J gene.
+            example: >
+                QVQLQQPGAELVKPGASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKYNEKFKSKATLTVDKPSSTAYMQLSSLTSEDSAVYYCARYDYYGSSYFDYWGQGTTLTVSS
+            x-airr:
+                adc-query-support: true
+        receptor_variable_domain_1_locus:
+            type: string
+            nullable: false
+            enum:
+                - IGH
+                - TRB
+                - TRD
+            description: Locus from which the variable domain in receptor_variable_domain_1_aa originates
+            example: IGH
+            x-airr:
+                adc-query-support: true
+        receptor_variable_domain_2_aa:
+            type: string
+            nullable: false
+            description: >
+                Complete amino acid sequence of the mature variable domain of the Ig light, TCR alpha or TCR gamma chain.
+                The mature variable domain is defined as encompassing all AA from and including first AA after the the
+                signal peptide to and including the last AA that is completely encoded by the J gene.
+            example: >
+                QAVVTQESALTTSPGETVTLTCRSSTGAVTTSNYANWVQEKPDHLFTGLIGGTNNRAPGVPARFSGSLIGDKAALTITGAQTEDEAIYFCALWYSNHWVFGGGTKLTVL
+            x-airr:
+                adc-query-support: true
+        receptor_variable_domain_2_locus:
+            type: string
+            nullable: false
+            enum:
+                - IGI
+                - IGK
+                - IGL
+                - TRA
+                - TRG
+            description: Locus from which the variable domain in receptor_variable_domain_2_aa originates
+            example: IGL
+            x-airr:
+                adc-query-support: true
+        receptor_ref:
+            type: array
+            nullable: true
+            description: Array of receptor identifiers defined for the Receptor object
+            title: Receptor cross-references
+            items:
+                type: string
+            example: ["IEDB_RECEPTOR:10"]
+            x-airr:
+                adc-query-support: true
+        reactivity_measurements:
+            type: array
+            nullable: true
+            description: Records of reactivity measurement
+            items:
+                $ref: '#/ReceptorReactivity'
+
+
+ReceptorReactivity:
+    type: object
+    required:
+        - ligand_type
+        - antigen_type
+        - antigen
+        - reactivity_method
+        - reactivity_readout
+        - reactivity_value
+        - reactivity_unit
+    properties:
+        ligand_type:
+            type: string
+            nullable: false
+            enum:
+                - "MHC:peptide"
+                - "MHC:non-peptide"
+                - protein
+                - peptide
+                - non-peptidic
+            description: Classification of ligand binding to receptor
+            example: non-peptide
+        antigen_type:
+            type: string
+            nullable: false
+            enum:
+                - protein
+                - peptide
+                - non-peptidic
+            description: >
+                The type of antigen before processing by the immune system.
+            example: protein
+        antigen:
+            $ref: '#/Ontology'
+            nullable: false
+            description: >
+                The substance against which the receptor was tested. This can be any substance that
+                stimulates an adaptive immune response in the host, either through antibody production
+                or by T cell activation after presentation via an MHC molecule.
+            title: Antigen
+            example:
+                id: UNIPROT:P19597
+                label: Circumsporozoite protein
+            x-airr:
+                adc-query-support: true
+                format: ontology
+        antigen_source_species:
+            $ref: '#/Ontology'
+            nullable: true
+            description: The species from which the antigen was isolated
+            title: Source species of antigen
+            example:
+                id: NCBITAXON:5843
+                label: Plasmodium falciparum NF54
+            x-airr:
+                format: ontology
+                ontology:
+                    draft: true
+                    top_node:
+                        id: NCBITAXON:1
+                        label: root
+        peptide_start:
+            type: integer
+            nullable: true
+            description: Start position of the peptide within the reference protein sequence
+        peptide_end:
+            type: integer
+            nullable: true
+            description: End position of the peptide within the reference protein sequence
+        mhc_class:
+            type: string
+            nullable: true
+            enum:
+                - MHC-I
+                - MHC-II
+                - MHC-nonclassical
+                - null
+            description: Class of MHC molecule, only present for MHC:x ligand types
+            example: MHC-II
+        mhc_gene_1:
+            $ref: '#/Ontology'
+            nullable: true
+            description: The MHC gene to which the mhc_allele_1 belongs
+            title: MHC gene 1
+            example:
+                id: MRO:0000055
+                label: HLA-DRA
+            x-airr:
+                format: ontology
+                ontology:
+                    draft: true
+                    top_node:
+                        id: MRO:0000004
+                        label: MHC gene
+        mhc_allele_1:
+            type: string
+            nullable: true
+            description: Allele designation of the MHC alpha chain
+            example: HLA-DRA
+        mhc_gene_2:
+            $ref: '#/Ontology'
+            nullable: true
+            description: The MHC gene to which the mhc_allele_2 belongs
+            title: MHC gene 2
+            example:
+                id: MRO:0000057
+                label: HLA-DRB1
+            x-airr:
+                format: ontology
+                ontology:
+                    draft: true
+                    top_node:
+                        id: MRO:0000004
+                        label: MHC gene
+        mhc_allele_2:
+            type: string
+            nullable: true
+            description: >
+                Allele designation of the MHC class II beta chain or the invariant beta2-microglobin chain
+            example: HLA-DRB1*04:01
+        reactivity_method:
+            type: string
+            nullable: false
+            enum:
+                - SPR
+                - ITC
+                - ELISA
+                - cytometry
+                - biological_activity
+            description: The methodology used to assess expression (assay implemented in experiment)
+        reactivity_readout:
+            type: string
+            nullable: false
+            enum:
+                - binding_strength
+                - cytokine_release
+                - dissociation_constant_kd
+                - on_rate
+                - off_rate
+                - pathogen_inhibition
+            description: Reactivity measurement read-out
+            example: cytokine release
+        reactivity_value:
+            type: number
+            nullable: false
+            description: The absolute (processed) value of the measurement
+            example: 162.26
+        reactivity_unit:
+            type: string
+            nullable: false
+            description: The unit of the measurement
+            example: pg/ml
diff --git a/lang/python/airr/specs/airr-schema-openapi3.yaml b/lang/python/airr/specs/airr-schema-openapi3.yaml
new file mode 100644
index 000000000..bba3a45d8
--- /dev/null
+++ b/lang/python/airr/specs/airr-schema-openapi3.yaml
@@ -0,0 +1,5091 @@
+#
+# Schema definitions for AIRR standards objects
+#
+Info:
+    title: AIRR Schema
+    description: Schema definitions for AIRR standards objects
+    version: 1.4
+    contact:
+        name: AIRR Community
+        url: https://github.com/airr-community
+    license:
+        name: Creative Commons Attribution 4.0 International
+        url: https://creativecommons.org/licenses/by/4.0/
+
+
+# Properties that are based upon an ontology use this
+# standard schema definition
+Ontology:
+    type: object
+    properties:
+        id:
+            type: string
+            nullable: true
+            description: CURIE of the concept, encoding the ontology and the local ID
+        label:
+            type: string
+            nullable: true
+            description: Label of the concept in the respective ontology
+
+# Map to expand CURIE prefixes to full IRIs
+CURIEMap:
+    ABREG:
+        type: identifier
+        default:
+            map: ABREG
+        map:
+            ABREG:
+                iri_prefix: "http://antibodyregistry.org/AB_"
+    CHEBI:
+        type: ontology
+        default:
+            map: OBO
+            provider: OLS
+        map:
+            OBO:
+                iri_prefix: "http://purl.obolibrary.org/obo/CHEBI_"
+    CL:
+        type: ontology
+        default:
+            map: OBO
+            provider: OLS
+        map:
+            OBO:
+                iri_prefix: "http://purl.obolibrary.org/obo/CL_"
+    DOI:
+        type: identifier
+        default:
+            map: DOI
+        map:
+            DOI:
+                iri_prefix: "https://doi.org/"
+    DOID:
+        type: ontology
+        default:
+            map: OBO
+            provider: OLS
+        map:
+            OBO:
+                iri_prefix: "http://purl.obolibrary.org/obo/DOID_"
+    ENA:
+        type: identifier
+        default:
+            map: ENA
+        map:
+            ENA:
+                iri_prefix: "https://www.ebi.ac.uk/ena/browser/view/"
+    ENSG:
+        type: identifier
+        default:
+            map: ENSG
+        map:
+            ENSG:
+                iri_prefix: "https://www.ensembl.org/Multi/Search/Results?q="
+    GAZ:
+        type: ontology
+        default:
+            map: OBO
+            provider: OLS
+        map:
+            OBO:
+                iri_prefix: "http://purl.obolibrary.org/obo/GAZ_"
+    IEDB_RECEPTOR:
+        type: identifier
+        default:
+            map: IEDB
+            provider: IEDB
+        map:
+            IEDB:
+                iri_prefix: "https://www.iedb.org/receptor/"
+    MRO:
+        type: ontology
+        default:
+            map: OBO
+            provider: OLS
+        map:
+            OBO:
+                iri_prefix: "http://purl.obolibrary.org/obo/MRO_"
+    NCBITAXON:
+        type: taxonomy
+        default:
+            map: OBO
+            provider: OLS
+        map:
+            OBO:
+                iri_prefix: "http://purl.obolibrary.org/obo/NCBITaxon_"
+            BioPortal:
+                iri_prefix: "http://purl.bioontology.org/ontology/NCBITAXON/"
+    NCIT:
+        type: ontology
+        default:
+            map: OBO
+            provider: OLS
+        map:
+            OBO:
+                iri_prefix: "http://purl.obolibrary.org/obo/NCIT_"
+    ORCID:
+        type: catalog
+        default:
+            map: ORCID
+            provider: ORCID
+        map:
+            ORCID:
+                iri_prefix: "https://orcid.org/"
+    ROR:
+        type: catalog
+        default:
+            map: ROR
+            provider: ROR
+        map:
+            ROR:
+                iri_prefix: "https://ror.org/"
+    SRA:
+        type: identifier
+        default:
+            map: SRA
+        map:
+            SRA:
+                iri_prefix: "https://trace.ncbi.nlm.nih.gov/Traces/sra/?run="
+    UBERON:
+        type: ontology
+        default:
+            map: OBO
+            provider: OLS
+        map:
+            OBO:
+                iri_prefix: "http://purl.obolibrary.org/obo/UBERON_"
+    UNIPROT:
+        type: identifier
+        default:
+            map: UNIPROT
+        map:
+            UniProt:
+                iri_prefix: "http://purl.uniprot.org/uniprot/"
+    UO:
+        type: ontology
+        default:
+            map: OBO
+            provider: OLS
+        map:
+            OBO:
+                iri_prefix: "http://purl.obolibrary.org/obo/UO_"
+
+InformationProvider:
+    provider:
+        ENA:
+            request:
+                url: "{iri}"
+                response: text/html
+        IEDB:
+            request:
+                url: "https://query-api.iedb.org/tcr_search?receptor_group_id=eq.{local_id}"
+                response: application/json
+        OLS:
+            request:
+                url: "https://www.ebi.ac.uk/ols/api/ontologies/{ontology_id}/terms?iri={iri}"
+                response: application/json
+        Ontobee:
+            request:
+                url: "http://www.ontobee.org/ontology/rdf/{ontology_id}?iri={iri}"
+                response: application/rdf+xml
+        ORCID:
+            request:
+                url: "https://pub.orcid.org/v2.1/{local_id}"
+                header:
+                    Accept: application/json
+                response: application/json
+        ROR:
+            request:
+                url: "https://api.ror.org/organizations/{iri}"
+                response: application/json
+        SRA:
+            request:
+                url: "{iri}"
+                response: text/html
+    parameter:
+        CHEBI:
+            Ontobee:
+                ontology_id: CHEBI
+            OLS:
+                ontology_id: chebi
+        CL:
+            Ontobee:
+                ontology_id: CL
+            OLS:
+                ontology_id: cl
+        DOID:
+            Ontobee:
+                ontology_id: DOID
+            OLS:
+                ontology_id: doid
+        GAZ:
+            Ontobee:
+                ontology_id: GAZ
+            OLS:
+                ontology_id: gaz
+        MRO:
+            Ontobee:
+                ontology_id: MRO
+            OLS:
+                ontology_id: mro
+        NCBITAXON:
+            Ontobee:
+                ontology_id: NCBITaxon
+            OLS:
+                ontology_id: ncbitaxon
+            BioPortal:
+                ontology_id: NCBITAXON
+        NCIT:
+            Ontobee:
+                ontology_id: NCIT
+            OLS:
+                ontology_id: ncit
+        UBERON:
+            Ontobee:
+                ontology_id: UBERON
+            OLS:
+                ontology_id: uberon
+        UO:
+            Ontobee:
+                ontology_id: UO
+            OLS:
+                ontology_id: uo
+
+# AIRR specification extensions
+#
+# The schema definitions for AIRR standards objects is extended to
+# provide a number of AIRR specific attributes. This schema definition
+# specifies the structure, property names and data types. These
+# attributes are attached to an AIRR field with the x-airr property.
+
+Attributes:
+    type: object
+    properties:
+        miairr:
+            type: string
+            description: MiAIRR requirement level.
+            enum:
+                - essential
+                - important
+                - defined
+            default: defined
+        identifier:
+            type: boolean
+            description: >
+                True if the field is an identifier required to link metadata and/or individual
+                sequence records across objects in the complete AIRR Data Model and ADC API.
+            default: false
+        adc-query-support:
+            type: boolean
+            description: >
+                True if an ADC API implementation must support queries on the field.
+                If false, query support for the field in ADC API implementations is optional.
+            default: false
+        adc-api-optional:
+            type: boolean
+            description: >
+                If false, repositories must implement these fields both for queries and query repsonse.
+                Only applies to fields in the ADC API spec that are extensions to the AIRR Standard,
+                targeted at "convenience query fields" that make queries against repositories more
+                efficient than if queries were limited to AIRR fields only.
+                If true, repositories can choose to support the field or not.
+            default: false
+        deprecated:
+            type: boolean
+            description: True if the field has been deprecated from the schema.
+            default: false
+        deprecated-description:
+            type: string
+            description: Information regarding the deprecation of the field.
+        deprecated-replaced-by:
+            type: array
+            items:
+                type: string
+            description: The deprecated field is replaced by this list of fields.
+        set:
+            type: integer
+            description: MiAIRR set
+        subset:
+            type: string
+            description: MiAIRR subset
+        name:
+            type: string
+            description: MiAIRR name
+        format:
+            type: string
+            description: Field format. If null then assume the full range of the field data type
+            enum:
+                - ontology
+                - controlled_vocabulary
+                - physical_quantity
+                - CURIE
+        ontology:
+            type: object
+            description: Ontology definition for field
+            properties:
+                draft:
+                    type: boolean
+                    description: Indicates if ontology definition is a draft
+                top_node:
+                    type: object
+                    description: >
+                        Concept to use as top node for ontology. Note that this must have the same CURIE namespace
+                        as the actually annotated concept.
+                    properties:
+                        id:
+                            type: string
+                            description: CURIE for the top node term
+                        label:
+                            type: string
+                            description: Ontology name for the top node term
+
+# AIRR Data File
+#
+# A JSON data file that holds Repertoire metadata, data processing
+# analysis objects, or any object in the AIRR Data Model.
+#
+# It is presumed that the objects gathered together in an AIRR Data File are related
+# or relevant to each other, e.g. part of the same study; thus, the ID fields can be
+# internally resolved unless the ID contains an external PID. This implies that AIRR
+# Data Files cannot be merged simply by concatenating arrays; any merge program
+# would need to manage duplicate or conflicting ID values.
+#
+# While the properties in an AIRR Data File are not required, if one is provided then
+# the value should not be null.
+
+DataFile:
+    type: object
+    properties:
+        Info:
+            nullable: false
+            $ref: '#/InfoObject'
+        Repertoire:
+            type: array
+            nullable: false
+            description: List of repertoires
+            items:
+                $ref: '#/Repertoire'
+        RepertoireGroup:
+            type: array
+            nullable: false
+            description: List of repertoire collections
+            items:
+                $ref: '#/RepertoireGroup'
+        Rearrangement:
+            type: array
+            nullable: false
+            description: List of rearrangement records
+            items:
+                $ref: '#/Rearrangement'
+        Cell:
+            type: array
+            nullable: false
+            description: List of cells
+            items:
+                $ref: '#/Cell'
+        Clone:
+            type: array
+            nullable: false
+            description: List of clones
+            items:
+                $ref: '#/Clone'
+        GermlineSet:
+            type: array
+            nullable: false
+            description: List of germline sets
+            items:
+                $ref: '#/GermlineSet'
+        GenotypeSet:
+            type: array
+            nullable: false
+            description: List of genotype sets
+            items:
+                $ref: '#/GenotypeSet'
+
+# AIRR Info object, should be similar to openapi
+# should we point to an openapi schema?
+InfoObject:
+    type: object
+    description: Provides information about data and API responses.
+    required:
+        - title
+        - version
+    properties:
+        title:
+            type: string
+            nullable: false
+        version:
+            type: string
+            nullable: false
+        description:
+            type: string
+            nullable: true
+        contact:
+            type: object
+            nullable: true
+            properties:
+                name:
+                    type: string
+                    nullable: true
+                url:
+                    type: string
+                    nullable: true
+                email:
+                    type: string
+                    nullable: true
+        license:
+            type: object
+            nullable: true
+            required:
+                - name
+            properties:
+                name:
+                    type: string
+                    nullable: false
+                url:
+                    type: string
+                    nullable: true
+
+# A time point
+TimePoint:
+    description: Time point at which an observation or other action was performed.
+    type: object
+    properties:
+        label:
+            type: string
+            nullable: true
+            description: Informative label for the time point
+            example: Pre-operative sampling of cancer tissue
+            x-airr:
+                adc-query-support: true
+        value:
+            type: number
+            nullable: true
+            description: Value of the time point
+            example: -5.0
+            x-airr:
+                adc-query-support: true
+        unit:
+            $ref: '#/Ontology'
+            nullable: true
+            description: Unit of the time point
+            title: Unit of immunization schedule
+            example:
+                id: UO:0000033
+                label: day
+            x-airr:
+                adc-query-support: true
+                format: ontology
+                ontology:
+                    draft: false
+                    top_node:
+                        id: UO:0000003
+                        label: time unit
+
+#
+# General objects
+#
+
+# An individual
+Acknowledgement:
+    description: Individual whose contribution to this work should be acknowledged
+    type: object
+    required:
+        - acknowledgement_id
+        - name
+        - institution_name
+    properties:
+        acknowledgement_id:
+            type: string
+            description: unique identifier of this Acknowledgement within the file
+            x-airr:
+                identifier: true
+                miairr: important
+            nullable: true
+        name:
+            type: string
+            nullable: true
+            description: Full name of individual
+        institution_name:
+            type: string
+            nullable: true
+            description: Individual's department and institution name
+        orcid_id:
+            type: string
+            nullable: true
+            description: Individual's ORCID identifier
+
+#
+# Germline gene schema
+#
+
+# Rearranged and genomic germline sequences
+RearrangedSequence:
+    type: object
+    description: >
+        Details of a directly observed rearranged sequence or an inference from rearranged sequences 
+        contributing support for a gene or allele.
+    required:
+        - sequence_id
+        - sequence
+        - derivation
+        - observation_type
+        - repository_name
+        - repository_id
+        - deposited_version
+        - seq_start
+        - seq_end
+    properties:
+        sequence_id:
+            type: string
+            nullable: true
+            description: >
+                Unique identifier of this RearrangedSequence within the file, typically generated by the repository 
+                hosting the schema, for example from the underlying ID of the database record.
+            x-airr:
+                identifier: true
+                miairr: important
+        sequence:
+            type: string
+            nullable: false
+            x-airr:
+                miairr: essential
+            description: nucleotide sequence
+        derivation:
+            type: string
+            nullable: true
+            enum:
+                - DNA
+                - RNA
+                - null
+            description: The class of nucleic acid that was used as primary starting material
+            x-airr:
+                miairr: important
+        observation_type:
+            type: string
+            nullable: false
+            enum:
+                - direct_sequencing
+                - inference_from_repertoire
+            description: >
+                The type of observation from which this sequence was drawn, such as direct sequencing or 
+                inference from repertoire sequencing data.
+            x-airr:
+                miairr: essential
+        curation:
+            type: string
+            nullable: true
+            description: Curational notes on the sequence
+        repository_name:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: defined
+            description: Name of the repository in which the sequence has been deposited
+        repository_ref:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: defined
+            description: Queryable id or accession number of the sequence published by the repository
+        deposited_version:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: defined
+            description: Version number of the sequence within the repository
+        sequence_start:
+            type: integer
+            nullable: false
+            x-airr:
+                miairr: essential
+            description: Start co-ordinate of the sequence detailed in this record, within the sequence deposited
+        sequence_end:
+            type: integer
+            nullable: false
+            x-airr:
+                miairr: essential
+            description: End co-ordinate of the sequence detailed in this record, within the sequence deposited
+
+UnrearrangedSequence:
+    description: Details of an unrearranged sequence contributing support for a gene or allele
+    type: object
+    required:
+        - sequence_id
+        - sequence
+        - repository_name
+        - assembly_id
+        - gff_seqid
+        - gff_start
+        - gff_end
+        - strand
+    properties:
+        sequence_id:
+            type: string
+            nullable: true
+            x-airr:
+                identifier: true
+                miairr: important
+            description: unique identifier of this UnrearrangedSequence within the file
+        sequence:
+            type: string
+            nullable: false
+            description: >
+                Sequence of interest described in this record. Typically, this will include gene and promoter region.
+            x-airr:
+                miairr: essential
+        curation:
+            type: string
+            nullable: true
+            description: Curational notes on the sequence
+        repository_name:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: defined
+            description: Name of the repository in which the assembly or contig is deposited
+        repository_ref:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: defined
+            description: Queryable id or accession number of the sequence published by the repository
+        patch_no:
+            type: string
+            nullable: true
+            description: Genome assembly patch number in which this gene was determined
+        gff_seqid:
+            type: string
+            nullable: true
+            description: >
+                Sequence (from the assembly) of a window including the gene and preferably also the promoter region.
+        gff_start:
+            type: integer
+            nullable: true
+            description: >
+                Genomic co-ordinates of the start of the sequence of interest described in this record in 
+                Ensemble GFF version 3.
+        gff_end:
+            type: integer
+            nullable: true
+            description: >
+                Genomic co-ordinates of the end of the sequence of interest described in this record in 
+                Ensemble GFF version 3.
+        strand:
+            type: string
+            nullable: true
+            enum:
+                - +
+                - "-"
+                - null
+            description: sense (+ or -)
+
+# V gene delineation
+SequenceDelineationV:
+    description: Delineation of a V-gene in a particular system
+    type: object
+    required:
+        - sequence_delineation_id
+        - delineation_scheme
+        - fwr1_start
+        - fwr1_end
+        - cdr1_start
+        - cdr1_end
+        - fwr2_start
+        - fwr2_end
+        - cdr2_start
+        - cdr2_end
+        - fwr3_start
+        - fwr3_end
+        - cdr3_start
+    properties:
+        sequence_delineation_id:
+            type: string
+            nullable: true
+            description: >
+                Unique identifier of this SequenceDelineationV within the file. Typically, generated by the 
+                repository hosting the record.
+            x-airr:
+                identifier: true
+                miairr: important
+
+        delineation_scheme:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: important
+            description: Name of the delineation scheme
+            example: Chothia
+        unaligned_sequence:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: important
+            description: entire V-sequence covered by this delineation
+        aligned_sequence:
+            type: string
+            nullable: true
+            description: >
+                Aligned sequence if this delineation provides an alignment. An aligned sequence should always be 
+                provided for IMGT delineations.
+        fwr1_start:
+            type: integer
+            nullable: true
+            x-airr:
+                miairr: important
+            description: FWR1 start co-ordinate in the 'unaligned sequence' field
+        fwr1_end:
+            type: integer
+            nullable: true
+            x-airr:
+                miairr: important
+            description: FWR1 end co-ordinate in the 'unaligned sequence' field
+        cdr1_start:
+            type: integer
+            nullable: true
+            x-airr:
+                miairr: important
+            description: CDR1 start co-ordinate in the 'unaligned sequence' field
+        cdr1_end:
+            type: integer
+            nullable: true
+            x-airr:
+                miairr: important
+            description: CDR1 end co-ordinate in the 'unaligned sequence' field
+        fwr2_start:
+            type: integer
+            nullable: true
+            x-airr:
+                miairr: important
+            description: FWR2 start co-ordinate in the 'unaligned sequence' field
+        fwr2_end:
+            type: integer
+            nullable: true
+            x-airr:
+                miairr: important
+            description: FWR2 end co-ordinate in the 'unaligned sequence' field
+        cdr2_start:
+            type: integer
+            nullable: true
+            x-airr:
+                miairr: important
+            description: CDR2 start co-ordinate in the 'unaligned sequence' field
+        cdr2_end:
+            type: integer
+            nullable: true
+            x-airr:
+                miairr: important
+            description: CDR2 end co-ordinate in the 'unaligned sequence' field
+        fwr3_start:
+            type: integer
+            nullable: true
+            x-airr:
+                miairr: important
+            description: FWR3 start co-ordinate in the 'unaligned sequence' field
+        fwr3_end:
+            type: integer
+            nullable: true
+            x-airr:
+                miairr: important
+            description: FWR3 end co-ordinate in the 'unaligned sequence' field
+        cdr3_start:
+            type: integer
+            nullable: true
+            x-airr:
+                miairr: important
+            description: CDR3 start co-ordinate in the 'unaligned sequence' field
+        alignment_labels:
+            type: array
+            nullable: true
+            items:
+                type: string
+            description: >
+                One string for each codon in the aligned_sequence indicating the label of that codon according to 
+                the numbering of the delineation scheme if it provides one.
+
+# Description of a putative or confirmed Ig receptor gene/allele
+AlleleDescription:
+    description: Details of a putative or confirmed Ig receptor gene/allele inferred from one or more observations
+    type: object
+    required:
+        - allele_description_id
+        - maintainer
+        - lab_address
+        - release_version
+        - release_date
+        - release_description
+        - sequence
+        - coding_sequence
+        - locus
+        - sequence_type
+        - functional
+        - inference_type
+        - species
+    properties:
+        allele_description_id:
+            type: string
+            nullable: true
+            x-airr:
+                identifier: true
+                miairr: important
+            description: >
+                Unique identifier of this AlleleDescription within the file. Typically, generated by the 
+                repository hosting the record.
+        allele_description_ref:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: important
+            description: Unique reference to the allele description, in standardized form (Repo:Label:Version)
+            example: OGRDB:Human_IGH:IGHV1-69*01.001
+        maintainer:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: defined
+            description: Maintainer of this sequence record
+        acknowledgements:
+            type: array
+            nullable: true
+            description: List of individuals whose contribution to the gene description should be acknowledged
+            items:
+                $ref: '#/Acknowledgement'
+        lab_address:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: defined
+            description: Institution and full address of corresponding author
+        release_version:
+            type: integer
+            nullable: true
+            x-airr:
+                miairr: important
+            description: Version number of this record, updated whenever a revised version is published or released
+        release_date:
+            type: string
+            nullable: true
+            format: date-time
+            x-airr:
+                miairr: important
+            description: Date of this release
+            title: Release Date
+            example: "2021-02-02"
+        release_description:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: important
+            description: Brief descriptive notes of the reason for this release and the changes embodied
+        label:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: important
+            description: >
+                The accepted name for this gene or allele following the relevant nomenclature.
+                The value in this field should correspond to values in acceptable name fields of other schemas, 
+                such as v_call, d_call, and j_call fields.
+            example: IGHV1-69*01
+        sequence:
+            type: string
+            nullable: false
+            x-airr:
+                miairr: essential
+            description: >
+                Nucleotide sequence of the gene. This should cover the full length that is available, 
+                including where possible RSS, and 5' UTR and lead-in for V-gene sequences.
+        coding_sequence:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: important
+            description: >
+                Nucleotide sequence of the core coding region, such as the coding region of a D-, J- or C- gene 
+                or the coding region of a V-gene excluding the leader.
+        aliases:
+            type: array
+            nullable: true
+            items:
+                type: string
+            description: Alternative names for this sequence
+        locus:
+            type: string
+            nullable: false
+            enum:
+                - IGH
+                - IGI
+                - IGK
+                - IGL
+                - TRA
+                - TRB
+                - TRG
+                - TRD
+            description: Gene locus
+            x-airr:
+                miairr: essential
+        chromosome:
+            type: integer
+            nullable: true
+            description: chromosome on which the gene is located
+        sequence_type:
+            type: string
+            nullable: false
+            enum:
+                - V
+                - D
+                - J
+                - C
+            description: Sequence type (V, D, J, C)
+            x-airr:
+                miairr: essential
+        functional:
+            type: boolean
+            nullable: true
+            x-airr:
+                miairr: important
+            description: True if the gene is functional, false if it is a pseudogene
+        inference_type:
+            type: string
+            nullable: true
+            enum:
+                - genomic_and_rearranged
+                - genomic_only
+                - rearranged_only
+                - null
+            description: Type of inference(s) from which this gene sequence was inferred
+            x-airr:
+                miairr: important
+        species:
+            $ref: '#/Ontology'
+            nullable: false
+            description: Binomial designation of subject's species
+            title: Organism
+            example:
+                id: NCBITAXON:9606
+                label: Homo sapiens
+            x-airr:
+                miairr: essential
+        species_subgroup:
+            type: string
+            nullable: true
+            description: Race, strain or other species subgroup to which this subject belongs
+            example: BALB/c
+        species_subgroup_type:
+            type: string
+            nullable: true
+            enum:
+                - breed
+                - strain
+                - inbred
+                - outbred
+                - locational
+                - null
+        status:
+            type: string
+            nullable: true
+            enum:
+                - active
+                - draft
+                - retired
+                - withdrawn
+                - null
+            description: Status of record, assumed active if the field is not present
+        subgroup_designation:
+            type: string
+            nullable: true
+            description: Identifier of the gene subgroup or clade, as (and if) defined
+        gene_designation:
+            type: string
+            nullable: true
+            description: Gene number or other identifier, as (and if) defined
+        allele_designation:
+            type: string
+            nullable: true
+            description: Allele number or other identifier, as (and if) defined
+        allele_similarity_cluster_designation:
+            type: string
+            nullable: true
+            description: ID of the similarity cluster used in this germline set, if designated
+        allele_similarity_cluster_member_id:
+            type: string
+            nullable: true
+            description: Membership ID of the allele within the similarity cluster, if a cluster is designated
+        j_codon_frame:
+            type: integer
+            nullable: true
+            enum:
+                - 1
+                - 2
+                - 3
+                - null
+            description: >
+                Codon position of the first nucleotide in the 'coding_sequence' field. Mandatory for J genes. 
+                Not used for V or D genes. '1' means the sequence is in-frame, '2' means that the first bp is 
+                missing from the first codon, and '3' means that the first 2 bp are missing.
+        gene_start:
+            type: integer
+            nullable: true
+            description: >
+                Co-ordinate in the sequence field of the first nucleotide in the coding_sequence field.
+            x-airr:
+                miairr: important
+        gene_end:
+            type: integer
+            nullable: true
+            description: >
+                Co-ordinate in the sequence field of the last gene-coding nucleotide in the coding_sequence field.
+            x-airr:
+                miairr: important
+        utr_5_prime_start:
+            type: integer
+            nullable: true
+            description: Start co-ordinate in the sequence field of the 5 prime UTR (V-genes only).
+        utr_5_prime_end:
+            type: integer
+            nullable: true
+            description: End co-ordinate in the sequence field of the 5 prime UTR (V-genes only).
+        leader_1_start:
+            type: integer
+            nullable: true
+            description: Start co-ordinate in the sequence field of L-PART1 (V-genes only).
+        leader_1_end:
+            type: integer
+            nullable: true
+            description: End co-ordinate in the sequence field of L-PART1 (V-genes only).
+        leader_2_start:
+            type: integer
+            nullable: true
+            description: Start co-ordinate in the sequence field of L-PART2 (V-genes only).
+        leader_2_end:
+            type: integer
+            nullable: true
+            description: End co-ordinate in the sequence field of L-PART2 (V-genes only).
+        v_rs_start:
+            type: integer
+            nullable: true
+            description: Start co-ordinate in the sequence field of the V recombination site (V-genes only).
+        v_rs_end:
+            type: integer
+            nullable: true
+            description: End co-ordinate in the sequence field of the V recombination site (V-genes only).
+        d_rs_3_prime_start:
+            type: integer
+            nullable: true
+            description: Start co-ordinate in the sequence field of the 3 prime D recombination site (D-genes only).
+        d_rs_3_prime_end:
+            type: integer
+            nullable: true
+            description: End co-ordinate in the sequence field of the 3 prime D recombination site (D-genes only).
+        d_rs_5_prime_start:
+            type: integer
+            nullable: true
+            description: Start co-ordinate in the sequence field of the 5 prime D recombination site (D-genes only).
+        d_rs_5_prime_end:
+            type: integer
+            nullable: true
+            description: End co-ordinate in the sequence field of 5 the prime D recombination site (D-genes only).
+        j_cdr3_end:
+            type: integer
+            nullable: true
+            description: >
+                In the case of a J-gene, the co-ordinate in the sequence field of the first nucelotide of the 
+                conserved PHE or TRP (IMGT codon position 118).
+        j_rs_start:
+            type: integer
+            nullable: true
+            description: Start co-ordinate in the sequence field of J recombination site (J-genes only).
+        j_rs_end:
+            type: integer
+            nullable: true
+            description: End co-ordinate in the sequence field of J recombination site (J-genes only).
+        j_donor_splice:
+            type: integer
+            nullable: true
+            description: Co-ordinate in the sequence field of the final 3' nucleotide of the J-REGION (J-genes only).
+        v_gene_delineations:
+            type: array
+            nullable: true
+            items:
+                $ref: '#/SequenceDelineationV'
+        unrearranged_support:
+            type: array
+            nullable: true
+            items:
+                $ref: '#/UnrearrangedSequence'
+        rearranged_support:
+            type: array
+            nullable: true
+            items:
+                $ref: '#/RearrangedSequence'
+        paralogs:
+            type: array
+            nullable: true
+            items:
+                type: string
+            description: Gene symbols of any paralogs
+        curation:
+            type: string
+            nullable: true
+            description: >
+                Curational notes on the AlleleDescription. This can be used to give more extensive notes on the 
+                decisions taken than are provided in the release_description.
+        curational_tags:
+            type: array
+            nullable: true
+            items:
+                type: string
+                enum:
+                    - likely_truncated
+                    - likely_full_length
+            description: Controlled-vocabulary tags applied to this description
+
+# Collection of gene descriptions into a germline set
+GermlineSet:
+    type: object
+    description: >
+        A germline object set bringing together multiple AlleleDescriptions from the same strain or species. 
+        All genes in a GermlineSet should be from a single locus.
+    required:
+        - germline_set_id
+        - author
+        - lab_name
+        - lab_address
+        - release_version
+        - release_description
+        - release_date
+        - germline_set_name
+        - germline_set_ref
+        - species
+        - locus
+        - allele_descriptions
+    properties:
+        germline_set_id:
+            type: string
+            nullable: true
+            description: >
+                Unique identifier of the GermlineSet within this file. Typically, generated by the 
+                repository hosting the record.
+            x-airr:
+                identifier: true
+                miairr: important
+        author:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: important
+            description: Corresponding author
+        lab_name:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: important
+            description: Department of corresponding author
+        lab_address:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: important
+            description: Institutional address of corresponding author
+        acknowledgements:
+            type: array
+            nullable: true
+            description: List of individuals whose contribution to the germline set should be acknowledged
+            items:
+                $ref: '#/Acknowledgement'
+        release_version:
+            type: number
+            nullable: true
+            x-airr:
+                miairr: important
+            description: Version number of this record, allocated automatically
+        release_description:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: important
+            description: Brief descriptive notes of the reason for this release and the changes embodied
+        release_date:
+            type: string
+            nullable: true
+            format: date-time
+            x-airr:
+                miairr: important
+            description: Date of this release
+            title: Release Date
+            example: "2021-02-02"
+        germline_set_name:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: important
+            description: descriptive name of this germline set
+        germline_set_ref:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: important
+            description: Unique identifier of the germline set and version, in standardized form (Repo:Label:Version)
+            example: OGRDB:Human_IGH:2021.11
+        pub_ids:
+            type: array
+            items:
+                type: string
+            nullable: true
+            description: Publications describing the germline set
+            example: ["PMID:35720344"]
+        species:
+            $ref: '#/Ontology'
+            nullable: false
+            x-airr:
+                miairr: essential
+            description: Binomial designation of subject's species
+            title: Organism
+            example:
+                id: NCBITAXON:9606
+                label: Homo sapiens
+        species_subgroup:
+            type: string
+            nullable: true
+            description: Race, strain or other species subgroup to which this subject belongs
+            example: BALB/c
+        species_subgroup_type:
+            type: string
+            nullable: true
+            enum:
+                - breed
+                - strain
+                - inbred
+                - outbred
+                - locational
+                - null
+        locus:
+            type: string
+            nullable: false
+            enum:
+                - IGH
+                - IGI
+                - IGK
+                - IGL
+                - TRA
+                - TRB
+                - TRG
+                - TRD
+            description: Gene locus
+            x-airr:
+                miairr: essential
+        allele_descriptions:
+            type: array
+            nullable: true
+            items:
+                $ref: '#/AlleleDescription'
+            description: list of allele_descriptions in the germline set
+            x-airr:
+                miairr: important
+        curation:
+            type: string
+            nullable: true
+            description: >
+                Curational notes on the GermlineSet. This can be used to give more extensive notes on the 
+                decisions taken than are provided in the release_description.
+
+#
+# Genotype schema
+#
+
+# GenotypeSet lists the Genotypes (describing different loci) inferred for this subject
+
+GenotypeSet:
+    type: object
+    required:
+        - receptor_genotype_set_id
+    properties:
+        receptor_genotype_set_id:
+            type: string
+            nullable: true
+            x-airr:
+                identifier: true
+                miairr: important
+            description: >
+                A unique identifier for this Receptor Genotype Set, typically generated by the repository 
+                hosting the schema, for example from the underlying ID of the database record.
+        genotype_class_list:
+            description: List of Genotypes included in this Receptor Genotype Set.
+            type: array
+            nullable: true
+            items:
+                $ref: '#/Genotype'
+
+# Genotype of adaptive immune receptors
+# This enumerates the alleles and gene deletions inferred in a single subject.
+# Included alleles may either be listed by reference to a GermlineSet, or
+# listed as 'undocumented', in which case the inferred sequence is provided
+
+Genotype:
+    type: object
+    required:
+        - receptor_genotype_id
+        - locus
+    properties:
+        receptor_genotype_id:
+            type: string
+            nullable: true
+            x-airr:
+                identifier: true
+                miairr: important
+            description: >
+                A unique identifier within the file for this Receptor Genotype, typically generated by the 
+                repository hosting the schema, for example from the underlying ID of the database record.
+        locus:
+            type: string
+            nullable: false
+            enum:
+                - IGH
+                - IGI
+                - IGK
+                - IGL
+                - TRA
+                - TRB
+                - TRD
+                - TRG
+            description: Gene locus
+            example: IGH
+            x-airr:
+                adc-query-support: true
+                format: controlled_vocabulary
+                miairr: essential
+        documented_alleles:
+            type: array
+            nullable: true
+            description: List of alleles documented in reference set(s)
+            items:
+                $ref: '#/DocumentedAllele'
+            x-airr:
+                miairr: important
+        undocumented_alleles:
+            type: array
+            nullable: true
+            description: List of alleles inferred to be present and not documented in an identified GermlineSet
+            items:
+                $ref: '#/UndocumentedAllele'
+            x-airr:
+                adc-query-support: true
+        deleted_genes:
+            type: array
+            nullable: true
+            description: Array of genes identified as being deleted in this genotype
+            items:
+                $ref: '#/DeletedGene'
+            x-airr:
+                adc-query-support: true
+        inference_process:
+            type: string
+            nullable: true
+            enum:
+                - genomic_sequencing
+                - repertoire_sequencing
+                - null
+            description: Information on how the genotype was acquired. Controlled vocabulary.
+            title: Genotype acquisition process
+            example: repertoire_sequencing
+            x-airr:
+                adc-query-support: true
+                format: controlled_vocabulary
+
+# Documented Allele
+# This describes a 'known' allele found in a genotype
+# It 'known' in the sense that it is documented in a reference set
+
+DocumentedAllele:
+    type: object
+    required:
+        - label
+        - germline_set_ref
+    properties:
+        label:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: important
+            description: The accepted name for this allele, taken from the GermlineSet
+        germline_set_ref:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: important
+            description: GermlineSet from which it was taken, referenced in standardized form (Repo:Label:Version)
+            example: OGRDB:Human_IGH:2021.11
+        phasing:
+            type: integer
+            nullable: true
+            description: >
+                Chromosomal phasing indicator. Alleles with the same value are inferred to be located on the 
+                same chromosome.
+
+# Undocumented Allele
+# This describes a 'undocumented' allele found in a genotype
+# It is 'undocumented' in the sense that it was not found in reference sets consulted for the analysis
+
+UndocumentedAllele:
+    required:
+        - allele_name
+        - sequence
+    type: object
+    properties:
+        allele_name:
+            type: string
+            nullable: true
+            description: Allele name as allocated by the inference pipeline
+            x-airr:
+                miairr: important
+        sequence:
+            type: string
+            nullable: false
+            description: nt sequence of the allele, as provided by the inference pipeline
+            x-airr:
+                miairr: essential
+        phasing:
+            type: integer
+            nullable: true
+            description: >
+                Chromosomal phasing indicator. Alleles with the same value are inferred to be located on the 
+                same chromosome.
+
+# Deleted Gene
+# It is regarded as 'deleted' in the sense that it was not identified during inference of the genotype
+
+DeletedGene:
+    required:
+        - label
+        - germline_set_ref
+    type: object
+    properties:
+        label:
+            type: string
+            nullable: false
+            description: The accepted name for this gene, taken from the GermlineSet
+            x-airr:
+                miairr: essential
+        germline_set_ref:
+            type: string
+            nullable: true
+            description: GermlineSet from which it was taken (issuer/name/version)
+            x-airr:
+                miairr: important
+        phasing:
+            type: integer
+            nullable: true
+            description: >
+                Chromosomal phasing indicator. Alleles with the same value are inferred to be located on the 
+                same chromosome.
+
+
+# List of MHCGenotypes describing a subject's genotype
+MHCGenotypeSet:
+    type: object
+    required:
+        - mhc_genotype_set_id
+        - mhc_genotype_list
+    properties:
+        mhc_genotype_set_id:
+            type: string
+            nullable: true
+            x-airr:
+                identifier: true
+                miairr: important
+            description: A unique identifier for this MHCGenotypeSet
+        mhc_genotype_list:
+            description: List of MHCGenotypes included in this set
+            type: array
+            nullable: true
+            x-airr:
+                miairr: important
+            items:
+                $ref: '#/MHCGenotype'
+
+# Genotype of major histocompatibility complex (MHC) class I, class II and non-classical loci
+MHCGenotype:
+    type: object
+    required:
+        - mhc_genotype_id
+        - mhc_class
+        - mhc_alleles
+    properties:
+        mhc_genotype_id:
+            type: string
+            nullable: true
+            x-airr:
+                identifier: true
+                miairr: important
+            description: A unique identifier for this MHCGenotype, assumed to be unique in the context of the study
+        mhc_class:
+            type: string
+            nullable: false
+            enum:
+                - MHC-I
+                - MHC-II
+                - MHC-nonclassical
+            description: Class of MHC alleles described by the MHCGenotype
+            example: MHC-I
+            x-airr:
+                miairr: essential
+                adc-query-support: true
+                format: controlled_vocabulary
+        mhc_alleles:
+            type: array
+            nullable: true
+            description: List of MHC alleles of the indicated mhc_class identified in an individual
+            items:
+                $ref: '#/MHCAllele'
+            x-airr:
+                miairr: important
+                adc-query-support: true
+        mhc_genotyping_method:
+            type: string
+            nullable: true
+            description: >
+                Information on how the genotype was determined. The content of this field should come from a list of
+                recommended terms provided in the AIRR Schema documentation.
+            title: MHC genotyping method
+            example: pcr_low_resolution
+            x-airr:
+                adc-query-support: true
+                miairr: important
+
+
+# Allele of an MHC gene
+MHCAllele:
+    type: object
+    properties:
+        allele_designation:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: important
+            description: >
+                The accepted designation of an allele, usually its gene symbol plus allele/sub-allele/etc
+                identifiers, if provided by the mhc_typing method
+        gene:
+            $ref: '#/Ontology'
+            nullable: true
+            description: The MHC gene to which the described allele belongs
+            title: MHC gene
+            example:
+                id: MRO:0000046
+                label: HLA-A
+            x-airr:
+                adc-query-support: false
+                format: ontology
+                ontology:
+                    draft: true
+                    top_node:
+                        id: MRO:0000004
+                        label: MHC gene
+                miairr: important
+        reference_set_ref:
+            type: string
+            nullable: true
+            x-airr:
+                miairr: important
+            description: Repository and list from which it was taken (issuer/name/version)
+
+
+SubjectGenotype:
+    type: object
+    properties:
+        receptor_genotype_set:
+            nullable: true
+            $ref: '#/GenotypeSet'
+            description: Immune receptor genotype set for this subject.
+        mhc_genotype_set:
+            nullable: true
+            $ref: '#/MHCGenotypeSet'
+            description: MHC genotype set for this subject.
+
+#
+# Repertoire metadata schema
+#
+
+# The overall study with a globally unique study_id
+Study:
+    type: object
+    required:
+        - study_id
+        - study_title
+        - study_type
+        - inclusion_exclusion_criteria
+        - grants
+        - collected_by
+        - lab_name
+        - lab_address
+        - submitted_by
+        - pub_ids
+        - keywords_study
+    properties:
+        study_id:
+            type: string
+            nullable: true
+            description: >
+                Unique ID assigned by study registry such as one of the International Nucleotide Sequence Database
+                Collaboration (INSDC) repositories.
+            title: Study ID
+            example: PRJNA001
+            x-airr:
+                identifier: true
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: study
+                name: Study ID
+        study_title:
+            type: string
+            nullable: true
+            description: Descriptive study title
+            title: Study title
+            example: Effects of sun light exposure of the Treg repertoire
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: study
+                name: Study title
+        study_type:
+            $ref: '#/Ontology'
+            nullable: true
+            description: Type of study design
+            title: Study type
+            example:
+                id: NCIT:C15197
+                label: Case-Control Study
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: study
+                name: Study type
+                format: ontology
+                ontology:
+                    draft: false
+                    top_node:
+                        id: NCIT:C63536
+                        label: Study
+        study_description:
+            type: string
+            nullable: true
+            description: Generic study description
+            title: Study description
+            example: Longer description
+            x-airr:
+                name: Study description
+                adc-query-support: true
+        inclusion_exclusion_criteria:
+            type: string
+            nullable: true
+            description: List of criteria for inclusion/exclusion for the study
+            title: Study inclusion/exclusion criteria
+            example: "Include: Clinical P. falciparum infection; Exclude: Seropositive for HIV"
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: study
+                name: Study inclusion/exclusion criteria
+        grants:
+            type: string
+            nullable: true
+            description: Funding agencies and grant numbers
+            title: Grant funding agency
+            example: NIH, award number R01GM987654
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: study
+                name: Grant funding agency
+        study_contact:
+            type: string
+            nullable: true
+            description: >
+                Full contact information of the contact persons for this study This should include an e-mail address
+                and a persistent identifier such as an ORCID ID.
+            title: Contact information (study)
+            example: Dr. P. Stibbons, p.stibbons@unseenu.edu, https://orcid.org/0000-0002-1825-0097
+            x-airr:
+                adc-query-support: true
+                name: Contact information (study)
+        collected_by:
+            type: string
+            nullable: true
+            description: >
+                Full contact information of the data collector, i.e. the person who is legally responsible for data
+                collection and release. This should include an e-mail address and a persistent identifier such as an
+                ORCID ID.
+            title: Contact information (data collection)
+            example: Dr. P. Stibbons, p.stibbons@unseenu.edu, https://orcid.org/0000-0002-1825-0097
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: study
+                name: Contact information (data collection)
+        lab_name:
+            type: string
+            nullable: true
+            description: Department of data collector
+            title: Lab name
+            example: Department for Planar Immunology
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: study
+                name: Lab name
+        lab_address:
+            type: string
+            nullable: true
+            description: Institution and institutional address of data collector
+            title: Lab address
+            example: School of Medicine, Unseen University, Ankh-Morpork, Disk World
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: study
+                name: Lab address
+        submitted_by:
+            type: string
+            nullable: true
+            description: >
+                Full contact information of the data depositor, i.e., the person submitting the data to a repository.
+                This should include an e-mail address and a persistent identifier such as an ORCID ID. This is
+                supposed to be a short-lived and technical role until the submission is relased.
+            title: Contact information (data deposition)
+            example: Adrian Turnipseed, a.turnipseed@unseenu.edu, https://orcid.org/0000-0002-1825-0097
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: study
+                name: Contact information (data deposition)
+        pub_ids:
+            type: array
+            items:
+                type: string
+            nullable: true
+            description: >
+                Array of publications describing the rationale and/or outcome of the study as an array of CURIE objects such as 
+                a DOI or Pubmed ID. Where more than one publication is given, if there is a primary publication for the study it
+                should come first.
+            title: Relevant publications
+            example: ["PMID:29144493", "DOI:10.1038/ni.3873"]
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: study
+                name: Relevant publications
+        keywords_study:
+            type: array
+            items:
+                type: string
+                enum:
+                    - contains_ig
+                    - contains_tr
+                    - contains_paired_chain
+                    - contains_schema_rearrangement
+                    - contains_schema_clone
+                    - contains_schema_cell
+                    - contains_schema_receptor
+                    - contains_schema_cellexpression
+                    - contains_schema_receptorreactivity
+            nullable: true
+            description: >
+                Keywords describing properties of one or more data sets in a study. "contains_schema" keywords indicate that
+                the study contains data objects from the AIRR Schema of that type (Rearrangement, Clone, Cell, Receptor) while
+                the other keywords indicate that the study design considers the type of data indicated (e.g. it is possible to have
+                a study that "contains_paired_chain" but does not "contains_schema_cell").
+            title: Keywords for study
+            example:
+                - contains_ig
+                - contains_schema_rearrangement
+                - contains_schema_clone
+                - contains_schema_cell
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: study
+                name: Keywords for study
+                format: controlled_vocabulary
+        adc_publish_date:
+            type: string
+            format: date-time
+            nullable: true
+            description: >
+                Date the study was first published in the AIRR Data Commons.
+            title: ADC Publish Date
+            example: "2021-02-02"
+            x-airr:
+                adc-query-support: true
+                name: ADC Publish Date
+        adc_update_date:
+            type: string
+            format: date-time
+            nullable: true
+            description: >
+                Date the study data was updated in the AIRR Data Commons.
+            title: ADC Update Date
+            example: "2021-02-02"
+            x-airr:
+                adc-query-support: true
+                name: ADC Update Date
+
+# 1-to-n relationship between a study and its subjects
+# subject_id is unique within a study
+Subject:
+    type: object
+    required:
+        - subject_id
+        - synthetic
+        - species
+        - sex
+        - age_min
+        - age_max
+        - age_unit
+        - age_event
+        - ancestry_population
+        - ethnicity
+        - race
+        - strain_name
+        - linked_subjects
+        - link_type
+    properties:
+        subject_id:
+            type: string
+            nullable: true
+            description: >
+                Subject ID assigned by submitter, unique within study. If possible, a persistent subject ID linked to
+                an INSDC or similar repository study should be used.
+            title: Subject ID
+            example: SUB856413
+            x-airr:
+                identifier: true
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: subject
+                name: Subject ID
+        synthetic:
+            type: boolean
+            nullable: false
+            description: TRUE for libraries in which the diversity has been synthetically generated (e.g. phage display)
+            title: Synthetic library
+            x-airr:
+                miairr: essential
+                adc-query-support: true
+                set: 1
+                subset: subject
+                name: Synthetic library
+        species:
+            $ref: '#/Ontology'
+            nullable: false
+            description: Binomial designation of subject's species
+            title: Organism
+            example:
+                id: NCBITAXON:9606
+                label: Homo sapiens
+            x-airr:
+                miairr: essential
+                adc-query-support: true
+                set: 1
+                subset: subject
+                name: Species
+                format: ontology
+                ontology:
+                    draft: false
+                    top_node:
+                        id: NCBITAXON:7776
+                        label: Gnathostomata
+        organism:
+            $ref: '#/Ontology'
+            nullable: true
+            description: Binomial designation of subject's species
+            x-airr:
+                deprecated: true
+                deprecated-description: Field was renamed to species for clarity.
+                deprecated-replaced-by:
+                    - species
+        sex:
+            type: string
+            enum:
+                - male
+                - female
+                - pooled
+                - hermaphrodite
+                - intersex
+                - null
+            nullable: true
+            description: Biological sex of subject
+            title: Sex
+            example: female
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: subject
+                name: Sex
+                format: controlled_vocabulary
+        age_min:
+            type: number
+            nullable: true
+            description: Specific age or lower boundary of age range.
+            title: Age minimum
+            example: 60
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: subject
+                name: Age minimum
+        age_max:
+            type: number
+            nullable: true
+            description: >
+                Upper boundary of age range or equal to age_min for specific age.
+                This field should only be null if age_min is null.
+            title: Age maximum
+            example: 80
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: subject
+                name: Age maximum
+        age_unit:
+            $ref: '#/Ontology'
+            nullable: true
+            description: Unit of age range
+            title: Age unit
+            example:
+                id: UO:0000036
+                label: year
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: subject
+                name: Age unit
+                format: ontology
+                ontology:
+                    draft: false
+                    top_node:
+                        id: UO:0000003
+                        label: time unit
+        age_event:
+            type: string
+            nullable: true
+            description: >
+                Event in the study schedule to which `Age` refers. For NCBI BioSample this MUST be `sampling`. For other
+                implementations submitters need to be aware that there is currently no mechanism to encode to potential
+                delta between `Age event` and `Sample collection time`, hence the chosen events should be in temporal proximity.
+            title: Age event
+            example: enrollment
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: subject
+                name: Age event
+        age:
+            type: string
+            nullable: true
+            x-airr:
+                deprecated: true
+                deprecated-description: Split into two fields to specify as an age range.
+                deprecated-replaced-by:
+                    - age_min
+                    - age_max
+                    - age_unit
+        ancestry_population:
+            $ref: '#/Ontology'
+            nullable: true
+            description: Broad geographic origin of ancestry (continent)
+            title: Ancestry population
+            example:
+                id: GAZ:00000459
+                label: South America
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: subject
+                name: Ancestry population
+                format: ontology
+                ontology:
+                    draft: true
+                    top_node:
+                        id: GAZ:00000448
+                        label: geographic location
+        location_birth:
+            $ref: '#/Ontology'
+            nullable: true
+            description: Self-reported location of birth of the subject, preferred granularity is country-level
+            example:
+                id: GAZ:00002939
+                label: Poland
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: subject
+                name: Location of birth
+                format: ontology
+                ontology:
+                    draft: true
+                    top_node:
+                        id: GAZ:00000448
+                        label: geographic location
+        ethnicity:
+            type: string
+            nullable: true
+            description: Ethnic group of subject (defined as cultural/language-based membership)
+            title: Ethnicity
+            example: English, Kurds, Manchu, Yakuts (and other fields from Wikipedia)
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: subject
+                name: Ethnicity
+        race:
+            type: string
+            nullable: true
+            description: Racial group of subject (as defined by NIH)
+            title: Race
+            example: White, American Indian or Alaska Native, Black, Asian, Native Hawaiian or Other Pacific Islander, Other
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: subject
+                name: Race
+        strain_name:
+            type: string
+            nullable: true
+            description: Non-human designation of the strain or breed of animal used
+            title: Strain name
+            example: C57BL/6J
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: subject
+                name: Strain name
+        linked_subjects:
+            type: string
+            nullable: true
+            description: Subject ID to which `Relation type` refers
+            title: Relation to other subjects
+            example: SUB1355648
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: subject
+                name: Relation to other subjects
+        link_type:
+            type: string
+            nullable: true
+            description: Relation between subject and `linked_subjects`, can be genetic or environmental (e.g.exposure)
+            title: Relation type
+            example: father, daughter, household
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: subject
+                name: Relation type
+        diagnosis:
+            type: array
+            nullable: false
+            description: Diagnosis information for subject
+            items:
+                $ref: '#/Diagnosis'
+            x-airr:
+                adc-query-support: true
+        genotype:
+            nullable: true
+            $ref: '#/SubjectGenotype'
+            title: SubjectGenotype
+
+# 1-to-n relationship between a subject and its diagnoses
+Diagnosis:
+    type: object
+    required:
+        - study_group_description
+        - disease_diagnosis
+        - disease_length
+        - disease_stage
+        - prior_therapies
+        - immunogen
+        - intervention
+        - medical_history
+    properties:
+        study_group_description:
+            type: string
+            nullable: true
+            description: Designation of study arm to which the subject is assigned to
+            title: Study group description
+            example: control
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: diagnosis and intervention
+                name: Study group description
+        disease_diagnosis:
+            $ref: '#/Ontology'
+            nullable: true
+            description: Diagnosis of subject
+            title: Diagnosis
+            example:
+                id: DOID:9538
+                label: multiple myeloma
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: diagnosis and intervention
+                name: Diagnosis
+                format: ontology
+                ontology:
+                    draft: false
+                    top_node:
+                        id: DOID:4
+                        label: disease
+        disease_length:
+            type: string
+            nullable: true
+            description: Time duration between initial diagnosis and current intervention
+            title: Length of disease
+            example: 23 months
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: diagnosis and intervention
+                name: Length of disease
+                format: physical_quantity
+        disease_stage:
+            type: string
+            nullable: true
+            description: Stage of disease at current intervention
+            title: Disease stage
+            example: Stage II
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: diagnosis and intervention
+                name: Disease stage
+        prior_therapies:
+            type: string
+            nullable: true
+            description: List of all relevant previous therapies applied to subject for treatment of `Diagnosis`
+            title: Prior therapies for primary disease under study
+            example: melphalan/prednisone
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: diagnosis and intervention
+                name: Prior therapies for primary disease under study
+        immunogen:
+            type: string
+            nullable: true
+            description: Antigen, vaccine or drug applied to subject at this intervention
+            title: Immunogen/agent
+            example: bortezomib
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: diagnosis and intervention
+                name: Immunogen/agent
+        intervention:
+            type: string
+            nullable: true
+            description: Description of intervention
+            title: Intervention definition
+            example: systemic chemotherapy, 6 cycles, 1.25 mg/m2
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: diagnosis and intervention
+                name: Intervention definition
+        medical_history:
+            type: string
+            nullable: true
+            description: Medical history of subject that is relevant to assess the course of disease and/or treatment
+            title: Other relevant medical history
+            example: MGUS, first diagnosed 5 years prior
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 1
+                subset: diagnosis and intervention
+                name: Other relevant medical history
+
+# 1-to-n relationship between a subject and its samples
+# sample_id is unique within a study
+Sample:
+    type: object
+    required:
+        - sample_id
+        - sample_type
+        - tissue
+        - anatomic_site
+        - disease_state_sample
+        - collection_time_point_relative
+        - collection_time_point_relative_unit
+        - collection_time_point_reference
+        - biomaterial_provider
+    properties:
+        sample_id:
+            type: string
+            nullable: true
+            description: >
+                Sample ID assigned by submitter, unique within study. If possible, a persistent sample ID linked to
+                INSDC or similar repository study should be used.
+            title: Biological sample ID
+            example: SUP52415
+            x-airr:
+                identifier: true
+                miairr: important
+                adc-query-support: true
+                set: 2
+                subset: sample
+                name: Biological sample ID
+        sample_type:
+            type: string
+            nullable: true
+            description: The way the sample was obtained, e.g. fine-needle aspirate, organ harvest, peripheral venous puncture
+            title: Sample type
+            example: Biopsy
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 2
+                subset: sample
+                name: Sample type
+        tissue:
+            $ref: '#/Ontology'
+            nullable: true
+            description: The actual tissue sampled, e.g. lymph node, liver, peripheral blood
+            title: Tissue
+            example:
+                id: UBERON:0002371
+                label: bone marrow
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 2
+                subset: sample
+                name: Tissue
+                format: ontology
+                ontology:
+                    draft: false
+                    top_node:
+                        id: UBERON:0010000
+                        label: multicellular anatomical structure
+        anatomic_site:
+            type: string
+            nullable: true
+            description: The anatomic location of the tissue, e.g. Inguinal, femur
+            title: Anatomic site
+            example: Iliac crest
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 2
+                subset: sample
+                name: Anatomic site
+        disease_state_sample:
+            type: string
+            nullable: true
+            description: Histopathologic evaluation of the sample
+            title: Disease state of sample
+            example: Tumor infiltration
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 2
+                subset: sample
+                name: Disease state of sample
+        collection_time_point_relative:
+            type: number
+            nullable: true
+            description: Time point at which sample was taken, relative to `Collection time event`
+            title: Sample collection time
+            example: 14
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 2
+                subset: sample
+                name: Sample collection time
+        collection_time_point_relative_unit:
+            $ref: '#/Ontology'
+            nullable: true
+            description: Unit of Sample collection time
+            title: Sample collection time unit
+            example:
+                id: UO:0000033
+                label: day
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 2
+                subset: sample
+                name: Sample collection time unit
+                format: ontology
+                ontology:
+                    draft: false
+                    top_node:
+                        id: UO:0000003
+                        label: time unit
+        collection_time_point_reference:
+            type: string
+            nullable: true
+            description: Event in the study schedule to which `Sample collection time` relates to
+            title: Collection time event
+            example: Primary vaccination
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 2
+                subset: sample
+                name: Collection time event
+        collection_location:
+            $ref: '#/Ontology'
+            nullable: true
+            description: Location where the sample was taken, preferred granularity is country-level
+            title: Sample collection location
+            example:
+                id: GAZ:00002939
+                label: Poland
+            x-airr:
+                miairr: important
+                set: 2
+                subset: sample
+                name: Sample collection location
+                format: ontology
+                ontology:
+                    draft: true
+                    top_node:
+                        id: GAZ:00000448
+                        label: geographic location
+        biomaterial_provider:
+            type: string
+            nullable: true
+            description: Name and address of the entity providing the sample
+            title: Biomaterial provider
+            example: Tissues-R-Us, Tampa, FL, USA
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 2
+                subset: sample
+                name: Biomaterial provider
+
+# 1-to-n relationship between a sample and processing of its cells
+CellProcessing:
+    type: object
+    required:
+        - tissue_processing
+        - cell_subset
+        - cell_phenotype
+        - single_cell
+        - cell_number
+        - cells_per_reaction
+        - cell_storage
+        - cell_quality
+        - cell_isolation
+        - cell_processing_protocol
+    properties:
+        tissue_processing:
+            type: string
+            nullable: true
+            description: Enzymatic digestion and/or physical methods used to isolate cells from sample
+            title: Tissue processing
+            example: Collagenase A/Dnase I digested, followed by Percoll gradient
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (cell)
+                name: Tissue processing
+        cell_subset:
+            $ref: '#/Ontology'
+            nullable: true
+            description: Commonly-used designation of isolated cell population
+            title: Cell subset
+            example:
+                id: CL:0000972
+                label: class switched memory B cell
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (cell)
+                name: Cell subset
+                format: ontology
+                ontology:
+                    draft: false
+                    top_node:
+                        id: CL:0000542
+                        label: lymphocyte
+        cell_phenotype:
+            type: string
+            nullable: true
+            description: List of cellular markers and their expression levels used to isolate the cell population
+            title: Cell subset phenotype
+            example: CD19+ CD38+ CD27+ IgM- IgD-
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (cell)
+                name: Cell subset phenotype
+        cell_species:
+            $ref: '#/Ontology'
+            nullable: true
+            description: >
+                Binomial designation of the species from which the analyzed cells originate. Typically, this value
+                should be identical to `species`, in which case it SHOULD NOT be set explicitly. However, there are
+                valid experimental setups in which the two might differ, e.g., chimeric animal models. If set, this
+                key will overwrite the `species` information for all lower layers of the schema.
+            title: Cell species
+            example:
+                id: NCBITAXON:9606
+                label: Homo sapiens
+            x-airr:
+                miairr: defined
+                adc-query-support: true
+                set: 3
+                subset: process (cell)
+                name: Cell species
+                format: ontology
+                ontology:
+                    draft: false
+                    top_node:
+                        id: NCBITAXON:7776
+                        label: Gnathostomata
+        single_cell:
+            type: boolean
+            nullable: true
+            description: TRUE if single cells were isolated into separate compartments
+            title: Single-cell sort
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (cell)
+                name: Single-cell sort
+        cell_number:
+            type: integer
+            nullable: true
+            description: Total number of cells that went into the experiment
+            title: Number of cells in experiment
+            example: 1000000
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (cell)
+                name: Number of cells in experiment
+        cells_per_reaction:
+            type: integer
+            nullable: true
+            description: Number of cells for each biological replicate
+            title: Number of cells per sequencing reaction
+            example: 50000
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (cell)
+                name: Number of cells per sequencing reaction
+        cell_storage:
+            type: boolean
+            nullable: true
+            description: TRUE if cells were cryo-preserved between isolation and further processing
+            title: Cell storage
+            example: TRUE
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (cell)
+                name: Cell storage
+        cell_quality:
+            type: string
+            nullable: true
+            description: Relative amount of viable cells after preparation and (if applicable) thawing
+            title: Cell quality
+            example: 90% viability as determined by 7-AAD
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (cell)
+                name: Cell quality
+        cell_isolation:
+            type: string
+            nullable: true
+            description: Description of the procedure used for marker-based isolation or enrich cells
+            title: Cell isolation / enrichment procedure
+            example: >
+                Cells were stained with fluorochrome labeled antibodies and then sorted on a FlowMerlin (CE) cytometer.
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (cell)
+                name: Cell isolation / enrichment procedure
+        cell_processing_protocol:
+            type: string
+            nullable: true
+            description: >
+                Description of the methods applied to the sample including cell preparation/ isolation/enrichment and
+                nucleic acid extraction. This should closely mirror the Materials and methods section in the manuscript.
+            title: Processing protocol
+            example: Stimulated wih anti-CD3/anti-CD28
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (cell)
+                name: Processing protocol
+
+# object for PCR primer targets
+PCRTarget:
+    type: object
+    required:
+        - pcr_target_locus
+        - forward_pcr_primer_target_location
+        - reverse_pcr_primer_target_location
+    properties:
+        pcr_target_locus:
+            type: string
+            enum:
+                - IGH
+                - IGI
+                - IGK
+                - IGL
+                - TRA
+                - TRB
+                - TRD
+                - TRG
+                - null
+            nullable: true
+            description: >
+                Designation of the target locus. Note that this field uses a controlled vocubulary that is meant to
+                provide a generic classification of the locus, not necessarily the correct designation according to
+                a specific nomenclature.
+            title: Target locus for PCR
+            example: IGK
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (nucleic acid [pcr])
+                name: Target locus for PCR
+                format: controlled_vocabulary
+        forward_pcr_primer_target_location:
+            type: string
+            nullable: true
+            description: Position of the most distal nucleotide templated by the forward primer or primer mix
+            title: Forward PCR primer target location
+            example: IGHV, +23
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (nucleic acid [pcr])
+                name: Forward PCR primer target location
+        reverse_pcr_primer_target_location:
+            type: string
+            nullable: true
+            description: Position of the most proximal nucleotide templated by the reverse primer or primer mix
+            title: Reverse PCR primer target location
+            example: IGHG, +57
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (nucleic acid [pcr])
+                name: Reverse PCR primer target location
+
+# generally, a 1-to-1 relationship between a CellProcessing and processing of its nucleic acid
+# but may be 1-to-n for technical replicates.
+NucleicAcidProcessing:
+    type: object
+    required:
+        - template_class
+        - template_quality
+        - template_amount
+        - template_amount_unit
+        - library_generation_method
+        - library_generation_protocol
+        - library_generation_kit_version
+        - complete_sequences
+        - physical_linkage
+    properties:
+        template_class:
+            type: string
+            enum:
+                - DNA
+                - RNA
+            nullable: false
+            description: >
+                The class of nucleic acid that was used as primary starting material for the following procedures
+            title: Target substrate
+            example: RNA
+            x-airr:
+                miairr: essential
+                adc-query-support: true
+                set: 3
+                subset: process (nucleic acid)
+                name: Target substrate
+                format: controlled_vocabulary
+        template_quality:
+            type: string
+            nullable: true
+            description: Description and results of the quality control performed on the template material
+            title: Target substrate quality
+            example: RIN 9.2
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (nucleic acid)
+                name: Target substrate quality
+        template_amount:
+            type: number
+            nullable: true
+            description: Amount of template that went into the process
+            title: Template amount
+            example: 1000
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (nucleic acid)
+                name: Template amount
+        template_amount_unit:
+            $ref: '#/Ontology'
+            nullable: true
+            description: Unit of template amount
+            title: Template amount time unit
+            example:
+                id: UO:0000024
+                label: nanogram
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (nucleic acid)
+                name: Template amount time unit
+                format: ontology
+                ontology:
+                    draft: false
+                    top_node:
+                        id: UO:0000002
+                        label: physical quantity
+        library_generation_method:
+            type: string
+            enum:
+                - "PCR"
+                - "RT(RHP)+PCR"
+                - "RT(oligo-dT)+PCR"
+                - "RT(oligo-dT)+TS+PCR"
+                - "RT(oligo-dT)+TS(UMI)+PCR"
+                - "RT(specific)+PCR"
+                - "RT(specific)+TS+PCR"
+                - "RT(specific)+TS(UMI)+PCR"
+                - "RT(specific+UMI)+PCR"
+                - "RT(specific+UMI)+TS+PCR"
+                - "RT(specific)+TS"
+                - "other"
+            nullable: false
+            description: Generic type of library generation
+            title: Library generation method
+            example: RT(oligo-dT)+TS(UMI)+PCR
+            x-airr:
+                miairr: essential
+                adc-query-support: true
+                set: 3
+                subset: process (nucleic acid)
+                name: Library generation method
+                format: controlled_vocabulary
+        library_generation_protocol:
+            type: string
+            nullable: true
+            description: Description of processes applied to substrate to obtain a library that is ready for sequencing
+            title: Library generation protocol
+            example: cDNA was generated using
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (nucleic acid)
+                name: Library generation protocol
+        library_generation_kit_version:
+            type: string
+            nullable: true
+            description: When using a library generation protocol from a commercial provider, provide the protocol version number
+            title: Protocol IDs
+            example: v2.1 (2016-09-15)
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (nucleic acid)
+                name: Protocol IDs
+        pcr_target:
+            type: array
+            nullable: false
+            description: >
+                If a PCR step was performed that specifically targets the IG/TR loci, the target and primer locations
+                need to be provided here. This field holds an array of PCRTarget objects, so that multiplex PCR setups
+                amplifying multiple loci at the same time can be annotated using one record per locus. PCR setups not
+                targeting any specific locus must not annotate this field but select the appropriate
+                library_generation_method instead.
+            items:
+                $ref: '#/PCRTarget'
+            x-airr:
+                adc-query-support: true
+        complete_sequences:
+            type: string
+            enum:
+                - partial
+                - complete
+                - "complete+untemplated"
+                - mixed
+            nullable: false
+            description: >
+                To be considered `complete`, the procedure used for library construction MUST generate sequences that
+                1) include the first V gene codon that encodes the mature polypeptide chain (i.e. after the
+                leader sequence) and 2) include the last complete codon of the J gene (i.e. 1 bp 5' of the J->C
+                splice site) and 3) provide sequence information for all positions between 1) and 2). To be considered
+                `complete & untemplated`, the sections of the sequences defined in points 1) to 3) of the previous
+                sentence MUST be untemplated, i.e. MUST NOT overlap with the primers used in library preparation.
+                `mixed` should only be used if the procedure used for library construction will likely produce multiple
+                categories of sequences in the given experiment. It SHOULD NOT be used as a replacement of a NULL value.
+            title: Complete sequences
+            example: partial
+            x-airr:
+                miairr: essential
+                adc-query-support: true
+                set: 3
+                subset: process (nucleic acid)
+                name: Complete sequences
+                format: controlled_vocabulary
+        physical_linkage:
+            type: string
+            enum:
+                - none
+                - "hetero_head-head"
+                - "hetero_tail-head"
+                - "hetero_prelinked"
+            nullable: false
+            description: >
+                In case an experimental setup is used that physically links nucleic acids derived from distinct
+                `Rearrangements` before library preparation, this field describes the mode of that linkage. All
+                `hetero_*` terms indicate that in case of paired-read sequencing, the two reads should be expected
+                to map to distinct IG/TR loci. `*_head-head` refers to techniques that link the 5' ends of transcripts
+                in a single-cell context. `*_tail-head` refers to techniques that link the 3' end of one transcript to
+                the 5' end of another one in a single-cell context. This term does not provide any information whether
+                a continuous reading-frame between the two is generated. `*_prelinked` refers to constructs in which
+                the linkage was already present on the DNA level (e.g. scFv).
+            title: Physical linkage of different rearrangements
+            example: hetero_head-head
+            x-airr:
+                miairr: essential
+                adc-query-support: true
+                set: 3
+                subset: process (nucleic acid)
+                name: Physical linkage of different rearrangements
+                format: controlled_vocabulary
+
+# 1-to-n relationship between a NucleicAcidProcessing and SequencingRun with resultant raw sequence file(s)
+SequencingRun:
+    type: object
+    required:
+        - sequencing_run_id
+        - total_reads_passing_qc_filter
+        - sequencing_platform
+        - sequencing_facility
+        - sequencing_run_date
+        - sequencing_kit
+    properties:
+        sequencing_run_id:
+            type: string
+            nullable: true
+            description: ID of sequencing run assigned by the sequencing facility
+            title: Batch number
+            example: 160101_M01234
+            x-airr:
+                identifier: true
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (sequencing)
+                name: Batch number
+        total_reads_passing_qc_filter:
+            type: integer
+            nullable: true
+            description: Number of usable reads for analysis
+            title: Total reads passing QC filter
+            example: 10365118
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (sequencing)
+                name: Total reads passing QC filter
+        sequencing_platform:
+            type: string
+            nullable: true
+            description: Designation of sequencing instrument used
+            title: Sequencing platform
+            example: Alumina LoSeq 1000
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (sequencing)
+                name: Sequencing platform
+        sequencing_facility:
+            type: string
+            nullable: true
+            description: Name and address of sequencing facility
+            title: Sequencing facility
+            example: Seqs-R-Us, Vancouver, BC, Canada
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (sequencing)
+                name: Sequencing facility
+        sequencing_run_date:
+            type: string
+            nullable: true
+            description: Date of sequencing run
+            title: Date of sequencing run
+            format: date
+            example: 2016-12-16
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (sequencing)
+                name: Date of sequencing run
+        sequencing_kit:
+            type: string
+            nullable: true
+            description: Name, manufacturer, order and lot numbers of sequencing kit
+            title: Sequencing kit
+            example: "FullSeq 600, Alumina, #M123456C0, 789G1HK"
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 3
+                subset: process (sequencing)
+                name: Sequencing kit
+        sequencing_files:
+            $ref: '#/SequencingData'
+            nullable: false
+            description: Set of sequencing files produced by the sequencing run
+            x-airr:
+                adc-query-support: true
+
+# Resultant raw sequencing files from a SequencingRun
+SequencingData:
+    type: object
+    required:
+        - sequencing_data_id
+        - file_type
+        - filename
+        - read_direction
+        - read_length
+        - paired_filename
+        - paired_read_direction
+        - paired_read_length
+    properties:
+        sequencing_data_id:
+            type: string
+            nullable: true
+            description: >
+                Persistent identifier of raw data stored in an archive (e.g. INSDC run ID). Data archive should 
+                be identified in the CURIE prefix.
+            title: Raw sequencing data persistent identifier
+            example: "SRA:SRR11610494"
+            x-airr:
+                identifier: true
+                miairr: important
+                adc-query-support: true
+                set: 4
+                subset: data (raw reads)
+                format: CURIE
+        file_type:
+            type: string
+            nullable: true
+            description: File format for the raw reads or sequences
+            title: Raw sequencing data file type
+            enum:
+                - fasta
+                - fastq
+                - null
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 4
+                subset: data (raw reads)
+                name: Raw sequencing data file type
+                format: controlled_vocabulary
+        filename:
+            type: string
+            nullable: true
+            description: File name for the raw reads or sequences. The first file in paired-read sequencing.
+            title: Raw sequencing data file name
+            example: MS10R-NMonson-C7JR9_S1_R1_001.fastq
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 4
+                subset: data (raw reads)
+                name: Raw sequencing data file name
+        read_direction:
+            type: string
+            nullable: true
+            description: Read direction for the raw reads or sequences. The first file in paired-read sequencing.
+            title: Read direction
+            example: forward
+            enum:
+                - forward
+                - reverse
+                - mixed
+                - null
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 4
+                subset: data (raw reads)
+                name: Read direction
+                format: controlled_vocabulary
+        read_length:
+            type: integer
+            nullable: true
+            description: Read length in bases for the first file in paired-read sequencing
+            title: Forward read length
+            example: 300
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 4
+                subset: data (raw reads)
+                name: Forward read length
+        paired_filename:
+            type: string
+            nullable: true
+            description: File name for the second file in paired-read sequencing
+            title: Paired raw sequencing data file name
+            example: MS10R-NMonson-C7JR9_S1_R2_001.fastq
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 4
+                subset: data (raw reads)
+                name: Paired raw sequencing data file name
+        paired_read_direction:
+            type: string
+            nullable: true
+            description: Read direction for the second file in paired-read sequencing
+            title: Paired read direction
+            example: reverse
+            enum:
+                - forward
+                - reverse
+                - mixed
+                - null
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 4
+                subset: data (raw reads)
+                name: Paired read direction
+                format: controlled_vocabulary
+        paired_read_length:
+            type: integer
+            nullable: true
+            description: Read length in bases for the second file in paired-read sequencing
+            title: Paired read length
+            example: 300
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 4
+                subset: data (raw reads)
+                name: Paired read length
+        index_filename:
+            type: string
+            nullable: true
+            description: File name for the index file
+            title: Sequencing index file name
+            example: MS10R-NMonson-C7JR9_S1_R3_001.fastq
+            x-airr:
+                adc-query-support: true
+        index_length:
+            type: integer
+            nullable: true
+            description: Read length in bases for the index file
+            title: Index read length
+            example: 8
+            x-airr:
+                adc-query-support: true
+
+# 1-to-n relationship between a repertoire and data processing
+#
+# Set of annotated rearrangement sequences produced by
+# data processing upon the raw sequence data for a repertoire.
+DataProcessing:
+    type: object
+    required:
+        - software_versions
+        - paired_reads_assembly
+        - quality_thresholds
+        - primer_match_cutoffs
+        - collapsing_method
+        - data_processing_protocols
+        - germline_database
+    properties:
+        data_processing_id:
+            type: string
+            nullable: true
+            description: Identifier for the data processing object.
+            title: Data processing ID
+            x-airr:
+                name: Data processing ID
+                adc-query-support: true
+                identifier: true
+        primary_annotation:
+            type: boolean
+            default: false
+            nullable: false
+            description: >
+                If true, indicates this is the primary or default data processing for
+                the repertoire and its rearrangements. If false, indicates this is a secondary
+                or additional data processing.
+            title: Primary annotation
+            x-airr:
+                adc-query-support: true
+                identifier: true
+        software_versions:
+            type: string
+            nullable: true
+            description: Version number and / or date, include company pipelines
+            title: Software tools and version numbers
+            example: IgBLAST 1.6
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 5
+                subset: process (computational)
+                name: Software tools and version numbers
+        paired_reads_assembly:
+            type: string
+            nullable: true
+            description: How paired end reads were assembled into a single receptor sequence
+            title: Paired read assembly
+            example: PandaSeq (minimal overlap 50, threshold 0.8)
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 5
+                subset: process (computational)
+                name: Paired read assembly
+        quality_thresholds:
+            type: string
+            nullable: true
+            description: How/if sequences were removed from (4) based on base quality scores
+            title: Quality thresholds
+            example: Average Phred score >=20
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 5
+                subset: process (computational)
+                name: Quality thresholds
+        primer_match_cutoffs:
+            type: string
+            nullable: true
+            description: How primers were identified in the sequences, were they removed/masked/etc?
+            title: Primer match cutoffs
+            example: Hamming distance <= 2
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 5
+                subset: process (computational)
+                name: Primer match cutoffs
+        collapsing_method:
+            type: string
+            nullable: true
+            description: The method used for combining multiple sequences from (4) into a single sequence in (5)
+            title: Collapsing method
+            example: MUSCLE 3.8.31
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 5
+                subset: process (computational)
+                name: Collapsing method
+        data_processing_protocols:
+            type: string
+            nullable: true
+            description: General description of how QC is performed
+            title: Data processing protocols
+            example: Data was processed using [...]
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 5
+                subset: process (computational)
+                name: Data processing protocols
+        data_processing_files:
+            type: array
+            items:
+                type: string
+            nullable: true
+            description: Array of file names for data produced by this data processing.
+            title: Processed data file names
+            example:
+                - 'ERR1278153_aa.txz'
+                - 'ERR1278153_ab.txz'
+                - 'ERR1278153_ac.txz'
+            x-airr:
+                adc-query-support: true
+                name: Processed data file names
+        germline_database:
+            type: string
+            nullable: true
+            description: Source of germline V(D)J genes with version number or date accessed.
+            title: V(D)J germline reference database
+            example: ENSEMBL, Homo sapiens build 90, 2017-10-01
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 5
+                subset: data (processed sequence)
+                name: V(D)J germline reference database
+        germline_set_ref:
+            type: string
+            nullable: true
+            description: Unique identifier of the germline set and version, in standardized form (Repo:Label:Version)
+            example: OGRDB:Human_IGH:2021.11
+            x-airr:
+                adc-query-support: true
+        analysis_provenance_id:
+            type: string
+            nullable: true
+            description: Identifier for machine-readable PROV model of analysis provenance
+            title: Analysis provenance ID
+            x-airr:
+                adc-query-support: true
+
+SampleProcessing:
+    allOf:
+        - type: object
+          properties:
+              sample_processing_id:
+                  type: string
+                  nullable: true
+                  description: >
+                      Identifier for the sample processing object. This field should be unique within the repertoire.
+                      This field can be used to uniquely identify the combination of sample, cell processing,
+                      nucleic acid processing and sequencing run information for the repertoire.
+                  title: Sample processing ID
+                  x-airr:
+                      name: Sample processing ID
+                      adc-query-support: true
+                      identifier: true
+        - $ref: '#/Sample'
+        - $ref: '#/CellProcessing'
+        - $ref: '#/NucleicAcidProcessing'
+        - $ref: '#/SequencingRun'
+
+
+# The composite schema for the repertoire object
+#
+# This represents a sample repertoire as defined by the study
+# and experimentally observed by raw sequence data. A repertoire
+# can only be for one subject but may include multiple samples.
+Repertoire:
+    type: object
+    required:
+        - study
+        - subject
+        - sample
+        - data_processing
+    properties:
+        repertoire_id:
+            type: string
+            nullable: true
+            description: >
+                Identifier for the repertoire object. This identifier should be globally unique so that repertoires
+                from multiple studies can be combined together without conflict. The repertoire_id is used to link
+                other AIRR data to a Repertoire. Specifically, the Rearrangements Schema includes repertoire_id for
+                referencing the specific Repertoire for that Rearrangement.
+            title: Repertoire ID
+            x-airr:
+                adc-query-support: true
+                identifier: true
+        repertoire_name:
+            type: string
+            nullable: true
+            description: Short generic display name for the repertoire
+            title: Repertoire name
+            x-airr:
+                name: Repertoire name
+                adc-query-support: true
+        repertoire_description:
+            type: string
+            nullable: true
+            description: Generic repertoire description
+            title: Repertoire description
+            x-airr:
+                name: Repertoire description
+                adc-query-support: true
+        study:
+            $ref: '#/Study'
+            nullable: false
+            description: Study object
+            x-airr:
+                adc-query-support: true
+        subject:
+            $ref: '#/Subject'
+            nullable: false
+            description: Subject object
+            x-airr:
+                adc-query-support: true
+        sample:
+            type: array
+            nullable: false
+            description: List of Sample Processing objects
+            items:
+                $ref: '#/SampleProcessing'
+            x-airr:
+                adc-query-support: true
+        data_processing:
+            type: array
+            nullable: false
+            description: List of Data Processing objects
+            items:
+                $ref: '#/DataProcessing'
+            x-airr:
+                adc-query-support: true
+
+# A collection of repertoires for analysis purposes, includes optional time course
+RepertoireGroup:
+    type: object
+    required:
+        - repertoire_group_id
+        - repertoires
+    properties:
+        repertoire_group_id:
+            type: string
+            nullable: true
+            description: Identifier for this repertoire collection
+            x-airr:
+                identifier: true
+        repertoire_group_name:
+            type: string
+            nullable: true
+            description: Short display name for this repertoire collection
+        repertoire_group_description:
+            type: string
+            nullable: true
+            description: Repertoire collection description
+        repertoires:
+            type: array
+            nullable: true
+            description: >
+                List of repertoires in this collection with an associated description and time point designation
+            items:
+                type: object
+                properties:
+                    repertoire_id:
+                        type: string
+                        nullable: false
+                        description: Identifier to the repertoire
+                        x-airr:
+                            adc-query-support: true
+                    repertoire_description:
+                        type: string
+                        nullable: true
+                        description: Description of this repertoire within the group
+                        x-airr:
+                            adc-query-support: true
+                    time_point:
+                        $ref: '#/TimePoint'
+                        nullable: true
+                        description: Time point designation for this repertoire within the group
+                        x-airr:
+                            adc-query-support: true
+
+Alignment:
+    type: object
+    required:
+        - sequence_id
+        - segment
+        - call
+        - score
+        - cigar
+    properties:
+        sequence_id:
+            type: string
+            nullable: true
+            description: >
+                Unique query sequence identifier within the file. Most often this will be the input sequence
+                header or a substring thereof, but may also be a custom identifier defined by the tool in
+                cases where query sequences have been combined in some fashion prior to alignment.
+            x-airr:
+                identifier: true
+        segment:
+            type: string
+            nullable: true
+            description: >
+                The segment for this alignment. One of V, D, J or C.
+        rev_comp:
+            type: boolean
+            nullable: true
+            description: >
+                Alignment result is from the reverse complement of the query sequence.
+        call:
+            type: string
+            nullable: true
+            description: >
+                Gene assignment with allele.
+        score:
+            type: number
+            nullable: true
+            description: >
+                Alignment score.
+        identity:
+            type: number
+            nullable: true
+            description: >
+                Alignment fractional identity.
+        support:
+            type: number
+            nullable: true
+            description: >
+                Alignment E-value, p-value, likelihood, probability or other similar measure of
+                support for the gene assignment as defined by the alignment tool.
+        cigar:
+            type: string
+            nullable: true
+            description: >
+                Alignment CIGAR string.
+        sequence_start:
+            type: integer
+            nullable: true
+            description: >
+                Start position of the segment in the query sequence (1-based closed interval).
+        sequence_end:
+            type: integer
+            nullable: true
+            description: >
+                End position of the segment in the query sequence (1-based closed interval).
+        germline_start:
+            type: integer
+            nullable: true
+            description: >
+                Alignment start position in the reference sequence (1-based closed interval).
+        germline_end:
+            type: integer
+            nullable: true
+            description: >
+                Alignment end position in the reference sequence (1-based closed interval).
+        rank:
+            type: integer
+            nullable: true
+            description: >
+                Alignment rank.
+        rearrangement_id:
+            type: string
+            nullable: true
+            description: >
+                Identifier for the Rearrangement object. May be identical to sequence_id,
+                but will usually be a universally unique record locator for database applications.
+            x-airr:
+                deprecated: true
+                deprecated-description: Field has been merged with sequence_id to avoid confusion.
+                deprecated-replaced-by:
+                    - sequence_id
+        data_processing_id:
+            type: string
+            nullable: true
+            description: >
+                Identifier to the data processing object in the repertoire metadata
+                for this rearrangement. If this field is empty than the primary data processing object is assumed.
+        germline_database:
+            type: string
+            nullable: true
+            description: Source of germline V(D)J genes with version number or date accessed.
+            example: ENSEMBL, Homo sapiens build 90, 2017-10-01
+            x-airr:
+                deprecated: true
+                deprecated-description: Field was moved up to the DataProcessing level to avoid data duplication.
+                deprecated-replaced-by:
+                    - "DataProcessing:germline_database"
+
+
+# The extended rearrangement object
+Rearrangement:
+    type: object
+    required:
+        - sequence_id
+        - sequence
+        - rev_comp
+        - productive
+        - v_call
+        - d_call
+        - j_call
+        - sequence_alignment
+        - germline_alignment
+        - junction
+        - junction_aa
+        - v_cigar
+        - d_cigar
+        - j_cigar
+    properties:
+        sequence_id:
+            type: string
+            nullable: true
+            description: >
+                Unique query sequence identifier for the Rearrangement. Most often this will be the input sequence
+                header or a substring thereof, but may also be a custom identifier defined by the tool in
+                cases where query sequences have been combined in some fashion prior to alignment. When
+                downloaded from an AIRR Data Commons repository, this will usually be a universally unique
+                record locator for linking with other objects in the AIRR Data Model.
+            x-airr:
+                adc-query-support: true
+                identifier: true
+        sequence:
+            type: string
+            nullable: true
+            description: >
+                The query nucleotide sequence. Usually, this is the unmodified input sequence, which may be
+                reverse complemented if necessary. In some cases, this field may contain consensus sequences or
+                other types of collapsed input sequences if these steps are performed prior to alignment.
+        quality:
+            type: string
+            nullable: true
+            description: >
+                The Sanger/Phred quality scores for assessment of sequence quality.
+                Phred quality scores from 0 to 93 are encoded using ASCII 33 to 126 (Used by Illumina from v1.8.)
+        sequence_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the query nucleotide sequence.
+        rev_comp:
+            type: boolean
+            nullable: true
+            description: >
+                True if the alignment is on the opposite strand (reverse complemented) with respect to the
+                query sequence. If True then all output data, such as alignment coordinates and sequences,
+                are based on the reverse complement of 'sequence'.
+        productive:
+            type: boolean
+            nullable: true
+            description: >
+                True if the V(D)J sequence is predicted to be productive.
+            x-airr:
+                adc-query-support: true
+        vj_in_frame:
+            type: boolean
+            nullable: true
+            description: True if the V and J gene alignments are in-frame.
+        stop_codon:
+            type: boolean
+            nullable: true
+            description: True if the aligned sequence contains a stop codon.
+        complete_vdj:
+            type: boolean
+            nullable: true
+            description: >
+                True if the sequence alignment spans the entire V(D)J region. Meaning,
+                sequence_alignment includes both the first V gene codon that encodes the
+                mature polypeptide chain (i.e., after the leader sequence) and the last
+                complete codon of the J gene (i.e., before the J-C splice site).
+                This does not require an absence of deletions within the internal
+                FWR and CDR regions of the alignment.
+        locus:
+            type: string
+            enum:
+                - IGH
+                - IGI
+                - IGK
+                - IGL
+                - TRA
+                - TRB
+                - TRD
+                - TRG
+                - null
+            nullable: true
+            description: >
+                Gene locus (chain type). Note that this field uses a controlled vocabulary that is meant to provide a
+                generic classification of the locus, not necessarily the correct designation according to a specific
+                nomenclature.
+            title: Gene locus
+            example: IGH
+            x-airr:
+                adc-query-support: true
+                name: Gene locus
+                format: controlled_vocabulary
+        locus_species:
+            $ref: '#/Ontology'
+            nullable: true
+            description: >
+                Binomial designation of the species from which the locus originates. Typically, this value should be
+                identical to `organism`, if which case it SHOULD NOT be set explicitly. However, there are valid
+                experimental setups in which the two might differ, e.g. transgenic animal models. If set, this key
+                will overwrite the `organism` information for all lower layers of the schema.
+            title: Locus species
+            example:
+                id: NCBITAXON:9606
+                label: Homo sapiens
+            x-airr:
+                miairr: defined
+                adc-query-support: true
+                name: Locus species
+                format: ontology
+                ontology:
+                    draft: false
+                    top_node:
+                        id: NCBITAXON:7776
+                        label: Gnathostomata
+        v_call:
+            type: string
+            nullable: true
+            description: >
+                V gene with allele. If referring to a known reference sequence in a database
+                the relevant gene/allele nomenclature should be followed (e.g., IGHV4-59*01 if using IMGT/GENE-DB).
+            title: V gene with allele
+            example: IGHV4-59*01
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 6
+                subset: data (processed sequence)
+                name: V gene with allele
+        d_call:
+            type: string
+            nullable: true
+            description: >
+                First or only D gene with allele. If referring to a known reference sequence in a database
+                the relevant gene/allele nomenclature should be followed (e.g., IGHD3-10*01 if using IMGT/GENE-DB).
+            title: D gene with allele
+            example: IGHD3-10*01
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 6
+                subset: data (processed sequence)
+                name: D gene with allele
+        d2_call:
+            type: string
+            nullable: true
+            description: >
+                Second D gene with allele. If referring to a known reference sequence in a database the relevant
+                gene/allele nomenclature should be followed (e.g., IGHD3-10*01 if using IMGT/GENE-DB).
+            example: IGHD3-10*01
+        j_call:
+            type: string
+            nullable: true
+            description: >
+                J gene with allele. If referring to a known reference sequence in a database the relevant
+                gene/allele nomenclature should be followed (e.g., IGHJ4*02 if using IMGT/GENE-DB).
+            title: J gene with allele
+            example: IGHJ4*02
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 6
+                subset: data (processed sequence)
+                name: J gene with allele
+        c_call:
+            type: string
+            nullable: true
+            description: >
+                Constant region gene with allele. If referring to a known reference sequence in a database the
+                relevant gene/allele nomenclature should be followed (e.g., IGHG1*01 if using IMGT/GENE-DB).
+            title: C region
+            example: IGHG1*01
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 6
+                subset: data (processed sequence)
+                name: C region
+        sequence_alignment:
+            type: string
+            nullable: true
+            description: >
+                Aligned portion of query sequence, including any indel corrections or numbering spacers,
+                such as IMGT-gaps. Typically, this will include only the V(D)J region, but that is not
+                a requirement.
+        quality_alignment:
+            type: string
+            nullable: true
+            description: >
+                Sanger/Phred quality scores for assessment of sequence_alignment quality.
+                Phred quality scores from 0 to 93 are encoded using ASCII 33 to 126 (Used by Illumina from v1.8.)
+        sequence_alignment_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the aligned query sequence.
+        germline_alignment:
+            type: string
+            nullable: true
+            description: >
+                Assembled, aligned, full-length inferred germline sequence spanning the same region
+                as the sequence_alignment field (typically the V(D)J region) and including the same set
+                of corrections and spacers (if any).
+        germline_alignment_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the assembled germline sequence.
+        junction:
+            type: string
+            nullable: true
+            description: >
+                Junction region nucleotide sequence, where the junction is defined as
+                the CDR3 plus the two flanking conserved codons.
+            title: IMGT-JUNCTION nucleotide sequence
+            example: TGTGCAAGAGCGGGAGTTTACGACGGATATACTATGGACTACTGG
+            x-airr:
+                miairr: important
+                set: 6
+                subset: data (processed sequence)
+                name: IMGT-JUNCTION nucleotide sequence
+        junction_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the junction.
+            title: IMGT-JUNCTION amino acid sequence
+            example: CARAGVYDGYTMDYW
+            x-airr:
+                miairr: important
+                adc-query-support: true
+                set: 6
+                subset: data (processed sequence)
+                name: IMGT-JUNCTION amino acid sequence
+        np1:
+            type: string
+            nullable: true
+            description: >
+                Nucleotide sequence of the combined N/P region between the V gene and
+                first D gene alignment or between the V gene and J gene alignments.
+        np1_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the np1 field.
+        np2:
+            type: string
+            nullable: true
+            description: >
+                Nucleotide sequence of the combined N/P region between either the first D gene and J gene
+                alignments or the first D gene and second D gene alignments.
+        np2_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the np2 field.
+        np3:
+            type: string
+            nullable: true
+            description: >
+                Nucleotide sequence of the combined N/P region between the second D gene
+                and J gene alignments.
+        np3_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the np3 field.
+        cdr1:
+            type: string
+            nullable: true
+            description: >
+                Nucleotide sequence of the aligned CDR1 region.
+        cdr1_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the cdr1 field.
+        cdr2:
+            type: string
+            nullable: true
+            description: >
+                Nucleotide sequence of the aligned CDR2 region.
+        cdr2_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the cdr2 field.
+        cdr3:
+            type: string
+            nullable: true
+            description: >
+                Nucleotide sequence of the aligned CDR3 region.
+        cdr3_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the cdr3 field.
+        fwr1:
+            type: string
+            nullable: true
+            description: >
+                Nucleotide sequence of the aligned FWR1 region.
+        fwr1_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the fwr1 field.
+        fwr2:
+            type: string
+            nullable: true
+            description: >
+                Nucleotide sequence of the aligned FWR2 region.
+        fwr2_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the fwr2 field.
+        fwr3:
+            type: string
+            nullable: true
+            description: >
+                Nucleotide sequence of the aligned FWR3 region.
+        fwr3_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the fwr3 field.
+        fwr4:
+            type: string
+            nullable: true
+            description: >
+                Nucleotide sequence of the aligned FWR4 region.
+        fwr4_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the fwr4 field.
+        v_score:
+            type: number
+            nullable: true
+            description: Alignment score for the V gene.
+        v_identity:
+            type: number
+            nullable: true
+            description: Fractional identity for the V gene alignment.
+        v_support:
+            type: number
+            nullable: true
+            description: >
+                V gene alignment E-value, p-value, likelihood, probability or other similar measure of
+                support for the V gene assignment as defined by the alignment tool.
+        v_cigar:
+            type: string
+            nullable: true
+            description: CIGAR string for the V gene alignment.
+        d_score:
+            type: number
+            nullable: true
+            description: Alignment score for the first or only D gene alignment.
+        d_identity:
+            type: number
+            nullable: true
+            description: Fractional identity for the first or only D gene alignment.
+        d_support:
+            type: number
+            nullable: true
+            description: >
+                D gene alignment E-value, p-value, likelihood, probability or other similar measure of
+                support for the first or only D gene as defined by the alignment tool.
+        d_cigar:
+            type: string
+            nullable: true
+            description: CIGAR string for the first or only D gene alignment.
+        d2_score:
+            type: number
+            nullable: true
+            description: Alignment score for the second D gene alignment.
+        d2_identity:
+            type: number
+            nullable: true
+            description: Fractional identity for the second D gene alignment.
+        d2_support:
+            type: number
+            nullable: true
+            description: >
+                D gene alignment E-value, p-value, likelihood, probability or other similar measure of
+                support for the second D gene as defined by the alignment tool.
+        d2_cigar:
+            type: string
+            nullable: true
+            description: CIGAR string for the second D gene alignment.
+        j_score:
+            type: number
+            nullable: true
+            description: Alignment score for the J gene alignment.
+        j_identity:
+            type: number
+            nullable: true
+            description: Fractional identity for the J gene alignment.
+        j_support:
+            type: number
+            nullable: true
+            description: >
+                J gene alignment E-value, p-value, likelihood, probability or other similar measure of
+                support for the J gene assignment as defined by the alignment tool.
+        j_cigar:
+            type: string
+            nullable: true
+            description: CIGAR string for the J gene alignment.
+        c_score:
+            type: number
+            nullable: true
+            description: Alignment score for the C gene alignment.
+        c_identity:
+            type: number
+            nullable: true
+            description: Fractional identity for the C gene alignment.
+        c_support:
+            type: number
+            nullable: true
+            description: >
+                C gene alignment E-value, p-value, likelihood, probability or other similar measure of
+                support for the C gene assignment as defined by the alignment tool.
+        c_cigar:
+            type: string
+            nullable: true
+            description: CIGAR string for the C gene alignment.
+        v_sequence_start:
+            type: integer
+            nullable: true
+            description: >
+                Start position of the V gene in the query sequence (1-based closed interval).
+        v_sequence_end:
+            type: integer
+            nullable: true
+            description: >
+                End position of the V gene in the query sequence (1-based closed interval).
+        v_germline_start:
+            type: integer
+            nullable: true
+            description: >
+                Alignment start position in the V gene reference sequence (1-based closed interval).
+        v_germline_end:
+            type: integer
+            nullable: true
+            description: >
+                Alignment end position in the V gene reference sequence (1-based closed interval).
+        v_alignment_start:
+            type: integer
+            nullable: true
+            description: >
+                Start position of the V gene alignment in both the sequence_alignment and germline_alignment
+                fields (1-based closed interval).
+        v_alignment_end:
+            type: integer
+            nullable: true
+            description: >
+                End position of the V gene alignment in both the sequence_alignment and germline_alignment
+                fields (1-based closed interval).
+        d_sequence_start:
+            type: integer
+            nullable: true
+            description: >
+                Start position of the first or only D gene in the query sequence.
+                (1-based closed interval).
+        d_sequence_end:
+            type: integer
+            nullable: true
+            description: >
+                End position of the first or only D gene in the query sequence.
+                (1-based closed interval).
+        d_germline_start:
+            type: integer
+            nullable: true
+            description: >
+                Alignment start position in the D gene reference sequence for the first or only
+                D gene (1-based closed interval).
+        d_germline_end:
+            type: integer
+            nullable: true
+            description: >
+                Alignment end position in the D gene reference sequence for the first or only
+                D gene (1-based closed interval).
+        d_alignment_start:
+            type: integer
+            nullable: true
+            description: >
+                Start position of the first or only D gene in both the sequence_alignment
+                and germline_alignment fields (1-based closed interval).
+        d_alignment_end:
+            type: integer
+            nullable: true
+            description: >
+                End position of the first or only D gene in both the sequence_alignment
+                and germline_alignment fields (1-based closed interval).
+        d2_sequence_start:
+            type: integer
+            nullable: true
+            description: >
+                Start position of the second D gene in the query sequence (1-based closed interval).
+        d2_sequence_end:
+            type: integer
+            nullable: true
+            description: >
+                End position of the second D gene in the query sequence (1-based closed interval).
+        d2_germline_start:
+            type: integer
+            nullable: true
+            description: >
+                Alignment start position in the second D gene reference sequence (1-based closed interval).
+        d2_germline_end:
+            type: integer
+            nullable: true
+            description: >
+                Alignment end position in the second D gene reference sequence (1-based closed interval).
+        d2_alignment_start:
+            type: integer
+            nullable: true
+            description: >
+                Start position of the second D gene alignment in both the sequence_alignment and
+                germline_alignment fields (1-based closed interval).
+        d2_alignment_end:
+            type: integer
+            nullable: true
+            description: >
+                End position of the second D gene alignment in both the sequence_alignment and
+                germline_alignment fields (1-based closed interval).
+        j_sequence_start:
+            type: integer
+            nullable: true
+            description: >
+                Start position of the J gene in the query sequence (1-based closed interval).
+        j_sequence_end:
+            type: integer
+            nullable: true
+            description: >
+                End position of the J gene in the query sequence (1-based closed interval).
+        j_germline_start:
+            type: integer
+            nullable: true
+            description: >
+                Alignment start position in the J gene reference sequence (1-based closed interval).
+        j_germline_end:
+            type: integer
+            nullable: true
+            description: >
+                Alignment end position in the J gene reference sequence (1-based closed interval).
+        j_alignment_start:
+            type: integer
+            nullable: true
+            description: >
+                Start position of the J gene alignment in both the sequence_alignment and germline_alignment
+                fields (1-based closed interval).
+        j_alignment_end:
+            type: integer
+            nullable: true
+            description: >
+                End position of the J gene alignment in both the sequence_alignment and germline_alignment
+                fields (1-based closed interval).
+        c_sequence_start:
+            type: integer
+            nullable: true
+            description: >
+                Start position of the C gene in the query sequence (1-based closed interval).
+        c_sequence_end:
+            type: integer
+            nullable: true
+            description: >
+                End position of the C gene in the query sequence (1-based closed interval).
+        c_germline_start:
+            type: integer
+            nullable: true
+            description: >
+                Alignment start position in the C gene reference sequence (1-based closed interval).
+        c_germline_end:
+            type: integer
+            nullable: true
+            description: >
+                Alignment end position in the C gene reference sequence (1-based closed interval).
+        c_alignment_start:
+            type: integer
+            nullable: true
+            description: >
+                Start position of the C gene alignment in both the sequence_alignment and germline_alignment
+                fields (1-based closed interval).
+        c_alignment_end:
+            type: integer
+            nullable: true
+            description: >
+                End position of the C gene alignment in both the sequence_alignment and germline_alignment
+                fields (1-based closed interval).
+        cdr1_start:
+            type: integer
+            nullable: true
+            description: CDR1 start position in the query sequence (1-based closed interval).
+        cdr1_end:
+            type: integer
+            nullable: true
+            description: CDR1 end position in the query sequence (1-based closed interval).
+        cdr2_start:
+            type: integer
+            nullable: true
+            description: CDR2 start position in the query sequence (1-based closed interval).
+        cdr2_end:
+            type: integer
+            nullable: true
+            description: CDR2 end position in the query sequence (1-based closed interval).
+        cdr3_start:
+            type: integer
+            nullable: true
+            description: CDR3 start position in the query sequence (1-based closed interval).
+        cdr3_end:
+            type: integer
+            nullable: true
+            description: CDR3 end position in the query sequence (1-based closed interval).
+        fwr1_start:
+            type: integer
+            nullable: true
+            description: FWR1 start position in the query sequence (1-based closed interval).
+        fwr1_end:
+            type: integer
+            nullable: true
+            description: FWR1 end position in the query sequence (1-based closed interval).
+        fwr2_start:
+            type: integer
+            nullable: true
+            description: FWR2 start position in the query sequence (1-based closed interval).
+        fwr2_end:
+            type: integer
+            nullable: true
+            description: FWR2 end position in the query sequence (1-based closed interval).
+        fwr3_start:
+            type: integer
+            nullable: true
+            description: FWR3 start position in the query sequence (1-based closed interval).
+        fwr3_end:
+            type: integer
+            nullable: true
+            description: FWR3 end position in the query sequence (1-based closed interval).
+        fwr4_start:
+            type: integer
+            nullable: true
+            description: FWR4 start position in the query sequence (1-based closed interval).
+        fwr4_end:
+            type: integer
+            nullable: true
+            description: FWR4 end position in the query sequence (1-based closed interval).
+        v_sequence_alignment:
+            type: string
+            nullable: true
+            description: >
+                Aligned portion of query sequence assigned to the V gene, including any
+                indel corrections or numbering spacers.
+        v_sequence_alignment_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the v_sequence_alignment field.
+        d_sequence_alignment:
+            type: string
+            nullable: true
+            description: >
+                Aligned portion of query sequence assigned to the first or only D gene, including any
+                indel corrections or numbering spacers.
+        d_sequence_alignment_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the d_sequence_alignment field.
+        d2_sequence_alignment:
+            type: string
+            nullable: true
+            description: >
+                Aligned portion of query sequence assigned to the second D gene, including any
+                indel corrections or numbering spacers.
+        d2_sequence_alignment_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the d2_sequence_alignment field.
+        j_sequence_alignment:
+            type: string
+            nullable: true
+            description: >
+                Aligned portion of query sequence assigned to the J gene, including any
+                indel corrections or numbering spacers.
+        j_sequence_alignment_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the j_sequence_alignment field.
+        c_sequence_alignment:
+            type: string
+            nullable: true
+            description: >
+                Aligned portion of query sequence assigned to the constant region, including
+                any indel corrections or numbering spacers.
+        c_sequence_alignment_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the c_sequence_alignment field.
+        v_germline_alignment:
+            type: string
+            nullable: true
+            description: >
+                Aligned V gene germline sequence spanning the same region
+                as the v_sequence_alignment field and including the same set
+                of corrections and spacers (if any).
+        v_germline_alignment_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the v_germline_alignment field.
+        d_germline_alignment:
+            type: string
+            nullable: true
+            description: >
+                Aligned D gene germline sequence spanning the same region
+                as the d_sequence_alignment field and including the same set
+                of corrections and spacers (if any).
+        d_germline_alignment_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the d_germline_alignment field.
+        d2_germline_alignment:
+            type: string
+            nullable: true
+            description: >
+                Aligned D gene germline sequence spanning the same region
+                as the d2_sequence_alignment field and including the same set
+                of corrections and spacers (if any).
+        d2_germline_alignment_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the d2_germline_alignment field.
+        j_germline_alignment:
+            type: string
+            nullable: true
+            description: >
+                Aligned J gene germline sequence spanning the same region
+                as the j_sequence_alignment field and including the same set
+                of corrections and spacers (if any).
+        j_germline_alignment_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the j_germline_alignment field.
+        c_germline_alignment:
+            type: string
+            nullable: true
+            description: >
+                Aligned constant region germline sequence spanning the same region
+                as the c_sequence_alignment field and including the same set
+                of corrections and spacers (if any).
+        c_germline_alignment_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the c_germline_aligment field.
+        junction_length:
+            type: integer
+            nullable: true
+            description: Number of nucleotides in the junction sequence.
+        junction_aa_length:
+            type: integer
+            nullable: true
+            description: Number of amino acids in the junction sequence.
+            x-airr:
+                adc-query-support: true
+        np1_length:
+            type: integer
+            nullable: true
+            description: >
+                Number of nucleotides between the V gene and first D gene alignments or
+                between the V gene and J gene alignments.
+        np2_length:
+            type: integer
+            nullable: true
+            description: >
+                Number of nucleotides between either the first D gene and J gene alignments
+                or the first D gene and second D gene alignments.
+        np3_length:
+            type: integer
+            nullable: true
+            description: >
+                Number of nucleotides between the second D gene and J gene alignments.
+        n1_length:
+            type: integer
+            nullable: true
+            description: Number of untemplated nucleotides 5' of the first or only D gene alignment.
+        n2_length:
+            type: integer
+            nullable: true
+            description: Number of untemplated nucleotides 3' of the first or only D gene alignment.
+        n3_length:
+            type: integer
+            nullable: true
+            description: Number of untemplated nucleotides 3' of the second D gene alignment.
+        p3v_length:
+            type: integer
+            nullable: true
+            description: Number of palindromic nucleotides 3' of the V gene alignment.
+        p5d_length:
+            type: integer
+            nullable: true
+            description: Number of palindromic nucleotides 5' of the first or only D gene alignment.
+        p3d_length:
+            type: integer
+            nullable: true
+            description: Number of palindromic nucleotides 3' of the first or only D gene alignment.
+        p5d2_length:
+            type: integer
+            nullable: true
+            description: Number of palindromic nucleotides 5' of the second D gene alignment.
+        p3d2_length:
+            type: integer
+            nullable: true
+            description: Number of palindromic nucleotides 3' of the second D gene alignment.
+        p5j_length:
+            type: integer
+            nullable: true
+            description: Number of palindromic nucleotides 5' of the J gene alignment.
+        v_frameshift:
+            type: boolean
+            nullable: true
+            description: >
+                True if the V gene in the query nucleotide sequence contains a translational
+                frameshift relative to the frame of the V gene reference sequence.
+        j_frameshift:
+            type: boolean
+            nullable: true
+            description: >
+                True if the J gene in the query nucleotide sequence contains a translational
+                frameshift relative to the frame of the J gene reference sequence.
+        d_frame:
+            type: integer
+            nullable: true
+            description: >
+                Numerical reading frame (1, 2, 3) of the first or only D gene in the query nucleotide sequence,
+                where frame 1 is relative to the first codon of D gene reference sequence.
+        d2_frame:
+            type: integer
+            nullable: true
+            description: >
+                Numerical reading frame (1, 2, 3) of the second D gene in the query nucleotide sequence,
+                where frame 1 is relative to the first codon of D gene reference sequence.
+        consensus_count:
+            type: integer
+            nullable: true
+            description: >
+                Number of reads contributing to the UMI consensus or contig assembly for this sequence.
+                For example, the sum of the number of reads for all UMIs that contribute to
+                the query sequence.
+        duplicate_count:
+            type: integer
+            nullable: true
+            description: >
+                Copy number or number of duplicate observations for the query sequence.
+                For example, the number of identical reads observed for this sequence.
+            title: Read count
+            example: 123
+            x-airr:
+                miairr: important
+                set: 6
+                subset: data (processed sequence)
+                name: Read count
+        umi_count:
+            type: integer
+            nullable: true
+            description: >
+                Number of distinct UMIs represented by this sequence.
+                For example, the total number of UMIs that contribute to
+                the contig assembly for the query sequence.
+        cell_id:
+            type: string
+            nullable: true
+            description: >
+                Identifier defining the cell of origin for the query sequence.
+            title: Cell index
+            example: W06_046_091
+            x-airr:
+                identifier: true
+                miairr: important
+                adc-query-support: true
+                set: 6
+                subset: data (processed sequence)
+                name: Cell index
+        clone_id:
+            type: string
+            nullable: true
+            description: Clonal cluster assignment for the query sequence.
+            x-airr:
+                adc-query-support: true
+                identifier: true
+        repertoire_id:
+            type: string
+            nullable: true
+            description: Identifier to the associated repertoire in study metadata.
+            x-airr:
+                adc-query-support: true
+                identifier: true
+        sample_processing_id:
+            type: string
+            nullable: true
+            description: >
+                Identifier to the sample processing object in the repertoire metadata
+                for this rearrangement. If the repertoire has a single sample then
+                this field may be empty or missing. If the repertoire has multiple samples then
+                this field may be empty or missing if the sample cannot be differentiated or
+                the relationship is not maintained by the data processing.
+            x-airr:
+                adc-query-support: true
+                identifier: true
+        data_processing_id:
+            type: string
+            nullable: true
+            description: >
+                Identifier to the data processing object in the repertoire metadata
+                for this rearrangement. If this field is empty than the primary data processing object is assumed.
+            x-airr:
+                adc-query-support: true
+                identifier: true
+        rearrangement_id:
+            type: string
+            nullable: true
+            description: >
+                Identifier for the Rearrangement object. May be identical to sequence_id,
+                but will usually be a universally unique record locator for database applications.
+            x-airr:
+                deprecated: true
+                deprecated-description: Field has been merged with sequence_id to avoid confusion.
+                deprecated-replaced-by:
+                    - sequence_id
+        rearrangement_set_id:
+            type: string
+            nullable: true
+            description: >
+                Identifier for grouping Rearrangement objects.
+            x-airr:
+                deprecated: true
+                deprecated-description: Field has been replaced by other specialized identifiers.
+                deprecated-replaced-by:
+                    - repertoire_id
+                    - sample_processing_id
+                    - data_processing_id
+        germline_database:
+            type: string
+            nullable: true
+            description: Source of germline V(D)J genes with version number or date accessed.
+            example: ENSEMBL, Homo sapiens build 90, 2017-10-01
+            x-airr:
+                deprecated: true
+                deprecated-description: Field was moved up to the DataProcessing level to avoid data duplication.
+                deprecated-replaced-by:
+                    - "DataProcessing:germline_database"
+
+# A unique inferred clone object that has been constructed within a single data processing
+# for a single repertoire and a subset of its sequences and/or rearrangements.
+Clone:
+    type: object
+    required:
+        - clone_id
+        - germline_alignment
+    properties:
+        clone_id:
+            type: string
+            nullable: true
+            description: Identifier for the clone.
+            x-airr:
+                identifier: true
+        repertoire_id:
+            type: string
+            nullable: true
+            description: Identifier to the associated repertoire in study metadata.
+            x-airr:
+                adc-query-support: true
+        data_processing_id:
+            type: string
+            nullable: true
+            description: Identifier of the data processing object in the repertoire metadata for this clone.
+            x-airr:
+                adc-query-support: true
+        sequences:
+            type: array
+            items:
+                type: string
+            nullable: true
+            description: >
+                List sequence_id strings that act as keys to the Rearrangement records for members of the clone.
+        v_call:
+            type: string
+            nullable: true
+            description: >
+                V gene with allele of the inferred ancestral of the clone. For example, IGHV4-59*01.
+            example: IGHV4-59*01
+        d_call:
+            type: string
+            nullable: true
+            description: >
+                D gene with allele of the inferred ancestor of the clone. For example, IGHD3-10*01.
+            example: IGHD3-10*01
+        j_call:
+            type: string
+            nullable: true
+            description: >
+                J gene with allele of the inferred ancestor of the clone. For example, IGHJ4*02.
+            example: IGHJ4*02
+        junction:
+            type: string
+            nullable: true
+            description: >
+                Nucleotide sequence for the junction region of the inferred ancestor of the clone,
+                where the junction is defined as the CDR3 plus the two flanking conserved codons.
+        junction_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the junction.
+        junction_length:
+            type: integer
+            nullable: true
+            description: Number of nucleotides in the junction.
+        junction_aa_length:
+            type: integer
+            nullable: true
+            description: Number of amino acids in junction_aa.
+        germline_alignment:
+            type: string
+            nullable: true
+            description: >
+                Assembled, aligned, full-length inferred ancestor of the clone spanning the same region
+                as the sequence_alignment field of nodes (typically the V(D)J region) and including the
+                same set of corrections and spacers (if any).
+        germline_alignment_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of germline_alignment.
+        v_alignment_start:
+            type: integer
+            nullable: true
+            description: >
+                Start position in the V gene alignment in both the sequence_alignment and germline_alignment
+                fields (1-based closed interval).
+        v_alignment_end:
+            type: integer
+            nullable: true
+            description: >
+                End position in the V gene alignment in both the sequence_alignment and germline_alignment
+                fields (1-based closed interval).
+        d_alignment_start:
+            type: integer
+            nullable: true
+            description: >
+                Start position of the D gene alignment in both the sequence_alignment and germline_alignment
+                fields (1-based closed interval).
+        d_alignment_end:
+            type: integer
+            nullable: true
+            description: >
+                End position of the D gene alignment in both the sequence_alignment and germline_alignment
+                fields (1-based closed interval).
+        j_alignment_start:
+            type: integer
+            nullable: true
+            description: >
+                Start position of the J gene alignment in both the sequence_alignment and germline_alignment
+                fields (1-based closed interval).
+        j_alignment_end:
+            type: integer
+            nullable: true
+            description: >
+                End position of the J gene alignment in both the sequence_alignment and germline_alignment
+                fields (1-based closed interval).
+        junction_start:
+            type: integer
+            nullable: true
+            description: Junction region start position in the alignment (1-based closed interval).
+        junction_end:
+            type: integer
+            nullable: true
+            description: Junction region end position in the alignment (1-based closed interval).
+        umi_count:
+            type: integer
+            nullable: true
+            description: >
+                Number of distinct UMIs observed across all sequences (Rearrangement records) in this clone.
+        clone_count:
+            type: integer
+            nullable: true
+            description: >
+                Absolute count of the size (number of members) of this clone in the repertoire.
+                This could simply be the number of sequences (Rearrangement records) observed in this clone,
+                the number of distinct cell barcodes (unique cell_id values),
+                or a more sophisticated calculation appropriate to the experimental protocol.
+                Absolute count is provided versus a frequency so that downstream analysis tools can perform their own normalization.
+        seed_id:
+            type: string
+            nullable: true
+            description: sequence_id of the seed sequence. Empty string (or null) if there is no seed sequence.
+
+# 1-to-n relationship for a clone to its trees.
+Tree:
+    type: object
+    required:
+        - tree_id
+        - clone_id
+        - newick
+    properties:
+        tree_id:
+            type: string
+            nullable: true
+            description: Identifier for the tree.
+            x-airr:
+                identifier: true
+        clone_id:
+            type: string
+            nullable: true
+            description: Identifier for the clone.
+        newick:
+            type: string
+            nullable: true
+            description: Newick string of the tree edges.
+        nodes:
+            type: object
+            nullable: true
+            description: Dictionary of nodes in the tree, keyed by sequence_id string
+            additionalProperties:
+                $ref: '#/Node'
+
+# 1-to-n relationship between a tree and its nodes
+Node:
+    type: object
+    required:
+        - sequence_id
+    properties:
+        sequence_id:
+            type: string
+            nullable: true
+            description: >
+                Identifier for this node that matches the identifier in the newick string and, where possible,
+                the sequence_id in the source repertoire.
+            x-airr:
+                identifier: true
+        sequence_alignment:
+            type: string
+            nullable: true
+            description: >
+                Nucleotide sequence of the node, aligned to the germline_alignment for this clone, including
+                including any indel corrections or spacers.
+        junction:
+            type: string
+            nullable: true
+            description: >
+                Junction region nucleotide sequence for the node, where the junction is defined as
+                the CDR3 plus the two flanking conserved codons.
+        junction_aa:
+            type: string
+            nullable: true
+            description: >
+                Amino acid translation of the junction.
+
+# The cell object acts as point of reference for all data that can be related
+# to an individual cell, either by direct observation or inference.
+Cell:
+    type: object
+    required:
+        - cell_id
+        - rearrangements
+        - repertoire_id
+        - virtual_pairing
+    properties:
+        cell_id:
+            type: string
+            nullable: false
+            description: >
+                Identifier defining the cell of origin for the query sequence.
+            title: Cell index
+            example: W06_046_091
+            x-airr:
+                identifier: true
+                miairr: defined
+                adc-query-support: true
+                name: Cell index
+        rearrangements:
+            type: array
+            nullable: true
+            description: >
+                Array of sequence identifiers defined for the Rearrangement object
+            title: Cell-associated rearrangements
+            items:
+                type: string
+            example: [id1, id2] #empty vs NULL?
+            x-airr:
+                miairr: defined
+                adc-query-support: true
+                name: Cell-associated rearrangements
+        receptors:
+            type: array
+            nullable: true
+            description: >
+                Array of receptor identifiers defined for the Receptor object
+            title: Cell-associated receptors
+            items:
+                type: string
+            example: [id1, id2] #empty vs NULL?
+            x-airr:
+                miairr: defined
+                adc-query-support: true
+                name: Cell-associated receptors
+        repertoire_id:
+            type: string
+            nullable: true
+            description: Identifier to the associated repertoire in study metadata.
+            title: Parental repertoire of cell
+            x-airr:
+                miairr: defined
+                adc-query-support: true
+                name: Parental repertoire of cell
+        data_processing_id:
+            type: string
+            nullable: true
+            description: Identifier of the data processing object in the repertoire metadata for this clone.
+            title: Data processing for cell
+            x-airr:
+                miairr: defined
+                adc-query-support: true
+                name: Data processing for cell
+        expression_study_method:
+            type: string
+            enum:
+                - flow_cytometry
+                - single-cell_transcriptome
+                - null
+            nullable: true
+            description: >
+                Keyword describing the methodology used to assess expression. This values for this field MUST 
+                come from a controlled vocabulary.
+            x-airr:
+                miairr: defined
+                adc-query-support: true
+        expression_raw_doi:
+            type: string
+            nullable: true
+            description: >
+                DOI of raw data set containing the current event
+            x-airr:
+                miairr: defined
+                adc-query-support: true
+        expression_index:
+            type: string
+            nullable: true
+            description: >
+                Index addressing the current event within the raw data set.
+            x-airr:
+                miairr: defined
+        virtual_pairing:
+            type: boolean
+            nullable: true
+            description: >
+                boolean to indicate if pairing was inferred.
+            title: Virtual pairing
+            x-airr:
+                miairr: defined
+                adc-query-support: true
+                name: Virtual pairing
+
+# The CellExpression object acts as a container to hold a single expression level measurement from
+# an experiment. Expression data is associated with a cell_id and the related repertoire_id and
+# data_processing_id as cell_id is not guaranteed to be unique outside the data processing for
+# a single repertoire.
+CellExpression:
+    type: object
+    required:
+        - expression_id
+        - repertoire_id
+        - data_processing_id
+        - cell_id
+        - property
+        - property_type
+        - value
+    properties:
+        expression_id:
+            type: string
+            description: >
+                Identifier of this expression property measurement.
+            title: Expression property measurement identifier
+            nullable: false
+            x-airr:
+                identifier: true
+                miairr: defined
+                adc-query-support: true
+                name: Expression measurement identifier
+        cell_id:
+            type: string
+            description: >
+                Identifier of the cell to which this expression data is related.
+            title: Cell identifier
+            nullable: false
+            example: W06_046_091
+            x-airr:
+                miairr: defined
+                adc-query-support: true
+                name: Cell identifier
+        repertoire_id:
+            type: string
+            description: Identifier for the associated repertoire in study metadata.
+            title: Parental repertoire of cell
+            nullable: true
+            x-airr:
+                miairr: defined
+                adc-query-support: true
+                name: Parental repertoire of cell
+        data_processing_id:
+            type: string
+            description: Identifier of the data processing object in the repertoire metadata for this clone.
+            title: Data processing for cell
+            nullable: true
+            x-airr:
+                miairr: defined
+                adc-query-support: true
+                name: Data processing for cell
+        property_type:
+            type: string
+            description: >
+                Keyword describing the property type and detection method used to measure the property value.
+                The following keywords are recommended, but custom property types are also valid:
+                "mrna_expression_by_read_count",
+                "protein_expression_by_fluorescence_intensity", "antigen_bait_binding_by_fluorescence_intensity",
+                "protein_expression_by_dna_barcode_count" and "antigen_bait_binding_by_dna_barcode_count".
+            nullable: false
+            title: Property type and detection method
+            x-airr:
+                miairr: defined
+                adc-query-support: true
+                name: Property type and detection method
+        property:
+            $ref: '#/Ontology'
+            nullable: true
+            title: Property information
+            description: >
+                Name of the property observed, typically a gene or antibody identifier (and label) from a 
+                canonical resource such as Ensembl (e.g. ENSG00000275747, IGHV3-79) or 
+                Antibody Registry (ABREG:1236456, Purified anti-mouse/rat/human CD27 antibody).
+            example:
+                id: ENSG:ENSG00000275747
+                label: IGHV3-79
+            x-airr:
+                miairr: defined
+                adc-query-support: true
+                format: ontology
+                name: Property information
+        value:
+            type: number
+            description: Level at which the property was observed in the experiment (non-normalized).
+            title: Property value
+            nullable: true
+            example: 3
+            x-airr:
+                miairr: defined
+                adc-query-support: true
+                name: Property value
+
+
+# The Receptor object hold information about a receptor and its reactivity.
+#
+Receptor:
+    type: object
+    required:
+        - receptor_id
+        - receptor_hash
+        - receptor_type
+        - receptor_variable_domain_1_aa
+        - receptor_variable_domain_1_locus
+        - receptor_variable_domain_2_aa
+        - receptor_variable_domain_2_locus
+    properties:
+        receptor_id:
+            type: string
+            nullable: false
+            description: ID of the current Receptor object, unique within the local repository.
+            title: Receptor ID
+            example: TCR-MM-012345
+            x-airr:
+                identifier: true
+                adc-query-support: true
+        receptor_hash:
+            type: string
+            nullable: false
+            description: >
+                The SHA256 hash of the receptor amino acid sequence, calculated on the concatenated
+                ``receptor_variable_domain_*_aa`` sequences and represented as base16-encoded string.
+            title: Receptor hash ID
+            example: aa1c4b77a6f4927611ab39f5267415beaa0ba07a952c233d803b07e52261f026
+            x-airr:
+                adc-query-support: true
+        receptor_type:
+            type: string
+            nullable: false
+            enum:
+                - Ig
+                - TCR
+            description: The top-level receptor type, either Immunoglobulin (Ig) or T Cell Receptor (TCR).
+            x-airr:
+                adc-query-support: true
+        receptor_variable_domain_1_aa:
+            type: string
+            nullable: false
+            description: >
+                Complete amino acid sequence of the mature variable domain of the Ig heavy, TCR beta or TCR delta chain.
+                The mature variable domain is defined as encompassing all AA from and including first AA after the the
+                signal peptide to and including the last AA that is completely encoded by the J gene.
+            example: >
+                QVQLQQPGAELVKPGASVKLSCKASGYTFTSYWMHWVKQRPGRGLEWIGRIDPNSGGTKYNEKFKSKATLTVDKPSSTAYMQLSSLTSEDSAVYYCARYDYYGSSYFDYWGQGTTLTVSS
+            x-airr:
+                adc-query-support: true
+        receptor_variable_domain_1_locus:
+            type: string
+            nullable: false
+            enum:
+                - IGH
+                - TRB
+                - TRD
+            description: Locus from which the variable domain in receptor_variable_domain_1_aa originates
+            example: IGH
+            x-airr:
+                adc-query-support: true
+        receptor_variable_domain_2_aa:
+            type: string
+            nullable: false
+            description: >
+                Complete amino acid sequence of the mature variable domain of the Ig light, TCR alpha or TCR gamma chain.
+                The mature variable domain is defined as encompassing all AA from and including first AA after the the
+                signal peptide to and including the last AA that is completely encoded by the J gene.
+            example: >
+                QAVVTQESALTTSPGETVTLTCRSSTGAVTTSNYANWVQEKPDHLFTGLIGGTNNRAPGVPARFSGSLIGDKAALTITGAQTEDEAIYFCALWYSNHWVFGGGTKLTVL
+            x-airr:
+                adc-query-support: true
+        receptor_variable_domain_2_locus:
+            type: string
+            nullable: false
+            enum:
+                - IGI
+                - IGK
+                - IGL
+                - TRA
+                - TRG
+            description: Locus from which the variable domain in receptor_variable_domain_2_aa originates
+            example: IGL
+            x-airr:
+                adc-query-support: true
+        receptor_ref:
+            type: array
+            nullable: true
+            description: Array of receptor identifiers defined for the Receptor object
+            title: Receptor cross-references
+            items:
+                type: string
+            example: ["IEDB_RECEPTOR:10"]
+            x-airr:
+                adc-query-support: true
+        reactivity_measurements:
+            type: array
+            nullable: true
+            description: Records of reactivity measurement
+            items:
+                $ref: '#/ReceptorReactivity'
+
+
+ReceptorReactivity:
+    type: object
+    required:
+        - ligand_type
+        - antigen_type
+        - antigen
+        - reactivity_method
+        - reactivity_readout
+        - reactivity_value
+        - reactivity_unit
+    properties:
+        ligand_type:
+            type: string
+            nullable: false
+            enum:
+                - "MHC:peptide"
+                - "MHC:non-peptide"
+                - protein
+                - peptide
+                - non-peptidic
+            description: Classification of ligand binding to receptor
+            example: non-peptide
+        antigen_type:
+            type: string
+            nullable: false
+            enum:
+                - protein
+                - peptide
+                - non-peptidic
+            description: >
+                The type of antigen before processing by the immune system.
+            example: protein
+        antigen:
+            $ref: '#/Ontology'
+            nullable: false
+            description: >
+                The substance against which the receptor was tested. This can be any substance that
+                stimulates an adaptive immune response in the host, either through antibody production
+                or by T cell activation after presentation via an MHC molecule.
+            title: Antigen
+            example:
+                id: UNIPROT:P19597
+                label: Circumsporozoite protein
+            x-airr:
+                adc-query-support: true
+                format: ontology
+        antigen_source_species:
+            $ref: '#/Ontology'
+            nullable: true
+            description: The species from which the antigen was isolated
+            title: Source species of antigen
+            example:
+                id: NCBITAXON:5843
+                label: Plasmodium falciparum NF54
+            x-airr:
+                format: ontology
+                ontology:
+                    draft: true
+                    top_node:
+                        id: NCBITAXON:1
+                        label: root
+        peptide_start:
+            type: integer
+            nullable: true
+            description: Start position of the peptide within the reference protein sequence
+        peptide_end:
+            type: integer
+            nullable: true
+            description: End position of the peptide within the reference protein sequence
+        mhc_class:
+            type: string
+            nullable: true
+            enum:
+                - MHC-I
+                - MHC-II
+                - MHC-nonclassical
+                - null
+            description: Class of MHC molecule, only present for MHC:x ligand types
+            example: MHC-II
+        mhc_gene_1:
+            $ref: '#/Ontology'
+            nullable: true
+            description: The MHC gene to which the mhc_allele_1 belongs
+            title: MHC gene 1
+            example:
+                id: MRO:0000055
+                label: HLA-DRA
+            x-airr:
+                format: ontology
+                ontology:
+                    draft: true
+                    top_node:
+                        id: MRO:0000004
+                        label: MHC gene
+        mhc_allele_1:
+            type: string
+            nullable: true
+            description: Allele designation of the MHC alpha chain
+            example: HLA-DRA
+        mhc_gene_2:
+            $ref: '#/Ontology'
+            nullable: true
+            description: The MHC gene to which the mhc_allele_2 belongs
+            title: MHC gene 2
+            example:
+                id: MRO:0000057
+                label: HLA-DRB1
+            x-airr:
+                format: ontology
+                ontology:
+                    draft: true
+                    top_node:
+                        id: MRO:0000004
+                        label: MHC gene
+        mhc_allele_2:
+            type: string
+            nullable: true
+            description: >
+                Allele designation of the MHC class II beta chain or the invariant beta2-microglobin chain
+            example: HLA-DRB1*04:01
+        reactivity_method:
+            type: string
+            nullable: false
+            enum:
+                - SPR
+                - ITC
+                - ELISA
+                - cytometry
+                - biological_activity
+            description: The methodology used to assess expression (assay implemented in experiment)
+        reactivity_readout:
+            type: string
+            nullable: false
+            enum:
+                - binding_strength
+                - cytokine_release
+                - dissociation_constant_kd
+                - on_rate
+                - off_rate
+                - pathogen_inhibition
+            description: Reactivity measurement read-out
+            example: cytokine release
+        reactivity_value:
+            type: number
+            nullable: false
+            description: The absolute (processed) value of the measurement
+            example: 162.26
+        reactivity_unit:
+            type: string
+            nullable: false
+            description: The unit of the measurement
+            example: pg/ml
diff --git a/tests/check-consistency-formats.py b/tests/check-consistency-formats.py
index cd3d76423..98bec18fd 100755
--- a/tests/check-consistency-formats.py
+++ b/tests/check-consistency-formats.py
@@ -17,7 +17,9 @@
 spec_files = {basename(f): f for f in glob('specs/airr-schema.yaml')}
 v3spec_files = {basename(f): f for f in glob('specs/airr-schema-openapi3.yaml')}
 py_files = {basename(f): f for f in glob('lang/python/airr/specs/airr-schema.yaml')}
+py_v3_files = {basename(f): f for f in glob('lang/python/airr/specs/airr-schema-openapi3.yaml')}
 r_files = {basename(f): f for f in glob('lang/R/inst/extdata/airr-schema.yaml')}
+r_v3_files = {basename(f): f for f in glob('lang/R/inst/extdata/airr-schema-openapi3.yaml')}
 
 # Check python package specs
 if set(spec_files.keys()) != set(py_files.keys()):
@@ -26,6 +28,12 @@
     for spec in set(py_files.keys()) - set(spec_files.keys()):
         print('{} found in python package but missing from specs/'.format(spec), file=sys.stderr)
     sys.exit(1)
+if set(v3spec_files.keys()) != set(py_v3_files.keys()):
+    for spec in set(v3spec_files.keys()) - set(py_v3_files.keys()):
+        print('{} missing from python package'.format(spec), file=sys.stderr)
+    for spec in set(py_v3_files.keys()) - set(v3spec_files.keys()):
+        print('{} found in python package but missing from specs/'.format(spec), file=sys.stderr)
+    sys.exit(1)
 
 # Check R package specs
 if set(spec_files.keys()) != set(r_files.keys()):
@@ -34,7 +42,36 @@
     for spec in set(r_files.keys()) - set(spec_files.keys()):
         print('{} found in R package but missing from specs/'.format(spec), file=sys.stderr)
     sys.exit(1)
+if set(v3spec_files.keys()) != set(r_v3_files.keys()):
+    for spec in set(v3spec_files.keys()) - set(r_v3_files.keys()):
+        print('{} missing from R package'.format(spec), file=sys.stderr)
+    for spec in set(r_v3_files.keys()) - set(v3spec_files.keys()):
+        print('{} found in R package but missing from specs/'.format(spec), file=sys.stderr)
+    sys.exit(1)
+
+# V3 spec against lang
+for spec_name in v3spec_files:
+    # check equality of specs
+    with open(v3spec_files[spec_name], 'r') as ip:
+        gold_spec = yaml.safe_load(ip)
+    with open(py_v3_files[spec_name], 'r') as ip:
+        py_spec = yaml.safe_load(ip)
+    with open(r_v3_files[spec_name], 'r') as ip:
+        r_spec = yaml.safe_load(ip)
+
+    # Check python package
+    if jsondiff.diff(gold_spec, py_spec) != {}:
+        print('{} openapi v3 spec is different from python version'.format(spec_name), file=sys.stderr)
+        print(jsondiff.diff(gold_spec, py_spec, syntax='explicit'), file=sys.stderr)
+        sys.exit(1)
+
+    # Check R package
+    if jsondiff.diff(gold_spec, r_spec) != {}:
+        print('{} openapi v3 spec is different from R version'.format(spec_name), file=sys.stderr)
+        print(jsondiff.diff(gold_spec, r_spec), file=sys.stderr)
+        sys.exit(1)
 
+# V2 spec against lang
 for spec_name in spec_files:
     # check equality of specs
     with open(spec_files[spec_name], 'r') as ip:
@@ -46,13 +83,13 @@
 
     # Check python package
     if jsondiff.diff(gold_spec, py_spec) != {}:
-        print('{} spec is different from python version'.format(spec_name), file=sys.stderr)
+        print('{} openapi v2 spec is different from python version'.format(spec_name), file=sys.stderr)
         print(jsondiff.diff(gold_spec, py_spec, syntax='explicit'), file=sys.stderr)
         sys.exit(1)
 
     # Check R package
     if jsondiff.diff(gold_spec, r_spec) != {}:
-        print('{} spec is different from R version'.format(spec_name), file=sys.stderr)
+        print('{} openapi v2 spec is different from R version'.format(spec_name), file=sys.stderr)
         print(jsondiff.diff(gold_spec, r_spec), file=sys.stderr)
         sys.exit(1)
 

From 9412cb3459a4f910ba490417db9a345b8d24d5a8 Mon Sep 17 00:00:00 2001
From: Scott Christley <scott.christley@utsouthwestern.edu>
Date: Mon, 26 Feb 2024 16:06:21 -0600
Subject: [PATCH 03/15] centralize test data

---
 .github/workflows/py-unittest.yaml |   5 +
 .github/workflows/r-check.yaml     |   4 +
 tests/data/bad_genotype_set.json   |  44 ++
 tests/data/bad_germline_set.json   | 351 +++++++++++
 tests/data/bad_rearrangement.tsv   |  10 +
 tests/data/bad_repertoire.yaml     | 148 +++++
 tests/data/extra_rearrangement.tsv |   2 +
 tests/data/good_combined_airr.json | 933 +++++++++++++++++++++++++++++
 tests/data/good_combined_airr.yaml | 834 ++++++++++++++++++++++++++
 tests/data/good_genotype_set.json  |  38 ++
 tests/data/good_germline_set.json  | 358 +++++++++++
 tests/data/good_rearrangement.tsv  |  10 +
 tests/data/good_repertoire.yaml    | 403 +++++++++++++
 tests/data/output_blank.json       | 231 +++++++
 tests/data/output_data.json        | 913 ++++++++++++++++++++++++++++
 tests/data/output_rep.json         | 506 ++++++++++++++++
 tests/data/warning_repertoire.json |   1 +
 17 files changed, 4791 insertions(+)
 create mode 100644 tests/data/bad_genotype_set.json
 create mode 100644 tests/data/bad_germline_set.json
 create mode 100644 tests/data/bad_rearrangement.tsv
 create mode 100644 tests/data/bad_repertoire.yaml
 create mode 100644 tests/data/extra_rearrangement.tsv
 create mode 100644 tests/data/good_combined_airr.json
 create mode 100644 tests/data/good_combined_airr.yaml
 create mode 100644 tests/data/good_genotype_set.json
 create mode 100644 tests/data/good_germline_set.json
 create mode 100644 tests/data/good_rearrangement.tsv
 create mode 100644 tests/data/good_repertoire.yaml
 create mode 100644 tests/data/output_blank.json
 create mode 100644 tests/data/output_data.json
 create mode 100644 tests/data/output_rep.json
 create mode 100644 tests/data/warning_repertoire.json

diff --git a/.github/workflows/py-unittest.yaml b/.github/workflows/py-unittest.yaml
index f531fd12f..4d24c2020 100644
--- a/.github/workflows/py-unittest.yaml
+++ b/.github/workflows/py-unittest.yaml
@@ -26,6 +26,11 @@ jobs:
         python-version: [ '3.8' ]
     steps:
       - uses: actions/checkout@v2
+
+      - name: Check test data matches the global test data files
+        run: diff -rc tests/data ../../tests/data
+        shell: bash
+
       - name: Set up Python
         uses: actions/setup-python@v2
         with:
diff --git a/.github/workflows/r-check.yaml b/.github/workflows/r-check.yaml
index 8732673a4..1127abf90 100644
--- a/.github/workflows/r-check.yaml
+++ b/.github/workflows/r-check.yaml
@@ -26,6 +26,10 @@ jobs:
     steps:
       - uses: actions/checkout@v2
 
+      - name: Check test data matches the global test data files
+        run: diff -rc tests/data-tests ../../tests/data
+        shell: bash
+
       - name: Install dependencies
         run: |
           install.packages(c("remotes", "testthat", "roxygen2", "devtools", "rcmdcheck"))
diff --git a/tests/data/bad_genotype_set.json b/tests/data/bad_genotype_set.json
new file mode 100644
index 000000000..c58a39027
--- /dev/null
+++ b/tests/data/bad_genotype_set.json
@@ -0,0 +1,44 @@
+{
+    "GenotypeSet": [{
+        "receptor_genotype_set_id": "1",
+        "genotype_class_list": [
+            {
+                "receptor_genotype_id": "1",
+                "locus": "IGH",
+                "documented_alleles": [
+                    {
+                        "label": "IGHV1-69*01",
+                        "germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
+                        "phasing": 1
+                    },
+                    {
+                        "label": "IGHV1-69*02",
+                        "germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
+                        "phasing": 2
+                    },
+                    {
+                        "label": "IGHV1-69*02",
+                        "name": "1234",
+                        "germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
+                        "phasing": 2
+                    }
+                ],
+                "undocumented_alleles": [
+                    {
+                        "allele_name": "IGHD3-1*01_S1234",
+                        "sequence": "agtagtagtagt",
+                        "phasing": 1
+                    }
+                ],
+                "deleted_genes": [
+                    {
+                        "label": "IGHV3-30-3",
+                        "germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
+                        "phasing": "1"
+                    }
+                ],
+                "inference_process": "repertoire_sequencing"
+            }
+        ]
+    }]
+}
\ No newline at end of file
diff --git a/tests/data/bad_germline_set.json b/tests/data/bad_germline_set.json
new file mode 100644
index 000000000..168cc1fa5
--- /dev/null
+++ b/tests/data/bad_germline_set.json
@@ -0,0 +1,351 @@
+{
+    "GermlineSet": [{
+        "germline_set_id": "OGRDB:G00007",
+        "author": "William Lees",
+        "lab_name": "",
+        "lab_address": "Birkbeck College, University of London, Malet Street, London",
+        "release_version": 1,
+        "release_description": "",
+        "release_date": "2021-11-24",
+        "germline_set_name": "CAST IGH",
+        "germline_set_ref": "OGRDB:G00007.1",
+        "pub_ids": [""],
+        "species": ["Mouse"],
+        "species_subgroup": "CAST_EiJ",
+        "species_subgroup_type": "strain",
+        "locus": "IGH",
+        "allele_descriptions": [
+            {
+                "allele_description_id": "OGRDB:A00301",
+                "maintainer": "William Lees",
+                "acknowledgements": [],
+                "lab_address": "Birkbeck College, University of London, Malet Street, London",
+                "release_version": 1,
+                "release_date": "24-Nov-2021",
+                "release_description": "First release",
+                "label": "IGHV-2DBF",
+                "sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
+                "coding_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
+                "aliases": [
+                    "watson_et_al:CAST_EiJ_IGHV5-3"
+                ],
+                "locus": "IGH",
+                "chromosome": null,
+                "sequence_type": "V",
+                "functional": true,
+                "inference_type": "rearranged_only",
+                "species": "Mouse",
+                "species_subgroup": "CAST_EiJ",
+                "species_subgroup_type": "strain",
+                "status": "active",
+                "gene_designation": null,
+                "subgroup_designation": null,
+                "allele_designation": null,
+                "gene_start": null,
+                "gene_end": null,
+                "utr_5_prime_start": null,
+                "utr_5_prime_end": null,
+                "leader_1_start": null,
+                "leader_1_end": null,
+                "leader_2_start": null,
+                "leader_2_end": null,
+                "v_rs_start": null,
+                "v_rs_end": null,
+                "v_gene_delineations": [
+                    {
+                        "sequence_delineation_id": "1",
+                        "delineation_scheme": "IMGT",
+                        "fwr1_start": 1,
+                        "fwr1_end": 78,
+                        "cdr1_start": 79,
+                        "cdr1_end": 114,
+                        "fwr2_start": 115,
+                        "fwr2_end": 165,
+                        "cdr2_start": 166,
+                        "cdr2_end": 195,
+                        "fwr3_start": 196,
+                        "fwr3_end": 312,
+                        "cdr3_start": 313,
+                        "alignment": [
+                            "1",
+                            "2",
+                            "3",
+                            "4",
+                            "5",
+                            "6",
+                            "7",
+                            "8",
+                            "9",
+                            "10",
+                            "11",
+                            "12",
+                            "13",
+                            "14",
+                            "15",
+                            "16",
+                            "17",
+                            "18",
+                            "19",
+                            "20",
+                            "21",
+                            "22",
+                            "23",
+                            "24",
+                            "25",
+                            "26",
+                            "27",
+                            "28",
+                            "29",
+                            "30",
+                            "31",
+                            "32",
+                            "33",
+                            "34",
+                            "35",
+                            "36",
+                            "37",
+                            "38",
+                            "39",
+                            "40",
+                            "41",
+                            "42",
+                            "43",
+                            "44",
+                            "45",
+                            "46",
+                            "47",
+                            "48",
+                            "49",
+                            "50",
+                            "51",
+                            "52",
+                            "53",
+                            "54",
+                            "55",
+                            "56",
+                            "57",
+                            "58",
+                            "59",
+                            "60",
+                            "61",
+                            "62",
+                            "63",
+                            "64",
+                            "65",
+                            "66",
+                            "67",
+                            "68",
+                            "69",
+                            "70",
+                            "71",
+                            "72",
+                            "73",
+                            "74",
+                            "75",
+                            "76",
+                            "77",
+                            "78",
+                            "79",
+                            "80",
+                            "81",
+                            "82",
+                            "83",
+                            "84",
+                            "85",
+                            "86",
+                            "87",
+                            "88",
+                            "89",
+                            "90",
+                            "91",
+                            "92",
+                            "93",
+                            "94",
+                            "95",
+                            "96",
+                            "97",
+                            "98",
+                            "99",
+                            "100",
+                            "101",
+                            "102",
+                            "103",
+                            "104"
+                        ]
+                    }
+                ],
+                "unrearranged_support": [],
+                "rearranged_support": [],
+                "paralogs": [],
+                "notes": "Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV5-3",
+                "curational_tags": null
+            },
+            {
+                "allele_description_id": "OGRDB:A00314",
+                "maintainer": "William Lees",
+                "acknowledgements": [],
+                "lab_address": "Birkbeck College, University of London, Malet Street, London",
+                "release_version": 1,
+                "release_date": "24-Nov-2021",
+                "release_description": "First release",
+                "label": "IGHV-2ETO",
+                "sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC",
+                "coding_sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCT...GGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGC......ACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGAT.........GATGATAAGTACTATAACCCATCCCTGAAG...AGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC",
+                "aliases": [
+                    "watson_et_al:CAST_EiJ_IGHV8-2"
+                ],
+                "locus": "IGH",
+                "chromosome": null,
+                "sequence_type": "V",
+                "functional": true,
+                "inference_type": "rearranged_only",
+                "species": "Mouse",
+                "species_subgroup": "CAST_EiJ",
+                "species_subgroup_type": "strain",
+                "status": "active",
+                "gene_designation": null,
+                "subgroup_designation": null,
+                "allele_designation": null,
+                "gene_start": null,
+                "gene_end": null,
+                "utr_5_prime_start": null,
+                "utr_5_prime_end": null,
+                "leader_1_start": null,
+                "leader_1_end": null,
+                "leader_2_start": null,
+                "leader_2_end": null,
+                "v_rs_start": null,
+                "v_rs_end": null,
+                "v_gene_delineations": [
+                    {
+                        "sequence_delineation_id": "1",
+                        "delineation_scheme": "IMGT",
+                        "fwr1_start": 1,
+                        "fwr1_end": 78,
+                        "cdr1_start": 79,
+                        "cdr1_end": 114,
+                        "fwr2_start": 115,
+                        "fwr2_end": 165,
+                        "cdr2_start": 166,
+                        "cdr2_end": 195,
+                        "fwr3_start": 196,
+                        "fwr3_end": 312,
+                        "cdr3_start": 313,
+                        "alignment": [
+                            "1",
+                            "2",
+                            "3",
+                            "4",
+                            "5",
+                            "6",
+                            "7",
+                            "8",
+                            "9",
+                            "10",
+                            "11",
+                            "12",
+                            "13",
+                            "14",
+                            "15",
+                            "16",
+                            "17",
+                            "18",
+                            "19",
+                            "20",
+                            "21",
+                            "22",
+                            "23",
+                            "24",
+                            "25",
+                            "26",
+                            "27",
+                            "28",
+                            "29",
+                            "30",
+                            "31",
+                            "32",
+                            "33",
+                            "34",
+                            "35",
+                            "36",
+                            "37",
+                            "38",
+                            "39",
+                            "40",
+                            "41",
+                            "42",
+                            "43",
+                            "44",
+                            "45",
+                            "46",
+                            "47",
+                            "48",
+                            "49",
+                            "50",
+                            "51",
+                            "52",
+                            "53",
+                            "54",
+                            "55",
+                            "56",
+                            "57",
+                            "58",
+                            "59",
+                            "60",
+                            "61",
+                            "62",
+                            "63",
+                            "64",
+                            "65",
+                            "66",
+                            "67",
+                            "68",
+                            "69",
+                            "70",
+                            "71",
+                            "72",
+                            "73",
+                            "74",
+                            "75",
+                            "76",
+                            "77",
+                            "78",
+                            "79",
+                            "80",
+                            "81",
+                            "82",
+                            "83",
+                            "84",
+                            "85",
+                            "86",
+                            "87",
+                            "88",
+                            "89",
+                            "90",
+                            "91",
+                            "92",
+                            "93",
+                            "94",
+                            "95",
+                            "96",
+                            "97",
+                            "98",
+                            "99",
+                            "100",
+                            "101",
+                            "102",
+                            "103",
+                            "104"
+                        ]
+                    }
+                ],
+                "unrearranged_support": [],
+                "rearranged_support": [],
+                "paralogs": [],
+                "notes": "Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV8-2",
+                "curational_tags": null
+            }
+        ],
+        "notes": ""
+    }]
+}
diff --git a/tests/data/bad_rearrangement.tsv b/tests/data/bad_rearrangement.tsv
new file mode 100644
index 000000000..d12fc79fe
--- /dev/null
+++ b/tests/data/bad_rearrangement.tsv
@@ -0,0 +1,10 @@
+rearrangement_id	rearrangement_set_id	sequence_id	wrong_name	rev_comp	productive	sequence_alignment	germline_alignment	v_call	d_call	j_call	c_call	junction	junction_length	junction_aa	v_score	d_score	j_score	c_score	v_cigar	d_cigar	j_cigar	c_cigar	v_identity	v_evalue	d_identity	d_evalue	j_identity	j_evalue	v_sequence_start	v_sequence_end	v_germline_start	v_germline_end	d_sequence_start	d_sequence_end	d_germline_start	d_germline_end	j_sequence_start	j_sequence_end	j_germline_start	j_germline_end	np1_length	np2_length	duplicate_count
+IVKNQEJ01BVGQ6	1	IVKNQEJ01BVGQ6	GGCCCAGGACTGGTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGGGGCCAGGGAACCCTGGTCACTGTCTCCTCA	T	T			IGHV4-31*03	IGHD1-7*01,IGHD6-19*01	IGHJ4*02		TGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGG	36	CASGVAGTFDYW	430	16.4	75.8		22N1S275=	11N280S8=	6N292S32=1X9=		1	1E-122	1	2.7	0.9762	6E-18	0	275	0	317	279	287	10	18	291	333	5	47	4	4	1247
+IVKNQEJ01AQVWS	1	IVKNQEJ01AQVWS	GGCCCAGGACTGGTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCAACTACAACCCCTCCCTCAAGAGTCGAGTCACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCA	T	T			IGHV4-31*03	IGHD1-7*01,IGHD6-19*01	IGHJ4*02		TGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGG	36	CASGVAGTFDYW	420	16.4	83.8		22N1S156=1X10=1X17=1X89=	11N280S8=	6N292S42=		0.9891	8E-120	1	2.7	1	2E-20	0	275	0	317	279	287	10	18	291	333	5	47	4	4	4
+IVKNQEJ01AOYFZ	1	IVKNQEJ01AOYFZ	GGCCCAGGACTGGTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGCGGGGTGGCTGGTAACTTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCA	T	F			IGHV4-31*03	IGHD6-19*01	IGHJ4*02		TGTGCGAGCGGGGTGGCTGGTAACTTTTGACTACTGG	37	CASGVAGNF*LLX	430	20.4	83.8		22N1S275=	11N280S10=	6N293S42=		1	1E-122	1	0.17	1	2E-20	0	275	0	317	279	289	10	20	292	334	5	47	4	3	92
+IVKNQEJ01EI5S4	1	IVKNQEJ01EI5S4	GGCCCAGGACTGGTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCA	T	T			IGHV4-31*03	IGHD1-7*01,IGHD6-19*01	IGHJ4*02		TGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGG	36	CASGVAGTFDYW	430	16.4	83.8		22N1S275=	11N280S8=	6N292S42=		1	1E-122	1	2.7	1	2E-20	0	275	0	317	279	287	10	18	291	333	5	47	4	4	2913
+IVKNQEJ01DGRRI	1	IVKNQEJ01DGRRI	GGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCGCTGTCTATGGTGGGTCCTTCAGTGGTTACTACTGGAGCTGGATCCGCCAGCCCCCAGGGAAGGGTCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCA	T	T			IGHV4-34*09	IGHD1-7*01,IGHD6-19*01	IGHJ4*02		TGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGG	36	CASGVAGTFDYW	389	16.4	83.8		22N1S23=2X85=1X15=1X1=1X3=1X2=1X1=1X5=1X6=1X118=	11N274S8=	6N286S42=		0.9628	2E-110	1	2.6	1	2E-20	0	269	0	317	273	281	10	18	285	327	5	47	4	4	1
+IVKNQEJ01APN5N	1	IVKNQEJ01APN5N	GGCCCAGGACTGGTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTAGGGCCAGGGAACCCTGGTCACTGTCTCCTCA	T	F			IGHV4-31*03	IGHD1-7*01,IGHD6-19*01	IGHJ4*02		TGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTAG	36	CASGVAGTFDY*	430	16.4	67.9		22N1S275=	11N280S8=	6N292S10=1X21=1X9=		1	1E-122	1	2.7	0.9524	1E-15	0	275	0	317	279	287	10	18	291	333	5	47	4	4	1
+IVKNQEJ01B0TT2	1	IVKNQEJ01B0TT2	GGCCCAGGACTGGTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGCGGGGTGGCTGGTAACTTTTGACTACTGGGGCCAGGGAACCCTGGTCACTGTCTCCTCA	T	F			IGHV4-31*03	IGHD6-19*01	IGHJ4*02		TGTGCGAGCGGGGTGGCTGGTAACTTTTGACTACTGG	37	CASGVAGNF*LLX	430	20.4	75.8		22N1S275=	11N280S10=	6N293S32=1X9=		1	1E-122	1	0.17	0.9762	6E-18	0	275	0	317	279	289	10	20	292	334	5	47	4	3	30
+IVKNQEJ01AIS74	1	IVKNQEJ01AIS74	GGCGCAGGACTGTTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGGCGGGGTGGCTGGTAACTTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCA	T	F			IGHV4-31*03	IGHD6-19*01	IGHJ4*02		TGTGCGAGGCGGGGTGGCTGGTAACTTTTGACTACTGG	38	CARRGGW*LLTTG	424	20.4	83.8		22N1S3=1X8=1X262=	11N281S10=	6N294S42=		0.9927	9E-121	1	0.17	1	2E-20	0	275	0	317	280	290	10	20	293	335	5	47	5	3	4
+IVKNQEJ01AJ44V	1	IVKNQEJ01AJ44V	GGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCGCTGTCTATGGTGGGTCCTTCAGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGGGGCCAGGGAACCCTGGTCACTGTCTCCTCA	T	T			IGHV4-59*06	IGHD1-7*01,IGHD6-19*01	IGHJ4*02		TGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGG	36	CASGVAGTFDYW	386	16.4	75.8		22N1S45=1X5=2X6=1X3=1X5=1X22=1X4=1X1=1X1=1X165=	11N274S8=	6N286S32=1X9=		0.9625	2E-109	1	2.6	0.9762	5E-18	0	267	0	315	273	281	10	18	285	327	5	47	6	4	12
diff --git a/tests/data/bad_repertoire.yaml b/tests/data/bad_repertoire.yaml
new file mode 100644
index 000000000..2de377cb3
--- /dev/null
+++ b/tests/data/bad_repertoire.yaml
@@ -0,0 +1,148 @@
+#
+# Example metadata
+#
+
+Repertoire:
+  - repertoire_id: 1841923116114776551-242ac11c-0001-012
+    study:
+      study_id: PRJNA300878
+      study_title: "Homo sapiens B and T cell repertoire - MZ twins"
+      study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level."
+      lab_name: "Mark M. Davis"
+      lab_address: "Stanford University"
+      submitted_by: "Florian Rubelt"
+      pub_ids: ["PMID:27005435"]
+    subject:
+      subject_id: TW01A
+      synthetic: false
+      species:
+        id: "NCBITaxon_9606"
+        value: "Homo sapiens"
+      sex: female
+      age_min: 27
+      age_max: 27
+      age_unit:
+        id: UO_0000036
+        value: year
+      linked_subjects: TW01B
+      link_type: twin
+    sample:
+      - sample_id: TW01A_B_naive
+        tissue: PBMC
+        cell_subset: "Naive B cell"
+        cell_phenotype: "expression of CD20 and the absence of CD27"
+        cell_species:
+          id: "NCBITaxon_9606"
+          value: "Homo sapiens"
+        single_cell: false
+        cell_isolation: FACS
+        template_class: RNA
+        pcr_target:
+          - pcr_target_locus: IGH
+        sequencing_platform: "Illumina MiSeq"
+        read_length: "300"
+        sequencing_files:
+          file_type: fastq
+          filename: SRR2905656_R1.fastq.gz
+          read_direction: forward
+          paired_filename: SRR2905656_R2.fastq.gz
+          paired_read_direction: reverse
+    data_processing:
+      - data_processing_id: 3059369183532618216-242ac11b-0001-007
+        analysis_provenance_id: 6623294219256599016-242ac11c-0001-012
+
+  - repertoire_id: 1602908186092376551-242ac11c-0001-012
+    study:
+      study_id: PRJNA300878
+      study_title: "Homo sapiens B and T cell repertoire - MZ twins"
+      study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level."
+      lab_name: "Mark M. Davis"
+      lab_address: "Stanford University"
+      submitted_by: "Florian Rubelt"
+      pub_ids: ["PMID:27005435"]
+    subject:
+      subject_id: TW01A
+      synthetic: false
+      species:
+        id: "NCBITaxon_9606"
+        value: "Homo sapiens"
+      sex: female
+      age_min: 27
+      age_max: 27
+      age_unit:
+        id: UO_0000036
+        value: year
+      linked_subjects: TW01B
+      link_type: twin
+    sample:
+      - sample_id: TW01A_B_memory
+        tissue: PBMC
+        cell_subset: "Memory B cell"
+        cell_phenotype: "expression of CD20 and CD27"
+        cell_species:
+          id: "NCBITaxon_9606"
+          value: "Homo sapiens"
+        single_cell: false
+        cell_isolation: FACS
+        template_class: RNA
+        pcr_target:
+          - pcr_target_locus: IGH
+        sequencing_platform: "Illumina MiSeq"
+        read_length: "300"
+        sequencing_files:
+          file_type: fastq
+          filename: SRR2905655_R1.fastq.gz
+          read_direction: forward
+          paired_filename: SRR2905655_R2.fastq.gz
+          paired_read_direction: reverse
+    data_processing:
+      - data_processing_id: 3059369183532618216-242ac11b-0001-007
+        analysis_provenance_id: 6623294219256599016-242ac11c-0001-012
+
+  - repertoire_id: 2366080924918616551-242ac11c-0001-012
+    study:
+      study_id: PRJNA300878
+      study_title: "Homo sapiens B and T cell repertoire - MZ twins"
+      study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level."
+      lab_name: "Mark M. Davis"
+      lab_address: "Stanford University"
+      submitted_by: "Florian Rubelt"
+      pub_ids: ["PMID:27005435"]
+    subject:
+      subject_id: TW01A
+      synthetic: false
+      species:
+        id: "NCBITaxon_9606"
+        value: "Homo sapiens"
+      sex: female
+      age_min: 27
+      age_max: 27
+      age_unit:
+        id: UO_0000036
+        value: year
+      linked_subjects: TW01B
+      link_type: twin
+    sample:
+      - sample_id: TW01A_T_naive_CD4
+        tissue: PBMC
+        cell_subset: "Naive CD4+ T cell"
+        cell_phenotype: "expression of CD8 and absence of CD4 and CD45RO"
+        cell_species:
+          id: "NCBITaxon_9606"
+          value: "Homo sapiens"
+        single_cell: false
+        cell_isolation: FACS
+        template_class: RNA
+        pcr_target:
+          - pcr_target_locus: TRB
+        sequencing_platform: "Illumina MiSeq"
+        read_length: "300"
+        sequencing_files:
+          file_type: fastq
+          filename: SRR2905659_R1.fastq.gz
+          read_direction: forward
+          paired_filename: SRR2905659_R2.fastq.gz
+          paired_read_direction: reverse
+    data_processing:
+      - data_processing_id: 651223970338378216-242ac11b-0001-007
+        analysis_provenance_id: 4625424004665971176-242ac11c-0001-012
diff --git a/tests/data/extra_rearrangement.tsv b/tests/data/extra_rearrangement.tsv
new file mode 100644
index 000000000..8bedb960f
--- /dev/null
+++ b/tests/data/extra_rearrangement.tsv
@@ -0,0 +1,2 @@
+sequence_id	sequence	rev_comp	productive	v_call	d_call	j_call	sequence_alignment	germline_alignment	junction	junction	junction_aa	v_cigar	d_cigar	j_cigar
+1	2	F	F	5	6	7	8	9	10	11	12	13	14	15	not_in_header	not_in
diff --git a/tests/data/good_combined_airr.json b/tests/data/good_combined_airr.json
new file mode 100644
index 000000000..9101b24a9
--- /dev/null
+++ b/tests/data/good_combined_airr.json
@@ -0,0 +1,933 @@
+{
+    "Repertoire": [
+        {
+            "repertoire_id": "1841923116114776551-242ac11c-0001-012",
+            "study": {
+                "study_id": "PRJNA300878",
+                "study_title": "Homo sapiens B and T cell repertoire - MZ twins",
+                "study_type": {
+                    "id": null,
+                    "label": null
+                },
+                "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.",
+                "study_contact": "Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X",
+                "inclusion_exclusion_criteria": null,
+                "lab_name": "Mark M. Davis",
+                "lab_address": "Stanford University",
+                "submitted_by": "Florian Rubelt",
+                "pub_ids": ["PMID:27005435"],
+                "collected_by": null,
+                "grants": null,
+                "keywords_study": [
+                    "contains_ig",
+                    "contains_tr"
+                ]
+            },
+            "subject": {
+                "subject_id": "TW01A",
+                "synthetic": false,
+                "species": {
+                    "id": "NCBITaxon_9606",
+                    "label": "Homo sapiens"
+                },
+                "sex": "female",
+                "age_min": 27,
+                "age_max": 27,
+                "age_unit": {
+                    "id": "UO_0000036",
+                    "label": "year"
+                },
+                "age_event": null,
+                "ancestry_population": {
+				    "id": null,
+					"label": null
+				},
+				"location_birth": {
+				    "id": null,
+					"label": null
+				},
+                "ethnicity": null,
+                "race": null,
+                "strain_name": null,
+                "linked_subjects": "TW01B",
+                "link_type": "twin",
+                "diagnosis": [
+                    {
+                        "study_group_description": null,
+                        "disease_diagnosis": {
+                            "id": null,
+                            "label": null
+                        },
+                        "disease_length": null,
+                        "disease_stage": null,
+                        "prior_therapies": null,
+                        "immunogen": null,
+                        "intervention": null,
+                        "medical_history": null
+                    }
+                ],
+                "genotype": {
+                    "receptor_genotype_set": {
+                        "receptor_genotype_set_id": "1",
+                        "genotype_class_list": [
+                            {
+                                "receptor_genotype_id": "1",
+                                "locus": "IGH",
+                                "documented_alleles": [
+                                    {
+                                        "label": "IGHV1-69*01",
+                                        "germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
+                                        "phasing": 1
+                                    },
+                                    {
+                                        "label": "IGHV1-69*02",
+                                        "germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
+                                        "phasing": 2
+                                    }
+                                ],
+                                "undocumented_alleles": [
+                                    {
+                                        "allele_name": "IGHD3-1*01_S1234",
+                                        "sequence": "agtagtagtagt",
+                                        "phasing": 1
+                                    }
+                                ],
+                                "deleted_genes": [
+                                    {
+                                        "label": "IGHV3-30-3",
+                                        "germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
+                                        "phasing": 1
+                                    }
+                                ],
+                                "inference_process": "repertoire_sequencing"
+                            }
+                        ]
+                    },
+                    "mhc_genotype_set": {
+                        "mhc_genotype_set_id": "this is a unique identifier",
+                        "mhc_genotype_list": [
+                            {
+                                "mhc_genotype_id": "unique",
+                                "mhc_class": "MHC-I",
+                                "mhc_genotyping_method": "pcr_low_resolution",
+                                "mhc_alleles": [
+                                    {
+                                        "allele_designation": "01:01",
+                                        "gene": {
+                                            "id": "MRO-0000046",
+                                            "label": "HLA-A"
+                                        },
+                                        "reference_set_ref": "blah"
+                                    }
+                                ]
+                            }
+                        ]
+                    }
+                }
+            },
+            "sample": [
+                {
+                    "sample_id": "TW01A_B_naive",
+                    "sample_processing_id": null,
+                    "sample_type": "peripheral venous puncture",
+                    "tissue": {
+                        "id": "UBERON_0000178",
+                        "label": "blood"
+                    },
+                    "tissue_processing": "Ficoll gradient",
+                    "cell_subset": {
+                        "id": "CL_0000788",
+                        "label": "naive B cell"
+                    },
+                    "cell_phenotype": "expression of CD20 and the absence of CD27",
+                    "cell_species": {
+                        "id": "NCBITaxon_9606",
+                        "label": "Homo sapiens"
+                    },
+                    "single_cell": false,
+                    "cell_isolation": "FACS",
+                    "template_class": "RNA",
+                    "pcr_target": [
+                        {
+                            "pcr_target_locus": "IGH",
+                            "forward_pcr_primer_target_location": null,
+                            "reverse_pcr_primer_target_location": null
+                        }
+                    ],
+                    "sequencing_platform": "Illumina MiSeq",
+                    "sequencing_files": {
+                        "sequencing_data_id": "SRR2905656",
+                        "file_type": "fastq",
+                        "filename": "SRR2905656_R1.fastq.gz",
+                        "read_direction": "forward",
+                        "read_length": 300,
+                        "paired_filename": "SRR2905656_R2.fastq.gz",
+                        "paired_read_direction": "reverse",
+                        "paired_read_length": 300
+                    },
+                    "anatomic_site": null,
+                    "disease_state_sample": null,
+                    "collection_time_point_relative": null,
+                    "collection_time_point_relative_unit": {
+                        "id": null,
+                        "label": null
+                    },
+                    "collection_time_point_reference": null,
+					"collection_location": {
+                        "id": null,
+                        "label": null
+                    },
+                    "biomaterial_provider": null,
+                    "cell_number": null,
+                    "cells_per_reaction": null,
+                    "cell_storage": false,
+                    "cell_quality": null,
+                    "cell_processing_protocol": null,
+                    "template_quality": null,
+                    "template_amount": null,
+                    "template_amount_unit": {
+                        "id": null,
+                        "label": null
+                    },
+                    "library_generation_method": "RT(oligo-dT)+PCR",
+                    "library_generation_protocol": null,
+                    "library_generation_kit_version": null,
+                    "complete_sequences": "partial",
+                    "physical_linkage": "none",
+                    "sequencing_run_id": null,
+                    "total_reads_passing_qc_filter": null,
+                    "sequencing_facility": null,
+                    "sequencing_run_date": null,
+                    "sequencing_kit": null
+                }
+            ],
+            "data_processing": [
+                {
+                    "data_processing_id": "3059369183532618216-242ac11b-0001-007",
+                    "primary_annotation": true,
+                    "software_versions": null,
+                    "paired_reads_assembly": null,
+                    "quality_thresholds": null,
+                    "primer_match_cutoffs": null,
+                    "collapsing_method": null,
+                    "data_processing_protocols": null,
+                    "data_processing_files": null,
+                    "germline_database": null,
+                    "analysis_provenance_id": "6623294219256599016-242ac11c-0001-012"
+                }
+            ]
+        },
+        {
+            "repertoire_id": "1602908186092376551-242ac11c-0001-012",
+            "study": {
+                "study_id": "PRJNA300878",
+                "study_title": "Homo sapiens B and T cell repertoire - MZ twins",
+                "study_type": {
+                    "id": null,
+                    "label": null
+                },
+                "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.",
+                "study_contact": "Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X",
+                "inclusion_exclusion_criteria": null,
+                "lab_name": "Mark M. Davis",
+                "lab_address": "Stanford University",
+                "submitted_by": "Florian Rubelt",
+                "pub_ids": ["PMID:27005435"],
+                "collected_by": null,
+                "grants": null,
+                "keywords_study": [
+                    "contains_ig",
+                    "contains_tr"
+                ]
+            },
+            "subject": {
+                "subject_id": "TW01A",
+                "synthetic": false,
+                "species": {
+                    "id": "NCBITaxon_9606",
+                    "label": "Homo sapiens"
+                },
+                "sex": "female",
+                "age_min": 27,
+                "age_max": 27,
+                "age_unit": {
+                    "id": "UO_0000036",
+                    "label": "year"
+                },
+                "age_event": null,
+                "ancestry_population": {
+				    "id": null,
+					"label": null
+				},
+				"location_birth": {
+				    "id": null,
+					"label": null
+				},
+                "ethnicity": null,
+                "race": null,
+                "strain_name": null,
+                "linked_subjects": "TW01B",
+                "link_type": "twin",
+                "diagnosis": [
+                    {
+                        "study_group_description": null,
+                        "disease_diagnosis": {
+                            "id": null,
+                            "label": null
+                        },
+                        "disease_length": null,
+                        "disease_stage": null,
+                        "prior_therapies": null,
+                        "immunogen": null,
+                        "intervention": null,
+                        "medical_history": null
+                    }
+                ]
+            },
+            "sample": [
+                {
+                    "sample_id": "TW01A_B_memory",
+                    "sample_processing_id": null,
+                    "sample_type": "peripheral venous puncture",
+                    "tissue": {
+                        "id": "UBERON_0000178",
+                        "label": "blood"
+                    },
+                    "tissue_processing": "Ficoll gradient",
+                    "cell_subset": {
+                        "id": "CL_0000787",
+                        "label": "memory B cell"
+                    },
+                    "cell_phenotype": "expression of CD20 and CD27",
+                    "cell_species": {
+                        "id": "NCBITaxon_9606",
+                        "label": "Homo sapiens"
+                    },
+                    "single_cell": false,
+                    "cell_isolation": "FACS",
+                    "template_class": "RNA",
+                    "pcr_target": [
+                        {
+                            "pcr_target_locus": "IGH",
+                            "forward_pcr_primer_target_location": null,
+                            "reverse_pcr_primer_target_location": null
+                        }
+                    ],
+                    "sequencing_platform": "Illumina MiSeq",
+                    "sequencing_files": {
+                        "sequencing_data_id": "SRR2905655",
+                        "file_type": "fastq",
+                        "filename": "SRR2905655_R1.fastq.gz",
+                        "read_direction": "forward",
+                        "read_length": 300,
+                        "paired_filename": "SRR2905655_R2.fastq.gz",
+                        "paired_read_direction": "reverse",
+                        "paired_read_length": 300
+                    },
+                    "anatomic_site": null,
+                    "disease_state_sample": null,
+                    "collection_time_point_relative": null,
+                    "collection_time_point_relative_unit": {
+                        "id": null,
+                        "label": null
+                    },
+                    "collection_time_point_reference": null,
+					"collection_location": {
+                        "id": null,
+                        "label": null
+                    },
+                    "biomaterial_provider": null,
+                    "cell_number": null,
+                    "cells_per_reaction": null,
+                    "cell_storage": false,
+                    "cell_quality": null,
+                    "cell_processing_protocol": null,
+                    "template_quality": null,
+                    "template_amount": null,
+                    "template_amount_unit": {
+                        "id": null,
+                        "label": null
+                    },
+                    "library_generation_method": "RT(oligo-dT)+PCR",
+                    "library_generation_protocol": null,
+                    "library_generation_kit_version": null,
+                    "complete_sequences": "partial",
+                    "physical_linkage": "none",
+                    "sequencing_run_id": null,
+                    "total_reads_passing_qc_filter": null,
+                    "sequencing_facility": null,
+                    "sequencing_run_date": null,
+                    "sequencing_kit": null
+                }
+            ],
+            "data_processing": [
+                {
+                    "data_processing_id": "3059369183532618216-242ac11b-0001-007",
+                    "primary_annotation": true,
+                    "software_versions": null,
+                    "paired_reads_assembly": null,
+                    "quality_thresholds": null,
+                    "primer_match_cutoffs": null,
+                    "collapsing_method": null,
+                    "data_processing_protocols": null,
+                    "data_processing_files": null,
+                    "germline_database": null,
+                    "analysis_provenance_id": "6623294219256599016-242ac11c-0001-012"
+                }
+            ]
+        },
+        {
+            "repertoire_id": "2366080924918616551-242ac11c-0001-012",
+            "study": {
+                "study_id": "PRJNA300878",
+                "study_title": "Homo sapiens B and T cell repertoire - MZ twins",
+                "study_type": {
+                    "id": null,
+                    "label": null
+                },
+                "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.",
+                "study_contact": "Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X",
+                "inclusion_exclusion_criteria": null,
+                "lab_name": "Mark M. Davis",
+                "lab_address": "Stanford University",
+                "submitted_by": "Florian Rubelt",
+                "pub_ids": ["PMID:27005435"],
+                "collected_by": null,
+                "grants": null,
+                "keywords_study": [
+                    "contains_ig",
+                    "contains_tr"
+                ]
+            },
+            "subject": {
+                "subject_id": "TW01A",
+                "synthetic": false,
+                "species": {
+                    "id": "NCBITaxon_9606",
+                    "label": "Homo sapiens"
+                },
+                "sex": "female",
+                "age_min": 27,
+                "age_max": 27,
+                "age_unit": {
+                    "id": "UO_0000036",
+                    "label": "year"
+                },
+                "age_event": null,
+                "ancestry_population": {
+				    "id": null,
+					"label": null
+				},
+				"location_birth": {
+				    "id": null,
+					"label": null
+				},
+                "ethnicity": null,
+                "race": null,
+                "strain_name": null,
+                "linked_subjects": "TW01B",
+                "link_type": "twin",
+                "diagnosis": [
+                    {
+                        "study_group_description": null,
+                        "disease_diagnosis": {
+                            "id": null,
+                            "label": null
+                        },
+                        "disease_length": null,
+                        "disease_stage": null,
+                        "prior_therapies": null,
+                        "immunogen": null,
+                        "intervention": null,
+                        "medical_history": null
+                    }
+                ]
+            },
+            "sample": [
+                {
+                    "sample_id": "TW01A_T_naive_CD4",
+                    "sample_processing_id": null,
+                    "sample_type": "peripheral venous puncture",
+                    "tissue": {
+                        "id": "UBERON_0000178",
+                        "label": "blood"
+                    },
+                    "tissue_processing": "Ficoll gradient",
+                    "cell_subset": {
+                        "id": "CL_0000895",
+                        "label": "naive thymus-derived CD4-positive, alpha-beta T cell"
+                    },
+                    "cell_phenotype": "expression of CD8 and absence of CD4 and CD45RO",
+                    "cell_species": {
+                        "id": "NCBITaxon_9606",
+                        "label": "Homo sapiens"
+                    },
+                    "single_cell": false,
+                    "cell_isolation": "FACS",
+                    "template_class": "RNA",
+                    "pcr_target": [
+                        {
+                            "pcr_target_locus": "TRB",
+                            "forward_pcr_primer_target_location": null,
+                            "reverse_pcr_primer_target_location": null
+                        }
+                    ],
+                    "sequencing_platform": "Illumina MiSeq",
+                    "sequencing_files": {
+                        "sequencing_data_id": "SRR2905659",
+                        "file_type": "fastq",
+                        "filename": "SRR2905659_R1.fastq.gz",
+                        "read_direction": "forward",
+                        "read_length": 300,
+                        "paired_filename": "SRR2905659_R2.fastq.gz",
+                        "paired_read_direction": "reverse",
+                        "paired_read_length": 300
+                    },
+                    "anatomic_site": null,
+                    "disease_state_sample": null,
+                    "collection_time_point_relative": null,
+                    "collection_time_point_relative_unit": {
+                        "id": null,
+                        "label": null
+                    },
+                    "collection_time_point_reference": null,
+					"collection_location": {
+                        "id": null,
+                        "label": null
+                    },
+                    "biomaterial_provider": null,
+                    "cell_number": null,
+                    "cells_per_reaction": null,
+                    "cell_storage": false,
+                    "cell_quality": null,
+                    "cell_processing_protocol": null,
+                    "template_quality": null,
+                    "template_amount": null,
+                    "template_amount_unit": {
+                        "id": null,
+                        "label": null
+                    },
+                    "library_generation_method": "RT(oligo-dT)+PCR",
+                    "library_generation_protocol": null,
+                    "library_generation_kit_version": null,
+                    "complete_sequences": "partial",
+                    "physical_linkage": "none",
+                    "sequencing_run_id": null,
+                    "total_reads_passing_qc_filter": null,
+                    "sequencing_facility": null,
+                    "sequencing_run_date": null,
+                    "sequencing_kit": null
+                }
+            ],
+            "data_processing": [
+                {
+                    "data_processing_id": "651223970338378216-242ac11b-0001-007",
+                    "primary_annotation": true,
+                    "software_versions": null,
+                    "paired_reads_assembly": null,
+                    "quality_thresholds": null,
+                    "primer_match_cutoffs": null,
+                    "collapsing_method": null,
+                    "data_processing_protocols": null,
+                    "data_processing_files": null,
+                    "germline_database": null,
+                    "analysis_provenance_id": "4625424004665971176-242ac11c-0001-012"
+                }
+            ]
+        }
+    ],
+
+
+    "GermlineSet": [{
+        "germline_set_id": "OGRDB:G00007",
+        "author": "William Lees",
+        "lab_name": "",
+        "lab_address": "Birkbeck College, University of London, Malet Street, London",
+        "acknowledgements": [],
+        "release_version": 1,
+        "release_description": "",
+        "release_date": "2021-11-24",
+        "germline_set_name": "CAST IGH",
+        "germline_set_ref": "OGRDB:G00007.1",
+        "pub_ids": [""],
+        "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" },
+        "species_subgroup": "CAST_EiJ",
+        "species_subgroup_type": "strain",
+        "locus": "IGH",
+        "allele_descriptions": [
+            {
+                "allele_description_id": "OGRDB:A00301",
+                "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2DBF",
+                "maintainer": "William Lees",
+                "acknowledgements": [],
+                "lab_address": "Birkbeck College, University of London, Malet Street, London",
+                "release_version": 1,
+                "release_date": "24-Nov-2021",
+                "release_description": "First release",
+                "label": "IGHV-2DBF",
+                "sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
+                "coding_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
+                "aliases": [
+                    "watson_et_al:CAST_EiJ_IGHV5-3"
+                ],
+                "locus": "IGH",
+                "chromosome": null,
+                "sequence_type": "V",
+                "functional": true,
+                "inference_type": "rearranged_only",
+                "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" },
+                "species_subgroup": "CAST_EiJ",
+                "species_subgroup_type": "strain",
+                "status": "active",
+                "gene_designation": null,
+                "subgroup_designation": null,
+                "allele_designation": null,
+                "gene_start": null,
+                "gene_end": null,
+                "utr_5_prime_start": null,
+                "utr_5_prime_end": null,
+                "leader_1_start": null,
+                "leader_1_end": null,
+                "leader_2_start": null,
+                "leader_2_end": null,
+                "v_rs_start": null,
+                "v_rs_end": null,
+                "v_gene_delineations": [
+                    {
+                        "sequence_delineation_id": "1",
+                        "delineation_scheme": "IMGT",
+                        "aligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
+                        "unaligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
+                        "fwr1_start": 1,
+                        "fwr1_end": 75,
+                        "cdr1_start": 76,
+                        "cdr1_end": 110,
+                        "fwr2_start": 111,
+                        "fwr2_end": 150,
+                        "cdr2_start": 151,
+                        "cdr2_end": 160,
+                        "fwr3_start": 161,
+                        "fwr3_end": 294,
+                        "cdr3_start": 295,
+                        "alignment": [
+                            "1",
+                            "2",
+                            "3",
+                            "4",
+                            "5",
+                            "6",
+                            "7",
+                            "8",
+                            "9",
+                            "10",
+                            "11",
+                            "12",
+                            "13",
+                            "14",
+                            "15",
+                            "16",
+                            "17",
+                            "18",
+                            "19",
+                            "20",
+                            "21",
+                            "22",
+                            "23",
+                            "24",
+                            "25",
+                            "26",
+                            "27",
+                            "28",
+                            "29",
+                            "30",
+                            "31",
+                            "32",
+                            "33",
+                            "34",
+                            "35",
+                            "36",
+                            "37",
+                            "38",
+                            "39",
+                            "40",
+                            "41",
+                            "42",
+                            "43",
+                            "44",
+                            "45",
+                            "46",
+                            "47",
+                            "48",
+                            "49",
+                            "50",
+                            "51",
+                            "52",
+                            "53",
+                            "54",
+                            "55",
+                            "56",
+                            "57",
+                            "58",
+                            "59",
+                            "60",
+                            "61",
+                            "62",
+                            "63",
+                            "64",
+                            "65",
+                            "66",
+                            "67",
+                            "68",
+                            "69",
+                            "70",
+                            "71",
+                            "72",
+                            "73",
+                            "74",
+                            "75",
+                            "76",
+                            "77",
+                            "78",
+                            "79",
+                            "80",
+                            "81",
+                            "82",
+                            "83",
+                            "84",
+                            "85",
+                            "86",
+                            "87",
+                            "88",
+                            "89",
+                            "90",
+                            "91",
+                            "92",
+                            "93",
+                            "94",
+                            "95",
+                            "96",
+                            "97",
+                            "98",
+                            "99",
+                            "100",
+                            "101",
+                            "102",
+                            "103",
+                            "104"
+                        ]
+                    }
+                ],
+                "unrearranged_support": [],
+                "rearranged_support": [],
+                "paralogs": [],
+                "curation": "Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV5-3",
+                "curational_tags": null
+            },
+            {
+                "allele_description_id": "OGRDB:A00314",
+                "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2ETO",
+                "maintainer": "William Lees",
+                "acknowledgements": [],
+                "lab_address": "Birkbeck College, University of London, Malet Street, London",
+                "release_version": 1,
+                "release_date": "24-Nov-2021",
+                "release_description": "First release",
+                "label": "IGHV-2ETO",
+                "sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC",
+                "coding_sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC",
+                "aliases": [
+                    "watson_et_al:CAST_EiJ_IGHV8-2"
+                ],
+                "locus": "IGH",
+                "chromosome": null,
+                "sequence_type": "V",
+                "functional": true,
+                "inference_type": "rearranged_only",
+                "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" },
+                "species_subgroup": "CAST_EiJ",
+                "species_subgroup_type": "strain",
+                "status": "active",
+                "gene_designation": null,
+                "subgroup_designation": null,
+                "allele_designation": null,
+                "gene_start": null,
+                "gene_end": null,
+                "utr_5_prime_start": null,
+                "utr_5_prime_end": null,
+                "leader_1_start": null,
+                "leader_1_end": null,
+                "leader_2_start": null,
+                "leader_2_end": null,
+                "v_rs_start": null,
+                "v_rs_end": null,
+                "v_gene_delineations": [
+                    {
+                        "sequence_delineation_id": "1",
+                        "delineation_scheme": "IMGT",
+                        "aligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
+                        "unaligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
+                        "fwr1_start": 1,
+                        "fwr1_end": 75,
+                        "cdr1_start": 76,
+                        "cdr1_end": 110,
+                        "fwr2_start": 111,
+                        "fwr2_end": 150,
+                        "cdr2_start": 151,
+                        "cdr2_end": 160,
+                        "fwr3_start": 161,
+                        "fwr3_end": 294,
+                        "cdr3_start": 295,
+                        "alignment": [
+                            "1",
+                            "2",
+                            "3",
+                            "4",
+                            "5",
+                            "6",
+                            "7",
+                            "8",
+                            "9",
+                            "10",
+                            "11",
+                            "12",
+                            "13",
+                            "14",
+                            "15",
+                            "16",
+                            "17",
+                            "18",
+                            "19",
+                            "20",
+                            "21",
+                            "22",
+                            "23",
+                            "24",
+                            "25",
+                            "26",
+                            "27",
+                            "28",
+                            "29",
+                            "30",
+                            "31",
+                            "32",
+                            "33",
+                            "34",
+                            "35",
+                            "36",
+                            "37",
+                            "38",
+                            "39",
+                            "40",
+                            "41",
+                            "42",
+                            "43",
+                            "44",
+                            "45",
+                            "46",
+                            "47",
+                            "48",
+                            "49",
+                            "50",
+                            "51",
+                            "52",
+                            "53",
+                            "54",
+                            "55",
+                            "56",
+                            "57",
+                            "58",
+                            "59",
+                            "60",
+                            "61",
+                            "62",
+                            "63",
+                            "64",
+                            "65",
+                            "66",
+                            "67",
+                            "68",
+                            "69",
+                            "70",
+                            "71",
+                            "72",
+                            "73",
+                            "74",
+                            "75",
+                            "76",
+                            "77",
+                            "78",
+                            "79",
+                            "80",
+                            "81",
+                            "82",
+                            "83",
+                            "84",
+                            "85",
+                            "86",
+                            "87",
+                            "88",
+                            "89",
+                            "90",
+                            "91",
+                            "92",
+                            "93",
+                            "94",
+                            "95",
+                            "96",
+                            "97",
+                            "98",
+                            "99",
+                            "100",
+                            "101",
+                            "102",
+                            "103",
+                            "104"
+                        ]
+                    }
+                ],
+                "unrearranged_support": [],
+                "rearranged_support": [],
+                "paralogs": [],
+                "curation": "Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV8-2",
+                "curational_tags": null
+            }
+        ],
+        "curation": null
+    }],
+    "GenotypeSet": [{
+        "receptor_genotype_set_id": "1",
+        "genotype_class_list": [
+            {
+                "receptor_genotype_id": "1",
+                "locus": "IGH",
+                "documented_alleles": [
+                    {
+                        "label": "IGHV1-69*01",
+                        "germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
+                        "phasing": 1
+                    },
+                    {
+                        "label": "IGHV1-69*02",
+                        "germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
+                        "phasing": 2
+                    }
+                ],
+                "undocumented_alleles": [
+                    {
+                        "allele_name": "IGHD3-1*01_S1234",
+                        "sequence": "agtagtagtagt",
+                        "phasing": 1
+                    }
+                ],
+                "deleted_genes": [
+                    {
+                        "label": "IGHV3-30-3",
+                        "germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
+                        "phasing": 1
+                    }
+                ],
+                "inference_process": "repertoire_sequencing"
+            }
+        ]
+    }]
+}
diff --git a/tests/data/good_combined_airr.yaml b/tests/data/good_combined_airr.yaml
new file mode 100644
index 000000000..80d0fe3a2
--- /dev/null
+++ b/tests/data/good_combined_airr.yaml
@@ -0,0 +1,834 @@
+Repertoire:
+  - repertoire_id: 1841923116114776551-242ac11c-0001-012
+    study:
+      study_id: PRJNA300878
+      study_title: Homo sapiens B and T cell repertoire - MZ twins
+      study_type:
+        id:
+        label:
+      study_description: The adaptive immune system's capability to protect the body
+        requires a highly diverse lymphocyte antigen receptor repertoire. However, the
+        influence of individual genetic and epigenetic differences on these repertoires
+        is not typically measured. By leveraging the unique characteristics of B, CD4+
+        T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified
+        the impact of heritable factors on both the V(D)J recombination process and
+        thymic selection in the case of T cell receptors, and show that the repertoires
+        of both naive and antigen experienced cells are subject to biases resulting
+        from differences in recombination. We show that biases in V(D)J usage, as well
+        as biased N/P additions, contribute to significant variation in the CDR3 region.
+        Moreover, we show that the relative usage of V and J gene segments is chromosomally
+        biased, with approximately 1.5 times as many rearrangements originating from
+        a single chromosome. These data refine our understanding of the heritable mechanisms
+        affecting the repertoire, and show that biases are evident on a chromosome-wide
+        level.
+      study_contact: Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X
+      inclusion_exclusion_criteria:
+      lab_name: Mark M. Davis
+      lab_address: Stanford University
+      submitted_by: Florian Rubelt
+      pub_ids: ["PMID:27005435"]
+      collected_by:
+      grants:
+      keywords_study:
+        - contains_ig
+        - contains_tr
+    subject:
+      subject_id: TW01A
+      synthetic: false
+      species:
+        id: NCBITaxon_9606
+        label: Homo sapiens
+      sex: female
+      age_min: 27
+      age_max: 27
+      age_unit:
+        id: UO_0000036
+        label: year
+      age_event:
+      ancestry_population:
+        id:
+        label:
+      location_birth:
+        id:
+        label:
+      ethnicity:
+      race:
+      strain_name:
+      linked_subjects: TW01B
+      link_type: twin
+      diagnosis:
+        - study_group_description:
+          disease_diagnosis:
+            id:
+            label:
+          disease_length:
+          disease_stage:
+          prior_therapies:
+          immunogen:
+          intervention:
+          medical_history:
+      genotype:
+        receptor_genotype_set:
+          receptor_genotype_set_id: '1'
+          genotype_class_list:
+            - receptor_genotype_id: '1'
+              locus: IGH
+              documented_alleles:
+                - label: IGHV1-69*01
+                  germline_set_ref: IMGT:Homo sapiens:2022.1.31
+                  phasing: 1
+                - label: IGHV1-69*02
+                  germline_set_ref: IMGT:Homo sapiens:2022.1.31
+                  phasing: 2
+              undocumented_alleles:
+                - allele_name: IGHD3-1*01_S1234
+                  sequence: agtagtagtagt
+                  phasing: 1
+              deleted_genes:
+                - label: IGHV3-30-3
+                  germline_set_ref: IMGT:Homo sapiens:2022.1.31
+                  phasing: 1
+              inference_process: repertoire_sequencing
+        mhc_genotype_set:
+          mhc_genotype_set_id: "this is a unique identifier"
+          mhc_genotype_list:
+            - mhc_genotype_id: unique
+              mhc_class: MHC-I
+              mhc_genotyping_method: pcr_low_resolution
+              mhc_alleles:
+                - allele_designation: "01:01"
+                  gene:
+                    id: "MRO-0000046"
+                    label: "HLA-A"
+                  reference_set_ref: blah
+    sample:
+      - sample_id: TW01A_B_naive
+        sample_processing_id:
+        sample_type: peripheral venous puncture
+        tissue:
+          id: UBERON_0000178
+          label: blood
+        tissue_processing: Ficoll gradient
+        cell_subset:
+          id: CL_0000788
+          label: naive B cell
+        cell_phenotype: expression of CD20 and the absence of CD27
+        cell_species:
+          id: NCBITaxon_9606
+          label: Homo sapiens
+        single_cell: false
+        cell_isolation: FACS
+        template_class: RNA
+        pcr_target:
+          - pcr_target_locus: IGH
+            forward_pcr_primer_target_location:
+            reverse_pcr_primer_target_location:
+        sequencing_platform: Illumina MiSeq
+        sequencing_files:
+          sequencing_data_id: SRR2905656
+          file_type: fastq
+          filename: SRR2905656_R1.fastq.gz
+          read_direction: forward
+          read_length: 300
+          paired_filename: SRR2905656_R2.fastq.gz
+          paired_read_direction: reverse
+          paired_read_length: 300
+        anatomic_site:
+        disease_state_sample:
+        collection_time_point_relative:
+        collection_time_point_relative_unit:
+          id:
+          label:
+        collection_time_point_reference:
+        collection_location:
+          id:
+          label:
+        biomaterial_provider:
+        cell_number:
+        cells_per_reaction:
+        cell_storage: false
+        cell_quality:
+        cell_processing_protocol:
+        template_quality:
+        template_amount:
+        template_amount_unit:
+          id:
+          label:
+        library_generation_method: RT(oligo-dT)+PCR
+        library_generation_protocol:
+        library_generation_kit_version:
+        complete_sequences: partial
+        physical_linkage: none
+        sequencing_run_id:
+        total_reads_passing_qc_filter:
+        sequencing_facility:
+        sequencing_run_date:
+        sequencing_kit:
+    data_processing:
+      - data_processing_id: 3059369183532618216-242ac11b-0001-007
+        primary_annotation: true
+        software_versions:
+        paired_reads_assembly:
+        quality_thresholds:
+        primer_match_cutoffs:
+        collapsing_method:
+        data_processing_protocols:
+        data_processing_files:
+        germline_database:
+        analysis_provenance_id: 6623294219256599016-242ac11c-0001-012
+  - repertoire_id: 1602908186092376551-242ac11c-0001-012
+    study:
+      study_id: PRJNA300878
+      study_title: Homo sapiens B and T cell repertoire - MZ twins
+      study_type:
+        id:
+        label:
+      study_description: The adaptive immune system's capability to protect the body
+        requires a highly diverse lymphocyte antigen receptor repertoire. However, the
+        influence of individual genetic and epigenetic differences on these repertoires
+        is not typically measured. By leveraging the unique characteristics of B, CD4+
+        T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified
+        the impact of heritable factors on both the V(D)J recombination process and
+        thymic selection in the case of T cell receptors, and show that the repertoires
+        of both naive and antigen experienced cells are subject to biases resulting
+        from differences in recombination. We show that biases in V(D)J usage, as well
+        as biased N/P additions, contribute to significant variation in the CDR3 region.
+        Moreover, we show that the relative usage of V and J gene segments is chromosomally
+        biased, with approximately 1.5 times as many rearrangements originating from
+        a single chromosome. These data refine our understanding of the heritable mechanisms
+        affecting the repertoire, and show that biases are evident on a chromosome-wide
+        level.
+      study_contact: Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X
+      inclusion_exclusion_criteria:
+      lab_name: Mark M. Davis
+      lab_address: Stanford University
+      submitted_by: Florian Rubelt
+      pub_ids: ["PMID:27005435"]
+      collected_by:
+      grants:
+      keywords_study:
+        - contains_ig
+        - contains_tr
+    subject:
+      subject_id: TW01A
+      synthetic: false
+      species:
+        id: NCBITaxon_9606
+        label: Homo sapiens
+      sex: female
+      age_min: 27
+      age_max: 27
+      age_unit:
+        id: UO_0000036
+        label: year
+      age_event:
+      ancestry_population:
+        id:
+        label:
+      location_birth:
+        id:
+        label:
+      ethnicity:
+      race:
+      strain_name:
+      linked_subjects: TW01B
+      link_type: twin
+      diagnosis:
+        - study_group_description:
+          disease_diagnosis:
+            id:
+            label:
+          disease_length:
+          disease_stage:
+          prior_therapies:
+          immunogen:
+          intervention:
+          medical_history:
+    sample:
+      - sample_id: TW01A_B_memory
+        sample_processing_id:
+        sample_type: peripheral venous puncture
+        tissue:
+          id: UBERON_0000178
+          label: blood
+        tissue_processing: Ficoll gradient
+        cell_subset:
+          id: CL_0000787
+          label: memory B cell
+        cell_phenotype: expression of CD20 and CD27
+        cell_species:
+          id: NCBITaxon_9606
+          label: Homo sapiens
+        single_cell: false
+        cell_isolation: FACS
+        template_class: RNA
+        pcr_target:
+          - pcr_target_locus: IGH
+            forward_pcr_primer_target_location:
+            reverse_pcr_primer_target_location:
+        sequencing_platform: Illumina MiSeq
+        sequencing_files:
+          sequencing_data_id: SRR2905655
+          file_type: fastq
+          filename: SRR2905655_R1.fastq.gz
+          read_direction: forward
+          read_length: 300
+          paired_filename: SRR2905655_R2.fastq.gz
+          paired_read_direction: reverse
+          paired_read_length: 300
+        anatomic_site:
+        disease_state_sample:
+        collection_time_point_relative:
+        collection_time_point_relative_unit:
+          id:
+          label:
+        collection_time_point_reference:
+        collection_location:
+          id:
+          label:
+        biomaterial_provider:
+        cell_number:
+        cells_per_reaction:
+        cell_storage: false
+        cell_quality:
+        cell_processing_protocol:
+        template_quality:
+        template_amount:
+        template_amount_unit:
+          id:
+          label:
+        library_generation_method: RT(oligo-dT)+PCR
+        library_generation_protocol:
+        library_generation_kit_version:
+        complete_sequences: partial
+        physical_linkage: none
+        sequencing_run_id:
+        total_reads_passing_qc_filter:
+        sequencing_facility:
+        sequencing_run_date:
+        sequencing_kit:
+    data_processing:
+      - data_processing_id: 3059369183532618216-242ac11b-0001-007
+        primary_annotation: true
+        software_versions:
+        paired_reads_assembly:
+        quality_thresholds:
+        primer_match_cutoffs:
+        collapsing_method:
+        data_processing_protocols:
+        data_processing_files:
+        germline_database:
+        analysis_provenance_id: 6623294219256599016-242ac11c-0001-012
+  - repertoire_id: 2366080924918616551-242ac11c-0001-012
+    study:
+      study_id: PRJNA300878
+      study_title: Homo sapiens B and T cell repertoire - MZ twins
+      study_type:
+        id:
+        label:
+      study_description: The adaptive immune system's capability to protect the body
+        requires a highly diverse lymphocyte antigen receptor repertoire. However, the
+        influence of individual genetic and epigenetic differences on these repertoires
+        is not typically measured. By leveraging the unique characteristics of B, CD4+
+        T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified
+        the impact of heritable factors on both the V(D)J recombination process and
+        thymic selection in the case of T cell receptors, and show that the repertoires
+        of both naive and antigen experienced cells are subject to biases resulting
+        from differences in recombination. We show that biases in V(D)J usage, as well
+        as biased N/P additions, contribute to significant variation in the CDR3 region.
+        Moreover, we show that the relative usage of V and J gene segments is chromosomally
+        biased, with approximately 1.5 times as many rearrangements originating from
+        a single chromosome. These data refine our understanding of the heritable mechanisms
+        affecting the repertoire, and show that biases are evident on a chromosome-wide
+        level.
+      study_contact: Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X
+      inclusion_exclusion_criteria:
+      lab_name: Mark M. Davis
+      lab_address: Stanford University
+      submitted_by: Florian Rubelt
+      pub_ids: ["PMID:27005435"]
+      collected_by:
+      grants:
+      keywords_study:
+        - contains_ig
+        - contains_tr
+    subject:
+      subject_id: TW01A
+      synthetic: false
+      species:
+        id: NCBITaxon_9606
+        label: Homo sapiens
+      sex: female
+      age_min: 27
+      age_max: 27
+      age_unit:
+        id: UO_0000036
+        label: year
+      age_event:
+      ancestry_population:
+        id:
+        label:
+      location_birth:
+        id:
+        label:
+      ethnicity:
+      race:
+      strain_name:
+      linked_subjects: TW01B
+      link_type: twin
+      diagnosis:
+        - study_group_description:
+          disease_diagnosis:
+            id:
+            label:
+          disease_length:
+          disease_stage:
+          prior_therapies:
+          immunogen:
+          intervention:
+          medical_history:
+    sample:
+      - sample_id: TW01A_T_naive_CD4
+        sample_processing_id:
+        sample_type: peripheral venous puncture
+        tissue:
+          id: UBERON_0000178
+          label: blood
+        tissue_processing: Ficoll gradient
+        cell_subset:
+          id: CL_0000895
+          label: naive thymus-derived CD4-positive, alpha-beta T cell
+        cell_phenotype: expression of CD8 and absence of CD4 and CD45RO
+        cell_species:
+          id: NCBITaxon_9606
+          label: Homo sapiens
+        single_cell: false
+        cell_isolation: FACS
+        template_class: RNA
+        pcr_target:
+          - pcr_target_locus: TRB
+            forward_pcr_primer_target_location:
+            reverse_pcr_primer_target_location:
+        sequencing_platform: Illumina MiSeq
+        sequencing_files:
+          sequencing_data_id: SRR2905659
+          file_type: fastq
+          filename: SRR2905659_R1.fastq.gz
+          read_direction: forward
+          read_length: 300
+          paired_filename: SRR2905659_R2.fastq.gz
+          paired_read_direction: reverse
+          paired_read_length: 300
+        anatomic_site:
+        disease_state_sample:
+        collection_time_point_relative:
+        collection_time_point_relative_unit:
+          id:
+          label:
+        collection_time_point_reference:
+        collection_location:
+          id:
+          label:
+        biomaterial_provider:
+        cell_number:
+        cells_per_reaction:
+        cell_storage: false
+        cell_quality:
+        cell_processing_protocol:
+        template_quality:
+        template_amount:
+        template_amount_unit:
+          id:
+          label:
+        library_generation_method: RT(oligo-dT)+PCR
+        library_generation_protocol:
+        library_generation_kit_version:
+        complete_sequences: partial
+        physical_linkage: none
+        sequencing_run_id:
+        total_reads_passing_qc_filter:
+        sequencing_facility:
+        sequencing_run_date:
+        sequencing_kit:
+    data_processing:
+      - data_processing_id: 651223970338378216-242ac11b-0001-007
+        primary_annotation: true
+        software_versions:
+        paired_reads_assembly:
+        quality_thresholds:
+        primer_match_cutoffs:
+        collapsing_method:
+        data_processing_protocols:
+        data_processing_files:
+        germline_database:
+        analysis_provenance_id: 4625424004665971176-242ac11c-0001-012
+
+GermlineSet:
+- acknowledgements: []
+  allele_descriptions:
+  - acknowledgements: []
+    aliases:
+    - watson_et_al:CAST_EiJ_IGHV5-3
+    allele_description_id: OGRDB:A00301
+    allele_description_ref: OGRDB:Mouse_IGH:IGHV-2DBF
+    allele_designation: null
+    chromosome: null
+    coding_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA
+    curation: 'Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV5-3'
+    curational_tags: null
+    functional: true
+    gene_designation: null
+    gene_end: null
+    gene_start: null
+    inference_type: rearranged_only
+    lab_address: Birkbeck College, University of London, Malet Street, London
+    label: IGHV-2DBF
+    leader_1_end: null
+    leader_1_start: null
+    leader_2_end: null
+    leader_2_start: null
+    locus: IGH
+    maintainer: William Lees
+    paralogs: []
+    rearranged_support: []
+    release_date: 24-Nov-2021
+    release_description: First release
+    release_version: 1
+    sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA
+    sequence_type: V
+    species:
+      id: NCBITAXON:10090
+      label: Mus musculus
+    species_subgroup: CAST_EiJ
+    species_subgroup_type: strain
+    status: active
+    subgroup_designation: null
+    unrearranged_support: []
+    utr_5_prime_end: null
+    utr_5_prime_start: null
+    v_gene_delineations:
+    - aligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA
+      alignment:
+      - '1'
+      - '2'
+      - '3'
+      - '4'
+      - '5'
+      - '6'
+      - '7'
+      - '8'
+      - '9'
+      - '10'
+      - '11'
+      - '12'
+      - '13'
+      - '14'
+      - '15'
+      - '16'
+      - '17'
+      - '18'
+      - '19'
+      - '20'
+      - '21'
+      - '22'
+      - '23'
+      - '24'
+      - '25'
+      - '26'
+      - '27'
+      - '28'
+      - '29'
+      - '30'
+      - '31'
+      - '32'
+      - '33'
+      - '34'
+      - '35'
+      - '36'
+      - '37'
+      - '38'
+      - '39'
+      - '40'
+      - '41'
+      - '42'
+      - '43'
+      - '44'
+      - '45'
+      - '46'
+      - '47'
+      - '48'
+      - '49'
+      - '50'
+      - '51'
+      - '52'
+      - '53'
+      - '54'
+      - '55'
+      - '56'
+      - '57'
+      - '58'
+      - '59'
+      - '60'
+      - '61'
+      - '62'
+      - '63'
+      - '64'
+      - '65'
+      - '66'
+      - '67'
+      - '68'
+      - '69'
+      - '70'
+      - '71'
+      - '72'
+      - '73'
+      - '74'
+      - '75'
+      - '76'
+      - '77'
+      - '78'
+      - '79'
+      - '80'
+      - '81'
+      - '82'
+      - '83'
+      - '84'
+      - '85'
+      - '86'
+      - '87'
+      - '88'
+      - '89'
+      - '90'
+      - '91'
+      - '92'
+      - '93'
+      - '94'
+      - '95'
+      - '96'
+      - '97'
+      - '98'
+      - '99'
+      - '100'
+      - '101'
+      - '102'
+      - '103'
+      - '104'
+      cdr1_end: 110
+      cdr1_start: 76
+      cdr2_end: 160
+      cdr2_start: 151
+      cdr3_start: 295
+      delineation_scheme: IMGT
+      fwr1_end: 75
+      fwr1_start: 1
+      fwr2_end: 150
+      fwr2_start: 111
+      fwr3_end: 294
+      fwr3_start: 161
+      sequence_delineation_id: '1'
+      unaligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA
+    v_rs_end: null
+    v_rs_start: null
+  - acknowledgements: []
+    aliases:
+    - watson_et_al:CAST_EiJ_IGHV8-2
+    allele_description_id: OGRDB:A00314
+    allele_description_ref: OGRDB:Mouse_IGH:IGHV-2ETO
+    allele_designation: null
+    chromosome: null
+    coding_sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC
+    curation: 'Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV8-2'
+    curational_tags: null
+    functional: true
+    gene_designation: null
+    gene_end: null
+    gene_start: null
+    inference_type: rearranged_only
+    lab_address: Birkbeck College, University of London, Malet Street, London
+    label: IGHV-2ETO
+    leader_1_end: null
+    leader_1_start: null
+    leader_2_end: null
+    leader_2_start: null
+    locus: IGH
+    maintainer: William Lees
+    paralogs: []
+    rearranged_support: []
+    release_date: 24-Nov-2021
+    release_description: First release
+    release_version: 1
+    sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC
+    sequence_type: V
+    species:
+      id: NCBITAXON:10090
+      label: Mus musculus
+    species_subgroup: CAST_EiJ
+    species_subgroup_type: strain
+    status: active
+    subgroup_designation: null
+    unrearranged_support: []
+    utr_5_prime_end: null
+    utr_5_prime_start: null
+    v_gene_delineations:
+    - aligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA
+      alignment:
+      - '1'
+      - '2'
+      - '3'
+      - '4'
+      - '5'
+      - '6'
+      - '7'
+      - '8'
+      - '9'
+      - '10'
+      - '11'
+      - '12'
+      - '13'
+      - '14'
+      - '15'
+      - '16'
+      - '17'
+      - '18'
+      - '19'
+      - '20'
+      - '21'
+      - '22'
+      - '23'
+      - '24'
+      - '25'
+      - '26'
+      - '27'
+      - '28'
+      - '29'
+      - '30'
+      - '31'
+      - '32'
+      - '33'
+      - '34'
+      - '35'
+      - '36'
+      - '37'
+      - '38'
+      - '39'
+      - '40'
+      - '41'
+      - '42'
+      - '43'
+      - '44'
+      - '45'
+      - '46'
+      - '47'
+      - '48'
+      - '49'
+      - '50'
+      - '51'
+      - '52'
+      - '53'
+      - '54'
+      - '55'
+      - '56'
+      - '57'
+      - '58'
+      - '59'
+      - '60'
+      - '61'
+      - '62'
+      - '63'
+      - '64'
+      - '65'
+      - '66'
+      - '67'
+      - '68'
+      - '69'
+      - '70'
+      - '71'
+      - '72'
+      - '73'
+      - '74'
+      - '75'
+      - '76'
+      - '77'
+      - '78'
+      - '79'
+      - '80'
+      - '81'
+      - '82'
+      - '83'
+      - '84'
+      - '85'
+      - '86'
+      - '87'
+      - '88'
+      - '89'
+      - '90'
+      - '91'
+      - '92'
+      - '93'
+      - '94'
+      - '95'
+      - '96'
+      - '97'
+      - '98'
+      - '99'
+      - '100'
+      - '101'
+      - '102'
+      - '103'
+      - '104'
+      cdr1_end: 110
+      cdr1_start: 76
+      cdr2_end: 160
+      cdr2_start: 151
+      cdr3_start: 295
+      delineation_scheme: IMGT
+      fwr1_end: 75
+      fwr1_start: 1
+      fwr2_end: 150
+      fwr2_start: 111
+      fwr3_end: 294
+      fwr3_start: 161
+      sequence_delineation_id: '1'
+      unaligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA
+    v_rs_end: null
+    v_rs_start: null
+  author: William Lees
+  curation: null
+  germline_set_id: OGRDB:G00007
+  germline_set_name: CAST IGH
+  germline_set_ref: OGRDB:G00007.1
+  lab_address: Birkbeck College, University of London, Malet Street, London
+  lab_name: ''
+  locus: IGH
+  pub_ids: ['']
+  release_date: '2021-11-24'
+  release_description: ''
+  release_version: 1
+  species:
+    id: NCBITAXON:10090
+    label: Mus musculus
+  species_subgroup: CAST_EiJ
+  species_subgroup_type: strain
+
+
+GenotypeSet:
+  - receptor_genotype_set_id: '1'
+    genotype_class_list:
+      - receptor_genotype_id: '1'
+        locus: IGH
+        documented_alleles:
+          - label: IGHV1-69*01
+            germline_set_ref: IMGT:Homo sapiens:2022.1.31
+            phasing: 1
+          - label: IGHV1-69*02
+            germline_set_ref: IMGT:Homo sapiens:2022.1.31
+            phasing: 2
+        undocumented_alleles:
+          - allele_name: IGHD3-1*01_S1234
+            sequence: agtagtagtagt
+            phasing: 1
+        deleted_genes:
+          - label: IGHV3-30-3
+            germline_set_ref: IMGT:Homo sapiens:2022.1.31
+            phasing: 1
+        inference_process: repertoire_sequencing
diff --git a/tests/data/good_genotype_set.json b/tests/data/good_genotype_set.json
new file mode 100644
index 000000000..ba10f56e9
--- /dev/null
+++ b/tests/data/good_genotype_set.json
@@ -0,0 +1,38 @@
+{
+    "GenotypeSet": [{
+        "receptor_genotype_set_id": "1",
+        "genotype_class_list": [
+            {
+                "receptor_genotype_id": "1",
+                "locus": "IGH",
+                "documented_alleles": [
+                    {
+                        "label": "IGHV1-69*01",
+                        "germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
+                        "phasing": 1
+                    },
+                    {
+                        "label": "IGHV1-69*02",
+                        "germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
+                        "phasing": 2
+                    }
+                ],
+                "undocumented_alleles": [
+                    {
+                        "allele_name": "IGHD3-1*01_S1234",
+                        "sequence": "agtagtagtagt",
+                        "phasing": 1
+                    }
+                ],
+                "deleted_genes": [
+                    {
+                        "label": "IGHV3-30-3",
+                        "germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
+                        "phasing": 1
+                    }
+                ],
+                "inference_process": "repertoire_sequencing"
+            }
+        ]
+    }]
+}
\ No newline at end of file
diff --git a/tests/data/good_germline_set.json b/tests/data/good_germline_set.json
new file mode 100644
index 000000000..41ecf5f7d
--- /dev/null
+++ b/tests/data/good_germline_set.json
@@ -0,0 +1,358 @@
+{
+    "GermlineSet": [{
+        "germline_set_id": "OGRDB:G00007",
+        "author": "William Lees",
+        "lab_name": "",
+        "lab_address": "Birkbeck College, University of London, Malet Street, London",
+        "acknowledgements": [],
+        "release_version": 1,
+        "release_description": "",
+        "release_date": "2021-11-24",
+        "germline_set_name": "CAST IGH",
+        "germline_set_ref": "OGRDB:G00007.1",
+        "pub_ids": [""],
+        "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" },
+        "species_subgroup": "CAST_EiJ",
+        "species_subgroup_type": "strain",
+        "locus": "IGH",
+        "allele_descriptions": [
+            {
+                "allele_description_id": "OGRDB:A00301",
+                "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2DBF",
+                "maintainer": "William Lees",
+                "acknowledgements": [],
+                "lab_address": "Birkbeck College, University of London, Malet Street, London",
+                "release_version": 1,
+                "release_date": "24-Nov-2021",
+                "release_description": "First release",
+                "label": "IGHV-2DBF",
+                "sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
+                "coding_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
+                "aliases": [
+                    "watson_et_al:CAST_EiJ_IGHV5-3"
+                ],
+                "locus": "IGH",
+                "chromosome": null,
+                "sequence_type": "V",
+                "functional": true,
+                "inference_type": "rearranged_only",
+                "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" },
+                "species_subgroup": "CAST_EiJ",
+                "species_subgroup_type": "strain",
+                "status": "active",
+                "gene_designation": null,
+                "subgroup_designation": null,
+                "allele_designation": null,
+                "gene_start": null,
+                "gene_end": null,
+                "utr_5_prime_start": null,
+                "utr_5_prime_end": null,
+                "leader_1_start": null,
+                "leader_1_end": null,
+                "leader_2_start": null,
+                "leader_2_end": null,
+                "v_rs_start": null,
+                "v_rs_end": null,
+                "v_gene_delineations": [
+                    {
+                        "sequence_delineation_id": "1",
+                        "delineation_scheme": "IMGT",
+                        "aligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
+                        "unaligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
+                        "fwr1_start": 1,
+                        "fwr1_end": 75,
+                        "cdr1_start": 76,
+                        "cdr1_end": 110,
+                        "fwr2_start": 111,
+                        "fwr2_end": 150,
+                        "cdr2_start": 151,
+                        "cdr2_end": 160,
+                        "fwr3_start": 161,
+                        "fwr3_end": 294,
+                        "cdr3_start": 295,
+                        "alignment": [
+                            "1",
+                            "2",
+                            "3",
+                            "4",
+                            "5",
+                            "6",
+                            "7",
+                            "8",
+                            "9",
+                            "10",
+                            "11",
+                            "12",
+                            "13",
+                            "14",
+                            "15",
+                            "16",
+                            "17",
+                            "18",
+                            "19",
+                            "20",
+                            "21",
+                            "22",
+                            "23",
+                            "24",
+                            "25",
+                            "26",
+                            "27",
+                            "28",
+                            "29",
+                            "30",
+                            "31",
+                            "32",
+                            "33",
+                            "34",
+                            "35",
+                            "36",
+                            "37",
+                            "38",
+                            "39",
+                            "40",
+                            "41",
+                            "42",
+                            "43",
+                            "44",
+                            "45",
+                            "46",
+                            "47",
+                            "48",
+                            "49",
+                            "50",
+                            "51",
+                            "52",
+                            "53",
+                            "54",
+                            "55",
+                            "56",
+                            "57",
+                            "58",
+                            "59",
+                            "60",
+                            "61",
+                            "62",
+                            "63",
+                            "64",
+                            "65",
+                            "66",
+                            "67",
+                            "68",
+                            "69",
+                            "70",
+                            "71",
+                            "72",
+                            "73",
+                            "74",
+                            "75",
+                            "76",
+                            "77",
+                            "78",
+                            "79",
+                            "80",
+                            "81",
+                            "82",
+                            "83",
+                            "84",
+                            "85",
+                            "86",
+                            "87",
+                            "88",
+                            "89",
+                            "90",
+                            "91",
+                            "92",
+                            "93",
+                            "94",
+                            "95",
+                            "96",
+                            "97",
+                            "98",
+                            "99",
+                            "100",
+                            "101",
+                            "102",
+                            "103",
+                            "104"
+                        ]
+                    }
+                ],
+                "unrearranged_support": [],
+                "rearranged_support": [],
+                "paralogs": [],
+                "curation": "Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV5-3",
+                "curational_tags": null
+            },
+            {
+                "allele_description_id": "OGRDB:A00314",
+                "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2ETO",
+                "maintainer": "William Lees",
+                "acknowledgements": [],
+                "lab_address": "Birkbeck College, University of London, Malet Street, London",
+                "release_version": 1,
+                "release_date": "24-Nov-2021",
+                "release_description": "First release",
+                "label": "IGHV-2ETO",
+                "sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC",
+                "coding_sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC",
+                "aliases": [
+                    "watson_et_al:CAST_EiJ_IGHV8-2"
+                ],
+                "locus": "IGH",
+                "chromosome": null,
+                "sequence_type": "V",
+                "functional": true,
+                "inference_type": "rearranged_only",
+                "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" },
+                "species_subgroup": "CAST_EiJ",
+                "species_subgroup_type": "strain",
+                "status": "active",
+                "gene_designation": null,
+                "subgroup_designation": null,
+                "allele_designation": null,
+                "gene_start": null,
+                "gene_end": null,
+                "utr_5_prime_start": null,
+                "utr_5_prime_end": null,
+                "leader_1_start": null,
+                "leader_1_end": null,
+                "leader_2_start": null,
+                "leader_2_end": null,
+                "v_rs_start": null,
+                "v_rs_end": null,
+                "v_gene_delineations": [
+                    {
+                        "sequence_delineation_id": "1",
+                        "delineation_scheme": "IMGT",
+                        "aligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
+                        "unaligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
+                        "fwr1_start": 1,
+                        "fwr1_end": 75,
+                        "cdr1_start": 76,
+                        "cdr1_end": 110,
+                        "fwr2_start": 111,
+                        "fwr2_end": 150,
+                        "cdr2_start": 151,
+                        "cdr2_end": 160,
+                        "fwr3_start": 161,
+                        "fwr3_end": 294,
+                        "cdr3_start": 295,
+                        "alignment": [
+                            "1",
+                            "2",
+                            "3",
+                            "4",
+                            "5",
+                            "6",
+                            "7",
+                            "8",
+                            "9",
+                            "10",
+                            "11",
+                            "12",
+                            "13",
+                            "14",
+                            "15",
+                            "16",
+                            "17",
+                            "18",
+                            "19",
+                            "20",
+                            "21",
+                            "22",
+                            "23",
+                            "24",
+                            "25",
+                            "26",
+                            "27",
+                            "28",
+                            "29",
+                            "30",
+                            "31",
+                            "32",
+                            "33",
+                            "34",
+                            "35",
+                            "36",
+                            "37",
+                            "38",
+                            "39",
+                            "40",
+                            "41",
+                            "42",
+                            "43",
+                            "44",
+                            "45",
+                            "46",
+                            "47",
+                            "48",
+                            "49",
+                            "50",
+                            "51",
+                            "52",
+                            "53",
+                            "54",
+                            "55",
+                            "56",
+                            "57",
+                            "58",
+                            "59",
+                            "60",
+                            "61",
+                            "62",
+                            "63",
+                            "64",
+                            "65",
+                            "66",
+                            "67",
+                            "68",
+                            "69",
+                            "70",
+                            "71",
+                            "72",
+                            "73",
+                            "74",
+                            "75",
+                            "76",
+                            "77",
+                            "78",
+                            "79",
+                            "80",
+                            "81",
+                            "82",
+                            "83",
+                            "84",
+                            "85",
+                            "86",
+                            "87",
+                            "88",
+                            "89",
+                            "90",
+                            "91",
+                            "92",
+                            "93",
+                            "94",
+                            "95",
+                            "96",
+                            "97",
+                            "98",
+                            "99",
+                            "100",
+                            "101",
+                            "102",
+                            "103",
+                            "104"
+                        ]
+                    }
+                ],
+                "unrearranged_support": [],
+                "rearranged_support": [],
+                "paralogs": [],
+                "curation": "Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV8-2",
+                "curational_tags": null
+            }
+        ],
+        "curation": null
+    }]
+}
diff --git a/tests/data/good_rearrangement.tsv b/tests/data/good_rearrangement.tsv
new file mode 100644
index 000000000..e8521767d
--- /dev/null
+++ b/tests/data/good_rearrangement.tsv
@@ -0,0 +1,10 @@
+rearrangement_id	rearrangement_set_id	sequence_id	sequence	rev_comp	productive	sequence_alignment	germline_alignment	v_call	d_call	j_call	c_call	junction	junction_length	junction_aa	v_score	d_score	j_score	c_score	v_cigar	d_cigar	j_cigar	c_cigar	v_identity	v_evalue	d_identity	d_evalue	j_identity	j_evalue	v_sequence_start	v_sequence_end	v_germline_start	v_germline_end	d_sequence_start	d_sequence_end	d_germline_start	d_germline_end	j_sequence_start	j_sequence_end	j_germline_start	j_germline_end	np1_length	np2_length	duplicate_count
+IVKNQEJ01BVGQ6	1	IVKNQEJ01BVGQ6	GGCCCAGGACTGGTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGGGGCCAGGGAACCCTGGTCACTGTCTCCTCA	T	T			IGHV4-31*03	IGHD1-7*01,IGHD6-19*01	IGHJ4*02		TGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGG	36	CASGVAGTFDYW	430	16.4	75.8		22N1S275=	11N280S8=	6N292S32=1X9=		1	1E-122	1	2.7	0.9762	6E-18	0	275	0	317	279	287	10	18	291	333	5	47	4	4	1247
+IVKNQEJ01AQVWS	1	IVKNQEJ01AQVWS	GGCCCAGGACTGGTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCAACTACAACCCCTCCCTCAAGAGTCGAGTCACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCA	T	T			IGHV4-31*03	IGHD1-7*01,IGHD6-19*01	IGHJ4*02		TGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGG	36	CASGVAGTFDYW	420	16.4	83.8		22N1S156=1X10=1X17=1X89=	11N280S8=	6N292S42=		0.9891	8E-120	1	2.7	1	2E-20	0	275	0	317	279	287	10	18	291	333	5	47	4	4	4
+IVKNQEJ01AOYFZ	1	IVKNQEJ01AOYFZ	GGCCCAGGACTGGTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGCGGGGTGGCTGGTAACTTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCA	T	F			IGHV4-31*03	IGHD6-19*01	IGHJ4*02		TGTGCGAGCGGGGTGGCTGGTAACTTTTGACTACTGG	37	CASGVAGNF*LLX	430	20.4	83.8		22N1S275=	11N280S10=	6N293S42=		1	1E-122	1	0.17	1	2E-20	0	275	0	317	279	289	10	20	292	334	5	47	4	3	92
+IVKNQEJ01EI5S4	1	IVKNQEJ01EI5S4	GGCCCAGGACTGGTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCA	T	T			IGHV4-31*03	IGHD1-7*01,IGHD6-19*01	IGHJ4*02		TGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGG	36	CASGVAGTFDYW	430	16.4	83.8		22N1S275=	11N280S8=	6N292S42=		1	1E-122	1	2.7	1	2E-20	0	275	0	317	279	287	10	18	291	333	5	47	4	4	2913
+IVKNQEJ01DGRRI	1	IVKNQEJ01DGRRI	GGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCGCTGTCTATGGTGGGTCCTTCAGTGGTTACTACTGGAGCTGGATCCGCCAGCCCCCAGGGAAGGGTCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCA	T	T			IGHV4-34*09	IGHD1-7*01,IGHD6-19*01	IGHJ4*02		TGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGG	36	CASGVAGTFDYW	389	16.4	83.8		22N1S23=2X85=1X15=1X1=1X3=1X2=1X1=1X5=1X6=1X118=	11N274S8=	6N286S42=		0.9628	2E-110	1	2.6	1	2E-20	0	269	0	317	273	281	10	18	285	327	5	47	4	4	1
+IVKNQEJ01APN5N	1	IVKNQEJ01APN5N	GGCCCAGGACTGGTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTAGGGCCAGGGAACCCTGGTCACTGTCTCCTCA	T	F			IGHV4-31*03	IGHD1-7*01,IGHD6-19*01	IGHJ4*02		TGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTAG	36	CASGVAGTFDY*	430	16.4	67.9		22N1S275=	11N280S8=	6N292S10=1X21=1X9=		1	1E-122	1	2.7	0.9524	1E-15	0	275	0	317	279	287	10	18	291	333	5	47	4	4	1
+IVKNQEJ01B0TT2	1	IVKNQEJ01B0TT2	GGCCCAGGACTGGTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGCGGGGTGGCTGGTAACTTTTGACTACTGGGGCCAGGGAACCCTGGTCACTGTCTCCTCA	T	F			IGHV4-31*03	IGHD6-19*01	IGHJ4*02		TGTGCGAGCGGGGTGGCTGGTAACTTTTGACTACTGG	37	CASGVAGNF*LLX	430	20.4	75.8		22N1S275=	11N280S10=	6N293S32=1X9=		1	1E-122	1	0.17	0.9762	6E-18	0	275	0	317	279	289	10	20	292	334	5	47	4	3	30
+IVKNQEJ01AIS74	1	IVKNQEJ01AIS74	GGCGCAGGACTGTTGAAGCCTTCACAGACCCTGTCCCTCACCTGCACTGTCTCTGGTGGCTCCATCAGCAGTGGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGGCGGGGTGGCTGGTAACTTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCA	T	F			IGHV4-31*03	IGHD6-19*01	IGHJ4*02		TGTGCGAGGCGGGGTGGCTGGTAACTTTTGACTACTGG	38	CARRGGW*LLTTG	424	20.4	83.8		22N1S3=1X8=1X262=	11N281S10=	6N294S42=		0.9927	9E-121	1	0.17	1	2E-20	0	275	0	317	280	290	10	20	293	335	5	47	5	3	4
+IVKNQEJ01AJ44V	1	IVKNQEJ01AJ44V	GGCCCAGGACTGGTGAAGCCTTCGGAGACCCTGTCCCTCACCTGCGCTGTCTATGGTGGGTCCTTCAGTGGTTACTACTGGAGCTGGATCCGCCAGCACCCAGGGAAGGGCCTGGAGTGGATTGGGTACATCTATTACAGTGGGAGCACCTACTACAACCCGTCCCTCAAGAGTCGAGTTACCATATCAGTAGACACGTCTAAGAACCAGTTCTCCCTGAAGCTGAGCTCTGTGACTGCCGCGGACACGGCCGTGTATTACTGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGGGGCCAGGGAACCCTGGTCACTGTCTCCTCA	T	T			IGHV4-59*06	IGHD1-7*01,IGHD6-19*01	IGHJ4*02		TGTGCGAGCGGGGTGGCTGGAACTTTTGACTACTGG	36	CASGVAGTFDYW	386	16.4	75.8		22N1S45=1X5=2X6=1X3=1X5=1X22=1X4=1X1=1X1=1X165=	11N274S8=	6N286S32=1X9=		0.9625	2E-109	1	2.6	0.9762	5E-18	0	267	0	315	273	281	10	18	285	327	5	47	6	4	12
diff --git a/tests/data/good_repertoire.yaml b/tests/data/good_repertoire.yaml
new file mode 100644
index 000000000..9bf3a4653
--- /dev/null
+++ b/tests/data/good_repertoire.yaml
@@ -0,0 +1,403 @@
+#
+# Example metadata
+#
+
+Repertoire:
+  - repertoire_id: 1841923116114776551-242ac11c-0001-012
+    study:
+      study_id: PRJNA300878
+      study_title: "Homo sapiens B and T cell repertoire - MZ twins"
+      study_type:
+        id: null
+        label: null
+      study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level."
+      study_contact: "Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X"
+      inclusion_exclusion_criteria: null
+      lab_name: "Mark M. Davis"
+      lab_address: "Stanford University"
+      submitted_by: "Florian Rubelt"
+      pub_ids: ["PMID:27005435"]
+      collected_by: null
+      grants: null
+      keywords_study: 
+        - "contains_ig"
+        - "contains_tr"
+    subject:
+      subject_id: TW01A
+      synthetic: false
+      species:
+        id: "NCBITaxon_9606"
+        label: "Homo sapiens"
+      sex: female
+      age_min: 27
+      age_max: 27
+      age_unit:
+        id: UO_0000036
+        label: year
+      age_event: null
+      ancestry_population:
+        id: null
+        label: null
+      location_birth:
+        id: null
+        label: null
+      ethnicity: null
+      race: null
+      strain_name: null
+      linked_subjects: TW01B
+      link_type: twin
+      diagnosis:
+        - study_group_description: null
+          disease_diagnosis:
+            id: null
+            label: null
+          disease_length: null
+          disease_stage: null
+          prior_therapies: null
+          immunogen: null
+          intervention: null
+          medical_history: null
+
+    sample:
+      - sample_id: TW01A_B_naive
+        sample_processing_id: null
+        sample_type: "peripheral venous puncture"
+        tissue:
+          id: "UBERON_0000178"
+          label: "blood"
+        tissue_processing: "Ficoll gradient"
+        cell_subset:
+          id: "CL_0000788"
+          label: "naive B cell"
+        cell_phenotype: "expression of CD20 and the absence of CD27"
+        cell_species:
+          id: "NCBITaxon_9606"
+          label: "Homo sapiens"
+        single_cell: false
+        cell_isolation: FACS
+        template_class: RNA
+        pcr_target:
+          - pcr_target_locus: IGH
+            forward_pcr_primer_target_location: null
+            reverse_pcr_primer_target_location: null
+        sequencing_platform: "Illumina MiSeq"
+        sequencing_files:
+          sequencing_data_id: SRA:SRR2905656
+          file_type: fastq
+          filename: SRR2905656_R1.fastq.gz
+          read_direction: forward
+          read_length: 300
+          paired_filename: SRR2905656_R2.fastq.gz
+          paired_read_direction: reverse
+          paired_read_length: 300
+          index_filename: SRR2905656_R3.fastq.gz
+          index_length: 8
+        anatomic_site: null
+        disease_state_sample: null
+        collection_time_point_relative: null
+        collection_time_point_relative_unit:
+          id: null
+          label: null
+        collection_time_point_reference: null
+        collection_location:
+          id: null
+          label: null
+        biomaterial_provider: null
+        cell_number: null
+        cells_per_reaction: null
+        cell_storage: false
+        cell_quality: null
+        cell_processing_protocol: null
+        template_quality: null
+        template_amount: null
+        template_amount_unit:
+          id: null
+          label: null
+        library_generation_method: "RT(oligo-dT)+PCR"
+        library_generation_protocol: null
+        library_generation_kit_version: null
+        complete_sequences: "partial"
+        physical_linkage: "none"
+        sequencing_run_id: null
+        total_reads_passing_qc_filter: null
+        sequencing_facility: null
+        sequencing_run_date: null
+        sequencing_kit: null
+    data_processing:
+      - data_processing_id: 3059369183532618216-242ac11b-0001-007
+        primary_annotation: true
+        software_versions: null
+        paired_reads_assembly: null
+        quality_thresholds: null
+        primer_match_cutoffs: null
+        collapsing_method: null
+        data_processing_protocols: null
+        data_processing_files: null
+        germline_database: null
+        analysis_provenance_id: 6623294219256599016-242ac11c-0001-012
+
+  - repertoire_id: 1602908186092376551-242ac11c-0001-012
+    study:
+      study_id: PRJNA300878
+      study_title: "Homo sapiens B and T cell repertoire - MZ twins"
+      study_type:
+        id: null
+        label: null
+      study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level."
+      study_contact: "Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X"
+      inclusion_exclusion_criteria: null
+      lab_name: "Mark M. Davis"
+      lab_address: "Stanford University"
+      submitted_by: "Florian Rubelt"
+      pub_ids: ["PMID:27005435"]
+      collected_by: null
+      grants: null
+      keywords_study:
+        - "contains_ig"
+        - "contains_tr"
+    subject:
+      subject_id: TW01A
+      synthetic: false
+      species:
+        id: "NCBITaxon_9606"
+        label: "Homo sapiens"
+      sex: female
+      age_min: 27
+      age_max: 27
+      age_unit:
+        id: UO_0000036
+        label: year
+      age_event: null
+      ancestry_population:
+        id: null
+        label: null
+      location_birth:
+        id: null
+        label: null
+      ethnicity: null
+      race: null
+      strain_name: null
+      linked_subjects: TW01B
+      link_type: twin
+      diagnosis:
+        - study_group_description: null
+          disease_diagnosis:
+            id: null
+            label: null
+          disease_length: null
+          disease_stage: null
+          prior_therapies: null
+          immunogen: null
+          intervention: null
+          medical_history: null
+
+    sample:
+      - sample_id: TW01A_B_memory
+        sample_processing_id: null
+        sample_type: "peripheral venous puncture"
+        tissue:
+          id: "UBERON_0000178"
+          label: "blood"
+        tissue_processing: "Ficoll gradient"
+        cell_subset:
+          id: "CL_0000787"
+          label: "memory B cell"
+        cell_phenotype: "expression of CD20 and CD27"
+        cell_species:
+          id: "NCBITaxon_9606"
+          label: "Homo sapiens"
+        single_cell: false
+        cell_isolation: FACS
+        template_class: RNA
+        pcr_target:
+          - pcr_target_locus: IGH
+            forward_pcr_primer_target_location: null
+            reverse_pcr_primer_target_location: null
+        sequencing_platform: "Illumina MiSeq"
+        sequencing_files:
+          sequencing_data_id: SRA:SRR2905655
+          file_type: fastq
+          filename: SRR2905655_R1.fastq.gz
+          read_direction: forward
+          read_length: 300
+          paired_filename: SRR2905655_R2.fastq.gz
+          paired_read_direction: reverse
+          paired_read_length: 300
+          index_filename: SRR2905655_R3.fastq.gz
+          index_length: 8
+        anatomic_site: null
+        disease_state_sample: null
+        collection_time_point_relative: null
+        collection_time_point_relative_unit:
+          id: null
+          label: null
+        collection_time_point_reference: null
+        collection_location:
+          id: null
+          label: null
+        biomaterial_provider: null
+        cell_number: null
+        cells_per_reaction: null
+        cell_storage: false
+        cell_quality: null
+        cell_processing_protocol: null
+        template_quality: null
+        template_amount: null
+        template_amount_unit:
+          id: null
+          label: null
+        library_generation_method: "RT(oligo-dT)+PCR"
+        library_generation_protocol: null
+        library_generation_kit_version: null
+        complete_sequences: "partial"
+        physical_linkage: "none"
+        sequencing_run_id: null
+        total_reads_passing_qc_filter: null
+        sequencing_facility: null
+        sequencing_run_date: null
+        sequencing_kit: null
+    data_processing:
+      - data_processing_id: 3059369183532618216-242ac11b-0001-007
+        primary_annotation: true
+        software_versions: null
+        paired_reads_assembly: null
+        quality_thresholds: null
+        primer_match_cutoffs: null
+        collapsing_method: null
+        data_processing_protocols: null
+        data_processing_files: null
+        germline_database: null
+        analysis_provenance_id: 6623294219256599016-242ac11c-0001-012
+
+  - repertoire_id: 2366080924918616551-242ac11c-0001-012
+    study:
+      study_id: PRJNA300878
+      study_title: "Homo sapiens B and T cell repertoire - MZ twins"
+      study_type:
+        id: null
+        label: null
+      study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level."
+      study_contact: "Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X"
+      inclusion_exclusion_criteria: null
+      lab_name: "Mark M. Davis"
+      lab_address: "Stanford University"
+      submitted_by: "Florian Rubelt"
+      pub_ids: ["PMID:27005435"]
+      collected_by: null
+      grants: null
+      keywords_study:
+        - "contains_ig"
+        - "contains_tr"
+    subject:
+      subject_id: TW01A
+      synthetic: false
+      species:
+        id: "NCBITaxon_9606"
+        label: "Homo sapiens"
+      sex: female
+      age_min: 27
+      age_max: 27
+      age_unit:
+        id: UO_0000036
+        label: year
+      age_event: null
+      ancestry_population:
+        id: null
+        label: null
+      location_birth:
+        id: null
+        label: null
+      ethnicity: null
+      race: null
+      strain_name: null
+      linked_subjects: TW01B
+      link_type: twin
+      diagnosis:
+        - study_group_description: null
+          disease_diagnosis:
+            id: null
+            label: null
+          disease_length: null
+          disease_stage: null
+          prior_therapies: null
+          immunogen: null
+          intervention: null
+          medical_history: null
+
+    sample:
+      - sample_id: TW01A_T_naive_CD4
+        sample_processing_id: null
+        sample_type: "peripheral venous puncture"
+        tissue:
+          id: "UBERON_0000178"
+          label: "blood"
+        tissue_processing: "Ficoll gradient"
+        cell_subset:
+          id: "CL_0000895"
+          label: "naive thymus-derived CD4-positive, alpha-beta T cell"
+        cell_phenotype: "expression of CD8 and absence of CD4 and CD45RO"
+        cell_species:
+          id: "NCBITaxon_9606"
+          label: "Homo sapiens"
+        single_cell: false
+        cell_isolation: FACS
+        template_class: RNA
+        pcr_target:
+          - pcr_target_locus: TRB
+            forward_pcr_primer_target_location: null
+            reverse_pcr_primer_target_location: null
+        sequencing_platform: "Illumina MiSeq"
+        sequencing_files:
+          sequencing_data_id: SRA:SRR2905659
+          file_type: fastq
+          filename: SRR2905659_R1.fastq.gz
+          read_direction: forward
+          read_length: 300
+          paired_filename: SRR2905659_R2.fastq.gz
+          paired_read_direction: reverse
+          paired_read_length: 300
+          index_filename: SRR2905659_R3.fastq.gz
+          index_length: 8
+        anatomic_site: null
+        disease_state_sample: null
+        collection_time_point_relative: null
+        collection_time_point_relative_unit:
+          id: null
+          label: null
+        collection_time_point_reference: null
+        collection_location:
+          id: null
+          label: null
+        biomaterial_provider: null
+        cell_number: null
+        cells_per_reaction: null
+        cell_storage: false
+        cell_quality: null
+        cell_processing_protocol: null
+        template_quality: null
+        template_amount: null
+        template_amount_unit:
+          id: null
+          label: null
+        library_generation_method: "RT(oligo-dT)+PCR"
+        library_generation_protocol: null
+        library_generation_kit_version: null
+        complete_sequences: "partial"
+        physical_linkage: "none"
+        sequencing_run_id: null
+        total_reads_passing_qc_filter: null
+        sequencing_facility: null
+        sequencing_run_date: null
+        sequencing_kit: null
+    data_processing:
+      - data_processing_id: 651223970338378216-242ac11b-0001-007
+        primary_annotation: true
+        software_versions: null
+        paired_reads_assembly: null
+        quality_thresholds: null
+        primer_match_cutoffs: null
+        collapsing_method: null
+        data_processing_protocols: null
+        data_processing_files: null
+        germline_database: null
+        analysis_provenance_id: 4625424004665971176-242ac11c-0001-012
diff --git a/tests/data/output_blank.json b/tests/data/output_blank.json
new file mode 100644
index 000000000..3476903ad
--- /dev/null
+++ b/tests/data/output_blank.json
@@ -0,0 +1,231 @@
+{
+  "Info": {
+    "title": "AIRR Data File",
+    "description": "AIRR Data File written by AIRR Standards Python Library",
+    "version": 1.4,
+    "contact": {
+      "name": "AIRR Community",
+      "url": "https://github.com/airr-community"
+    },
+    "license": {
+      "name": "Creative Commons Attribution 4.0 International",
+      "url": "https://creativecommons.org/licenses/by/4.0/"
+    }
+  },
+  "Repertoire": {
+    "repertoire_id": null,
+    "repertoire_name": null,
+    "repertoire_description": null,
+    "study": {
+      "study_id": null,
+      "study_title": null,
+      "study_type": {
+        "id": null,
+        "label": null
+      },
+      "study_description": null,
+      "inclusion_exclusion_criteria": null,
+      "grants": null,
+      "study_contact": null,
+      "collected_by": null,
+      "lab_name": null,
+      "lab_address": null,
+      "submitted_by": null,
+      "pub_ids": [],
+      "keywords_study": [],
+      "adc_publish_date": null,
+      "adc_update_date": null
+    },
+    "subject": {
+      "subject_id": null,
+      "synthetic": false,
+      "species": {
+        "id": null,
+        "label": null
+      },
+      "sex": null,
+      "age_min": null,
+      "age_max": null,
+      "age_unit": {
+        "id": null,
+        "label": null
+      },
+      "age_event": null,
+      "ancestry_population": {
+        "id": null,
+        "label": null
+      },
+      "location_birth": {
+        "id": null,
+        "label": null
+      },
+      "ethnicity": null,
+      "race": null,
+      "strain_name": null,
+      "linked_subjects": null,
+      "link_type": null,
+      "diagnosis": [
+        {
+          "study_group_description": null,
+          "disease_diagnosis": {
+            "id": null,
+            "label": null
+          },
+          "disease_length": null,
+          "disease_stage": null,
+          "prior_therapies": null,
+          "immunogen": null,
+          "intervention": null,
+          "medical_history": null
+        }
+      ],
+      "genotype": {
+        "receptor_genotype_set": {
+          "receptor_genotype_set_id": null,
+          "genotype_class_list": [
+            {
+              "receptor_genotype_id": null,
+              "locus": "IGH",
+              "documented_alleles": [
+                {
+                  "label": null,
+                  "germline_set_ref": null,
+                  "phasing": null
+                }
+              ],
+              "undocumented_alleles": [
+                {
+                  "allele_name": null,
+                  "sequence": "",
+                  "phasing": null
+                }
+              ],
+              "deleted_genes": [
+                {
+                  "label": "",
+                  "germline_set_ref": null,
+                  "phasing": null
+                }
+              ],
+              "inference_process": null
+            }
+          ]
+        },
+        "mhc_genotype_set": {
+          "mhc_genotype_set_id": null,
+          "mhc_genotype_list": [
+            {
+              "mhc_genotype_id": null,
+              "mhc_class": "MHC-I",
+              "mhc_alleles": [
+                {
+                  "allele_designation": null,
+                  "gene": {
+                    "id": null,
+                    "label": null
+                  },
+                  "reference_set_ref": null
+                }
+              ],
+              "mhc_genotyping_method": null
+            }
+          ]
+        }
+      }
+    },
+    "sample": [
+      {
+        "sample_processing_id": null,
+        "sample_id": null,
+        "sample_type": null,
+        "tissue": {
+          "id": null,
+          "label": null
+        },
+        "anatomic_site": null,
+        "disease_state_sample": null,
+        "collection_time_point_relative": null,
+        "collection_time_point_relative_unit": {
+          "id": null,
+          "label": null
+        },
+        "collection_time_point_reference": null,
+        "collection_location": {
+          "id": null,
+          "label": null
+        },
+        "biomaterial_provider": null,
+        "tissue_processing": null,
+        "cell_subset": {
+          "id": null,
+          "label": null
+        },
+        "cell_phenotype": null,
+        "cell_species": {
+          "id": null,
+          "label": null
+        },
+        "single_cell": null,
+        "cell_number": null,
+        "cells_per_reaction": null,
+        "cell_storage": null,
+        "cell_quality": null,
+        "cell_isolation": null,
+        "cell_processing_protocol": null,
+        "template_class": "DNA",
+        "template_quality": null,
+        "template_amount": null,
+        "template_amount_unit": {
+          "id": null,
+          "label": null
+        },
+        "library_generation_method": "PCR",
+        "library_generation_protocol": null,
+        "library_generation_kit_version": null,
+        "pcr_target": [
+          {
+            "pcr_target_locus": null,
+            "forward_pcr_primer_target_location": null,
+            "reverse_pcr_primer_target_location": null
+          }
+        ],
+        "complete_sequences": "partial",
+        "physical_linkage": "none",
+        "sequencing_run_id": null,
+        "total_reads_passing_qc_filter": null,
+        "sequencing_platform": null,
+        "sequencing_facility": null,
+        "sequencing_run_date": null,
+        "sequencing_kit": null,
+        "sequencing_files": {
+          "sequencing_data_id": null,
+          "file_type": null,
+          "filename": null,
+          "read_direction": null,
+          "read_length": null,
+          "paired_filename": null,
+          "paired_read_direction": null,
+          "paired_read_length": null,
+          "index_filename": null,
+          "index_length": null
+        }
+      }
+    ],
+    "data_processing": [
+      {
+        "data_processing_id": null,
+        "primary_annotation": false,
+        "software_versions": null,
+        "paired_reads_assembly": null,
+        "quality_thresholds": null,
+        "primer_match_cutoffs": null,
+        "collapsing_method": null,
+        "data_processing_protocols": null,
+        "data_processing_files": [],
+        "germline_database": null,
+        "germline_set_ref": null,
+        "analysis_provenance_id": null
+      }
+    ]
+  }
+}
\ No newline at end of file
diff --git a/tests/data/output_data.json b/tests/data/output_data.json
new file mode 100644
index 000000000..f43d9973e
--- /dev/null
+++ b/tests/data/output_data.json
@@ -0,0 +1,913 @@
+{
+  "Info": {
+    "title": "AIRR Data File",
+    "description": "AIRR Data File written by AIRR Standards Python Library",
+    "version": 1.4,
+    "contact": {
+      "name": "AIRR Community",
+      "url": "https://github.com/airr-community"
+    },
+    "license": {
+      "name": "Creative Commons Attribution 4.0 International",
+      "url": "https://creativecommons.org/licenses/by/4.0/"
+    }
+  },
+  "Repertoire": {
+    "1841923116114776551-242ac11c-0001-012": {
+      "repertoire_id": "1841923116114776551-242ac11c-0001-012",
+      "study": {
+        "study_id": "PRJNA300878",
+        "study_title": "Homo sapiens B and T cell repertoire - MZ twins",
+        "study_type": {
+          "id": null,
+          "label": null
+        },
+        "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.",
+        "study_contact": "Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X",
+        "inclusion_exclusion_criteria": null,
+        "lab_name": "Mark M. Davis",
+        "lab_address": "Stanford University",
+        "submitted_by": "Florian Rubelt",
+        "pub_ids": [
+          "PMID:27005435"
+        ],
+        "collected_by": null,
+        "grants": null,
+        "keywords_study": [
+          "contains_ig",
+          "contains_tr"
+        ]
+      },
+      "subject": {
+        "subject_id": "TW01A",
+        "synthetic": false,
+        "species": {
+          "id": "NCBITaxon_9606",
+          "label": "Homo sapiens"
+        },
+        "sex": "female",
+        "age_min": 27,
+        "age_max": 27,
+        "age_unit": {
+          "id": "UO_0000036",
+          "label": "year"
+        },
+        "age_event": null,
+        "ancestry_population": {
+          "id": null,
+          "label": null
+        },
+        "location_birth": {
+          "id": null,
+          "label": null
+        },
+        "ethnicity": null,
+        "race": null,
+        "strain_name": null,
+        "linked_subjects": "TW01B",
+        "link_type": "twin",
+        "diagnosis": [
+          {
+            "study_group_description": null,
+            "disease_diagnosis": {
+              "id": null,
+              "label": null
+            },
+            "disease_length": null,
+            "disease_stage": null,
+            "prior_therapies": null,
+            "immunogen": null,
+            "intervention": null,
+            "medical_history": null
+          }
+        ]
+      },
+      "sample": [
+        {
+          "sample_id": "TW01A_B_naive",
+          "sample_processing_id": null,
+          "sample_type": "peripheral venous puncture",
+          "tissue": {
+            "id": "UBERON_0000178",
+            "label": "blood"
+          },
+          "tissue_processing": "Ficoll gradient",
+          "cell_subset": {
+            "id": "CL_0000788",
+            "label": "naive B cell"
+          },
+          "cell_phenotype": "expression of CD20 and the absence of CD27",
+          "cell_species": {
+            "id": "NCBITaxon_9606",
+            "label": "Homo sapiens"
+          },
+          "single_cell": false,
+          "cell_isolation": "FACS",
+          "template_class": "RNA",
+          "pcr_target": [
+            {
+              "pcr_target_locus": "IGH",
+              "forward_pcr_primer_target_location": null,
+              "reverse_pcr_primer_target_location": null
+            }
+          ],
+          "sequencing_platform": "Illumina MiSeq",
+          "sequencing_files": {
+            "sequencing_data_id": "SRA:SRR2905656",
+            "file_type": "fastq",
+            "filename": "SRR2905656_R1.fastq.gz",
+            "read_direction": "forward",
+            "read_length": 300,
+            "paired_filename": "SRR2905656_R2.fastq.gz",
+            "paired_read_direction": "reverse",
+            "paired_read_length": 300,
+            "index_filename": "SRR2905656_R3.fastq.gz",
+            "index_length": 8
+          },
+          "anatomic_site": null,
+          "disease_state_sample": null,
+          "collection_time_point_relative": null,
+          "collection_time_point_relative_unit": {
+            "id": null,
+            "label": null
+          },
+          "collection_time_point_reference": null,
+          "collection_location": {
+            "id": null,
+            "label": null
+          },
+          "biomaterial_provider": null,
+          "cell_number": null,
+          "cells_per_reaction": null,
+          "cell_storage": false,
+          "cell_quality": null,
+          "cell_processing_protocol": null,
+          "template_quality": null,
+          "template_amount": null,
+          "template_amount_unit": {
+            "id": null,
+            "label": null
+          },
+          "library_generation_method": "RT(oligo-dT)+PCR",
+          "library_generation_protocol": null,
+          "library_generation_kit_version": null,
+          "complete_sequences": "partial",
+          "physical_linkage": "none",
+          "sequencing_run_id": null,
+          "total_reads_passing_qc_filter": null,
+          "sequencing_facility": null,
+          "sequencing_run_date": null,
+          "sequencing_kit": null
+        }
+      ],
+      "data_processing": [
+        {
+          "data_processing_id": "3059369183532618216-242ac11b-0001-007",
+          "primary_annotation": true,
+          "software_versions": null,
+          "paired_reads_assembly": null,
+          "quality_thresholds": null,
+          "primer_match_cutoffs": null,
+          "collapsing_method": null,
+          "data_processing_protocols": null,
+          "data_processing_files": null,
+          "germline_database": null,
+          "analysis_provenance_id": "6623294219256599016-242ac11c-0001-012"
+        }
+      ]
+    },
+    "1602908186092376551-242ac11c-0001-012": {
+      "repertoire_id": "1602908186092376551-242ac11c-0001-012",
+      "study": {
+        "study_id": "PRJNA300878",
+        "study_title": "Homo sapiens B and T cell repertoire - MZ twins",
+        "study_type": {
+          "id": null,
+          "label": null
+        },
+        "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.",
+        "study_contact": "Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X",
+        "inclusion_exclusion_criteria": null,
+        "lab_name": "Mark M. Davis",
+        "lab_address": "Stanford University",
+        "submitted_by": "Florian Rubelt",
+        "pub_ids": [
+          "PMID:27005435"
+        ],
+        "collected_by": null,
+        "grants": null,
+        "keywords_study": [
+          "contains_ig",
+          "contains_tr"
+        ]
+      },
+      "subject": {
+        "subject_id": "TW01A",
+        "synthetic": false,
+        "species": {
+          "id": "NCBITaxon_9606",
+          "label": "Homo sapiens"
+        },
+        "sex": "female",
+        "age_min": 27,
+        "age_max": 27,
+        "age_unit": {
+          "id": "UO_0000036",
+          "label": "year"
+        },
+        "age_event": null,
+        "ancestry_population": {
+          "id": null,
+          "label": null
+        },
+        "location_birth": {
+          "id": null,
+          "label": null
+        },
+        "ethnicity": null,
+        "race": null,
+        "strain_name": null,
+        "linked_subjects": "TW01B",
+        "link_type": "twin",
+        "diagnosis": [
+          {
+            "study_group_description": null,
+            "disease_diagnosis": {
+              "id": null,
+              "label": null
+            },
+            "disease_length": null,
+            "disease_stage": null,
+            "prior_therapies": null,
+            "immunogen": null,
+            "intervention": null,
+            "medical_history": null
+          }
+        ]
+      },
+      "sample": [
+        {
+          "sample_id": "TW01A_B_memory",
+          "sample_processing_id": null,
+          "sample_type": "peripheral venous puncture",
+          "tissue": {
+            "id": "UBERON_0000178",
+            "label": "blood"
+          },
+          "tissue_processing": "Ficoll gradient",
+          "cell_subset": {
+            "id": "CL_0000787",
+            "label": "memory B cell"
+          },
+          "cell_phenotype": "expression of CD20 and CD27",
+          "cell_species": {
+            "id": "NCBITaxon_9606",
+            "label": "Homo sapiens"
+          },
+          "single_cell": false,
+          "cell_isolation": "FACS",
+          "template_class": "RNA",
+          "pcr_target": [
+            {
+              "pcr_target_locus": "IGH",
+              "forward_pcr_primer_target_location": null,
+              "reverse_pcr_primer_target_location": null
+            }
+          ],
+          "sequencing_platform": "Illumina MiSeq",
+          "sequencing_files": {
+            "sequencing_data_id": "SRA:SRR2905655",
+            "file_type": "fastq",
+            "filename": "SRR2905655_R1.fastq.gz",
+            "read_direction": "forward",
+            "read_length": 300,
+            "paired_filename": "SRR2905655_R2.fastq.gz",
+            "paired_read_direction": "reverse",
+            "paired_read_length": 300,
+            "index_filename": "SRR2905655_R3.fastq.gz",
+            "index_length": 8
+          },
+          "anatomic_site": null,
+          "disease_state_sample": null,
+          "collection_time_point_relative": null,
+          "collection_time_point_relative_unit": {
+            "id": null,
+            "label": null
+          },
+          "collection_time_point_reference": null,
+          "collection_location": {
+            "id": null,
+            "label": null
+          },
+          "biomaterial_provider": null,
+          "cell_number": null,
+          "cells_per_reaction": null,
+          "cell_storage": false,
+          "cell_quality": null,
+          "cell_processing_protocol": null,
+          "template_quality": null,
+          "template_amount": null,
+          "template_amount_unit": {
+            "id": null,
+            "label": null
+          },
+          "library_generation_method": "RT(oligo-dT)+PCR",
+          "library_generation_protocol": null,
+          "library_generation_kit_version": null,
+          "complete_sequences": "partial",
+          "physical_linkage": "none",
+          "sequencing_run_id": null,
+          "total_reads_passing_qc_filter": null,
+          "sequencing_facility": null,
+          "sequencing_run_date": null,
+          "sequencing_kit": null
+        }
+      ],
+      "data_processing": [
+        {
+          "data_processing_id": "3059369183532618216-242ac11b-0001-007",
+          "primary_annotation": true,
+          "software_versions": null,
+          "paired_reads_assembly": null,
+          "quality_thresholds": null,
+          "primer_match_cutoffs": null,
+          "collapsing_method": null,
+          "data_processing_protocols": null,
+          "data_processing_files": null,
+          "germline_database": null,
+          "analysis_provenance_id": "6623294219256599016-242ac11c-0001-012"
+        }
+      ]
+    },
+    "2366080924918616551-242ac11c-0001-012": {
+      "repertoire_id": "2366080924918616551-242ac11c-0001-012",
+      "study": {
+        "study_id": "PRJNA300878",
+        "study_title": "Homo sapiens B and T cell repertoire - MZ twins",
+        "study_type": {
+          "id": null,
+          "label": null
+        },
+        "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.",
+        "study_contact": "Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X",
+        "inclusion_exclusion_criteria": null,
+        "lab_name": "Mark M. Davis",
+        "lab_address": "Stanford University",
+        "submitted_by": "Florian Rubelt",
+        "pub_ids": [
+          "PMID:27005435"
+        ],
+        "collected_by": null,
+        "grants": null,
+        "keywords_study": [
+          "contains_ig",
+          "contains_tr"
+        ]
+      },
+      "subject": {
+        "subject_id": "TW01A",
+        "synthetic": false,
+        "species": {
+          "id": "NCBITaxon_9606",
+          "label": "Homo sapiens"
+        },
+        "sex": "female",
+        "age_min": 27,
+        "age_max": 27,
+        "age_unit": {
+          "id": "UO_0000036",
+          "label": "year"
+        },
+        "age_event": null,
+        "ancestry_population": {
+          "id": null,
+          "label": null
+        },
+        "location_birth": {
+          "id": null,
+          "label": null
+        },
+        "ethnicity": null,
+        "race": null,
+        "strain_name": null,
+        "linked_subjects": "TW01B",
+        "link_type": "twin",
+        "diagnosis": [
+          {
+            "study_group_description": null,
+            "disease_diagnosis": {
+              "id": null,
+              "label": null
+            },
+            "disease_length": null,
+            "disease_stage": null,
+            "prior_therapies": null,
+            "immunogen": null,
+            "intervention": null,
+            "medical_history": null
+          }
+        ]
+      },
+      "sample": [
+        {
+          "sample_id": "TW01A_T_naive_CD4",
+          "sample_processing_id": null,
+          "sample_type": "peripheral venous puncture",
+          "tissue": {
+            "id": "UBERON_0000178",
+            "label": "blood"
+          },
+          "tissue_processing": "Ficoll gradient",
+          "cell_subset": {
+            "id": "CL_0000895",
+            "label": "naive thymus-derived CD4-positive, alpha-beta T cell"
+          },
+          "cell_phenotype": "expression of CD8 and absence of CD4 and CD45RO",
+          "cell_species": {
+            "id": "NCBITaxon_9606",
+            "label": "Homo sapiens"
+          },
+          "single_cell": false,
+          "cell_isolation": "FACS",
+          "template_class": "RNA",
+          "pcr_target": [
+            {
+              "pcr_target_locus": "TRB",
+              "forward_pcr_primer_target_location": null,
+              "reverse_pcr_primer_target_location": null
+            }
+          ],
+          "sequencing_platform": "Illumina MiSeq",
+          "sequencing_files": {
+            "sequencing_data_id": "SRA:SRR2905659",
+            "file_type": "fastq",
+            "filename": "SRR2905659_R1.fastq.gz",
+            "read_direction": "forward",
+            "read_length": 300,
+            "paired_filename": "SRR2905659_R2.fastq.gz",
+            "paired_read_direction": "reverse",
+            "paired_read_length": 300,
+            "index_filename": "SRR2905659_R3.fastq.gz",
+            "index_length": 8
+          },
+          "anatomic_site": null,
+          "disease_state_sample": null,
+          "collection_time_point_relative": null,
+          "collection_time_point_relative_unit": {
+            "id": null,
+            "label": null
+          },
+          "collection_time_point_reference": null,
+          "collection_location": {
+            "id": null,
+            "label": null
+          },
+          "biomaterial_provider": null,
+          "cell_number": null,
+          "cells_per_reaction": null,
+          "cell_storage": false,
+          "cell_quality": null,
+          "cell_processing_protocol": null,
+          "template_quality": null,
+          "template_amount": null,
+          "template_amount_unit": {
+            "id": null,
+            "label": null
+          },
+          "library_generation_method": "RT(oligo-dT)+PCR",
+          "library_generation_protocol": null,
+          "library_generation_kit_version": null,
+          "complete_sequences": "partial",
+          "physical_linkage": "none",
+          "sequencing_run_id": null,
+          "total_reads_passing_qc_filter": null,
+          "sequencing_facility": null,
+          "sequencing_run_date": null,
+          "sequencing_kit": null
+        }
+      ],
+      "data_processing": [
+        {
+          "data_processing_id": "651223970338378216-242ac11b-0001-007",
+          "primary_annotation": true,
+          "software_versions": null,
+          "paired_reads_assembly": null,
+          "quality_thresholds": null,
+          "primer_match_cutoffs": null,
+          "collapsing_method": null,
+          "data_processing_protocols": null,
+          "data_processing_files": null,
+          "germline_database": null,
+          "analysis_provenance_id": "4625424004665971176-242ac11c-0001-012"
+        }
+      ]
+    }
+  },
+  "GermlineSet": {
+    "OGRDB:G00007": {
+      "germline_set_id": "OGRDB:G00007",
+      "author": "William Lees",
+      "lab_name": "",
+      "lab_address": "Birkbeck College, University of London, Malet Street, London",
+      "acknowledgements": [],
+      "release_version": 1,
+      "release_description": "",
+      "release_date": "2021-11-24",
+      "germline_set_name": "CAST IGH",
+      "germline_set_ref": "OGRDB:G00007.1",
+      "pub_ids": [
+        ""
+      ],
+      "species": {
+        "id": "NCBITAXON:10090",
+        "label": "Mus musculus"
+      },
+      "species_subgroup": "CAST_EiJ",
+      "species_subgroup_type": "strain",
+      "locus": "IGH",
+      "allele_descriptions": [
+        {
+          "allele_description_id": "OGRDB:A00301",
+          "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2DBF",
+          "maintainer": "William Lees",
+          "acknowledgements": [],
+          "lab_address": "Birkbeck College, University of London, Malet Street, London",
+          "release_version": 1,
+          "release_date": "24-Nov-2021",
+          "release_description": "First release",
+          "label": "IGHV-2DBF",
+          "sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
+          "coding_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
+          "aliases": [
+            "watson_et_al:CAST_EiJ_IGHV5-3"
+          ],
+          "locus": "IGH",
+          "chromosome": null,
+          "sequence_type": "V",
+          "functional": true,
+          "inference_type": "rearranged_only",
+          "species": {
+            "id": "NCBITAXON:10090",
+            "label": "Mus musculus"
+          },
+          "species_subgroup": "CAST_EiJ",
+          "species_subgroup_type": "strain",
+          "status": "active",
+          "gene_designation": null,
+          "subgroup_designation": null,
+          "allele_designation": null,
+          "gene_start": null,
+          "gene_end": null,
+          "utr_5_prime_start": null,
+          "utr_5_prime_end": null,
+          "leader_1_start": null,
+          "leader_1_end": null,
+          "leader_2_start": null,
+          "leader_2_end": null,
+          "v_rs_start": null,
+          "v_rs_end": null,
+          "v_gene_delineations": [
+            {
+              "sequence_delineation_id": "1",
+              "delineation_scheme": "IMGT",
+              "aligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
+              "unaligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
+              "fwr1_start": 1,
+              "fwr1_end": 75,
+              "cdr1_start": 76,
+              "cdr1_end": 110,
+              "fwr2_start": 111,
+              "fwr2_end": 150,
+              "cdr2_start": 151,
+              "cdr2_end": 160,
+              "fwr3_start": 161,
+              "fwr3_end": 294,
+              "cdr3_start": 295,
+              "alignment": [
+                "1",
+                "2",
+                "3",
+                "4",
+                "5",
+                "6",
+                "7",
+                "8",
+                "9",
+                "10",
+                "11",
+                "12",
+                "13",
+                "14",
+                "15",
+                "16",
+                "17",
+                "18",
+                "19",
+                "20",
+                "21",
+                "22",
+                "23",
+                "24",
+                "25",
+                "26",
+                "27",
+                "28",
+                "29",
+                "30",
+                "31",
+                "32",
+                "33",
+                "34",
+                "35",
+                "36",
+                "37",
+                "38",
+                "39",
+                "40",
+                "41",
+                "42",
+                "43",
+                "44",
+                "45",
+                "46",
+                "47",
+                "48",
+                "49",
+                "50",
+                "51",
+                "52",
+                "53",
+                "54",
+                "55",
+                "56",
+                "57",
+                "58",
+                "59",
+                "60",
+                "61",
+                "62",
+                "63",
+                "64",
+                "65",
+                "66",
+                "67",
+                "68",
+                "69",
+                "70",
+                "71",
+                "72",
+                "73",
+                "74",
+                "75",
+                "76",
+                "77",
+                "78",
+                "79",
+                "80",
+                "81",
+                "82",
+                "83",
+                "84",
+                "85",
+                "86",
+                "87",
+                "88",
+                "89",
+                "90",
+                "91",
+                "92",
+                "93",
+                "94",
+                "95",
+                "96",
+                "97",
+                "98",
+                "99",
+                "100",
+                "101",
+                "102",
+                "103",
+                "104"
+              ]
+            }
+          ],
+          "unrearranged_support": [],
+          "rearranged_support": [],
+          "paralogs": [],
+          "curation": "Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV5-3",
+          "curational_tags": null
+        },
+        {
+          "allele_description_id": "OGRDB:A00314",
+          "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2ETO",
+          "maintainer": "William Lees",
+          "acknowledgements": [],
+          "lab_address": "Birkbeck College, University of London, Malet Street, London",
+          "release_version": 1,
+          "release_date": "24-Nov-2021",
+          "release_description": "First release",
+          "label": "IGHV-2ETO",
+          "sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC",
+          "coding_sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC",
+          "aliases": [
+            "watson_et_al:CAST_EiJ_IGHV8-2"
+          ],
+          "locus": "IGH",
+          "chromosome": null,
+          "sequence_type": "V",
+          "functional": true,
+          "inference_type": "rearranged_only",
+          "species": {
+            "id": "NCBITAXON:10090",
+            "label": "Mus musculus"
+          },
+          "species_subgroup": "CAST_EiJ",
+          "species_subgroup_type": "strain",
+          "status": "active",
+          "gene_designation": null,
+          "subgroup_designation": null,
+          "allele_designation": null,
+          "gene_start": null,
+          "gene_end": null,
+          "utr_5_prime_start": null,
+          "utr_5_prime_end": null,
+          "leader_1_start": null,
+          "leader_1_end": null,
+          "leader_2_start": null,
+          "leader_2_end": null,
+          "v_rs_start": null,
+          "v_rs_end": null,
+          "v_gene_delineations": [
+            {
+              "sequence_delineation_id": "1",
+              "delineation_scheme": "IMGT",
+              "aligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
+              "unaligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
+              "fwr1_start": 1,
+              "fwr1_end": 75,
+              "cdr1_start": 76,
+              "cdr1_end": 110,
+              "fwr2_start": 111,
+              "fwr2_end": 150,
+              "cdr2_start": 151,
+              "cdr2_end": 160,
+              "fwr3_start": 161,
+              "fwr3_end": 294,
+              "cdr3_start": 295,
+              "alignment": [
+                "1",
+                "2",
+                "3",
+                "4",
+                "5",
+                "6",
+                "7",
+                "8",
+                "9",
+                "10",
+                "11",
+                "12",
+                "13",
+                "14",
+                "15",
+                "16",
+                "17",
+                "18",
+                "19",
+                "20",
+                "21",
+                "22",
+                "23",
+                "24",
+                "25",
+                "26",
+                "27",
+                "28",
+                "29",
+                "30",
+                "31",
+                "32",
+                "33",
+                "34",
+                "35",
+                "36",
+                "37",
+                "38",
+                "39",
+                "40",
+                "41",
+                "42",
+                "43",
+                "44",
+                "45",
+                "46",
+                "47",
+                "48",
+                "49",
+                "50",
+                "51",
+                "52",
+                "53",
+                "54",
+                "55",
+                "56",
+                "57",
+                "58",
+                "59",
+                "60",
+                "61",
+                "62",
+                "63",
+                "64",
+                "65",
+                "66",
+                "67",
+                "68",
+                "69",
+                "70",
+                "71",
+                "72",
+                "73",
+                "74",
+                "75",
+                "76",
+                "77",
+                "78",
+                "79",
+                "80",
+                "81",
+                "82",
+                "83",
+                "84",
+                "85",
+                "86",
+                "87",
+                "88",
+                "89",
+                "90",
+                "91",
+                "92",
+                "93",
+                "94",
+                "95",
+                "96",
+                "97",
+                "98",
+                "99",
+                "100",
+                "101",
+                "102",
+                "103",
+                "104"
+              ]
+            }
+          ],
+          "unrearranged_support": [],
+          "rearranged_support": [],
+          "paralogs": [],
+          "curation": "Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV8-2",
+          "curational_tags": null
+        }
+      ],
+      "curation": null
+    }
+  },
+  "GenotypeSet": {
+    "1": {
+      "receptor_genotype_set_id": "1",
+      "genotype_class_list": [
+        {
+          "receptor_genotype_id": "1",
+          "locus": "IGH",
+          "documented_alleles": [
+            {
+              "label": "IGHV1-69*01",
+              "germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
+              "phasing": 1
+            },
+            {
+              "label": "IGHV1-69*02",
+              "germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
+              "phasing": 2
+            }
+          ],
+          "undocumented_alleles": [
+            {
+              "allele_name": "IGHD3-1*01_S1234",
+              "sequence": "agtagtagtagt",
+              "phasing": 1
+            }
+          ],
+          "deleted_genes": [
+            {
+              "label": "IGHV3-30-3",
+              "germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
+              "phasing": 1
+            }
+          ],
+          "inference_process": "repertoire_sequencing"
+        }
+      ]
+    }
+  }
+}
\ No newline at end of file
diff --git a/tests/data/output_rep.json b/tests/data/output_rep.json
new file mode 100644
index 000000000..fa17a056d
--- /dev/null
+++ b/tests/data/output_rep.json
@@ -0,0 +1,506 @@
+{
+  "Info": {
+    "title": "Repertoire metadata",
+    "description": "Repertoire metadata written by AIRR Standards Python Library",
+    "version": 1.4,
+    "contact": {
+      "name": "AIRR Community",
+      "url": "https://github.com/airr-community"
+    },
+    "license": {
+      "name": "Creative Commons Attribution 4.0 International",
+      "url": "https://creativecommons.org/licenses/by/4.0/"
+    }
+  },
+  "Repertoire": [
+    {
+      "repertoire_id": "1841923116114776551-242ac11c-0001-012",
+      "study": {
+        "study_id": "PRJNA300878",
+        "study_title": "Homo sapiens B and T cell repertoire - MZ twins",
+        "study_type": {
+          "id": null,
+          "label": null
+        },
+        "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.",
+        "study_contact": "Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X",
+        "inclusion_exclusion_criteria": null,
+        "lab_name": "Mark M. Davis",
+        "lab_address": "Stanford University",
+        "submitted_by": "Florian Rubelt",
+        "pub_ids": [
+          "PMID:27005435"
+        ],
+        "collected_by": null,
+        "grants": null,
+        "keywords_study": [
+          "contains_ig",
+          "contains_tr"
+        ]
+      },
+      "subject": {
+        "subject_id": "TW01A",
+        "synthetic": false,
+        "species": {
+          "id": "NCBITaxon_9606",
+          "label": "Homo sapiens"
+        },
+        "sex": "female",
+        "age_min": 27,
+        "age_max": 27,
+        "age_unit": {
+          "id": "UO_0000036",
+          "label": "year"
+        },
+        "age_event": null,
+        "ancestry_population": {
+          "id": null,
+          "label": null
+        },
+        "location_birth": {
+          "id": null,
+          "label": null
+        },
+        "ethnicity": null,
+        "race": null,
+        "strain_name": null,
+        "linked_subjects": "TW01B",
+        "link_type": "twin",
+        "diagnosis": [
+          {
+            "study_group_description": null,
+            "disease_diagnosis": {
+              "id": null,
+              "label": null
+            },
+            "disease_length": null,
+            "disease_stage": null,
+            "prior_therapies": null,
+            "immunogen": null,
+            "intervention": null,
+            "medical_history": null
+          }
+        ]
+      },
+      "sample": [
+        {
+          "sample_id": "TW01A_B_naive",
+          "sample_processing_id": null,
+          "sample_type": "peripheral venous puncture",
+          "tissue": {
+            "id": "UBERON_0000178",
+            "label": "blood"
+          },
+          "tissue_processing": "Ficoll gradient",
+          "cell_subset": {
+            "id": "CL_0000788",
+            "label": "naive B cell"
+          },
+          "cell_phenotype": "expression of CD20 and the absence of CD27",
+          "cell_species": {
+            "id": "NCBITaxon_9606",
+            "label": "Homo sapiens"
+          },
+          "single_cell": false,
+          "cell_isolation": "FACS",
+          "template_class": "RNA",
+          "pcr_target": [
+            {
+              "pcr_target_locus": "IGH",
+              "forward_pcr_primer_target_location": null,
+              "reverse_pcr_primer_target_location": null
+            }
+          ],
+          "sequencing_platform": "Illumina MiSeq",
+          "sequencing_files": {
+            "sequencing_data_id": "SRA:SRR2905656",
+            "file_type": "fastq",
+            "filename": "SRR2905656_R1.fastq.gz",
+            "read_direction": "forward",
+            "read_length": 300,
+            "paired_filename": "SRR2905656_R2.fastq.gz",
+            "paired_read_direction": "reverse",
+            "paired_read_length": 300,
+            "index_filename": "SRR2905656_R3.fastq.gz",
+            "index_length": 8
+          },
+          "anatomic_site": null,
+          "disease_state_sample": null,
+          "collection_time_point_relative": null,
+          "collection_time_point_relative_unit": {
+            "id": null,
+            "label": null
+          },
+          "collection_time_point_reference": null,
+          "collection_location": {
+            "id": null,
+            "label": null
+          },
+          "biomaterial_provider": null,
+          "cell_number": null,
+          "cells_per_reaction": null,
+          "cell_storage": false,
+          "cell_quality": null,
+          "cell_processing_protocol": null,
+          "template_quality": null,
+          "template_amount": null,
+          "template_amount_unit": {
+            "id": null,
+            "label": null
+          },
+          "library_generation_method": "RT(oligo-dT)+PCR",
+          "library_generation_protocol": null,
+          "library_generation_kit_version": null,
+          "complete_sequences": "partial",
+          "physical_linkage": "none",
+          "sequencing_run_id": null,
+          "total_reads_passing_qc_filter": null,
+          "sequencing_facility": null,
+          "sequencing_run_date": null,
+          "sequencing_kit": null
+        }
+      ],
+      "data_processing": [
+        {
+          "data_processing_id": "3059369183532618216-242ac11b-0001-007",
+          "primary_annotation": true,
+          "software_versions": null,
+          "paired_reads_assembly": null,
+          "quality_thresholds": null,
+          "primer_match_cutoffs": null,
+          "collapsing_method": null,
+          "data_processing_protocols": null,
+          "data_processing_files": null,
+          "germline_database": null,
+          "analysis_provenance_id": "6623294219256599016-242ac11c-0001-012"
+        }
+      ]
+    },
+    {
+      "repertoire_id": "1602908186092376551-242ac11c-0001-012",
+      "study": {
+        "study_id": "PRJNA300878",
+        "study_title": "Homo sapiens B and T cell repertoire - MZ twins",
+        "study_type": {
+          "id": null,
+          "label": null
+        },
+        "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.",
+        "study_contact": "Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X",
+        "inclusion_exclusion_criteria": null,
+        "lab_name": "Mark M. Davis",
+        "lab_address": "Stanford University",
+        "submitted_by": "Florian Rubelt",
+        "pub_ids": [
+          "PMID:27005435"
+        ],
+        "collected_by": null,
+        "grants": null,
+        "keywords_study": [
+          "contains_ig",
+          "contains_tr"
+        ]
+      },
+      "subject": {
+        "subject_id": "TW01A",
+        "synthetic": false,
+        "species": {
+          "id": "NCBITaxon_9606",
+          "label": "Homo sapiens"
+        },
+        "sex": "female",
+        "age_min": 27,
+        "age_max": 27,
+        "age_unit": {
+          "id": "UO_0000036",
+          "label": "year"
+        },
+        "age_event": null,
+        "ancestry_population": {
+          "id": null,
+          "label": null
+        },
+        "location_birth": {
+          "id": null,
+          "label": null
+        },
+        "ethnicity": null,
+        "race": null,
+        "strain_name": null,
+        "linked_subjects": "TW01B",
+        "link_type": "twin",
+        "diagnosis": [
+          {
+            "study_group_description": null,
+            "disease_diagnosis": {
+              "id": null,
+              "label": null
+            },
+            "disease_length": null,
+            "disease_stage": null,
+            "prior_therapies": null,
+            "immunogen": null,
+            "intervention": null,
+            "medical_history": null
+          }
+        ]
+      },
+      "sample": [
+        {
+          "sample_id": "TW01A_B_memory",
+          "sample_processing_id": null,
+          "sample_type": "peripheral venous puncture",
+          "tissue": {
+            "id": "UBERON_0000178",
+            "label": "blood"
+          },
+          "tissue_processing": "Ficoll gradient",
+          "cell_subset": {
+            "id": "CL_0000787",
+            "label": "memory B cell"
+          },
+          "cell_phenotype": "expression of CD20 and CD27",
+          "cell_species": {
+            "id": "NCBITaxon_9606",
+            "label": "Homo sapiens"
+          },
+          "single_cell": false,
+          "cell_isolation": "FACS",
+          "template_class": "RNA",
+          "pcr_target": [
+            {
+              "pcr_target_locus": "IGH",
+              "forward_pcr_primer_target_location": null,
+              "reverse_pcr_primer_target_location": null
+            }
+          ],
+          "sequencing_platform": "Illumina MiSeq",
+          "sequencing_files": {
+            "sequencing_data_id": "SRA:SRR2905655",
+            "file_type": "fastq",
+            "filename": "SRR2905655_R1.fastq.gz",
+            "read_direction": "forward",
+            "read_length": 300,
+            "paired_filename": "SRR2905655_R2.fastq.gz",
+            "paired_read_direction": "reverse",
+            "paired_read_length": 300,
+            "index_filename": "SRR2905655_R3.fastq.gz",
+            "index_length": 8
+          },
+          "anatomic_site": null,
+          "disease_state_sample": null,
+          "collection_time_point_relative": null,
+          "collection_time_point_relative_unit": {
+            "id": null,
+            "label": null
+          },
+          "collection_time_point_reference": null,
+          "collection_location": {
+            "id": null,
+            "label": null
+          },
+          "biomaterial_provider": null,
+          "cell_number": null,
+          "cells_per_reaction": null,
+          "cell_storage": false,
+          "cell_quality": null,
+          "cell_processing_protocol": null,
+          "template_quality": null,
+          "template_amount": null,
+          "template_amount_unit": {
+            "id": null,
+            "label": null
+          },
+          "library_generation_method": "RT(oligo-dT)+PCR",
+          "library_generation_protocol": null,
+          "library_generation_kit_version": null,
+          "complete_sequences": "partial",
+          "physical_linkage": "none",
+          "sequencing_run_id": null,
+          "total_reads_passing_qc_filter": null,
+          "sequencing_facility": null,
+          "sequencing_run_date": null,
+          "sequencing_kit": null
+        }
+      ],
+      "data_processing": [
+        {
+          "data_processing_id": "3059369183532618216-242ac11b-0001-007",
+          "primary_annotation": true,
+          "software_versions": null,
+          "paired_reads_assembly": null,
+          "quality_thresholds": null,
+          "primer_match_cutoffs": null,
+          "collapsing_method": null,
+          "data_processing_protocols": null,
+          "data_processing_files": null,
+          "germline_database": null,
+          "analysis_provenance_id": "6623294219256599016-242ac11c-0001-012"
+        }
+      ]
+    },
+    {
+      "repertoire_id": "2366080924918616551-242ac11c-0001-012",
+      "study": {
+        "study_id": "PRJNA300878",
+        "study_title": "Homo sapiens B and T cell repertoire - MZ twins",
+        "study_type": {
+          "id": null,
+          "label": null
+        },
+        "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.",
+        "study_contact": "Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X",
+        "inclusion_exclusion_criteria": null,
+        "lab_name": "Mark M. Davis",
+        "lab_address": "Stanford University",
+        "submitted_by": "Florian Rubelt",
+        "pub_ids": [
+          "PMID:27005435"
+        ],
+        "collected_by": null,
+        "grants": null,
+        "keywords_study": [
+          "contains_ig",
+          "contains_tr"
+        ]
+      },
+      "subject": {
+        "subject_id": "TW01A",
+        "synthetic": false,
+        "species": {
+          "id": "NCBITaxon_9606",
+          "label": "Homo sapiens"
+        },
+        "sex": "female",
+        "age_min": 27,
+        "age_max": 27,
+        "age_unit": {
+          "id": "UO_0000036",
+          "label": "year"
+        },
+        "age_event": null,
+        "ancestry_population": {
+          "id": null,
+          "label": null
+        },
+        "location_birth": {
+          "id": null,
+          "label": null
+        },
+        "ethnicity": null,
+        "race": null,
+        "strain_name": null,
+        "linked_subjects": "TW01B",
+        "link_type": "twin",
+        "diagnosis": [
+          {
+            "study_group_description": null,
+            "disease_diagnosis": {
+              "id": null,
+              "label": null
+            },
+            "disease_length": null,
+            "disease_stage": null,
+            "prior_therapies": null,
+            "immunogen": null,
+            "intervention": null,
+            "medical_history": null
+          }
+        ]
+      },
+      "sample": [
+        {
+          "sample_id": "TW01A_T_naive_CD4",
+          "sample_processing_id": null,
+          "sample_type": "peripheral venous puncture",
+          "tissue": {
+            "id": "UBERON_0000178",
+            "label": "blood"
+          },
+          "tissue_processing": "Ficoll gradient",
+          "cell_subset": {
+            "id": "CL_0000895",
+            "label": "naive thymus-derived CD4-positive, alpha-beta T cell"
+          },
+          "cell_phenotype": "expression of CD8 and absence of CD4 and CD45RO",
+          "cell_species": {
+            "id": "NCBITaxon_9606",
+            "label": "Homo sapiens"
+          },
+          "single_cell": false,
+          "cell_isolation": "FACS",
+          "template_class": "RNA",
+          "pcr_target": [
+            {
+              "pcr_target_locus": "TRB",
+              "forward_pcr_primer_target_location": null,
+              "reverse_pcr_primer_target_location": null
+            }
+          ],
+          "sequencing_platform": "Illumina MiSeq",
+          "sequencing_files": {
+            "sequencing_data_id": "SRA:SRR2905659",
+            "file_type": "fastq",
+            "filename": "SRR2905659_R1.fastq.gz",
+            "read_direction": "forward",
+            "read_length": 300,
+            "paired_filename": "SRR2905659_R2.fastq.gz",
+            "paired_read_direction": "reverse",
+            "paired_read_length": 300,
+            "index_filename": "SRR2905659_R3.fastq.gz",
+            "index_length": 8
+          },
+          "anatomic_site": null,
+          "disease_state_sample": null,
+          "collection_time_point_relative": null,
+          "collection_time_point_relative_unit": {
+            "id": null,
+            "label": null
+          },
+          "collection_time_point_reference": null,
+          "collection_location": {
+            "id": null,
+            "label": null
+          },
+          "biomaterial_provider": null,
+          "cell_number": null,
+          "cells_per_reaction": null,
+          "cell_storage": false,
+          "cell_quality": null,
+          "cell_processing_protocol": null,
+          "template_quality": null,
+          "template_amount": null,
+          "template_amount_unit": {
+            "id": null,
+            "label": null
+          },
+          "library_generation_method": "RT(oligo-dT)+PCR",
+          "library_generation_protocol": null,
+          "library_generation_kit_version": null,
+          "complete_sequences": "partial",
+          "physical_linkage": "none",
+          "sequencing_run_id": null,
+          "total_reads_passing_qc_filter": null,
+          "sequencing_facility": null,
+          "sequencing_run_date": null,
+          "sequencing_kit": null
+        }
+      ],
+      "data_processing": [
+        {
+          "data_processing_id": "651223970338378216-242ac11b-0001-007",
+          "primary_annotation": true,
+          "software_versions": null,
+          "paired_reads_assembly": null,
+          "quality_thresholds": null,
+          "primer_match_cutoffs": null,
+          "collapsing_method": null,
+          "data_processing_protocols": null,
+          "data_processing_files": null,
+          "germline_database": null,
+          "analysis_provenance_id": "4625424004665971176-242ac11c-0001-012"
+        }
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/tests/data/warning_repertoire.json b/tests/data/warning_repertoire.json
new file mode 100644
index 000000000..30699a4c6
--- /dev/null
+++ b/tests/data/warning_repertoire.json
@@ -0,0 +1 @@
+{"Info":{"title":"AIRR Data Commons API for VDJServer Community Data Portal","description":"VDJServer ADC API response for repertoire query","version":"1.3","contact":{"name":"VDJServer","url":"http://vdjserver.org/","email":"vdjserver@utsouthwestern.edu"}},"Repertoire":[{"repertoire_id":"1329892364225474070-242ac113-0001-012","study":{"study_id":"PRJNA593622","study_title":"Determinants governing T cell receptor α/β-chain pairing in repertoire formation of identical twins","study_type":{"id":"NCIT:C16084","label":"Observational Study"},"study_description":"The T cell repertoire in each individual includes T cell receptors (TCRs) of enormous sequence diversity through the pairing of diverse TCR α- and β-chains, each generated by somatic recombination of paralogous gene segments. Whether the TCR repertoire contributes to susceptibility to infectious or autoimmune diseases in concert with disease-associated major histocompatibility complex (MHC) polymorphisms is unknown. Due to a lack in high-throughput technologies to sequence TCR α-β pairs, current studies on whether the TCR repertoire is shaped by host genetics have so far relied only on single-chain analysis. Using a high-throughput single T cell sequencing technology, we obtained the largest paired TCRαβ dataset so far, comprising 965,523 clonotypes from 15 healthy individuals including 6 monozygotic twin pairs. Public TCR α- and, to a lesser extent, TCR β-chain sequences were common in all individuals. In contrast, sharing of entirely identical TCRαβ amino acid sequences was very infrequent in unrelated individuals, but highly increased in twins, in particular in CD4 memory T cells. Based on nucleotide sequence identity, a subset of these shared clonotypes appeared to be the progeny of T cells that had been generated during fetal development and had persisted for more than 50 y. Additional shared TCRαβ in twins were encoded by different nucleotide sequences, implying that genetic determinants impose structural constraints on thymic selection that favor the selection of TCR α-β pairs with entire sequence identities.\n","inclusion_exclusion_criteria":" ","lab_name":"Jörg J Goronzy","lab_address":"Stanford University School of Medicine","submitted_by":"Scott Christley, scott.christley@utsouthwestern.edu","collected_by":"Hidetaka Tanno, hidetakatanno@utexas.edu","grants":"This work was supported by NIH Grants U19 AI057266 (to G.G. and J.J.G.) and R01 AI129191 (to J.J.G.) and US Defense Threat Reduction Agency Grant HDTRA1-12-C-0105 (to G.G.). H.T. was supported by University of Texas Health Innovation for Cancer Prevention Research Training Program Postdoctoral Fellowship (Cancer Prevention and Research Institute of Texas Grant RP160015), Japan Society for the Promotion of Science Postdoctoral Fellowships for Research Abroad, and Uehara Memorial Foundation Research Fellowship.","pub_ids":"PMID:31879353","keywords_study":["contains_tcr","contains_paired_chain"],"vdjserver_uuid":"1400363782577197546-242ac113-0001-012"},"subject":{"subject_id":"A1","synthetic":false,"species":{"id":"NCBITaxon:9606","label":"Homo sapiens"},"sex":"female","age_min":61,"age_max":61,"age_unit":{"id":"UO:0000036","label":"year"},"linked_subjects":"A2","link_type":"twin","diagnosis":[{"disease_diagnosis":{}}],"mhc":["HLA-A*30:02","HLA-A*31:01","HLA-B*35:02","HLA-B*38:01","HLA-C*04:01","HLA-C*12:03","HLA-DRB1*04:02","HLA-DRB1*04:03","HLA-DRB4*01:03","HLA-DQB1*03:02","HLA-DQB1*03:05"],"vdjserver_uuid":"4743918918142914070-242ac113-0001-012"},"sample":[{"sample_id":"A1_CD4_naive_TRB","tissue":{"id":"UBERON:0013756","label":"venous blood"},"biomaterial_provider":"Stanford University, CA","tissue_processing":"Peripheral blood mononuclear cells (PBMCs) were isolated by density centrifugation using Ficoll media at a density of 1.077 g/mL.","cell_subset":{"id":"CL:0000895","label":"naive thymus-derived CD4-positive, alpha-beta T cell"},"cell_phenotype":"CD4+CD45RA+CCR7+","cell_species":{},"single_cell":false,"cell_storage":true,"cell_isolation":"magnetic-bead–based negative EasySep selection reagents","template_class":"RNA","library_generation_method":"RT(oligo-dT)+PCR","pcr_target":[{"pcr_target_locus":"TRB"}],"complete_sequences":"partial","physical_linkage":"hetero_head-head","sequencing_run_id":"SRR10600326","sequencing_platform":"Illumina MiSeq","sequencing_files":{"file_type":"fastq","filename":"SRR10600326.sra_1.fastq.gz","read_length":300},"vdjserver_uuid":"4055006163864514070-242ac113-0001-012"},{"sample_id":"A1_CD4_naive_TRA","tissue":{"id":"UBERON:0013756","label":"venous blood"},"biomaterial_provider":"Stanford University, CA","tissue_processing":"Peripheral blood mononuclear cells (PBMCs) were isolated by density centrifugation using Ficoll media at a density of 1.077 g/mL.","cell_subset":{"id":"CL:0000895","label":"naive thymus-derived CD4-positive, alpha-beta T cell"},"cell_phenotype":"CD4+CD45RA+CCR7+","cell_species":{},"single_cell":false,"cell_storage":true,"cell_isolation":"magnetic-bead–based negative EasySep selection reagents","template_class":"RNA","library_generation_method":"RT(oligo-dT)+PCR","pcr_target":[{"pcr_target_locus":"TRA"}],"complete_sequences":"partial","physical_linkage":"hetero_head-head","sequencing_run_id":"SRR10600326","sequencing_platform":"Illumina MiSeq","sequencing_files":{"file_type":"fastq","filename":"SRR10600326.sra_2.fastq.gz","read_length":300},"vdjserver_uuid":"3987789925682114070-242ac113-0001-012"}],"data_processing":[{"data_processing_id":"65112922-e976-40d9-9dff-6b581acc745f-007","primary_annotation":true,"software_versions":"IgBlast 1.14","data_processing_files":["SRR10600326.sra_1.igblast.airr.tsv.gz","SRR10600326.sra_2.igblast.airr.tsv.gz"],"germline_database":"VDJServer IMGT 2019.01.23","vdjserver_uuid":"2248499969493954070-242ac113-0001-012"}]}]}
\ No newline at end of file

From 6927e1125bb81b9df8efd2f5e2f398141f0c5c1b Mon Sep 17 00:00:00 2001
From: Brian Corrie <bdcorrie@gmail.com>
Date: Sun, 11 Feb 2024 09:34:20 -0800
Subject: [PATCH 04/15] Minor change

Mostly so I can create a pull request...
---
 specs/airr-schema.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/specs/airr-schema.yaml b/specs/airr-schema.yaml
index 606f773f5..260a37636 100644
--- a/specs/airr-schema.yaml
+++ b/specs/airr-schema.yaml
@@ -3196,7 +3196,7 @@ Repertoire:
                 nullable: false
                 adc-query-support: true
 
-# A collection of repertoires for analysis purposes, includes optional time course
+# A group of repertoires for analysis purposes, includes optional time course
 RepertoireGroup:
     type: object
     required:

From 9616de2bbc22ba514830a793b027009a7b2799a7 Mon Sep 17 00:00:00 2001
From: Scott Christley <scott.christley@utsouthwestern.edu>
Date: Sat, 17 Feb 2024 18:27:34 -0600
Subject: [PATCH 05/15] Update descriptions

---
 specs/airr-schema.yaml | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/specs/airr-schema.yaml b/specs/airr-schema.yaml
index 260a37636..e4b6e34b8 100644
--- a/specs/airr-schema.yaml
+++ b/specs/airr-schema.yaml
@@ -371,7 +371,8 @@ DataFile:
                 nullable: false
         RepertoireGroup:
             type: array
-            description: List of repertoire collections
+            description: List of repertoire groups
+            
             items:
                 $ref: '#/RepertoireGroup'
             x-airr:
@@ -3196,7 +3197,8 @@ Repertoire:
                 nullable: false
                 adc-query-support: true
 
-# A group of repertoires for analysis purposes, includes optional time course
+# An ordered group of repertoires for analysis purposes, includes optional time course
+# Can be treated as a set if all repertoire_group_id are unique
 RepertoireGroup:
     type: object
     required:
@@ -3205,19 +3207,19 @@ RepertoireGroup:
     properties:
         repertoire_group_id:
             type: string
-            description: Identifier for this repertoire collection
+            description: Identifier for this repertoire group
             x-airr:
                 identifier: true
         repertoire_group_name:
             type: string
-            description: Short display name for this repertoire collection
+            description: Short display name for this repertoire group
         repertoire_group_description:
             type: string
-            description: Repertoire collection description
+            description: Repertoire group description
         repertoires:
             type: array
             description: >
-                List of repertoires in this collection with an associated description and time point designation
+                List of repertoires in this group with an associated description and time point designation
             items:
                 type: object
                 properties:

From 559075400f30f785da17bc41b18974e805b3ca13 Mon Sep 17 00:00:00 2001
From: Scott Christley <scott.christley@utsouthwestern.edu>
Date: Wed, 21 Feb 2024 19:19:55 -0600
Subject: [PATCH 06/15] update descriptions

---
 lang/R/inst/extdata/airr-schema.yaml    | 13 +++++++------
 lang/python/airr/specs/airr-schema.yaml | 13 +++++++------
 specs/airr-schema-openapi3.yaml         | 13 +++++++------
 specs/airr-schema.yaml                  |  1 -
 4 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/lang/R/inst/extdata/airr-schema.yaml b/lang/R/inst/extdata/airr-schema.yaml
index 606f773f5..fe21bbb75 100644
--- a/lang/R/inst/extdata/airr-schema.yaml
+++ b/lang/R/inst/extdata/airr-schema.yaml
@@ -371,7 +371,7 @@ DataFile:
                 nullable: false
         RepertoireGroup:
             type: array
-            description: List of repertoire collections
+            description: List of repertoire groups
             items:
                 $ref: '#/RepertoireGroup'
             x-airr:
@@ -3196,7 +3196,8 @@ Repertoire:
                 nullable: false
                 adc-query-support: true
 
-# A collection of repertoires for analysis purposes, includes optional time course
+# An ordered group of repertoires for analysis purposes, includes optional time course
+# Can be treated as a set if all repertoire_group_id are unique
 RepertoireGroup:
     type: object
     required:
@@ -3205,19 +3206,19 @@ RepertoireGroup:
     properties:
         repertoire_group_id:
             type: string
-            description: Identifier for this repertoire collection
+            description: Identifier for this repertoire group
             x-airr:
                 identifier: true
         repertoire_group_name:
             type: string
-            description: Short display name for this repertoire collection
+            description: Short display name for this repertoire group
         repertoire_group_description:
             type: string
-            description: Repertoire collection description
+            description: Repertoire group description
         repertoires:
             type: array
             description: >
-                List of repertoires in this collection with an associated description and time point designation
+                List of repertoires in this group with an associated description and time point designation
             items:
                 type: object
                 properties:
diff --git a/lang/python/airr/specs/airr-schema.yaml b/lang/python/airr/specs/airr-schema.yaml
index 606f773f5..fe21bbb75 100644
--- a/lang/python/airr/specs/airr-schema.yaml
+++ b/lang/python/airr/specs/airr-schema.yaml
@@ -371,7 +371,7 @@ DataFile:
                 nullable: false
         RepertoireGroup:
             type: array
-            description: List of repertoire collections
+            description: List of repertoire groups
             items:
                 $ref: '#/RepertoireGroup'
             x-airr:
@@ -3196,7 +3196,8 @@ Repertoire:
                 nullable: false
                 adc-query-support: true
 
-# A collection of repertoires for analysis purposes, includes optional time course
+# An ordered group of repertoires for analysis purposes, includes optional time course
+# Can be treated as a set if all repertoire_group_id are unique
 RepertoireGroup:
     type: object
     required:
@@ -3205,19 +3206,19 @@ RepertoireGroup:
     properties:
         repertoire_group_id:
             type: string
-            description: Identifier for this repertoire collection
+            description: Identifier for this repertoire group
             x-airr:
                 identifier: true
         repertoire_group_name:
             type: string
-            description: Short display name for this repertoire collection
+            description: Short display name for this repertoire group
         repertoire_group_description:
             type: string
-            description: Repertoire collection description
+            description: Repertoire group description
         repertoires:
             type: array
             description: >
-                List of repertoires in this collection with an associated description and time point designation
+                List of repertoires in this group with an associated description and time point designation
             items:
                 type: object
                 properties:
diff --git a/specs/airr-schema-openapi3.yaml b/specs/airr-schema-openapi3.yaml
index bba3a45d8..52efed116 100644
--- a/specs/airr-schema-openapi3.yaml
+++ b/specs/airr-schema-openapi3.yaml
@@ -368,7 +368,7 @@ DataFile:
         RepertoireGroup:
             type: array
             nullable: false
-            description: List of repertoire collections
+            description: List of repertoire groups
             items:
                 $ref: '#/RepertoireGroup'
         Rearrangement:
@@ -3298,7 +3298,8 @@ Repertoire:
             x-airr:
                 adc-query-support: true
 
-# A collection of repertoires for analysis purposes, includes optional time course
+# An ordered group of repertoires for analysis purposes, includes optional time course
+# Can be treated as a set if all repertoire_group_id are unique
 RepertoireGroup:
     type: object
     required:
@@ -3308,22 +3309,22 @@ RepertoireGroup:
         repertoire_group_id:
             type: string
             nullable: true
-            description: Identifier for this repertoire collection
+            description: Identifier for this repertoire group
             x-airr:
                 identifier: true
         repertoire_group_name:
             type: string
             nullable: true
-            description: Short display name for this repertoire collection
+            description: Short display name for this repertoire group
         repertoire_group_description:
             type: string
             nullable: true
-            description: Repertoire collection description
+            description: Repertoire group description
         repertoires:
             type: array
             nullable: true
             description: >
-                List of repertoires in this collection with an associated description and time point designation
+                List of repertoires in this group with an associated description and time point designation
             items:
                 type: object
                 properties:
diff --git a/specs/airr-schema.yaml b/specs/airr-schema.yaml
index e4b6e34b8..fe21bbb75 100644
--- a/specs/airr-schema.yaml
+++ b/specs/airr-schema.yaml
@@ -372,7 +372,6 @@ DataFile:
         RepertoireGroup:
             type: array
             description: List of repertoire groups
-            
             items:
                 $ref: '#/RepertoireGroup'
             x-airr:

From b027e9f9d33092ec9a150dcd9dcac56967c75c07 Mon Sep 17 00:00:00 2001
From: Scott Christley <scott.christley@utsouthwestern.edu>
Date: Wed, 21 Feb 2024 18:31:03 -0600
Subject: [PATCH 07/15] array of extensions

---
 specs/adc-api-openapi3.yaml | 6 ++++++
 specs/adc-api.yaml          | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/specs/adc-api-openapi3.yaml b/specs/adc-api-openapi3.yaml
index 38a0f2fb9..5a909f1c3 100644
--- a/specs/adc-api-openapi3.yaml
+++ b/specs/adc-api-openapi3.yaml
@@ -73,6 +73,12 @@ components:
                   type: integer
                 last_update:
                   type: string
+                extensions:
+                  type: array
+                  items:
+                    type: string
+                    enum:
+                      - async_api
             api:
               $ref: '#/components/schemas/info_object'
             schema:
diff --git a/specs/adc-api.yaml b/specs/adc-api.yaml
index db5a28372..24142bb2d 100644
--- a/specs/adc-api.yaml
+++ b/specs/adc-api.yaml
@@ -62,6 +62,12 @@ definitions:
                 type: integer
               last_update:
                 type: string
+              extensions:
+                type: array
+                items:
+                  type: string
+                  enum:
+                    - async_api
           api:
             description: Provides information about the ADC API implemented by this repository service.
             $ref: '#/definitions/info_object'

From 3f834c13929054e0e8c7097a21ddfeac20ca28b8 Mon Sep 17 00:00:00 2001
From: Scott Christley <scott.christley@utsouthwestern.edu>
Date: Wed, 21 Feb 2024 18:39:17 -0600
Subject: [PATCH 08/15] update docs

---
 docs/api/adc_api_overview.rst | 2 ++
 docs/api/adc_api_requests.rst | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/api/adc_api_overview.rst b/docs/api/adc_api_overview.rst
index cffd157d5..31f520edb 100644
--- a/docs/api/adc_api_overview.rst
+++ b/docs/api/adc_api_overview.rst
@@ -60,6 +60,8 @@ to be followed.
    (timeout) should be used if the API does not complete an operation because of an
    internal time limit, and HTTP 413 (Content too large) should be returned when either
    max_size or max_query_size are exceeded.
+*  Extensions beyond the standard API, e.g., support for the Async API, should be specified
+   with the `extensions` property in the `/info` endpoint.
 
 Repository operation principles
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/docs/api/adc_api_requests.rst b/docs/api/adc_api_requests.rst
index 528139a6f..9b19a5995 100644
--- a/docs/api/adc_api_requests.rst
+++ b/docs/api/adc_api_requests.rst
@@ -110,7 +110,8 @@ of the queries sent to the repository.
         }
       },
       "max_size": 1000,
-      "max_query_size": 2097152
+      "max_query_size": 2097152,
+      "extensions": ["async_api"]
     }
 
 **Query Repertoire Example**

From 6fe6633a62960e33698b2989b3481f08a09010e9 Mon Sep 17 00:00:00 2001
From: Christian Busse <christian.busse@dkfz-heidelberg.de>
Date: Mon, 15 Jan 2024 02:51:46 +0100
Subject: [PATCH 09/15] Add Contributor record, adapt Study object accordingly

---
 specs/airr-schema-openapi3.yaml | 177 +++++++++++++++++++-------------
 1 file changed, 103 insertions(+), 74 deletions(-)

diff --git a/specs/airr-schema-openapi3.yaml b/specs/airr-schema-openapi3.yaml
index 52efed116..6d5148be3 100644
--- a/specs/airr-schema-openapi3.yaml
+++ b/specs/airr-schema-openapi3.yaml
@@ -486,34 +486,111 @@ TimePoint:
 # General objects
 #
 
-# An individual
-Acknowledgement:
+# Contributor record to describe invididuals and their contribution to a data set
+#
+Contributor:
     description: Individual whose contribution to this work should be acknowledged
     type: object
     required:
-        - acknowledgement_id
+        - contributor_id
         - name
-        - institution_name
     properties:
-        acknowledgement_id:
+        contributor_id:
             type: string
-            description: unique identifier of this Acknowledgement within the file
+            description: unique identifier of this contributor within the file
             x-airr:
                 identifier: true
                 miairr: important
             nullable: true
         name:
             type: string
+            nullable: false
+            description: Full name of contributor
+        orcid_id:
+            $ref: '#/Ontology'
             nullable: true
-            description: Full name of individual
-        institution_name:
-            type: string
+            description: >
+                ORCID identifier of the contributor. Note that if present, the label of the ORCID record should take
+                precedence over the name reported in the `name` property.
+            title: ORCID iD
+            example:
+                id: ORCID:0000-0002-1825-0097
+                label: Josiah Carberry
+            x-airr:
+                adc-query-support: true
+                format: ontology
+                ontology:
+                    draft: false
+                    top_node:
+                        id: null
+                        label: null
+        affiliation:
+            $ref: '#/Ontology'
             nullable: true
-            description: Individual's department and institution name
-        orcid_id:
+            description: >
+                ROR of the contributor's primary affiliation. Note that ROR are only minted for institutions, not
+                from individuals institutes, divisions or departments.
+            title: ROR
+            example:
+                id: ROR:05h7xva58
+                label: Wesleyan University
+            x-airr:
+                adc-query-support: true
+                format: ontology
+                ontology:
+                    draft: false
+                    top_node:
+                        id: null
+                        label: null
+        affiliation_department:
             type: string
             nullable: true
-            description: Individual's ORCID identifier
+            description: >
+                Additional information regarding the contributor's primary affiliation. Can be used to specify
+                individual institutes, divisions or departments.
+            example: Department for Psychoceramics
+        contributions:
+            type: array
+            nullable: true
+            description: List of all roles the contributor had in a project
+            items:
+                $ref: '#/ContributorContribution'
+
+ContributorContribution:
+    type: object
+    required:
+        - role
+    properties:
+        role:
+            type: string
+            nullable: false
+            description: Role according to CRediT taxonomy
+            enum:
+                - conceptualization
+                - data curation
+                - formal analysis
+                - funding acquisition
+                - investigation
+                - methodology
+                - project administration
+                - resources
+                - software
+                - supervision
+                - validation
+                - visualization
+                - writing - original draft
+                - writing - review & editing
+        degree:
+            type: string
+            nullable: true
+            description: >
+                Optional specification of the degree of contribution, should be used if multiple individuals serve
+                the same role.
+            enum:
+                - lead
+                - equal
+                - supporting
+
 
 #
 # Germline gene schema
@@ -849,7 +926,7 @@ AlleleDescription:
             nullable: true
             description: List of individuals whose contribution to the gene description should be acknowledged
             items:
-                $ref: '#/Acknowledgement'
+                $ref: '#/Contributor'
         lab_address:
             type: string
             nullable: true
@@ -1192,7 +1269,7 @@ GermlineSet:
             nullable: true
             description: List of individuals whose contribution to the germline set should be acknowledged
             items:
-                $ref: '#/Acknowledgement'
+                $ref: '#/Contributor'
         release_version:
             type: number
             nullable: true
@@ -1702,71 +1779,23 @@ Study:
                 set: 1
                 subset: study
                 name: Grant funding agency
-        study_contact:
-            type: string
-            nullable: true
-            description: >
-                Full contact information of the contact persons for this study This should include an e-mail address
-                and a persistent identifier such as an ORCID ID.
-            title: Contact information (study)
-            example: Dr. P. Stibbons, p.stibbons@unseenu.edu, https://orcid.org/0000-0002-1825-0097
-            x-airr:
-                adc-query-support: true
-                name: Contact information (study)
-        collected_by:
-            type: string
-            nullable: true
-            description: >
-                Full contact information of the data collector, i.e. the person who is legally responsible for data
-                collection and release. This should include an e-mail address and a persistent identifier such as an
-                ORCID ID.
-            title: Contact information (data collection)
-            example: Dr. P. Stibbons, p.stibbons@unseenu.edu, https://orcid.org/0000-0002-1825-0097
-            x-airr:
-                miairr: important
-                adc-query-support: true
-                set: 1
-                subset: study
-                name: Contact information (data collection)
-        lab_name:
-            type: string
-            nullable: true
-            description: Department of data collector
-            title: Lab name
-            example: Department for Planar Immunology
-            x-airr:
-                miairr: important
-                adc-query-support: true
-                set: 1
-                subset: study
-                name: Lab name
-        lab_address:
-            type: string
-            nullable: true
-            description: Institution and institutional address of data collector
-            title: Lab address
-            example: School of Medicine, Unseen University, Ankh-Morpork, Disk World
-            x-airr:
-                miairr: important
-                adc-query-support: true
-                set: 1
-                subset: study
-                name: Lab address
-        submitted_by:
-            type: string
-            nullable: true
+        contributors:
+            type: array
+            nullable: false
             description: >
-                Full contact information of the data depositor, i.e., the person submitting the data to a repository.
-                This should include an e-mail address and a persistent identifier such as an ORCID ID. This is
-                supposed to be a short-lived and technical role until the submission is relased.
-            title: Contact information (data deposition)
-            example: Adrian Turnipseed, a.turnipseed@unseenu.edu, https://orcid.org/0000-0002-1825-0097
+                List of individuals who contributed to the study. Note that these are not necessarily identical with the
+                authors on an associated manuscript or other scholarly communication. Further note that at least the
+                following three CRediT contributor roles "supervision", "investigation" and "data curation" should be
+                assigned.
+            title: Contributors
+            items:
+                $ref: '#/Contributor'
             x-airr:
-                miairr: important
+                miairr: essential
                 adc-query-support: true
                 set: 1
                 subset: study
-                name: Contact information (data deposition)
+                name: Contributors
         pub_ids:
             type: array
             items:

From 505a50f67677b61e8c5d16de03b572fd1a3458aa Mon Sep 17 00:00:00 2001
From: Christian Busse <christian.busse@dkfz-heidelberg.de>
Date: Wed, 21 Feb 2024 02:15:43 +0100
Subject: [PATCH 10/15] Update v3 Schema, sync to v2 Schema and its copies

---
 lang/R/inst/extdata/airr-schema.yaml    | 239 ++++++++++++++++--------
 lang/python/airr/specs/airr-schema.yaml | 239 ++++++++++++++++--------
 specs/airr-schema-openapi3.yaml         | 124 +++++++-----
 specs/airr-schema.yaml                  | 239 ++++++++++++++++--------
 4 files changed, 552 insertions(+), 289 deletions(-)

diff --git a/lang/R/inst/extdata/airr-schema.yaml b/lang/R/inst/extdata/airr-schema.yaml
index fe21bbb75..87a25b5bd 100644
--- a/lang/R/inst/extdata/airr-schema.yaml
+++ b/lang/R/inst/extdata/airr-schema.yaml
@@ -493,30 +493,115 @@ TimePoint:
 # TODO: link to global schema with JSON-LD?
 #
 
-# An individual
-Acknowledgement:
+# Contributor record to describe invididuals and their contribution to a data set
+#
+Contributor:
     description: Individual whose contribution to this work should be acknowledged
     type: object
     required:
-        - acknowledgement_id
+        - contributor_id
         - name
-        - institution_name
     properties:
-        acknowledgement_id:
+        contributor_id:
             type: string
-            description: unique identifier of this Acknowledgement within the file
+            description: Unique identifier of this contributor within the file
             x-airr:
+                nullable: true
                 identifier: true
                 miairr: important
         name:
             type: string
-            description: Full name of individual
-        institution_name:
-            type: string
-            description: Individual's department and institution name
+            description: Full name of contributor
+            x-airr:
+                 nullable: false
         orcid_id:
+            $ref: '#/Ontology'
+            description: >
+                ORCID identifier of the contributor. Note that if present, the label of the ORCID record should take
+                precedence over the name reported in the `name` property.
+            title: ORCID iD
+            example:
+                id: ORCID:0000-0002-1825-0097
+                label: Josiah Carberry
+            x-airr:
+                nullable: true
+                adc-query-support: true
+                format: ontology
+                ontology:
+                    draft: false
+                    top_node:
+                        id: null
+                        label: null
+        affiliation:
+            $ref: '#/Ontology'
+            description: >
+                ROR of the contributor's primary affiliation. Note that ROR are only minted for institutions, not
+                from individuals institutes, divisions or departments.
+            title: ROR
+            example:
+                id: ROR:05h7xva58
+                label: Wesleyan University
+            x-airr:
+                nullable: true
+                adc-query-support: true
+                format: ontology
+                ontology:
+                    draft: false
+                    top_node:
+                        id: null
+                        label: null
+        affiliation_department:
+            type: string
+            description: >
+                Additional information regarding the contributor's primary affiliation. Can be used to specify
+                individual institutes, divisions or departments.
+            example: Department for Psychoceramics
+            x-airr:
+                nullable: true
+        contributions:
+            type: array
+            description: List of all roles the contributor had in a project
+            items:
+                $ref: '#/ContributorContribution'
+            x-airr:
+                nullable: true
+
+ContributorContribution:
+    type: object
+    required:
+        - role
+    properties:
+        role:
             type: string
-            description: Individual's ORCID identifier
+            description: Role according to CRediT taxonomy
+            enum:
+                - conceptualization
+                - data curation
+                - formal analysis
+                - funding acquisition
+                - investigation
+                - methodology
+                - project administration
+                - resources
+                - software
+                - supervision
+                - validation
+                - visualization
+                - writing - original draft
+                - writing - review & editing
+            x-airr:
+                nullable: false
+        degree:
+            type: string
+            description: >
+                Optional specification of the degree of contribution, should be used if multiple individuals serve
+                the same role.
+            enum:
+                - lead
+                - equal
+                - supporting
+            x-airr:
+                nullable: true
 
 #
 # Germline gene schema
@@ -783,8 +868,7 @@ AlleleDescription:
     type: object
     required:
         - allele_description_id
-        - maintainer
-        - lab_address
+        - acknowledgements
         - release_version
         - release_date
         - release_description
@@ -810,21 +894,15 @@ AlleleDescription:
             example: OGRDB:Human_IGH:IGHV1-69*01.001
             x-airr:
                 miairr: important
-        maintainer:
-            type: string
-            description: Maintainer of this sequence record
-            x-airr:
-                miairr: defined
         acknowledgements:
             type: array
-            description: List of individuals whose contribution to the gene description should be acknowledged
+            description: >
+                List of individuals whose contribution to the gene description should be acknowledged. Note that these
+                are not necessarily identical with the authors on an associated manuscript or other scholarly
+                communication. Further note that typically at least the three CRediT contributor roles "supervision",
+                "investigation" and "data curation" should be assigned. The current maintainer should be listed first.
             items:
-                $ref: '#/Acknowledgement'
-        lab_address:
-            type: string
-            description: Institution and full address of corresponding author
-            x-airr:
-                miairr: defined
+                $ref: '#/Contributor'
         release_version:
             type: integer
             description: Version number of this record, updated whenever a revised version is published or released
@@ -1084,9 +1162,7 @@ GermlineSet:
         All genes in a GermlineSet should be from a single locus.
     required:
         - germline_set_id
-        - author
-        - lab_name
-        - lab_address
+        - acknowledgements
         - release_version
         - release_description
         - release_date
@@ -1104,26 +1180,15 @@ GermlineSet:
             x-airr:
                 identifier: true
                 miairr: important
-        author:
-            type: string
-            description: Corresponding author
-            x-airr:
-                miairr: important
-        lab_name:
-            type: string
-            description: Department of corresponding author
-            x-airr:
-                miairr: important
-        lab_address:
-            type: string
-            description: Institutional address of corresponding author
-            x-airr:
-                miairr: important
         acknowledgements:
             type: array
-            description: List of individuals whose contribution to the germline set should be acknowledged
+            description: >
+                List of individuals whose contribution to the germline set should be acknowledged. Note that these are
+                not necessarily identical with the authors on an associated manuscript or other scholarly communication.
+                Further note that typically at least the three CRediT contributor roles "supervision", "investigation"
+                and "data curation" should be assigned. The coresponding author should be listed last.
             items:
-                $ref: '#/Acknowledgement'
+                $ref: '#/Contributor'
         release_version:
             type: number
             description: Version number of this record, allocated automatically
@@ -1593,71 +1658,81 @@ Study:
                 set: 1
                 subset: study
                 name: Grant funding agency
+        contributors:
+            type: array
+            description: >
+                List of individuals who contributed to the study. Note that these are not necessarily identical with
+                the authors on an associated manuscript or other scholarly communication. Further note that typically
+                at least the three CRediT contributor roles "supervision", "investigation" and "data curation" should
+                be assigned. The coresponding author should be listed last.
+            title: Contributors
+            items:
+                $ref: '#/Contributor'
+            x-airr:
+                nullable: false
+                miairr: essential
+                adc-query-support: true
+                set: 1
+                subset: study
+                name: Contributors
         study_contact:
             type: string
             description: >
                 Full contact information of the contact persons for this study This should include an e-mail address
                 and a persistent identifier such as an ORCID ID.
-            title: Contact information (study)
-            example: Dr. P. Stibbons, p.stibbons@unseenu.edu, https://orcid.org/0000-0002-1825-0097
             x-airr:
-                nullable: true
-                adc-query-support: true
-                name: Contact information (study)
+                deprecated: true
+                deprecated-description: >
+                    Acknowledgements and contact information was re-organized into the contributors property, which
+                    is an array of Contributor objects.
+                deprecated-replaced-by:
+                    - contributors
         collected_by:
             type: string
             description: >
                 Full contact information of the data collector, i.e. the person who is legally responsible for data
                 collection and release. This should include an e-mail address and a persistent identifier such as an
                 ORCID ID.
-            title: Contact information (data collection)
-            example: Dr. P. Stibbons, p.stibbons@unseenu.edu, https://orcid.org/0000-0002-1825-0097
             x-airr:
-                miairr: important
-                nullable: true
-                adc-query-support: true
-                set: 1
-                subset: study
-                name: Contact information (data collection)
+                deprecated: true
+                deprecated-description: >
+                    Acknowledgements and contact information was re-organized into the contributors property, which
+                    is an array of Contributor objects.
+                deprecated-replaced-by:
+                    - contributors
         lab_name:
             type: string
             description: Department of data collector
-            title: Lab name
-            example: Department for Planar Immunology
             x-airr:
-                miairr: important
-                nullable: true
-                adc-query-support: true
-                set: 1
-                subset: study
-                name: Lab name
+                deprecated: true
+                deprecated-description: >
+                    Acknowledgements and contact information was re-organized into the contributors property, which
+                    is an array of Contributor objects.
+                deprecated-replaced-by:
+                    - contributors
         lab_address:
             type: string
             description: Institution and institutional address of data collector
-            title: Lab address
-            example: School of Medicine, Unseen University, Ankh-Morpork, Disk World
             x-airr:
-                miairr: important
-                nullable: true
-                adc-query-support: true
-                set: 1
-                subset: study
-                name: Lab address
+                deprecated: true
+                deprecated-description: >
+                    Acknowledgements and contact information was re-organized into the contributors property, which
+                    is an array of Contributor objects.
+                deprecated-replaced-by:
+                    - contributors
         submitted_by:
             type: string
             description: >
                 Full contact information of the data depositor, i.e., the person submitting the data to a repository.
                 This should include an e-mail address and a persistent identifier such as an ORCID ID. This is
                 supposed to be a short-lived and technical role until the submission is relased.
-            title: Contact information (data deposition)
-            example: Adrian Turnipseed, a.turnipseed@unseenu.edu, https://orcid.org/0000-0002-1825-0097
             x-airr:
-                miairr: important
-                nullable: true
-                adc-query-support: true
-                set: 1
-                subset: study
-                name: Contact information (data deposition)
+                deprecated: true
+                deprecated-description: >
+                    Acknowledgements and contact information was re-organized into the contributors property, which
+                    is an array of Contributor objects.
+                deprecated-replaced-by:
+                    - contributors
         pub_ids:
             type: array
             items:
diff --git a/lang/python/airr/specs/airr-schema.yaml b/lang/python/airr/specs/airr-schema.yaml
index fe21bbb75..87a25b5bd 100644
--- a/lang/python/airr/specs/airr-schema.yaml
+++ b/lang/python/airr/specs/airr-schema.yaml
@@ -493,30 +493,115 @@ TimePoint:
 # TODO: link to global schema with JSON-LD?
 #
 
-# An individual
-Acknowledgement:
+# Contributor record to describe invididuals and their contribution to a data set
+#
+Contributor:
     description: Individual whose contribution to this work should be acknowledged
     type: object
     required:
-        - acknowledgement_id
+        - contributor_id
         - name
-        - institution_name
     properties:
-        acknowledgement_id:
+        contributor_id:
             type: string
-            description: unique identifier of this Acknowledgement within the file
+            description: Unique identifier of this contributor within the file
             x-airr:
+                nullable: true
                 identifier: true
                 miairr: important
         name:
             type: string
-            description: Full name of individual
-        institution_name:
-            type: string
-            description: Individual's department and institution name
+            description: Full name of contributor
+            x-airr:
+                 nullable: false
         orcid_id:
+            $ref: '#/Ontology'
+            description: >
+                ORCID identifier of the contributor. Note that if present, the label of the ORCID record should take
+                precedence over the name reported in the `name` property.
+            title: ORCID iD
+            example:
+                id: ORCID:0000-0002-1825-0097
+                label: Josiah Carberry
+            x-airr:
+                nullable: true
+                adc-query-support: true
+                format: ontology
+                ontology:
+                    draft: false
+                    top_node:
+                        id: null
+                        label: null
+        affiliation:
+            $ref: '#/Ontology'
+            description: >
+                ROR of the contributor's primary affiliation. Note that ROR are only minted for institutions, not
+                from individuals institutes, divisions or departments.
+            title: ROR
+            example:
+                id: ROR:05h7xva58
+                label: Wesleyan University
+            x-airr:
+                nullable: true
+                adc-query-support: true
+                format: ontology
+                ontology:
+                    draft: false
+                    top_node:
+                        id: null
+                        label: null
+        affiliation_department:
+            type: string
+            description: >
+                Additional information regarding the contributor's primary affiliation. Can be used to specify
+                individual institutes, divisions or departments.
+            example: Department for Psychoceramics
+            x-airr:
+                nullable: true
+        contributions:
+            type: array
+            description: List of all roles the contributor had in a project
+            items:
+                $ref: '#/ContributorContribution'
+            x-airr:
+                nullable: true
+
+ContributorContribution:
+    type: object
+    required:
+        - role
+    properties:
+        role:
             type: string
-            description: Individual's ORCID identifier
+            description: Role according to CRediT taxonomy
+            enum:
+                - conceptualization
+                - data curation
+                - formal analysis
+                - funding acquisition
+                - investigation
+                - methodology
+                - project administration
+                - resources
+                - software
+                - supervision
+                - validation
+                - visualization
+                - writing - original draft
+                - writing - review & editing
+            x-airr:
+                nullable: false
+        degree:
+            type: string
+            description: >
+                Optional specification of the degree of contribution, should be used if multiple individuals serve
+                the same role.
+            enum:
+                - lead
+                - equal
+                - supporting
+            x-airr:
+                nullable: true
 
 #
 # Germline gene schema
@@ -783,8 +868,7 @@ AlleleDescription:
     type: object
     required:
         - allele_description_id
-        - maintainer
-        - lab_address
+        - acknowledgements
         - release_version
         - release_date
         - release_description
@@ -810,21 +894,15 @@ AlleleDescription:
             example: OGRDB:Human_IGH:IGHV1-69*01.001
             x-airr:
                 miairr: important
-        maintainer:
-            type: string
-            description: Maintainer of this sequence record
-            x-airr:
-                miairr: defined
         acknowledgements:
             type: array
-            description: List of individuals whose contribution to the gene description should be acknowledged
+            description: >
+                List of individuals whose contribution to the gene description should be acknowledged. Note that these
+                are not necessarily identical with the authors on an associated manuscript or other scholarly
+                communication. Further note that typically at least the three CRediT contributor roles "supervision",
+                "investigation" and "data curation" should be assigned. The current maintainer should be listed first.
             items:
-                $ref: '#/Acknowledgement'
-        lab_address:
-            type: string
-            description: Institution and full address of corresponding author
-            x-airr:
-                miairr: defined
+                $ref: '#/Contributor'
         release_version:
             type: integer
             description: Version number of this record, updated whenever a revised version is published or released
@@ -1084,9 +1162,7 @@ GermlineSet:
         All genes in a GermlineSet should be from a single locus.
     required:
         - germline_set_id
-        - author
-        - lab_name
-        - lab_address
+        - acknowledgements
         - release_version
         - release_description
         - release_date
@@ -1104,26 +1180,15 @@ GermlineSet:
             x-airr:
                 identifier: true
                 miairr: important
-        author:
-            type: string
-            description: Corresponding author
-            x-airr:
-                miairr: important
-        lab_name:
-            type: string
-            description: Department of corresponding author
-            x-airr:
-                miairr: important
-        lab_address:
-            type: string
-            description: Institutional address of corresponding author
-            x-airr:
-                miairr: important
         acknowledgements:
             type: array
-            description: List of individuals whose contribution to the germline set should be acknowledged
+            description: >
+                List of individuals whose contribution to the germline set should be acknowledged. Note that these are
+                not necessarily identical with the authors on an associated manuscript or other scholarly communication.
+                Further note that typically at least the three CRediT contributor roles "supervision", "investigation"
+                and "data curation" should be assigned. The coresponding author should be listed last.
             items:
-                $ref: '#/Acknowledgement'
+                $ref: '#/Contributor'
         release_version:
             type: number
             description: Version number of this record, allocated automatically
@@ -1593,71 +1658,81 @@ Study:
                 set: 1
                 subset: study
                 name: Grant funding agency
+        contributors:
+            type: array
+            description: >
+                List of individuals who contributed to the study. Note that these are not necessarily identical with
+                the authors on an associated manuscript or other scholarly communication. Further note that typically
+                at least the three CRediT contributor roles "supervision", "investigation" and "data curation" should
+                be assigned. The coresponding author should be listed last.
+            title: Contributors
+            items:
+                $ref: '#/Contributor'
+            x-airr:
+                nullable: false
+                miairr: essential
+                adc-query-support: true
+                set: 1
+                subset: study
+                name: Contributors
         study_contact:
             type: string
             description: >
                 Full contact information of the contact persons for this study This should include an e-mail address
                 and a persistent identifier such as an ORCID ID.
-            title: Contact information (study)
-            example: Dr. P. Stibbons, p.stibbons@unseenu.edu, https://orcid.org/0000-0002-1825-0097
             x-airr:
-                nullable: true
-                adc-query-support: true
-                name: Contact information (study)
+                deprecated: true
+                deprecated-description: >
+                    Acknowledgements and contact information was re-organized into the contributors property, which
+                    is an array of Contributor objects.
+                deprecated-replaced-by:
+                    - contributors
         collected_by:
             type: string
             description: >
                 Full contact information of the data collector, i.e. the person who is legally responsible for data
                 collection and release. This should include an e-mail address and a persistent identifier such as an
                 ORCID ID.
-            title: Contact information (data collection)
-            example: Dr. P. Stibbons, p.stibbons@unseenu.edu, https://orcid.org/0000-0002-1825-0097
             x-airr:
-                miairr: important
-                nullable: true
-                adc-query-support: true
-                set: 1
-                subset: study
-                name: Contact information (data collection)
+                deprecated: true
+                deprecated-description: >
+                    Acknowledgements and contact information was re-organized into the contributors property, which
+                    is an array of Contributor objects.
+                deprecated-replaced-by:
+                    - contributors
         lab_name:
             type: string
             description: Department of data collector
-            title: Lab name
-            example: Department for Planar Immunology
             x-airr:
-                miairr: important
-                nullable: true
-                adc-query-support: true
-                set: 1
-                subset: study
-                name: Lab name
+                deprecated: true
+                deprecated-description: >
+                    Acknowledgements and contact information was re-organized into the contributors property, which
+                    is an array of Contributor objects.
+                deprecated-replaced-by:
+                    - contributors
         lab_address:
             type: string
             description: Institution and institutional address of data collector
-            title: Lab address
-            example: School of Medicine, Unseen University, Ankh-Morpork, Disk World
             x-airr:
-                miairr: important
-                nullable: true
-                adc-query-support: true
-                set: 1
-                subset: study
-                name: Lab address
+                deprecated: true
+                deprecated-description: >
+                    Acknowledgements and contact information was re-organized into the contributors property, which
+                    is an array of Contributor objects.
+                deprecated-replaced-by:
+                    - contributors
         submitted_by:
             type: string
             description: >
                 Full contact information of the data depositor, i.e., the person submitting the data to a repository.
                 This should include an e-mail address and a persistent identifier such as an ORCID ID. This is
                 supposed to be a short-lived and technical role until the submission is relased.
-            title: Contact information (data deposition)
-            example: Adrian Turnipseed, a.turnipseed@unseenu.edu, https://orcid.org/0000-0002-1825-0097
             x-airr:
-                miairr: important
-                nullable: true
-                adc-query-support: true
-                set: 1
-                subset: study
-                name: Contact information (data deposition)
+                deprecated: true
+                deprecated-description: >
+                    Acknowledgements and contact information was re-organized into the contributors property, which
+                    is an array of Contributor objects.
+                deprecated-replaced-by:
+                    - contributors
         pub_ids:
             type: array
             items:
diff --git a/specs/airr-schema-openapi3.yaml b/specs/airr-schema-openapi3.yaml
index 6d5148be3..1ae5ad012 100644
--- a/specs/airr-schema-openapi3.yaml
+++ b/specs/airr-schema-openapi3.yaml
@@ -497,11 +497,11 @@ Contributor:
     properties:
         contributor_id:
             type: string
-            description: unique identifier of this contributor within the file
+            nullable: true
+            description: Unique identifier of this contributor within the file
             x-airr:
                 identifier: true
                 miairr: important
-            nullable: true
         name:
             type: string
             nullable: false
@@ -886,8 +886,7 @@ AlleleDescription:
     type: object
     required:
         - allele_description_id
-        - maintainer
-        - lab_address
+        - acknowledgements
         - release_version
         - release_date
         - release_description
@@ -915,24 +914,16 @@ AlleleDescription:
                 miairr: important
             description: Unique reference to the allele description, in standardized form (Repo:Label:Version)
             example: OGRDB:Human_IGH:IGHV1-69*01.001
-        maintainer:
-            type: string
-            nullable: true
-            x-airr:
-                miairr: defined
-            description: Maintainer of this sequence record
         acknowledgements:
             type: array
             nullable: true
-            description: List of individuals whose contribution to the gene description should be acknowledged
+            description: >
+                List of individuals whose contribution to the gene description should be acknowledged. Note that these
+                are not necessarily identical with the authors on an associated manuscript or other scholarly
+                communication. Further note that typically at least the three CRediT contributor roles "supervision",
+                "investigation" and "data curation" should be assigned. The current maintainer should be listed first.
             items:
                 $ref: '#/Contributor'
-        lab_address:
-            type: string
-            nullable: true
-            x-airr:
-                miairr: defined
-            description: Institution and full address of corresponding author
         release_version:
             type: integer
             nullable: true
@@ -1225,9 +1216,7 @@ GermlineSet:
         All genes in a GermlineSet should be from a single locus.
     required:
         - germline_set_id
-        - author
-        - lab_name
-        - lab_address
+        - acknowledgements
         - release_version
         - release_description
         - release_date
@@ -1246,28 +1235,14 @@ GermlineSet:
             x-airr:
                 identifier: true
                 miairr: important
-        author:
-            type: string
-            nullable: true
-            x-airr:
-                miairr: important
-            description: Corresponding author
-        lab_name:
-            type: string
-            nullable: true
-            x-airr:
-                miairr: important
-            description: Department of corresponding author
-        lab_address:
-            type: string
-            nullable: true
-            x-airr:
-                miairr: important
-            description: Institutional address of corresponding author
         acknowledgements:
             type: array
             nullable: true
-            description: List of individuals whose contribution to the germline set should be acknowledged
+            description: >
+                List of individuals whose contribution to the germline set should be acknowledged. Note that these are
+                not necessarily identical with the authors on an associated manuscript or other scholarly communication.
+                Further note that typically at least the three CRediT contributor roles "supervision", "investigation"
+                and "data curation" should be assigned. The coresponding author should be listed last.
             items:
                 $ref: '#/Contributor'
         release_version:
@@ -1783,10 +1758,10 @@ Study:
             type: array
             nullable: false
             description: >
-                List of individuals who contributed to the study. Note that these are not necessarily identical with the
-                authors on an associated manuscript or other scholarly communication. Further note that at least the
-                following three CRediT contributor roles "supervision", "investigation" and "data curation" should be
-                assigned.
+                List of individuals who contributed to the study. Note that these are not necessarily identical with
+                the authors on an associated manuscript or other scholarly communication. Further note that typically
+                at least the three CRediT contributor roles "supervision", "investigation" and "data curation" should
+                be assigned. The coresponding author should be listed last.
             title: Contributors
             items:
                 $ref: '#/Contributor'
@@ -1796,6 +1771,69 @@ Study:
                 set: 1
                 subset: study
                 name: Contributors
+        study_contact:
+            type: string
+            nullable: true
+            description: >
+                Full contact information of the contact persons for this study This should include an e-mail address
+                and a persistent identifier such as an ORCID ID.
+            x-airr:
+                deprecated: true
+                deprecated-description: >
+                    Acknowledgements and contact information was re-organized into the contributors property, which
+                    is an array of Contributor objects.
+                deprecated-replaced-by:
+                    - contributors
+        collected_by:
+            type: string
+            nullable: true
+            description: >
+                Full contact information of the data collector, i.e. the person who is legally responsible for data
+                collection and release. This should include an e-mail address and a persistent identifier such as an
+                ORCID ID.
+            x-airr:
+                deprecated: true
+                deprecated-description: >
+                    Acknowledgements and contact information was re-organized into the contributors property, which
+                    is an array of Contributor objects.
+                deprecated-replaced-by:
+                    - contributors
+        lab_name:
+            type: string
+            nullable: true
+            description: Department of data collector
+            x-airr:
+                deprecated: true
+                deprecated-description: >
+                    Acknowledgements and contact information was re-organized into the contributors property, which
+                    is an array of Contributor objects.
+                deprecated-replaced-by:
+                    - contributors
+        lab_address:
+            type: string
+            nullable: true
+            description: Institution and institutional address of data collector
+            x-airr:
+                deprecated: true
+                deprecated-description: >
+                    Acknowledgements and contact information was re-organized into the contributors property, which
+                    is an array of Contributor objects.
+                deprecated-replaced-by:
+                    - contributors
+        submitted_by:
+            type: string
+            nullable: true
+            description: >
+                Full contact information of the data depositor, i.e., the person submitting the data to a repository.
+                This should include an e-mail address and a persistent identifier such as an ORCID ID. This is
+                supposed to be a short-lived and technical role until the submission is relased.
+            x-airr:
+                deprecated: true
+                deprecated-description: >
+                    Acknowledgements and contact information was re-organized into the contributors property, which
+                    is an array of Contributor objects.
+                deprecated-replaced-by:
+                    - contributors
         pub_ids:
             type: array
             items:
diff --git a/specs/airr-schema.yaml b/specs/airr-schema.yaml
index fe21bbb75..87a25b5bd 100644
--- a/specs/airr-schema.yaml
+++ b/specs/airr-schema.yaml
@@ -493,30 +493,115 @@ TimePoint:
 # TODO: link to global schema with JSON-LD?
 #
 
-# An individual
-Acknowledgement:
+# Contributor record to describe invididuals and their contribution to a data set
+#
+Contributor:
     description: Individual whose contribution to this work should be acknowledged
     type: object
     required:
-        - acknowledgement_id
+        - contributor_id
         - name
-        - institution_name
     properties:
-        acknowledgement_id:
+        contributor_id:
             type: string
-            description: unique identifier of this Acknowledgement within the file
+            description: Unique identifier of this contributor within the file
             x-airr:
+                nullable: true
                 identifier: true
                 miairr: important
         name:
             type: string
-            description: Full name of individual
-        institution_name:
-            type: string
-            description: Individual's department and institution name
+            description: Full name of contributor
+            x-airr:
+                 nullable: false
         orcid_id:
+            $ref: '#/Ontology'
+            description: >
+                ORCID identifier of the contributor. Note that if present, the label of the ORCID record should take
+                precedence over the name reported in the `name` property.
+            title: ORCID iD
+            example:
+                id: ORCID:0000-0002-1825-0097
+                label: Josiah Carberry
+            x-airr:
+                nullable: true
+                adc-query-support: true
+                format: ontology
+                ontology:
+                    draft: false
+                    top_node:
+                        id: null
+                        label: null
+        affiliation:
+            $ref: '#/Ontology'
+            description: >
+                ROR of the contributor's primary affiliation. Note that ROR are only minted for institutions, not
+                from individuals institutes, divisions or departments.
+            title: ROR
+            example:
+                id: ROR:05h7xva58
+                label: Wesleyan University
+            x-airr:
+                nullable: true
+                adc-query-support: true
+                format: ontology
+                ontology:
+                    draft: false
+                    top_node:
+                        id: null
+                        label: null
+        affiliation_department:
+            type: string
+            description: >
+                Additional information regarding the contributor's primary affiliation. Can be used to specify
+                individual institutes, divisions or departments.
+            example: Department for Psychoceramics
+            x-airr:
+                nullable: true
+        contributions:
+            type: array
+            description: List of all roles the contributor had in a project
+            items:
+                $ref: '#/ContributorContribution'
+            x-airr:
+                nullable: true
+
+ContributorContribution:
+    type: object
+    required:
+        - role
+    properties:
+        role:
             type: string
-            description: Individual's ORCID identifier
+            description: Role according to CRediT taxonomy
+            enum:
+                - conceptualization
+                - data curation
+                - formal analysis
+                - funding acquisition
+                - investigation
+                - methodology
+                - project administration
+                - resources
+                - software
+                - supervision
+                - validation
+                - visualization
+                - writing - original draft
+                - writing - review & editing
+            x-airr:
+                nullable: false
+        degree:
+            type: string
+            description: >
+                Optional specification of the degree of contribution, should be used if multiple individuals serve
+                the same role.
+            enum:
+                - lead
+                - equal
+                - supporting
+            x-airr:
+                nullable: true
 
 #
 # Germline gene schema
@@ -783,8 +868,7 @@ AlleleDescription:
     type: object
     required:
         - allele_description_id
-        - maintainer
-        - lab_address
+        - acknowledgements
         - release_version
         - release_date
         - release_description
@@ -810,21 +894,15 @@ AlleleDescription:
             example: OGRDB:Human_IGH:IGHV1-69*01.001
             x-airr:
                 miairr: important
-        maintainer:
-            type: string
-            description: Maintainer of this sequence record
-            x-airr:
-                miairr: defined
         acknowledgements:
             type: array
-            description: List of individuals whose contribution to the gene description should be acknowledged
+            description: >
+                List of individuals whose contribution to the gene description should be acknowledged. Note that these
+                are not necessarily identical with the authors on an associated manuscript or other scholarly
+                communication. Further note that typically at least the three CRediT contributor roles "supervision",
+                "investigation" and "data curation" should be assigned. The current maintainer should be listed first.
             items:
-                $ref: '#/Acknowledgement'
-        lab_address:
-            type: string
-            description: Institution and full address of corresponding author
-            x-airr:
-                miairr: defined
+                $ref: '#/Contributor'
         release_version:
             type: integer
             description: Version number of this record, updated whenever a revised version is published or released
@@ -1084,9 +1162,7 @@ GermlineSet:
         All genes in a GermlineSet should be from a single locus.
     required:
         - germline_set_id
-        - author
-        - lab_name
-        - lab_address
+        - acknowledgements
         - release_version
         - release_description
         - release_date
@@ -1104,26 +1180,15 @@ GermlineSet:
             x-airr:
                 identifier: true
                 miairr: important
-        author:
-            type: string
-            description: Corresponding author
-            x-airr:
-                miairr: important
-        lab_name:
-            type: string
-            description: Department of corresponding author
-            x-airr:
-                miairr: important
-        lab_address:
-            type: string
-            description: Institutional address of corresponding author
-            x-airr:
-                miairr: important
         acknowledgements:
             type: array
-            description: List of individuals whose contribution to the germline set should be acknowledged
+            description: >
+                List of individuals whose contribution to the germline set should be acknowledged. Note that these are
+                not necessarily identical with the authors on an associated manuscript or other scholarly communication.
+                Further note that typically at least the three CRediT contributor roles "supervision", "investigation"
+                and "data curation" should be assigned. The coresponding author should be listed last.
             items:
-                $ref: '#/Acknowledgement'
+                $ref: '#/Contributor'
         release_version:
             type: number
             description: Version number of this record, allocated automatically
@@ -1593,71 +1658,81 @@ Study:
                 set: 1
                 subset: study
                 name: Grant funding agency
+        contributors:
+            type: array
+            description: >
+                List of individuals who contributed to the study. Note that these are not necessarily identical with
+                the authors on an associated manuscript or other scholarly communication. Further note that typically
+                at least the three CRediT contributor roles "supervision", "investigation" and "data curation" should
+                be assigned. The coresponding author should be listed last.
+            title: Contributors
+            items:
+                $ref: '#/Contributor'
+            x-airr:
+                nullable: false
+                miairr: essential
+                adc-query-support: true
+                set: 1
+                subset: study
+                name: Contributors
         study_contact:
             type: string
             description: >
                 Full contact information of the contact persons for this study This should include an e-mail address
                 and a persistent identifier such as an ORCID ID.
-            title: Contact information (study)
-            example: Dr. P. Stibbons, p.stibbons@unseenu.edu, https://orcid.org/0000-0002-1825-0097
             x-airr:
-                nullable: true
-                adc-query-support: true
-                name: Contact information (study)
+                deprecated: true
+                deprecated-description: >
+                    Acknowledgements and contact information was re-organized into the contributors property, which
+                    is an array of Contributor objects.
+                deprecated-replaced-by:
+                    - contributors
         collected_by:
             type: string
             description: >
                 Full contact information of the data collector, i.e. the person who is legally responsible for data
                 collection and release. This should include an e-mail address and a persistent identifier such as an
                 ORCID ID.
-            title: Contact information (data collection)
-            example: Dr. P. Stibbons, p.stibbons@unseenu.edu, https://orcid.org/0000-0002-1825-0097
             x-airr:
-                miairr: important
-                nullable: true
-                adc-query-support: true
-                set: 1
-                subset: study
-                name: Contact information (data collection)
+                deprecated: true
+                deprecated-description: >
+                    Acknowledgements and contact information was re-organized into the contributors property, which
+                    is an array of Contributor objects.
+                deprecated-replaced-by:
+                    - contributors
         lab_name:
             type: string
             description: Department of data collector
-            title: Lab name
-            example: Department for Planar Immunology
             x-airr:
-                miairr: important
-                nullable: true
-                adc-query-support: true
-                set: 1
-                subset: study
-                name: Lab name
+                deprecated: true
+                deprecated-description: >
+                    Acknowledgements and contact information was re-organized into the contributors property, which
+                    is an array of Contributor objects.
+                deprecated-replaced-by:
+                    - contributors
         lab_address:
             type: string
             description: Institution and institutional address of data collector
-            title: Lab address
-            example: School of Medicine, Unseen University, Ankh-Morpork, Disk World
             x-airr:
-                miairr: important
-                nullable: true
-                adc-query-support: true
-                set: 1
-                subset: study
-                name: Lab address
+                deprecated: true
+                deprecated-description: >
+                    Acknowledgements and contact information was re-organized into the contributors property, which
+                    is an array of Contributor objects.
+                deprecated-replaced-by:
+                    - contributors
         submitted_by:
             type: string
             description: >
                 Full contact information of the data depositor, i.e., the person submitting the data to a repository.
                 This should include an e-mail address and a persistent identifier such as an ORCID ID. This is
                 supposed to be a short-lived and technical role until the submission is relased.
-            title: Contact information (data deposition)
-            example: Adrian Turnipseed, a.turnipseed@unseenu.edu, https://orcid.org/0000-0002-1825-0097
             x-airr:
-                miairr: important
-                nullable: true
-                adc-query-support: true
-                set: 1
-                subset: study
-                name: Contact information (data deposition)
+                deprecated: true
+                deprecated-description: >
+                    Acknowledgements and contact information was re-organized into the contributors property, which
+                    is an array of Contributor objects.
+                deprecated-replaced-by:
+                    - contributors
         pub_ids:
             type: array
             items:

From e1dab4a04eb6bda6ba6927bb48abbe32e8650176 Mon Sep 17 00:00:00 2001
From: Christian Busse <christian.busse@dkfz-heidelberg.de>
Date: Wed, 21 Feb 2024 03:18:22 +0100
Subject: [PATCH 11/15] Update R and Python Schema files

---
 lang/R/R/Schema.R          | 2 +-
 lang/python/airr/schema.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/lang/R/R/Schema.R b/lang/R/R/Schema.R
index da7cb1056..c6925e67f 100644
--- a/lang/R/R/Schema.R
+++ b/lang/R/R/Schema.R
@@ -312,7 +312,7 @@ AIRRSchema <- list("Info"=load_schema("InfoObject"),
                    "SequencingData"=load_schema("SequencingData"),
                    "DataProcessing"=load_schema("DataProcessing"),
                    "GermlineSet"=load_schema("GermlineSet"),
-                   "Acknowledgement"=load_schema("Acknowledgement"),
+                   "Contributor"=load_schema("Contributor"),
                    "RearrangedSequence"=load_schema("RearrangedSequence"),
                    "UnrearrangedSequence"=load_schema("UnrearrangedSequence"),
                    "SequenceDelineationV"=load_schema("SequenceDelineationV"),
diff --git a/lang/python/airr/schema.py b/lang/python/airr/schema.py
index 28967d33c..28de0a859 100644
--- a/lang/python/airr/schema.py
+++ b/lang/python/airr/schema.py
@@ -544,7 +544,7 @@ def _default(spec):
     'SequencingData': Schema('SequencingData'),
     'DataProcessing': Schema('DataProcessing'),
     'GermlineSet': Schema('GermlineSet'),
-    'Acknowledgement': Schema('Acknowledgement'),
+    'Contributor': Schema('Contributor'),
     'RearrangedSequence': Schema('RearrangedSequence'),
     'UnrearrangedSequence': Schema('UnrearrangedSequence'),
     'SequenceDelineationV': Schema('SequenceDelineationV'),

From 44b04655add076901b109be65e5f925198cd4d05 Mon Sep 17 00:00:00 2001
From: Christian Busse <christian.busse@dkfz-heidelberg.de>
Date: Wed, 21 Feb 2024 03:11:27 +0100
Subject: [PATCH 12/15] Update example and test files for R and Python

---
 lang/R/inst/extdata/airr-schema.yaml          |   5 +-
 lang/R/inst/extdata/germline-example.json     | 182 ++--
 lang/R/inst/extdata/repertoire-example.yaml   | 182 +++-
 lang/R/tests/data-tests/bad_genotype_set.json |  86 +-
 lang/R/tests/data-tests/bad_germline_set.json | 102 ++-
 lang/R/tests/data-tests/bad_repertoire.yaml   |  96 +-
 .../tests/data-tests/good_combined_airr.json  | 449 +++++++--
 .../tests/data-tests/good_combined_airr.yaml  | 287 ++++--
 .../R/tests/data-tests/good_genotype_set.json |  74 +-
 .../R/tests/data-tests/good_germline_set.json | 108 ++-
 lang/R/tests/data-tests/good_repertoire.yaml  | 146 ++-
 lang/python/airr/specs/airr-schema.yaml       |   5 +-
 lang/python/tests/data/bad_genotype_set.json  |   2 +-
 lang/python/tests/data/bad_germline_set.json  |  94 +-
 lang/python/tests/data/bad_repertoire.yaml    |  90 +-
 .../python/tests/data/good_combined_airr.json | 349 +++++--
 .../python/tests/data/good_combined_airr.yaml | 856 ++++++++++--------
 lang/python/tests/data/good_genotype_set.json |   2 +-
 lang/python/tests/data/good_germline_set.json | 108 ++-
 lang/python/tests/data/good_repertoire.yaml   | 128 ++-
 specs/airr-schema-openapi3.yaml               |   5 +-
 specs/airr-schema.yaml                        |   5 +-
 22 files changed, 2364 insertions(+), 997 deletions(-)

diff --git a/lang/R/inst/extdata/airr-schema.yaml b/lang/R/inst/extdata/airr-schema.yaml
index 87a25b5bd..dd2c0c241 100644
--- a/lang/R/inst/extdata/airr-schema.yaml
+++ b/lang/R/inst/extdata/airr-schema.yaml
@@ -1571,10 +1571,7 @@ Study:
         - study_type
         - inclusion_exclusion_criteria
         - grants
-        - collected_by
-        - lab_name
-        - lab_address
-        - submitted_by
+        - contributors
         - pub_ids
         - keywords_study
     properties:
diff --git a/lang/R/inst/extdata/germline-example.json b/lang/R/inst/extdata/germline-example.json
index 926b6d428..9d41e5f38 100644
--- a/lang/R/inst/extdata/germline-example.json
+++ b/lang/R/inst/extdata/germline-example.json
@@ -1,17 +1,41 @@
 {
     "GermlineSet": [{
         "germline_set_id": "OGRDB:G00007",
-        "author": "William Lees",
-        "lab_name": "",
-        "lab_address": "Birkbeck College, University of London, Malet Street, London",
-        "acknowledgements": [],
+        "acknowledgements": [
+            {
+                "contributor_id": "1",
+                "name": "William Lees",
+                "orcid_id": {
+                    "id": "ORCID:0000-0001-9834-6840",
+                    "label": "William Lees"
+                },
+                "affiliation": {
+                    "id": "ROR:02mb95055",
+                    "label": "Birkbeck, University of London"
+                },
+                "affiliation_department":"",
+                "contributions": [
+                    {
+                        "role": "investigation",
+                        "degree": null
+                    },
+                    {
+                        "role": "data curation",
+                        "degree": null
+                    }
+                ]
+            }
+        ],
         "release_version": 1,
         "release_description": "",
         "release_date": "2021-11-24",
         "germline_set_name": "CAST IGH",
         "germline_set_ref": "OGRDB:G00007.1",
-        "pub_ids": "",
-        "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" },
+        "pub_ids": [""],
+        "species": {
+            "id": "NCBITAXON:10090",
+            "label": "Mus musculus"
+        },
         "species_subgroup": "CAST_EiJ",
         "species_subgroup_type": "strain",
         "locus": "IGH",
@@ -19,11 +43,33 @@
             {
                 "allele_description_id": "OGRDB:A00301",
                 "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2DBF",
-                "maintainer": "William Lees",
-                "acknowledgements": [],
-                "lab_address": "Birkbeck College, University of London, Malet Street, London",
+                "acknowledgements":  [
+                    {
+                        "contributor_id": "1",
+                        "name": "William Lees",
+                        "orcid_id": {
+                            "id": "ORCID:0000-0001-9834-6840",
+                            "label": "William Lees"
+                        },
+                        "affiliation": {
+                            "id": "ROR:02mb95055",
+                            "label": "Birkbeck, University of London"
+                        },
+                        "affiliation_department":"",
+                        "contributions": [
+                            {
+                                "role": "investigation",
+                                "degree": null
+                            },
+                            {
+                                "role": "data curation",
+                                "degree": null
+                            }
+                        ]
+                    }
+                ],
                 "release_version": 1,
-                "release_date": "24-Nov-2021",
+                "release_date": "2021-11-24",
                 "release_description": "First release",
                 "label": "IGHV-2DBF",
                 "sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
@@ -36,7 +82,10 @@
                 "sequence_type": "V",
                 "functional": true,
                 "inference_type": "rearranged_only",
-                "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" },
+                "species": {
+                    "id": "NCBITAXON:10090",
+                    "label": "Mus musculus"
+                },
                 "species_subgroup": "CAST_EiJ",
                 "species_subgroup_type": "strain",
                 "status": "active",
@@ -70,7 +119,7 @@
                         "fwr3_start": 161,
                         "fwr3_end": 294,
                         "cdr3_start": 295,
-                        "alignment": [
+                        "alignment_labels": [
                             "1",
                             "2",
                             "3",
@@ -187,11 +236,33 @@
             {
                 "allele_description_id": "OGRDB:A00314",
                 "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2ETO",
-                "maintainer": "William Lees",
-                "acknowledgements": [],
-                "lab_address": "Birkbeck College, University of London, Malet Street, London",
+                "acknowledgements":  [
+                    {
+                        "contributor_id": "1",
+                        "name": "William Lees",
+                        "orcid_id": {
+                            "id": "ORCID:0000-0001-9834-6840",
+                            "label": "William Lees"
+                        },
+                        "affiliation": {
+                            "id": "ROR:02mb95055",
+                            "label": "Birkbeck, University of London"
+                        },
+                        "affiliation_department":"",
+                        "contributions": [
+                            {
+                                "role": "investigation",
+                                "degree": null
+                            },
+                            {
+                                "role": "data curation",
+                                "degree": null
+                            }
+                        ]
+                    }
+                ],
                 "release_version": 1,
-                "release_date": "24-Nov-2021",
+                "release_date": "2021-11-24",
                 "release_description": "First release",
                 "label": "IGHV-2ETO",
                 "sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC",
@@ -204,7 +275,10 @@
                 "sequence_type": "V",
                 "functional": true,
                 "inference_type": "rearranged_only",
-                "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" },
+                "species": {
+                    "id": "NCBITAXON:10090",
+                    "label": "Mus musculus"
+                },
                 "species_subgroup": "CAST_EiJ",
                 "species_subgroup_type": "strain",
                 "status": "active",
@@ -238,7 +312,7 @@
                         "fwr3_start": 161,
                         "fwr3_end": 294,
                         "cdr3_start": 295,
-                        "alignment": [
+                        "alignment_labels": [
                             "1",
                             "2",
                             "3",
@@ -356,40 +430,40 @@
         "curation": null
     }],
 
-	"GenotypeSet": [{
-		"receptor_genotype_set_id": "1",
-		"genotype_class_list": [
-			{
-				"receptor_genotype_id": "1",
-				"locus": "IGH",
-				"documented_alleles": [
-					{
-						"label": "IGHV1-69*01",
-						"germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
-						"phasing": 1
-					},
-					{
-						"label": "IGHV1-69*02",
-						"germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
-						"phasing": 2
-					}
-				],
-				"undocumented_alleles": [
-					{
-						"allele_name": "IGHD3-1*01_S1234",
-						"sequence": "agtagtagtagt",
-						"phasing": 1
-					}
-				],
-				"deleted_genes": [
-					{
-						"label": "IGHV3-30-3",
-						"germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
-						"phasing": 1
-					}
-				],
-				"inference_process": "repertoire_sequencing"
-			}
-		]
-	}]
+    "GenotypeSet": [{
+        "receptor_genotype_set_id": "1",
+        "genotype_class_list": [
+            {
+                "receptor_genotype_id": "1",
+                "locus": "IGH",
+                "documented_alleles": [
+                    {
+                        "label": "IGHV1-69*01",
+                        "germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
+                        "phasing": 1
+                    },
+                    {
+                        "label": "IGHV1-69*02",
+                        "germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
+                        "phasing": 2
+                    }
+                ],
+                "undocumented_alleles": [
+                    {
+                        "allele_name": "IGHD3-1*01_S1234",
+                        "sequence": "agtagtagtagt",
+                        "phasing": 1
+                    }
+                ],
+                "deleted_genes": [
+                    {
+                        "label": "IGHV3-30-3",
+                        "germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
+                        "phasing": 1
+                    }
+                ],
+                "inference_process": "repertoire_sequencing"
+            }
+        ]
+    }]
 }
diff --git a/lang/R/inst/extdata/repertoire-example.yaml b/lang/R/inst/extdata/repertoire-example.yaml
index 5d6808bcc..6adaa2361 100644
--- a/lang/R/inst/extdata/repertoire-example.yaml
+++ b/lang/R/inst/extdata/repertoire-example.yaml
@@ -11,31 +11,58 @@ Repertoire:
         id: null
         label: null
       study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level."
-      study_contact: "Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X"
       inclusion_exclusion_criteria: null
-      lab_name: "Mark M. Davis"
-      lab_address: "Stanford University"
-      submitted_by: "Florian Rubelt"
-      pub_ids: "PMID:27005435"
-      collected_by: null
+      contributors:
+        - contributor_id: "1"
+          name: "Florian Rubelt"
+          orcid_id:
+            id: null
+            label: null
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "investigation"
+              degree: null
+            - role: "data curation"
+              degree: null
+        - contributor_id: "2"
+          name: "Mark M. Davis"
+          orcid_id:
+            id: "ORCID:0000-0001-6868-657X"
+            label: "Mark Davis"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "supervision"
+              degree: null
+      pub_ids: ["PMID:27005435"]
       grants: null
-      keywords_study: 
+      keywords_study:
         - "contains_ig"
         - "contains_tr"
     subject:
       subject_id: TW01A
       synthetic: false
       species:
-        id: "NCBITaxon_9606"
+        id: "NCBITAXON:9606"
         label: "Homo sapiens"
-      sex: F
+      sex: female
       age_min: 27
       age_max: 27
       age_unit:
-        id: UO_0000036
+        id: UO:0000036
         label: year
       age_event: null
-      ancestry_population: null
+      ancestry_population:
+        id: null
+        label: null
+      location_birth:
+        id: null
+        label: null
       ethnicity: null
       race: null
       strain_name: null
@@ -58,15 +85,15 @@ Repertoire:
         sample_processing_id: null
         sample_type: "peripheral venous puncture"
         tissue:
-          id: "UBERON_0000178"
+          id: "UBERON:0000178"
           label: "blood"
         tissue_processing: "Ficoll gradient"
         cell_subset:
-          id: "CL_0000788"
+          id: "CL:0000788"
           label: "naive B cell"
         cell_phenotype: "expression of CD20 and the absence of CD27"
         cell_species:
-          id: "NCBITaxon_9606"
+          id: "NCBITAXON:9606"
           label: "Homo sapiens"
         single_cell: false
         cell_isolation: FACS
@@ -77,7 +104,7 @@ Repertoire:
             reverse_pcr_primer_target_location: null
         sequencing_platform: "Illumina MiSeq"
         sequencing_files:
-          sequencing_data_id: SRR2905656
+          sequencing_data_id: SRA:SRR2905656
           file_type: fastq
           filename: SRR2905656_R1.fastq.gz
           read_direction: forward
@@ -85,6 +112,8 @@ Repertoire:
           paired_filename: SRR2905656_R2.fastq.gz
           paired_read_direction: reverse
           paired_read_length: 300
+          index_filename: SRR2905656_R3.fastq.gz
+          index_length: 8
         anatomic_site: null
         disease_state_sample: null
         collection_time_point_relative: null
@@ -92,6 +121,9 @@ Repertoire:
           id: null
           label: null
         collection_time_point_reference: null
+        collection_location:
+          id: null
+          label: null
         biomaterial_provider: null
         cell_number: null
         cells_per_reaction: null
@@ -134,13 +166,35 @@ Repertoire:
         id: null
         label: null
       study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level."
-      study_contact: "Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X"
       inclusion_exclusion_criteria: null
-      lab_name: "Mark M. Davis"
-      lab_address: "Stanford University"
-      submitted_by: "Florian Rubelt"
-      pub_ids: "PMID:27005435"
-      collected_by: null
+      contributors:
+        - contributor_id: "1"
+          name: "Florian Rubelt"
+          orcid_id:
+            id: null
+            label: null
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "investigation"
+              degree: null
+            - role: "data curation"
+              degree: null
+        - contributor_id: "2"
+          name: "Mark M. Davis"
+          orcid_id:
+            id: "ORCID:0000-0001-6868-657X"
+            label: "Mark Davis"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "supervision"
+              degree: null
+      pub_ids: ["PMID:27005435"]
       grants: null
       keywords_study:
         - "contains_ig"
@@ -149,16 +203,21 @@ Repertoire:
       subject_id: TW01A
       synthetic: false
       species:
-        id: "NCBITaxon_9606"
+        id: "NCBITAXON:9606"
         label: "Homo sapiens"
-      sex: F
+      sex: female
       age_min: 27
       age_max: 27
       age_unit:
-        id: UO_0000036
+        id: UO:0000036
         label: year
       age_event: null
-      ancestry_population: null
+      ancestry_population:
+        id: null
+        label: null
+      location_birth:
+        id: null
+        label: null
       ethnicity: null
       race: null
       strain_name: null
@@ -181,15 +240,15 @@ Repertoire:
         sample_processing_id: null
         sample_type: "peripheral venous puncture"
         tissue:
-          id: "UBERON_0000178"
+          id: "UBERON:0000178"
           label: "blood"
         tissue_processing: "Ficoll gradient"
         cell_subset:
-          id: "CL_0000787"
+          id: "CL:0000787"
           label: "memory B cell"
         cell_phenotype: "expression of CD20 and CD27"
         cell_species:
-          id: "NCBITaxon_9606"
+          id: "NCBITAXON:9606"
           label: "Homo sapiens"
         single_cell: false
         cell_isolation: FACS
@@ -200,7 +259,7 @@ Repertoire:
             reverse_pcr_primer_target_location: null
         sequencing_platform: "Illumina MiSeq"
         sequencing_files:
-          sequencing_data_id: SRR2905655
+          sequencing_data_id: SRA:SRR2905655
           file_type: fastq
           filename: SRR2905655_R1.fastq.gz
           read_direction: forward
@@ -208,6 +267,8 @@ Repertoire:
           paired_filename: SRR2905655_R2.fastq.gz
           paired_read_direction: reverse
           paired_read_length: 300
+          index_filename: SRR2905655_R3.fastq.gz
+          index_length: 8
         anatomic_site: null
         disease_state_sample: null
         collection_time_point_relative: null
@@ -215,6 +276,9 @@ Repertoire:
           id: null
           label: null
         collection_time_point_reference: null
+        collection_location:
+          id: null
+          label: null
         biomaterial_provider: null
         cell_number: null
         cells_per_reaction: null
@@ -257,13 +321,35 @@ Repertoire:
         id: null
         label: null
       study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level."
-      study_contact: "Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X"
       inclusion_exclusion_criteria: null
-      lab_name: "Mark M. Davis"
-      lab_address: "Stanford University"
-      submitted_by: "Florian Rubelt"
-      pub_ids: "PMID:27005435"
-      collected_by: null
+      contributors:
+        - contributor_id: "1"
+          name: "Florian Rubelt"
+          orcid_id:
+            id: null
+            label: null
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "investigation"
+              degree: null
+            - role: "data curation"
+              degree: null
+        - contributor_id: "2"
+          name: "Mark M. Davis"
+          orcid_id:
+            id: "ORCID:0000-0001-6868-657X"
+            label: "Mark Davis"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "supervision"
+              degree: null
+      pub_ids: ["PMID:27005435"]
       grants: null
       keywords_study:
         - "contains_ig"
@@ -272,16 +358,21 @@ Repertoire:
       subject_id: TW01A
       synthetic: false
       species:
-        id: "NCBITaxon_9606"
+        id: "NCBITAXON:9606"
         label: "Homo sapiens"
-      sex: F
+      sex: female
       age_min: 27
       age_max: 27
       age_unit:
-        id: UO_0000036
+        id: UO:0000036
         label: year
       age_event: null
-      ancestry_population: null
+      ancestry_population:
+        id: null
+        label: null
+      location_birth:
+        id: null
+        label: null
       ethnicity: null
       race: null
       strain_name: null
@@ -304,15 +395,15 @@ Repertoire:
         sample_processing_id: null
         sample_type: "peripheral venous puncture"
         tissue:
-          id: "UBERON_0000178"
+          id: "UBERON:0000178"
           label: "blood"
         tissue_processing: "Ficoll gradient"
         cell_subset:
-          id: "CL_0000895"
+          id: "CL:0000895"
           label: "naive thymus-derived CD4-positive, alpha-beta T cell"
         cell_phenotype: "expression of CD8 and absence of CD4 and CD45RO"
         cell_species:
-          id: "NCBITaxon_9606"
+          id: "NCBITAXON:9606"
           label: "Homo sapiens"
         single_cell: false
         cell_isolation: FACS
@@ -323,7 +414,7 @@ Repertoire:
             reverse_pcr_primer_target_location: null
         sequencing_platform: "Illumina MiSeq"
         sequencing_files:
-          sequencing_data_id: SRR2905659
+          sequencing_data_id: SRA:SRR2905659
           file_type: fastq
           filename: SRR2905659_R1.fastq.gz
           read_direction: forward
@@ -331,6 +422,8 @@ Repertoire:
           paired_filename: SRR2905659_R2.fastq.gz
           paired_read_direction: reverse
           paired_read_length: 300
+          index_filename: SRR2905659_R3.fastq.gz
+          index_length: 8
         anatomic_site: null
         disease_state_sample: null
         collection_time_point_relative: null
@@ -338,6 +431,9 @@ Repertoire:
           id: null
           label: null
         collection_time_point_reference: null
+        collection_location:
+          id: null
+          label: null
         biomaterial_provider: null
         cell_number: null
         cells_per_reaction: null
diff --git a/lang/R/tests/data-tests/bad_genotype_set.json b/lang/R/tests/data-tests/bad_genotype_set.json
index 48825e1f8..01709d60a 100644
--- a/lang/R/tests/data-tests/bad_genotype_set.json
+++ b/lang/R/tests/data-tests/bad_genotype_set.json
@@ -1,44 +1,44 @@
 {
-	"GenotypeSet": [{
-		"receptor_genotype_set_id": "1",
-		"genotype_class_list": [
-			{
-				"receptor_genotype_id": "1",
-				"locus": 1,
-				"documented_alleles": [
-					{
-						"label": "IGHV1-69*01",
-						"germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
-						"phasing": 1
-					},
-					{
-						"label": "IGHV1-69*02",
-						"germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
-						"phasing": 2
-					},
-					{
-						"label": "IGHV1-69*02",
-						"name": "1234",
-						"germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
-						"phasing": 2
-					}
-				],
-				"undocumented_alleles": [
-					{
-						"allele_name": "IGHD3-1*01_S1234",
-						"sequence": "agtagtagtagt",
-						"phasing": 1
-					}
-				],
-				"deleted_genes": [
-					{
-						"label": "IGHV3-30-3",
-						"germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
-						"phasing": "1"
-					}
-				],
-				"inference_process": "repertoire_sequencing"
-			}
-		]
-	}]
-}
\ No newline at end of file
+    "GenotypeSet": [{
+        "receptor_genotype_set_id": "1",
+        "genotype_class_list": [
+            {
+                "receptor_genotype_id": "1",
+                "locus": "IGH",
+                "documented_alleles": [
+                    {
+                        "label": "IGHV1-69*01",
+                        "germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
+                        "phasing": 1
+                    },
+                    {
+                        "label": "IGHV1-69*02",
+                        "germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
+                        "phasing": 2
+                    },
+                    {
+                        "label": "IGHV1-69*02",
+                        "name": "1234",
+                        "germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
+                        "phasing": 2
+                    }
+                ],
+                "undocumented_alleles": [
+                    {
+                        "allele_name": "IGHD3-1*01_S1234",
+                        "sequence": "agtagtagtagt",
+                        "phasing": 1
+                    }
+                ],
+                "deleted_genes": [
+                    {
+                        "label": "IGHV3-30-3",
+                        "germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
+                        "phasing": "1"
+                    }
+                ],
+                "inference_process": "repertoire_sequencing"
+            }
+        ]
+    }]
+}
diff --git a/lang/R/tests/data-tests/bad_germline_set.json b/lang/R/tests/data-tests/bad_germline_set.json
index 0aeea9a2f..28531aabb 100644
--- a/lang/R/tests/data-tests/bad_germline_set.json
+++ b/lang/R/tests/data-tests/bad_germline_set.json
@@ -1,27 +1,71 @@
 {
     "GermlineSet": [{
         "germline_set_id": "OGRDB:G00007",
-        "author": "William Lees",
-        "lab_name": "",
-        "lab_address": "Birkbeck College, University of London, Malet Street, London",
+        "acknowledgements": [
+            {
+                "contributor_id": "1",
+                "name": "William Lees",
+                "orcid_id": {
+                    "id": "ORCID:0000-0001-9834-6840",
+                    "label": "William Lees"
+                },
+                "affiliation": {
+                    "id": "ROR:02mb95055",
+                    "label": "Birkbeck, University of London"
+                },
+                "affiliation_department":"",
+                "contributions": [
+                    {
+                        "role": "investigation",
+                        "degree": null
+                    },
+                    {
+                        "role": "data curation",
+                        "degree": null
+                    }
+                ]
+            }
+        ],
         "release_version": 1,
         "release_description": "",
         "release_date": "2021-11-24",
         "germline_set_name": "CAST IGH",
         "germline_set_ref": "OGRDB:G00007.1",
         "pub_ids": [""],
-        "species_typo": ["Mouse"],
+        "species": "Mouse",
         "species_subgroup": "CAST_EiJ",
         "species_subgroup_type": "strain",
-        "locus": 1,
+        "locus": "IGH",
         "allele_descriptions": [
             {
                 "allele_description_id": "OGRDB:A00301",
-                "maintainer": "William Lees",
-                "acknowledgements": [],
-                "lab_address": "Birkbeck College, University of London, Malet Street, London",
+                "acknowledgements": [
+                    {
+                        "contributor_id": "1",
+                        "name": "William Lees",
+                        "orcid_id": {
+                            "id": "ORCID:0000-0001-9834-6840",
+                            "label": "William Lees"
+                        },
+                        "affiliation": {
+                            "id": "ROR:02mb95055",
+                            "label": "Birkbeck, University of London"
+                        },
+                        "affiliation_department":"",
+                        "contributions": [
+                            {
+                                "role": "investigation",
+                                "degree": null
+                            },
+                            {
+                                "role": "data curation",
+                                "degree": null
+                            }
+                        ]
+                    }
+                ],
                 "release_version": 1,
-                "release_date": "24-Nov-2021",
+                "release_date": "2021-11-24",
                 "release_description": "First release",
                 "label": "IGHV-2DBF",
                 "sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
@@ -29,7 +73,7 @@
                 "aliases": [
                     "watson_et_al:CAST_EiJ_IGHV5-3"
                 ],
-                "locus": 1,
+                "locus": "IGH",
                 "chromosome": null,
                 "sequence_type": "V",
                 "functional": true,
@@ -66,7 +110,7 @@
                         "fwr3_start": 196,
                         "fwr3_end": 312,
                         "cdr3_start": 313,
-                        "alignment": [
+                        "alignment_labels": [
                             "1",
                             "2",
                             "3",
@@ -177,16 +221,38 @@
                 "unrearranged_support": [],
                 "rearranged_support": [],
                 "paralogs": [],
-                "notes": "Imported to OGRDB with the following notes:\r\nwatson_et_al: CAST_EiJ_IGHV5-3",
+                "notes": "Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV5-3",
                 "curational_tags": null
             },
             {
                 "allele_description_id": "OGRDB:A00314",
-                "maintainer": "William Lees",
-                "acknowledgements": [],
-                "lab_address": "Birkbeck College, University of London, Malet Street, London",
+                "acknowledgements": [
+                    {
+                        "contributor_id": "1",
+                        "name": "William Lees",
+                        "orcid_id": {
+                            "id": "ORCID:0000-0001-9834-6840",
+                            "label": "William Lees"
+                        },
+                        "affiliation": {
+                            "id": "ROR:02mb95055",
+                            "label": "Birkbeck, University of London"
+                        },
+                        "affiliation_department":"",
+                        "contributions": [
+                            {
+                                "role": "investigation",
+                                "degree": null
+                            },
+                            {
+                                "role": "data curation",
+                                "degree": null
+                            }
+                        ]
+                    }
+                ],
                 "release_version": 1,
-                "release_date": "24-Nov-2021",
+                "release_date": "2021-11-24",
                 "release_description": "First release",
                 "label": "IGHV-2ETO",
                 "sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC",
@@ -231,7 +297,7 @@
                         "fwr3_start": 196,
                         "fwr3_end": 312,
                         "cdr3_start": 313,
-                        "alignment": [
+                        "alignment_labels": [
                             "1",
                             "2",
                             "3",
@@ -342,7 +408,7 @@
                 "unrearranged_support": [],
                 "rearranged_support": [],
                 "paralogs": [],
-                "notes": "Imported to OGRDB with the following notes:\r\nwatson_et_al: CAST_EiJ_IGHV8-2",
+                "notes": "Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV8-2",
                 "curational_tags": null
             }
         ],
diff --git a/lang/R/tests/data-tests/bad_repertoire.yaml b/lang/R/tests/data-tests/bad_repertoire.yaml
index 57b0b7312..f35355e98 100644
--- a/lang/R/tests/data-tests/bad_repertoire.yaml
+++ b/lang/R/tests/data-tests/bad_repertoire.yaml
@@ -8,21 +8,39 @@ Repertoire:
       study_id: PRJNA300878
       study_title: "Homo sapiens B and T cell repertoire - MZ twins"
       study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level."
-      lab_name: "Mark M. Davis"
-      lab_address: "Stanford University"
-      submitted_by: "Florian Rubelt"
+      contributors:
+        - contributor_id: "1"
+          name: "Florian Rubelt"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "investigation"
+            - role: "data curation"
+        - contributor_id: "2"
+          name: "Mark M. Davis"
+          orcid_id:
+            id: "ORCID:0000-0001-6868-657X"
+            label: "Mark Davis"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "supervision"
       pub_ids: ["PMID:27005435"]
     subject:
       subject_id: TW01A
       synthetic: false
       species:
-        id: "NCBITaxon_9606"
+        id: "NCBITAXON:9606"
         value: "Homo sapiens"
-      sex: F
+      sex: female
       age_min: 27
       age_max: 27
       age_unit:
-        id: UO_0000036
+        id: UO:0000036
         value: year
       linked_subjects: TW01B
       link_type: twin
@@ -32,7 +50,7 @@ Repertoire:
         cell_subset: "Naive B cell"
         cell_phenotype: "expression of CD20 and the absence of CD27"
         cell_species:
-          id: "NCBITaxon_9606"
+          id: "NCBITAXON:9606"
           value: "Homo sapiens"
         single_cell: false
         cell_isolation: FACS
@@ -56,21 +74,39 @@ Repertoire:
       study_id: PRJNA300878
       study_title: "Homo sapiens B and T cell repertoire - MZ twins"
       study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level."
-      lab_name: "Mark M. Davis"
-      lab_address: "Stanford University"
-      submitted_by: "Florian Rubelt"
+      contributors:
+        - contributor_id: "1"
+          name: "Florian Rubelt"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "investigation"
+            - role: "data curation"
+        - contributor_id: "2"
+          name: "Mark M. Davis"
+          orcid_id:
+            id: "ORCID:0000-0001-6868-657X"
+            label: "Mark Davis"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "supervision"
       pub_ids: ["PMID:27005435"]
     subject:
       subject_id: TW01A
       synthetic: false
       species:
-        id: "NCBITaxon_9606"
+        id: "NCBITAXON:9606"
         value: "Homo sapiens"
-      sex: F
+      sex: female
       age_min: 27
       age_max: 27
       age_unit:
-        id: UO_0000036
+        id: UO:0000036
         value: year
       linked_subjects: TW01B
       link_type: twin
@@ -80,7 +116,7 @@ Repertoire:
         cell_subset: "Memory B cell"
         cell_phenotype: "expression of CD20 and CD27"
         cell_species:
-          id: "NCBITaxon_9606"
+          id: "NCBITAXON:9606"
           value: "Homo sapiens"
         single_cell: false
         cell_isolation: FACS
@@ -104,21 +140,39 @@ Repertoire:
       study_id: PRJNA300878
       study_title: "Homo sapiens B and T cell repertoire - MZ twins"
       study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level."
-      lab_name: "Mark M. Davis"
-      lab_address: "Stanford University"
-      submitted_by: "Florian Rubelt"
+      contributors:
+        - contributor_id: "1"
+          name: "Florian Rubelt"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "investigation"
+            - role: "data curation"
+        - contributor_id: "2"
+          name: "Mark M. Davis"
+          orcid_id:
+            id: "ORCID:0000-0001-6868-657X"
+            label: "Mark Davis"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "supervision"
       pub_ids: ["PMID:27005435"]
     subject:
       subject_id: TW01A
       synthetic: false
       species:
-        id: "NCBITaxon_9606"
+        id: "NCBITAXON:9606"
         value: "Homo sapiens"
-      sex: F
+      sex: female
       age_min: 27
       age_max: 27
       age_unit:
-        id: UO_0000036
+        id: UO:0000036
         value: year
       linked_subjects: TW01B
       link_type: twin
@@ -128,7 +182,7 @@ Repertoire:
         cell_subset: "Naive CD4+ T cell"
         cell_phenotype: "expression of CD8 and absence of CD4 and CD45RO"
         cell_species:
-          id: "NCBITaxon_9606"
+          id: "NCBITAXON:9606"
           value: "Homo sapiens"
         single_cell: false
         cell_isolation: FACS
diff --git a/lang/R/tests/data-tests/good_combined_airr.json b/lang/R/tests/data-tests/good_combined_airr.json
index aa7d52ec1..0ef2106ae 100644
--- a/lang/R/tests/data-tests/good_combined_airr.json
+++ b/lang/R/tests/data-tests/good_combined_airr.json
@@ -10,13 +10,52 @@
                     "label": null
                 },
                 "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.",
-                "study_contact": "Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X",
                 "inclusion_exclusion_criteria": null,
-                "lab_name": "Mark M. Davis",
-                "lab_address": "Stanford University",
-                "submitted_by": "Florian Rubelt",
+                "contributors": [
+                    {
+                        "contributor_id": "1",
+                        "name": "Florian Rubelt",
+                        "orcid_id": {
+                            "id": null,
+                            "label": null
+                        },
+                        "affiliation": {
+                            "id": "ROR:00f54p054",
+                            "label": "Stanford University"
+                         },
+                        "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine",
+                        "contributions": [
+                            {
+                                "role": "investigation",
+                                "degree": null
+                            },
+                            {
+                                "role": "data curation",
+                                "degree": null
+                            }
+                        ]
+                    },
+                    {
+                        "contributor_id": "2",
+                        "name": "Mark M. Davis",
+                        "orcid_id": {
+                            "id": "ORCID:0000-0001-6868-657X",
+                            "label": "Mark Davis"
+                        },
+                        "affiliation": {
+                            "id": "ROR:00f54p054",
+                            "label": "Stanford University"
+                         },
+                        "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine",
+                        "contributions": [
+                            {
+                                "role": "supervision",
+                                "degree": null
+                            }
+                        ]
+                    }
+                ],
                 "pub_ids": ["PMID:27005435"],
-                "collected_by": null,
                 "grants": null,
                 "keywords_study": [
                     "contains_ig",
@@ -27,25 +66,25 @@
                 "subject_id": "TW01A",
                 "synthetic": false,
                 "species": {
-                    "id": "NCBITaxon_9606",
+                    "id": "NCBITAXON:9606",
                     "label": "Homo sapiens"
                 },
-                "sex": "F",
+                "sex": "female",
                 "age_min": 27,
                 "age_max": 27,
                 "age_unit": {
-                    "id": "UO_0000036",
+                    "id": "UO:0000036",
                     "label": "year"
                 },
                 "age_event": null,
                 "ancestry_population": {
-				    "id": null,
-					"label": null
-				},
-				"location_birth": {
-				    "id": null,
-					"label": null
-				},
+                    "id": null,
+                    "label": null
+                },
+                "location_birth": {
+                    "id": null,
+                    "label": null
+                },
                 "ethnicity": null,
                 "race": null,
                 "strain_name": null,
@@ -65,7 +104,65 @@
                         "intervention": null,
                         "medical_history": null
                     }
-                ]
+                ],
+                "genotype": {
+                    "receptor_genotype_set": {
+                        "receptor_genotype_set_id": "1",
+                        "genotype_class_list": [
+                            {
+                                "receptor_genotype_id": "1",
+                                "locus": "IGH",
+                                "documented_alleles": [
+                                    {
+                                        "label": "IGHV1-69*01",
+                                        "germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
+                                        "phasing": 1
+                                    },
+                                    {
+                                        "label": "IGHV1-69*02",
+                                        "germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
+                                        "phasing": 2
+                                    }
+                                ],
+                                "undocumented_alleles": [
+                                    {
+                                        "allele_name": "IGHD3-1*01_S1234",
+                                        "sequence": "agtagtagtagt",
+                                        "phasing": 1
+                                    }
+                                ],
+                                "deleted_genes": [
+                                    {
+                                        "label": "IGHV3-30-3",
+                                        "germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
+                                        "phasing": 1
+                                    }
+                                ],
+                                "inference_process": "repertoire_sequencing"
+                            }
+                        ]
+                    },
+                    "mhc_genotype_set": {
+                        "mhc_genotype_set_id": "01847298-d0c2-11ee-bc66",
+                        "mhc_genotype_list": [
+                            {
+                                "mhc_genotype_id": "00be1c2e-d0c2-11ee-bfe7",
+                                "mhc_class": "MHC-I",
+                                "mhc_genotyping_method": "pcr_low_resolution",
+                                "mhc_alleles": [
+                                    {
+                                        "allele_designation": "01:01",
+                                        "gene": {
+                                            "id": "MRO-0000046",
+                                            "label": "HLA-A"
+                                        },
+                                        "reference_set_ref": null
+                                    }
+                                ]
+                            }
+                        ]
+                    }
+                }
             },
             "sample": [
                 {
@@ -73,17 +170,17 @@
                     "sample_processing_id": null,
                     "sample_type": "peripheral venous puncture",
                     "tissue": {
-                        "id": "UBERON_0000178",
+                        "id": "UBERON:0000178",
                         "label": "blood"
                     },
                     "tissue_processing": "Ficoll gradient",
                     "cell_subset": {
-                        "id": "CL_0000788",
+                        "id": "CL:0000788",
                         "label": "naive B cell"
                     },
                     "cell_phenotype": "expression of CD20 and the absence of CD27",
                     "cell_species": {
-                        "id": "NCBITaxon_9606",
+                        "id": "NCBITAXON:9606",
                         "label": "Homo sapiens"
                     },
                     "single_cell": false,
@@ -115,7 +212,7 @@
                         "label": null
                     },
                     "collection_time_point_reference": null,
-					"collection_location": {
+                    "collection_location": {
                         "id": null,
                         "label": null
                     },
@@ -169,13 +266,52 @@
                     "label": null
                 },
                 "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.",
-                "study_contact": "Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X",
                 "inclusion_exclusion_criteria": null,
-                "lab_name": "Mark M. Davis",
-                "lab_address": "Stanford University",
-                "submitted_by": "Florian Rubelt",
+                "contributors": [
+                    {
+                        "contributor_id": "1",
+                        "name": "Florian Rubelt",
+                        "orcid_id": {
+                            "id": null,
+                            "label": null
+                        },
+                        "affiliation": {
+                            "id": "ROR:00f54p054",
+                            "label": "Stanford University"
+                         },
+                        "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine",
+                        "contributions": [
+                            {
+                                "role": "investigation",
+                                "degree": null
+                            },
+                            {
+                                "role": "data curation",
+                                "degree": null
+                            }
+                        ]
+                    },
+                    {
+                        "contributor_id": "2",
+                        "name": "Mark M. Davis",
+                        "orcid_id": {
+                            "id": "ORCID:0000-0001-6868-657X",
+                            "label": "Mark Davis"
+                        },
+                        "affiliation": {
+                            "id": "ROR:00f54p054",
+                            "label": "Stanford University"
+                         },
+                        "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine",
+                        "contributions": [
+                            {
+                                "role": "supervision",
+                                "degree": null
+                            }
+                        ]
+                    }
+                ],
                 "pub_ids": ["PMID:27005435"],
-                "collected_by": null,
                 "grants": null,
                 "keywords_study": [
                     "contains_ig",
@@ -186,25 +322,25 @@
                 "subject_id": "TW01A",
                 "synthetic": false,
                 "species": {
-                    "id": "NCBITaxon_9606",
+                    "id": "NCBITAXON:9606",
                     "label": "Homo sapiens"
                 },
-                "sex": "F",
+                "sex": "female",
                 "age_min": 27,
                 "age_max": 27,
                 "age_unit": {
-                    "id": "UO_0000036",
+                    "id": "UO:0000036",
                     "label": "year"
                 },
                 "age_event": null,
                 "ancestry_population": {
-				    "id": null,
-					"label": null
-				},
-				"location_birth": {
-				    "id": null,
-					"label": null
-				},
+                    "id": null,
+                    "label": null
+                },
+                "location_birth": {
+                    "id": null,
+                    "label": null
+                },
                 "ethnicity": null,
                 "race": null,
                 "strain_name": null,
@@ -232,17 +368,17 @@
                     "sample_processing_id": null,
                     "sample_type": "peripheral venous puncture",
                     "tissue": {
-                        "id": "UBERON_0000178",
+                        "id": "UBERON:0000178",
                         "label": "blood"
                     },
                     "tissue_processing": "Ficoll gradient",
                     "cell_subset": {
-                        "id": "CL_0000787",
+                        "id": "CL:0000787",
                         "label": "memory B cell"
                     },
                     "cell_phenotype": "expression of CD20 and CD27",
                     "cell_species": {
-                        "id": "NCBITaxon_9606",
+                        "id": "NCBITAXON:9606",
                         "label": "Homo sapiens"
                     },
                     "single_cell": false,
@@ -274,7 +410,7 @@
                         "label": null
                     },
                     "collection_time_point_reference": null,
-					"collection_location": {
+                    "collection_location": {
                         "id": null,
                         "label": null
                     },
@@ -328,13 +464,52 @@
                     "label": null
                 },
                 "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.",
-                "study_contact": "Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X",
                 "inclusion_exclusion_criteria": null,
-                "lab_name": "Mark M. Davis",
-                "lab_address": "Stanford University",
-                "submitted_by": "Florian Rubelt",
+                "contributors": [
+                    {
+                        "contributor_id": "1",
+                        "name": "Florian Rubelt",
+                        "orcid_id": {
+                            "id": null,
+                            "label": null
+                        },
+                        "affiliation": {
+                            "id": "ROR:00f54p054",
+                            "label": "Stanford University"
+                         },
+                        "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine",
+                        "contributions": [
+                            {
+                                "role": "investigation",
+                                "degree": null
+                            },
+                            {
+                                "role": "data curation",
+                                "degree": null
+                            }
+                        ]
+                    },
+                    {
+                        "contributor_id": "2",
+                        "name": "Mark M. Davis",
+                        "orcid_id": {
+                            "id": "ORCID:0000-0001-6868-657X",
+                            "label": "Mark Davis"
+                        },
+                        "affiliation": {
+                            "id": "ROR:00f54p054",
+                            "label": "Stanford University"
+                         },
+                        "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine",
+                        "contributions": [
+                            {
+                                "role": "supervision",
+                                "degree": null
+                            }
+                        ]
+                    }
+                ],
                 "pub_ids": ["PMID:27005435"],
-                "collected_by": null,
                 "grants": null,
                 "keywords_study": [
                     "contains_ig",
@@ -345,25 +520,25 @@
                 "subject_id": "TW01A",
                 "synthetic": false,
                 "species": {
-                    "id": "NCBITaxon_9606",
+                    "id": "NCBITAXON:9606",
                     "label": "Homo sapiens"
                 },
-                "sex": "F",
+                "sex": "female",
                 "age_min": 27,
                 "age_max": 27,
                 "age_unit": {
-                    "id": "UO_0000036",
+                    "id": "UO:0000036",
                     "label": "year"
                 },
                 "age_event": null,
                 "ancestry_population": {
-				    "id": null,
-					"label": null
-				},
-				"location_birth": {
-				    "id": null,
-					"label": null
-				},
+                    "id": null,
+                    "label": null
+                },
+                "location_birth": {
+                    "id": null,
+                    "label": null
+                },
                 "ethnicity": null,
                 "race": null,
                 "strain_name": null,
@@ -391,17 +566,17 @@
                     "sample_processing_id": null,
                     "sample_type": "peripheral venous puncture",
                     "tissue": {
-                        "id": "UBERON_0000178",
+                        "id": "UBERON:0000178",
                         "label": "blood"
                     },
                     "tissue_processing": "Ficoll gradient",
                     "cell_subset": {
-                        "id": "CL_0000895",
+                        "id": "CL:0000895",
                         "label": "naive thymus-derived CD4-positive, alpha-beta T cell"
                     },
                     "cell_phenotype": "expression of CD8 and absence of CD4 and CD45RO",
                     "cell_species": {
-                        "id": "NCBITaxon_9606",
+                        "id": "NCBITAXON:9606",
                         "label": "Homo sapiens"
                     },
                     "single_cell": false,
@@ -433,7 +608,7 @@
                         "label": null
                     },
                     "collection_time_point_reference": null,
-					"collection_location": {
+                    "collection_location": {
                         "id": null,
                         "label": null
                     },
@@ -479,19 +654,44 @@
         }
     ],
 
+
     "GermlineSet": [{
         "germline_set_id": "OGRDB:G00007",
-        "author": "William Lees",
-        "lab_name": "",
-        "lab_address": "Birkbeck College, University of London, Malet Street, London",
-        "acknowledgements": [],
+        "acknowledgements": [
+            {
+                "contributor_id": "3",
+                "name": "William Lees",
+                "orcid_id": {
+                    "id": "ORCID:0000-0001-9834-6840",
+                    "label": "William Lees"
+                },
+                "affiliation": {
+                    "id": "ROR:02mb95055",
+                    "label": "Birkbeck, University of London"
+                },
+                "affiliation_department": null,
+                "contributions": [
+                    {
+                        "role": "investigation",
+                        "degree": null
+                    },
+                    {
+                        "role": "data curation",
+                        "degree": null
+                    }
+                ]
+            }
+        ],
         "release_version": 1,
         "release_description": "",
         "release_date": "2021-11-24",
         "germline_set_name": "CAST IGH",
         "germline_set_ref": "OGRDB:G00007.1",
         "pub_ids": [""],
-        "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" },
+        "species": {
+            "id": "NCBITAXON:10090",
+            "label": "Mus musculus"
+        },
         "species_subgroup": "CAST_EiJ",
         "species_subgroup_type": "strain",
         "locus": "IGH",
@@ -499,15 +699,37 @@
             {
                 "allele_description_id": "OGRDB:A00301",
                 "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2DBF",
-                "maintainer": "William Lees",
-                "acknowledgements": [],
-                "lab_address": "Birkbeck College, University of London, Malet Street, London",
+                "acknowledgements": [
+                    {
+                        "contributor_id": "3",
+                        "name": "William Lees",
+                        "orcid_id": {
+                            "id": "ORCID:0000-0001-9834-6840",
+                            "label": "William Lees"
+                        },
+                        "affiliation": {
+                            "id": "ROR:02mb95055",
+                            "label": "Birkbeck, University of London"
+                        },
+                        "affiliation_department": null,
+                        "contributions": [
+                            {
+                                "role": "investigation",
+                                "degree": null
+                            },
+                            {
+                                "role": "data curation",
+                                "degree": null
+                            }
+                        ]
+                    }
+                ],
                 "release_version": 1,
-                "release_date": "24-Nov-2021",
+                "release_date": "2021-11-24",
                 "release_description": "First release",
                 "label": "IGHV-2DBF",
                 "sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
-                "coding_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
+                "coding_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
                 "aliases": [
                     "watson_et_al:CAST_EiJ_IGHV5-3"
                 ],
@@ -516,7 +738,10 @@
                 "sequence_type": "V",
                 "functional": true,
                 "inference_type": "rearranged_only",
-                "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" },
+                "species": {
+                    "id": "NCBITAXON:10090",
+                    "label": "Mus musculus"
+                },
                 "species_subgroup": "CAST_EiJ",
                 "species_subgroup_type": "strain",
                 "status": "active",
@@ -537,18 +762,20 @@
                     {
                         "sequence_delineation_id": "1",
                         "delineation_scheme": "IMGT",
+                        "unaligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
+                        "aligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
                         "fwr1_start": 1,
-                        "fwr1_end": 78,
-                        "cdr1_start": 79,
-                        "cdr1_end": 114,
-                        "fwr2_start": 115,
-                        "fwr2_end": 165,
-                        "cdr2_start": 166,
-                        "cdr2_end": 195,
-                        "fwr3_start": 196,
-                        "fwr3_end": 312,
-                        "cdr3_start": 313,
-                        "alignment": [
+                        "fwr1_end": 75,
+                        "cdr1_start": 76,
+                        "cdr1_end": 110,
+                        "fwr2_start": 111,
+                        "fwr2_end": 150,
+                        "cdr2_start": 151,
+                        "cdr2_end": 160,
+                        "fwr3_start": 161,
+                        "fwr3_end": 294,
+                        "cdr3_start": 295,
+                        "alignment_labels": [
                             "1",
                             "2",
                             "3",
@@ -665,15 +892,37 @@
             {
                 "allele_description_id": "OGRDB:A00314",
                 "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2ETO",
-                "maintainer": "William Lees",
-                "acknowledgements": [],
-                "lab_address": "Birkbeck College, University of London, Malet Street, London",
+                "acknowledgements": [
+                    {
+                        "contributor_id": "3",
+                        "name": "William Lees",
+                        "orcid_id": {
+                            "id": "ORCID:0000-0001-9834-6840",
+                            "label": "William Lees"
+                        },
+                        "affiliation": {
+                            "id": "ROR:02mb95055",
+                            "label": "Birkbeck, University of London"
+                        },
+                        "affiliation_department": null,
+                        "contributions": [
+                            {
+                                "role": "investigation",
+                                "degree": null
+                            },
+                            {
+                                "role": "data curation",
+                                "degree": null
+                            }
+                        ]
+                    }
+                ],
                 "release_version": 1,
-                "release_date": "24-Nov-2021",
+                "release_date": "2021-11-24",
                 "release_description": "First release",
                 "label": "IGHV-2ETO",
                 "sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC",
-                "coding_sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCT...GGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGC......ACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGAT.........GATGATAAGTACTATAACCCATCCCTGAAG...AGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC",
+                "coding_sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC",
                 "aliases": [
                     "watson_et_al:CAST_EiJ_IGHV8-2"
                 ],
@@ -682,7 +931,10 @@
                 "sequence_type": "V",
                 "functional": true,
                 "inference_type": "rearranged_only",
-                "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" },
+                "species": {
+                    "id": "NCBITAXON:10090",
+                    "label": "Mus musculus"
+                },
                 "species_subgroup": "CAST_EiJ",
                 "species_subgroup_type": "strain",
                 "status": "active",
@@ -703,18 +955,20 @@
                     {
                         "sequence_delineation_id": "1",
                         "delineation_scheme": "IMGT",
+                        "unaligned_sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC",
+                        "aligned_sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTG...GGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGA............GCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGT......GGGATGATGATAAGTACTATAACCCATCCCTGA...AGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC",
                         "fwr1_start": 1,
-                        "fwr1_end": 78,
-                        "cdr1_start": 79,
-                        "cdr1_end": 114,
-                        "fwr2_start": 115,
-                        "fwr2_end": 165,
-                        "cdr2_start": 166,
-                        "cdr2_end": 195,
-                        "fwr3_start": 196,
-                        "fwr3_end": 312,
-                        "cdr3_start": 313,
-                        "alignment": [
+                        "fwr1_end": 75,
+                        "cdr1_start": 76,
+                        "cdr1_end": 110,
+                        "fwr2_start": 111,
+                        "fwr2_end": 150,
+                        "cdr2_start": 151,
+                        "cdr2_end": 160,
+                        "fwr3_start": 161,
+                        "fwr3_end": 294,
+                        "cdr3_start": 295,
+                        "alignment_labels": [
                             "1",
                             "2",
                             "3",
@@ -831,7 +1085,6 @@
         ],
         "curation": null
     }],
-
     "GenotypeSet": [{
         "receptor_genotype_set_id": "1",
         "genotype_class_list": [
diff --git a/lang/R/tests/data-tests/good_combined_airr.yaml b/lang/R/tests/data-tests/good_combined_airr.yaml
index f4fdcb0ef..2c9ab547c 100644
--- a/lang/R/tests/data-tests/good_combined_airr.yaml
+++ b/lang/R/tests/data-tests/good_combined_airr.yaml
@@ -21,13 +21,35 @@ Repertoire:
         a single chromosome. These data refine our understanding of the heritable mechanisms
         affecting the repertoire, and show that biases are evident on a chromosome-wide
         level.
-      study_contact: Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X
       inclusion_exclusion_criteria:
-      lab_name: Mark M. Davis
-      lab_address: Stanford University
-      submitted_by: Florian Rubelt
+      contributors:
+        - contributor_id: "1"
+          name: "Florian Rubelt"
+          orcid_id:
+            id:
+            label:
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "investigation"
+              degree:
+            - role: "data curation"
+              degree:
+        - contributor_id: "2"
+          name: "Mark M. Davis"
+          orcid_id:
+            id: "ORCID:0000-0001-6868-657X"
+            label: "Mark Davis"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "supervision"
+              degree:
       pub_ids: ["PMID:27005435"]
-      collected_by:
       grants:
       keywords_study:
         - contains_ig
@@ -36,13 +58,13 @@ Repertoire:
       subject_id: TW01A
       synthetic: false
       species:
-        id: NCBITaxon_9606
+        id: NCBITAXON:9606
         label: Homo sapiens
-      sex: F
+      sex: female
       age_min: 27
       age_max: 27
       age_unit:
-        id: UO_0000036
+        id: UO:0000036
         label: year
       age_event:
       ancestry_population:
@@ -67,20 +89,54 @@ Repertoire:
           immunogen:
           intervention:
           medical_history:
+      genotype:
+        receptor_genotype_set:
+          receptor_genotype_set_id: "1"
+          genotype_class_list:
+            - receptor_genotype_id: "1"
+              locus: IGH
+              documented_alleles:
+                - label: IGHV1-69*01
+                  germline_set_ref: IMGT:Homo sapiens:2022.1.31
+                  phasing: 1
+                - label: IGHV1-69*02
+                  germline_set_ref: IMGT:Homo sapiens:2022.1.31
+                  phasing: 2
+              undocumented_alleles:
+                - allele_name: IGHD3-1*01_S1234
+                  sequence: agtagtagtagt
+                  phasing: 1
+              deleted_genes:
+                - label: IGHV3-30-3
+                  germline_set_ref: IMGT:Homo sapiens:2022.1.31
+                  phasing: 1
+              inference_process: repertoire_sequencing
+        mhc_genotype_set:
+          mhc_genotype_set_id: 01847298-d0c2-11ee-bc66
+          mhc_genotype_list:
+            - mhc_genotype_id: 00be1c2e-d0c2-11ee-bfe7
+              mhc_class: MHC-I
+              mhc_genotyping_method: pcr_low_resolution
+              mhc_alleles:
+                - allele_designation: "01:01"
+                  gene:
+                    id: MRO-0000046
+                    label: HLA-A
+                  reference_set_ref:
     sample:
       - sample_id: TW01A_B_naive
         sample_processing_id:
         sample_type: peripheral venous puncture
         tissue:
-          id: UBERON_0000178
+          id: UBERON:0000178
           label: blood
         tissue_processing: Ficoll gradient
         cell_subset:
-          id: CL_0000788
+          id: CL:0000788
           label: naive B cell
         cell_phenotype: expression of CD20 and the absence of CD27
         cell_species:
-          id: NCBITaxon_9606
+          id: NCBITAXON:9606
           label: Homo sapiens
         single_cell: false
         cell_isolation: FACS
@@ -164,13 +220,35 @@ Repertoire:
         a single chromosome. These data refine our understanding of the heritable mechanisms
         affecting the repertoire, and show that biases are evident on a chromosome-wide
         level.
-      study_contact: Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X
       inclusion_exclusion_criteria:
-      lab_name: Mark M. Davis
-      lab_address: Stanford University
-      submitted_by: Florian Rubelt
+      contributors:
+        - contributor_id: "1"
+          name: "Florian Rubelt"
+          orcid_id:
+            id:
+            label:
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "investigation"
+              degree:
+            - role: "data curation"
+              degree:
+        - contributor_id: "2"
+          name: "Mark M. Davis"
+          orcid_id:
+            id: "ORCID:0000-0001-6868-657X"
+            label: "Mark Davis"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "supervision"
+              degree:
       pub_ids: ["PMID:27005435"]
-      collected_by:
       grants:
       keywords_study:
         - contains_ig
@@ -179,13 +257,13 @@ Repertoire:
       subject_id: TW01A
       synthetic: false
       species:
-        id: NCBITaxon_9606
+        id: NCBITAXON:9606
         label: Homo sapiens
-      sex: F
+      sex: female
       age_min: 27
       age_max: 27
       age_unit:
-        id: UO_0000036
+        id: UO:0000036
         label: year
       age_event:
       ancestry_population:
@@ -215,15 +293,15 @@ Repertoire:
         sample_processing_id:
         sample_type: peripheral venous puncture
         tissue:
-          id: UBERON_0000178
+          id: UBERON:0000178
           label: blood
         tissue_processing: Ficoll gradient
         cell_subset:
-          id: CL_0000787
+          id: CL:0000787
           label: memory B cell
         cell_phenotype: expression of CD20 and CD27
         cell_species:
-          id: NCBITaxon_9606
+          id: NCBITAXON:9606
           label: Homo sapiens
         single_cell: false
         cell_isolation: FACS
@@ -307,13 +385,35 @@ Repertoire:
         a single chromosome. These data refine our understanding of the heritable mechanisms
         affecting the repertoire, and show that biases are evident on a chromosome-wide
         level.
-      study_contact: Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X
       inclusion_exclusion_criteria:
-      lab_name: Mark M. Davis
-      lab_address: Stanford University
-      submitted_by: Florian Rubelt
+      contributors:
+        - contributor_id: "1"
+          name: "Florian Rubelt"
+          orcid_id:
+            id:
+            label:
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "investigation"
+              degree:
+            - role: "data curation"
+              degree:
+        - contributor_id: "2"
+          name: "Mark M. Davis"
+          orcid_id:
+            id: "ORCID:0000-0001-6868-657X"
+            label: "Mark Davis"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "supervision"
+              degree:
       pub_ids: ["PMID:27005435"]
-      collected_by:
       grants:
       keywords_study:
         - contains_ig
@@ -322,13 +422,13 @@ Repertoire:
       subject_id: TW01A
       synthetic: false
       species:
-        id: NCBITaxon_9606
+        id: NCBITAXON:9606
         label: Homo sapiens
-      sex: F
+      sex: female
       age_min: 27
       age_max: 27
       age_unit:
-        id: UO_0000036
+        id: UO:0000036
         label: year
       age_event:
       ancestry_population:
@@ -358,15 +458,15 @@ Repertoire:
         sample_processing_id:
         sample_type: peripheral venous puncture
         tissue:
-          id: UBERON_0000178
+          id: UBERON:0000178
           label: blood
         tissue_processing: Ficoll gradient
         cell_subset:
-          id: CL_0000895
+          id: CL:0000895
           label: naive thymus-derived CD4-positive, alpha-beta T cell
         cell_phenotype: expression of CD8 and absence of CD4 and CD45RO
         cell_species:
-          id: NCBITaxon_9606
+          id: NCBITAXON:9606
           label: Homo sapiens
         single_cell: false
         cell_isolation: FACS
@@ -431,16 +531,27 @@ Repertoire:
 
 GermlineSet:
   - germline_set_id: OGRDB:G00007
-    author: William Lees
-    lab_name: ''
-    lab_address: Birkbeck College, University of London, Malet Street, London
-    acknowledgements: []
+    acknowledgements:
+      - contributor_id: "3"
+        name: William Lees
+        orcid_id:
+          id: ORCID:0000-0001-9834-6840
+          label: William Lees
+        affiliation:
+          id: ROR:02mb95055
+          label: Birkbeck, University of London
+        affiliation_department:
+        contributions:
+          - role: investigation
+            degree: null
+          - role: data curation
+            degree: null
     release_version: 1
-    release_description: ''
-    release_date: '2021-11-24'
+    release_description: ""
+    release_date: "2021-11-24"
     germline_set_name: CAST IGH
     germline_set_ref: OGRDB:G00007.1
-    pub_ids: ['']
+    pub_ids: [""]
     species:
       id: NCBITAXON:10090
       label: Mus musculus
@@ -450,15 +561,27 @@ GermlineSet:
     allele_descriptions:
       - allele_description_id: OGRDB:A00301
         allele_description_ref: OGRDB:Mouse_IGH:IGHV-2DBF
-        maintainer: William Lees
-        acknowledgements: []
-        lab_address: Birkbeck College, University of London, Malet Street, London
+        acknowledgements:
+          - contributor_id: "3"
+            name: William Lees
+            orcid_id:
+              id: ORCID:0000-0001-9834-6840
+              label: William Lees
+            affiliation:
+              id: ROR:02mb95055
+              label: Birkbeck, University of London
+            affiliation_department:
+            contributions:
+              - role: investigation
+                degree:
+              - role: data curation
+                degree:
         release_version: 1
-        release_date: 24-Nov-2021
+        release_date: "2021-11-24"
         release_description: First release
         label: IGHV-2DBF
         sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA
-        coding_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA
+        coding_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA
         aliases:
           - watson_et_al:CAST_EiJ_IGHV5-3
         locus: IGH
@@ -488,18 +611,20 @@ GermlineSet:
         v_gene_delineations:
           - sequence_delineation_id: '1'
             delineation_scheme: IMGT
+            unaligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA
+            aligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA
             fwr1_start: 1
-            fwr1_end: 78
-            cdr1_start: 79
-            cdr1_end: 114
-            fwr2_start: 115
-            fwr2_end: 165
-            cdr2_start: 166
-            cdr2_end: 195
-            fwr3_start: 196
-            fwr3_end: 312
-            cdr3_start: 313
-            alignment:
+            fwr1_end: 75
+            cdr1_start: 76
+            cdr1_end: 110
+            fwr2_start: 111
+            fwr2_end: 150
+            cdr2_start: 151
+            cdr2_end: 160
+            fwr3_start: 161
+            fwr3_end: 294
+            cdr3_start: 295
+            alignment_labels:
               - '1'
               - '2'
               - '3'
@@ -611,15 +736,27 @@ GermlineSet:
         curational_tags:
       - allele_description_id: OGRDB:A00314
         allele_description_ref: OGRDB:Mouse_IGH:IGHV-2ETO
-        maintainer: William Lees
-        acknowledgements: []
-        lab_address: Birkbeck College, University of London, Malet Street, London
+        acknowledgements:
+          - contributor_id: "3"
+            name: William Lees
+            orcid_id:
+              id: ORCID:0000-0001-9834-6840
+              label: William Lees
+            affiliation:
+              id: ROR:02mb95055
+              label: Birkbeck, University of London
+            affiliation_department:
+            contributions:
+              - role: investigation
+                degree:
+              - role: data curation
+                degree:
         release_version: 1
-        release_date: 24-Nov-2021
+        release_date: "2021-11-24"
         release_description: First release
         label: IGHV-2ETO
         sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC
-        coding_sequence: CAAGTTACTCTAAAAGAGTCTGGCCCT...GGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGC......ACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGAT.........GATGATAAGTACTATAACCCATCCCTGAAG...AGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC
+        coding_sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC
         aliases:
           - watson_et_al:CAST_EiJ_IGHV8-2
         locus: IGH
@@ -649,18 +786,20 @@ GermlineSet:
         v_gene_delineations:
           - sequence_delineation_id: '1'
             delineation_scheme: IMGT
+            unaligned_sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC
+            aligned_sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTG...GGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGA............GCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGT......GGGATGATGATAAGTACTATAACCCATCCCTGA...AGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC
             fwr1_start: 1
-            fwr1_end: 78
-            cdr1_start: 79
-            cdr1_end: 114
-            fwr2_start: 115
-            fwr2_end: 165
-            cdr2_start: 166
-            cdr2_end: 195
-            fwr3_start: 196
-            fwr3_end: 312
-            cdr3_start: 313
-            alignment:
+            fwr1_end: 75
+            cdr1_start: 76
+            cdr1_end: 110
+            fwr2_start: 111
+            fwr2_end: 150
+            cdr2_start: 151
+            cdr2_end: 160
+            fwr3_start: 161
+            fwr3_end: 294
+            cdr3_start: 295
+            alignment_labels:
               - '1'
               - '2'
               - '3'
@@ -773,9 +912,9 @@ GermlineSet:
     curation:
 
 GenotypeSet:
-  - receptor_genotype_set_id: '1'
+  - receptor_genotype_set_id: "1"
     genotype_class_list:
-      - receptor_genotype_id: '1'
+      - receptor_genotype_id: "1"
         locus: IGH
         documented_alleles:
           - label: IGHV1-69*01
diff --git a/lang/R/tests/data-tests/good_genotype_set.json b/lang/R/tests/data-tests/good_genotype_set.json
index 4335b02e1..abd24646c 100644
--- a/lang/R/tests/data-tests/good_genotype_set.json
+++ b/lang/R/tests/data-tests/good_genotype_set.json
@@ -1,38 +1,38 @@
 {
-	"GenotypeSet": [{
-		"receptor_genotype_set_id": "1",
-		"genotype_class_list": [
-			{
-				"receptor_genotype_id": "1",
-				"locus": "IGH",
-				"documented_alleles": [
-					{
-						"label": "IGHV1-69*01",
-						"germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
-						"phasing": 1
-					},
-					{
-						"label": "IGHV1-69*02",
-						"germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
-						"phasing": 2
-					}
-				],
-				"undocumented_alleles": [
-					{
-						"allele_name": "IGHD3-1*01_S1234",
-						"sequence": "agtagtagtagt",
-						"phasing": 1
-					}
-				],
-				"deleted_genes": [
-					{
-						"label": "IGHV3-30-3",
-						"germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
-						"phasing": 1
-					}
-				],
-				"inference_process": "repertoire_sequencing"
-			}
-		]
-	}]
-}
\ No newline at end of file
+    "GenotypeSet": [{
+        "receptor_genotype_set_id": "1",
+        "genotype_class_list": [
+            {
+                "receptor_genotype_id": "1",
+                "locus": "IGH",
+                "documented_alleles": [
+                    {
+                        "label": "IGHV1-69*01",
+                        "germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
+                        "phasing": 1
+                    },
+                    {
+                        "label": "IGHV1-69*02",
+                        "germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
+                        "phasing": 2
+                    }
+                ],
+                "undocumented_alleles": [
+                    {
+                        "allele_name": "IGHD3-1*01_S1234",
+                        "sequence": "agtagtagtagt",
+                        "phasing": 1
+                    }
+                ],
+                "deleted_genes": [
+                    {
+                        "label": "IGHV3-30-3",
+                        "germline_set_ref": "IMGT:Homo sapiens:2022.1.31",
+                        "phasing": 1
+                    }
+                ],
+                "inference_process": "repertoire_sequencing"
+            }
+        ]
+    }]
+}
diff --git a/lang/R/tests/data-tests/good_germline_set.json b/lang/R/tests/data-tests/good_germline_set.json
index 41ecf5f7d..e74c590dc 100644
--- a/lang/R/tests/data-tests/good_germline_set.json
+++ b/lang/R/tests/data-tests/good_germline_set.json
@@ -1,17 +1,41 @@
 {
     "GermlineSet": [{
         "germline_set_id": "OGRDB:G00007",
-        "author": "William Lees",
-        "lab_name": "",
-        "lab_address": "Birkbeck College, University of London, Malet Street, London",
-        "acknowledgements": [],
+        "acknowledgements": [
+            {
+                "contributor_id": "1",
+                "name": "William Lees",
+                "orcid_id": {
+                    "id": "ORCID:0000-0001-9834-6840",
+                    "label": "William Lees"
+                },
+                "affiliation": {
+                    "id": "ROR:02mb95055",
+                    "label": "Birkbeck, University of London"
+                },
+                "affiliation_department":"",
+                "contributions": [
+                    {
+                        "role": "investigation",
+                        "degree": null
+                    },
+                    {
+                        "role": "data curation",
+                        "degree": null
+                    }
+                ]
+            }
+        ],
         "release_version": 1,
         "release_description": "",
         "release_date": "2021-11-24",
         "germline_set_name": "CAST IGH",
         "germline_set_ref": "OGRDB:G00007.1",
         "pub_ids": [""],
-        "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" },
+        "species": {
+            "id": "NCBITAXON:10090",
+            "label": "Mus musculus"
+        },
         "species_subgroup": "CAST_EiJ",
         "species_subgroup_type": "strain",
         "locus": "IGH",
@@ -19,11 +43,33 @@
             {
                 "allele_description_id": "OGRDB:A00301",
                 "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2DBF",
-                "maintainer": "William Lees",
-                "acknowledgements": [],
-                "lab_address": "Birkbeck College, University of London, Malet Street, London",
+                "acknowledgements":  [
+                    {
+                        "contributor_id": "1",
+                        "name": "William Lees",
+                        "orcid_id": {
+                            "id": "ORCID:0000-0001-9834-6840",
+                            "label": "William Lees"
+                        },
+                        "affiliation": {
+                            "id": "ROR:02mb95055",
+                            "label": "Birkbeck, University of London"
+                        },
+                        "affiliation_department":"",
+                        "contributions": [
+                            {
+                                "role": "investigation",
+                                "degree": null
+                            },
+                            {
+                                "role": "data curation",
+                                "degree": null
+                            }
+                        ]
+                    }
+                ],
                 "release_version": 1,
-                "release_date": "24-Nov-2021",
+                "release_date": "2021-11-24",
                 "release_description": "First release",
                 "label": "IGHV-2DBF",
                 "sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
@@ -36,7 +82,10 @@
                 "sequence_type": "V",
                 "functional": true,
                 "inference_type": "rearranged_only",
-                "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" },
+                "species": {
+                    "id": "NCBITAXON:10090",
+                    "label": "Mus musculus"
+                },
                 "species_subgroup": "CAST_EiJ",
                 "species_subgroup_type": "strain",
                 "status": "active",
@@ -70,7 +119,7 @@
                         "fwr3_start": 161,
                         "fwr3_end": 294,
                         "cdr3_start": 295,
-                        "alignment": [
+                        "alignment_labels": [
                             "1",
                             "2",
                             "3",
@@ -187,11 +236,33 @@
             {
                 "allele_description_id": "OGRDB:A00314",
                 "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2ETO",
-                "maintainer": "William Lees",
-                "acknowledgements": [],
-                "lab_address": "Birkbeck College, University of London, Malet Street, London",
+                "acknowledgements":  [
+                    {
+                        "contributor_id": "1",
+                        "name": "William Lees",
+                        "orcid_id": {
+                            "id": "ORCID:0000-0001-9834-6840",
+                            "label": "William Lees"
+                        },
+                        "affiliation": {
+                            "id": "ROR:02mb95055",
+                            "label": "Birkbeck, University of London"
+                        },
+                        "affiliation_department":"",
+                        "contributions": [
+                            {
+                                "role": "investigation",
+                                "degree": null
+                            },
+                            {
+                                "role": "data curation",
+                                "degree": null
+                            }
+                        ]
+                    }
+                ],
                 "release_version": 1,
-                "release_date": "24-Nov-2021",
+                "release_date": "2021-11-24",
                 "release_description": "First release",
                 "label": "IGHV-2ETO",
                 "sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC",
@@ -204,7 +275,10 @@
                 "sequence_type": "V",
                 "functional": true,
                 "inference_type": "rearranged_only",
-                "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" },
+                "species": {
+                    "id": "NCBITAXON:10090",
+                    "label": "Mus musculus"
+                },
                 "species_subgroup": "CAST_EiJ",
                 "species_subgroup_type": "strain",
                 "status": "active",
@@ -238,7 +312,7 @@
                         "fwr3_start": 161,
                         "fwr3_end": 294,
                         "cdr3_start": 295,
-                        "alignment": [
+                        "alignment_labels": [
                             "1",
                             "2",
                             "3",
diff --git a/lang/R/tests/data-tests/good_repertoire.yaml b/lang/R/tests/data-tests/good_repertoire.yaml
index c935c9b67..6adaa2361 100644
--- a/lang/R/tests/data-tests/good_repertoire.yaml
+++ b/lang/R/tests/data-tests/good_repertoire.yaml
@@ -11,28 +11,50 @@ Repertoire:
         id: null
         label: null
       study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level."
-      study_contact: "Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X"
       inclusion_exclusion_criteria: null
-      lab_name: "Mark M. Davis"
-      lab_address: "Stanford University"
-      submitted_by: "Florian Rubelt"
+      contributors:
+        - contributor_id: "1"
+          name: "Florian Rubelt"
+          orcid_id:
+            id: null
+            label: null
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "investigation"
+              degree: null
+            - role: "data curation"
+              degree: null
+        - contributor_id: "2"
+          name: "Mark M. Davis"
+          orcid_id:
+            id: "ORCID:0000-0001-6868-657X"
+            label: "Mark Davis"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "supervision"
+              degree: null
       pub_ids: ["PMID:27005435"]
-      collected_by: null
       grants: null
-      keywords_study: 
+      keywords_study:
         - "contains_ig"
         - "contains_tr"
     subject:
       subject_id: TW01A
       synthetic: false
       species:
-        id: "NCBITaxon_9606"
+        id: "NCBITAXON:9606"
         label: "Homo sapiens"
-      sex: F
+      sex: female
       age_min: 27
       age_max: 27
       age_unit:
-        id: UO_0000036
+        id: UO:0000036
         label: year
       age_event: null
       ancestry_population:
@@ -63,15 +85,15 @@ Repertoire:
         sample_processing_id: null
         sample_type: "peripheral venous puncture"
         tissue:
-          id: "UBERON_0000178"
+          id: "UBERON:0000178"
           label: "blood"
         tissue_processing: "Ficoll gradient"
         cell_subset:
-          id: "CL_0000788"
+          id: "CL:0000788"
           label: "naive B cell"
         cell_phenotype: "expression of CD20 and the absence of CD27"
         cell_species:
-          id: "NCBITaxon_9606"
+          id: "NCBITAXON:9606"
           label: "Homo sapiens"
         single_cell: false
         cell_isolation: FACS
@@ -82,7 +104,7 @@ Repertoire:
             reverse_pcr_primer_target_location: null
         sequencing_platform: "Illumina MiSeq"
         sequencing_files:
-          sequencing_data_id: SRR2905656
+          sequencing_data_id: SRA:SRR2905656
           file_type: fastq
           filename: SRR2905656_R1.fastq.gz
           read_direction: forward
@@ -90,6 +112,8 @@ Repertoire:
           paired_filename: SRR2905656_R2.fastq.gz
           paired_read_direction: reverse
           paired_read_length: 300
+          index_filename: SRR2905656_R3.fastq.gz
+          index_length: 8
         anatomic_site: null
         disease_state_sample: null
         collection_time_point_relative: null
@@ -142,13 +166,35 @@ Repertoire:
         id: null
         label: null
       study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level."
-      study_contact: "Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X"
       inclusion_exclusion_criteria: null
-      lab_name: "Mark M. Davis"
-      lab_address: "Stanford University"
-      submitted_by: "Florian Rubelt"
+      contributors:
+        - contributor_id: "1"
+          name: "Florian Rubelt"
+          orcid_id:
+            id: null
+            label: null
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "investigation"
+              degree: null
+            - role: "data curation"
+              degree: null
+        - contributor_id: "2"
+          name: "Mark M. Davis"
+          orcid_id:
+            id: "ORCID:0000-0001-6868-657X"
+            label: "Mark Davis"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "supervision"
+              degree: null
       pub_ids: ["PMID:27005435"]
-      collected_by: null
       grants: null
       keywords_study:
         - "contains_ig"
@@ -157,13 +203,13 @@ Repertoire:
       subject_id: TW01A
       synthetic: false
       species:
-        id: "NCBITaxon_9606"
+        id: "NCBITAXON:9606"
         label: "Homo sapiens"
-      sex: F
+      sex: female
       age_min: 27
       age_max: 27
       age_unit:
-        id: UO_0000036
+        id: UO:0000036
         label: year
       age_event: null
       ancestry_population:
@@ -194,15 +240,15 @@ Repertoire:
         sample_processing_id: null
         sample_type: "peripheral venous puncture"
         tissue:
-          id: "UBERON_0000178"
+          id: "UBERON:0000178"
           label: "blood"
         tissue_processing: "Ficoll gradient"
         cell_subset:
-          id: "CL_0000787"
+          id: "CL:0000787"
           label: "memory B cell"
         cell_phenotype: "expression of CD20 and CD27"
         cell_species:
-          id: "NCBITaxon_9606"
+          id: "NCBITAXON:9606"
           label: "Homo sapiens"
         single_cell: false
         cell_isolation: FACS
@@ -213,7 +259,7 @@ Repertoire:
             reverse_pcr_primer_target_location: null
         sequencing_platform: "Illumina MiSeq"
         sequencing_files:
-          sequencing_data_id: SRR2905655
+          sequencing_data_id: SRA:SRR2905655
           file_type: fastq
           filename: SRR2905655_R1.fastq.gz
           read_direction: forward
@@ -221,6 +267,8 @@ Repertoire:
           paired_filename: SRR2905655_R2.fastq.gz
           paired_read_direction: reverse
           paired_read_length: 300
+          index_filename: SRR2905655_R3.fastq.gz
+          index_length: 8
         anatomic_site: null
         disease_state_sample: null
         collection_time_point_relative: null
@@ -273,13 +321,35 @@ Repertoire:
         id: null
         label: null
       study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level."
-      study_contact: "Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X"
       inclusion_exclusion_criteria: null
-      lab_name: "Mark M. Davis"
-      lab_address: "Stanford University"
-      submitted_by: "Florian Rubelt"
+      contributors:
+        - contributor_id: "1"
+          name: "Florian Rubelt"
+          orcid_id:
+            id: null
+            label: null
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "investigation"
+              degree: null
+            - role: "data curation"
+              degree: null
+        - contributor_id: "2"
+          name: "Mark M. Davis"
+          orcid_id:
+            id: "ORCID:0000-0001-6868-657X"
+            label: "Mark Davis"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "supervision"
+              degree: null
       pub_ids: ["PMID:27005435"]
-      collected_by: null
       grants: null
       keywords_study:
         - "contains_ig"
@@ -288,13 +358,13 @@ Repertoire:
       subject_id: TW01A
       synthetic: false
       species:
-        id: "NCBITaxon_9606"
+        id: "NCBITAXON:9606"
         label: "Homo sapiens"
-      sex: F
+      sex: female
       age_min: 27
       age_max: 27
       age_unit:
-        id: UO_0000036
+        id: UO:0000036
         label: year
       age_event: null
       ancestry_population:
@@ -325,15 +395,15 @@ Repertoire:
         sample_processing_id: null
         sample_type: "peripheral venous puncture"
         tissue:
-          id: "UBERON_0000178"
+          id: "UBERON:0000178"
           label: "blood"
         tissue_processing: "Ficoll gradient"
         cell_subset:
-          id: "CL_0000895"
+          id: "CL:0000895"
           label: "naive thymus-derived CD4-positive, alpha-beta T cell"
         cell_phenotype: "expression of CD8 and absence of CD4 and CD45RO"
         cell_species:
-          id: "NCBITaxon_9606"
+          id: "NCBITAXON:9606"
           label: "Homo sapiens"
         single_cell: false
         cell_isolation: FACS
@@ -344,7 +414,7 @@ Repertoire:
             reverse_pcr_primer_target_location: null
         sequencing_platform: "Illumina MiSeq"
         sequencing_files:
-          sequencing_data_id: SRR2905659
+          sequencing_data_id: SRA:SRR2905659
           file_type: fastq
           filename: SRR2905659_R1.fastq.gz
           read_direction: forward
@@ -352,6 +422,8 @@ Repertoire:
           paired_filename: SRR2905659_R2.fastq.gz
           paired_read_direction: reverse
           paired_read_length: 300
+          index_filename: SRR2905659_R3.fastq.gz
+          index_length: 8
         anatomic_site: null
         disease_state_sample: null
         collection_time_point_relative: null
diff --git a/lang/python/airr/specs/airr-schema.yaml b/lang/python/airr/specs/airr-schema.yaml
index 87a25b5bd..dd2c0c241 100644
--- a/lang/python/airr/specs/airr-schema.yaml
+++ b/lang/python/airr/specs/airr-schema.yaml
@@ -1571,10 +1571,7 @@ Study:
         - study_type
         - inclusion_exclusion_criteria
         - grants
-        - collected_by
-        - lab_name
-        - lab_address
-        - submitted_by
+        - contributors
         - pub_ids
         - keywords_study
     properties:
diff --git a/lang/python/tests/data/bad_genotype_set.json b/lang/python/tests/data/bad_genotype_set.json
index c58a39027..01709d60a 100644
--- a/lang/python/tests/data/bad_genotype_set.json
+++ b/lang/python/tests/data/bad_genotype_set.json
@@ -41,4 +41,4 @@
             }
         ]
     }]
-}
\ No newline at end of file
+}
diff --git a/lang/python/tests/data/bad_germline_set.json b/lang/python/tests/data/bad_germline_set.json
index 168cc1fa5..28531aabb 100644
--- a/lang/python/tests/data/bad_germline_set.json
+++ b/lang/python/tests/data/bad_germline_set.json
@@ -1,27 +1,71 @@
 {
     "GermlineSet": [{
         "germline_set_id": "OGRDB:G00007",
-        "author": "William Lees",
-        "lab_name": "",
-        "lab_address": "Birkbeck College, University of London, Malet Street, London",
+        "acknowledgements": [
+            {
+                "contributor_id": "1",
+                "name": "William Lees",
+                "orcid_id": {
+                    "id": "ORCID:0000-0001-9834-6840",
+                    "label": "William Lees"
+                },
+                "affiliation": {
+                    "id": "ROR:02mb95055",
+                    "label": "Birkbeck, University of London"
+                },
+                "affiliation_department":"",
+                "contributions": [
+                    {
+                        "role": "investigation",
+                        "degree": null
+                    },
+                    {
+                        "role": "data curation",
+                        "degree": null
+                    }
+                ]
+            }
+        ],
         "release_version": 1,
         "release_description": "",
         "release_date": "2021-11-24",
         "germline_set_name": "CAST IGH",
         "germline_set_ref": "OGRDB:G00007.1",
         "pub_ids": [""],
-        "species": ["Mouse"],
+        "species": "Mouse",
         "species_subgroup": "CAST_EiJ",
         "species_subgroup_type": "strain",
         "locus": "IGH",
         "allele_descriptions": [
             {
                 "allele_description_id": "OGRDB:A00301",
-                "maintainer": "William Lees",
-                "acknowledgements": [],
-                "lab_address": "Birkbeck College, University of London, Malet Street, London",
+                "acknowledgements": [
+                    {
+                        "contributor_id": "1",
+                        "name": "William Lees",
+                        "orcid_id": {
+                            "id": "ORCID:0000-0001-9834-6840",
+                            "label": "William Lees"
+                        },
+                        "affiliation": {
+                            "id": "ROR:02mb95055",
+                            "label": "Birkbeck, University of London"
+                        },
+                        "affiliation_department":"",
+                        "contributions": [
+                            {
+                                "role": "investigation",
+                                "degree": null
+                            },
+                            {
+                                "role": "data curation",
+                                "degree": null
+                            }
+                        ]
+                    }
+                ],
                 "release_version": 1,
-                "release_date": "24-Nov-2021",
+                "release_date": "2021-11-24",
                 "release_description": "First release",
                 "label": "IGHV-2DBF",
                 "sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
@@ -66,7 +110,7 @@
                         "fwr3_start": 196,
                         "fwr3_end": 312,
                         "cdr3_start": 313,
-                        "alignment": [
+                        "alignment_labels": [
                             "1",
                             "2",
                             "3",
@@ -182,11 +226,33 @@
             },
             {
                 "allele_description_id": "OGRDB:A00314",
-                "maintainer": "William Lees",
-                "acknowledgements": [],
-                "lab_address": "Birkbeck College, University of London, Malet Street, London",
+                "acknowledgements": [
+                    {
+                        "contributor_id": "1",
+                        "name": "William Lees",
+                        "orcid_id": {
+                            "id": "ORCID:0000-0001-9834-6840",
+                            "label": "William Lees"
+                        },
+                        "affiliation": {
+                            "id": "ROR:02mb95055",
+                            "label": "Birkbeck, University of London"
+                        },
+                        "affiliation_department":"",
+                        "contributions": [
+                            {
+                                "role": "investigation",
+                                "degree": null
+                            },
+                            {
+                                "role": "data curation",
+                                "degree": null
+                            }
+                        ]
+                    }
+                ],
                 "release_version": 1,
-                "release_date": "24-Nov-2021",
+                "release_date": "2021-11-24",
                 "release_description": "First release",
                 "label": "IGHV-2ETO",
                 "sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC",
@@ -231,7 +297,7 @@
                         "fwr3_start": 196,
                         "fwr3_end": 312,
                         "cdr3_start": 313,
-                        "alignment": [
+                        "alignment_labels": [
                             "1",
                             "2",
                             "3",
diff --git a/lang/python/tests/data/bad_repertoire.yaml b/lang/python/tests/data/bad_repertoire.yaml
index 2de377cb3..f35355e98 100644
--- a/lang/python/tests/data/bad_repertoire.yaml
+++ b/lang/python/tests/data/bad_repertoire.yaml
@@ -8,21 +8,39 @@ Repertoire:
       study_id: PRJNA300878
       study_title: "Homo sapiens B and T cell repertoire - MZ twins"
       study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level."
-      lab_name: "Mark M. Davis"
-      lab_address: "Stanford University"
-      submitted_by: "Florian Rubelt"
+      contributors:
+        - contributor_id: "1"
+          name: "Florian Rubelt"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "investigation"
+            - role: "data curation"
+        - contributor_id: "2"
+          name: "Mark M. Davis"
+          orcid_id:
+            id: "ORCID:0000-0001-6868-657X"
+            label: "Mark Davis"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "supervision"
       pub_ids: ["PMID:27005435"]
     subject:
       subject_id: TW01A
       synthetic: false
       species:
-        id: "NCBITaxon_9606"
+        id: "NCBITAXON:9606"
         value: "Homo sapiens"
       sex: female
       age_min: 27
       age_max: 27
       age_unit:
-        id: UO_0000036
+        id: UO:0000036
         value: year
       linked_subjects: TW01B
       link_type: twin
@@ -32,7 +50,7 @@ Repertoire:
         cell_subset: "Naive B cell"
         cell_phenotype: "expression of CD20 and the absence of CD27"
         cell_species:
-          id: "NCBITaxon_9606"
+          id: "NCBITAXON:9606"
           value: "Homo sapiens"
         single_cell: false
         cell_isolation: FACS
@@ -56,21 +74,39 @@ Repertoire:
       study_id: PRJNA300878
       study_title: "Homo sapiens B and T cell repertoire - MZ twins"
       study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level."
-      lab_name: "Mark M. Davis"
-      lab_address: "Stanford University"
-      submitted_by: "Florian Rubelt"
+      contributors:
+        - contributor_id: "1"
+          name: "Florian Rubelt"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "investigation"
+            - role: "data curation"
+        - contributor_id: "2"
+          name: "Mark M. Davis"
+          orcid_id:
+            id: "ORCID:0000-0001-6868-657X"
+            label: "Mark Davis"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "supervision"
       pub_ids: ["PMID:27005435"]
     subject:
       subject_id: TW01A
       synthetic: false
       species:
-        id: "NCBITaxon_9606"
+        id: "NCBITAXON:9606"
         value: "Homo sapiens"
       sex: female
       age_min: 27
       age_max: 27
       age_unit:
-        id: UO_0000036
+        id: UO:0000036
         value: year
       linked_subjects: TW01B
       link_type: twin
@@ -80,7 +116,7 @@ Repertoire:
         cell_subset: "Memory B cell"
         cell_phenotype: "expression of CD20 and CD27"
         cell_species:
-          id: "NCBITaxon_9606"
+          id: "NCBITAXON:9606"
           value: "Homo sapiens"
         single_cell: false
         cell_isolation: FACS
@@ -104,21 +140,39 @@ Repertoire:
       study_id: PRJNA300878
       study_title: "Homo sapiens B and T cell repertoire - MZ twins"
       study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level."
-      lab_name: "Mark M. Davis"
-      lab_address: "Stanford University"
-      submitted_by: "Florian Rubelt"
+      contributors:
+        - contributor_id: "1"
+          name: "Florian Rubelt"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "investigation"
+            - role: "data curation"
+        - contributor_id: "2"
+          name: "Mark M. Davis"
+          orcid_id:
+            id: "ORCID:0000-0001-6868-657X"
+            label: "Mark Davis"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "supervision"
       pub_ids: ["PMID:27005435"]
     subject:
       subject_id: TW01A
       synthetic: false
       species:
-        id: "NCBITaxon_9606"
+        id: "NCBITAXON:9606"
         value: "Homo sapiens"
       sex: female
       age_min: 27
       age_max: 27
       age_unit:
-        id: UO_0000036
+        id: UO:0000036
         value: year
       linked_subjects: TW01B
       link_type: twin
@@ -128,7 +182,7 @@ Repertoire:
         cell_subset: "Naive CD4+ T cell"
         cell_phenotype: "expression of CD8 and absence of CD4 and CD45RO"
         cell_species:
-          id: "NCBITaxon_9606"
+          id: "NCBITAXON:9606"
           value: "Homo sapiens"
         single_cell: false
         cell_isolation: FACS
diff --git a/lang/python/tests/data/good_combined_airr.json b/lang/python/tests/data/good_combined_airr.json
index 9101b24a9..0ef2106ae 100644
--- a/lang/python/tests/data/good_combined_airr.json
+++ b/lang/python/tests/data/good_combined_airr.json
@@ -10,13 +10,52 @@
                     "label": null
                 },
                 "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.",
-                "study_contact": "Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X",
                 "inclusion_exclusion_criteria": null,
-                "lab_name": "Mark M. Davis",
-                "lab_address": "Stanford University",
-                "submitted_by": "Florian Rubelt",
+                "contributors": [
+                    {
+                        "contributor_id": "1",
+                        "name": "Florian Rubelt",
+                        "orcid_id": {
+                            "id": null,
+                            "label": null
+                        },
+                        "affiliation": {
+                            "id": "ROR:00f54p054",
+                            "label": "Stanford University"
+                         },
+                        "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine",
+                        "contributions": [
+                            {
+                                "role": "investigation",
+                                "degree": null
+                            },
+                            {
+                                "role": "data curation",
+                                "degree": null
+                            }
+                        ]
+                    },
+                    {
+                        "contributor_id": "2",
+                        "name": "Mark M. Davis",
+                        "orcid_id": {
+                            "id": "ORCID:0000-0001-6868-657X",
+                            "label": "Mark Davis"
+                        },
+                        "affiliation": {
+                            "id": "ROR:00f54p054",
+                            "label": "Stanford University"
+                         },
+                        "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine",
+                        "contributions": [
+                            {
+                                "role": "supervision",
+                                "degree": null
+                            }
+                        ]
+                    }
+                ],
                 "pub_ids": ["PMID:27005435"],
-                "collected_by": null,
                 "grants": null,
                 "keywords_study": [
                     "contains_ig",
@@ -27,25 +66,25 @@
                 "subject_id": "TW01A",
                 "synthetic": false,
                 "species": {
-                    "id": "NCBITaxon_9606",
+                    "id": "NCBITAXON:9606",
                     "label": "Homo sapiens"
                 },
                 "sex": "female",
                 "age_min": 27,
                 "age_max": 27,
                 "age_unit": {
-                    "id": "UO_0000036",
+                    "id": "UO:0000036",
                     "label": "year"
                 },
                 "age_event": null,
                 "ancestry_population": {
-				    "id": null,
-					"label": null
-				},
-				"location_birth": {
-				    "id": null,
-					"label": null
-				},
+                    "id": null,
+                    "label": null
+                },
+                "location_birth": {
+                    "id": null,
+                    "label": null
+                },
                 "ethnicity": null,
                 "race": null,
                 "strain_name": null,
@@ -104,10 +143,10 @@
                         ]
                     },
                     "mhc_genotype_set": {
-                        "mhc_genotype_set_id": "this is a unique identifier",
+                        "mhc_genotype_set_id": "01847298-d0c2-11ee-bc66",
                         "mhc_genotype_list": [
                             {
-                                "mhc_genotype_id": "unique",
+                                "mhc_genotype_id": "00be1c2e-d0c2-11ee-bfe7",
                                 "mhc_class": "MHC-I",
                                 "mhc_genotyping_method": "pcr_low_resolution",
                                 "mhc_alleles": [
@@ -117,7 +156,7 @@
                                             "id": "MRO-0000046",
                                             "label": "HLA-A"
                                         },
-                                        "reference_set_ref": "blah"
+                                        "reference_set_ref": null
                                     }
                                 ]
                             }
@@ -131,17 +170,17 @@
                     "sample_processing_id": null,
                     "sample_type": "peripheral venous puncture",
                     "tissue": {
-                        "id": "UBERON_0000178",
+                        "id": "UBERON:0000178",
                         "label": "blood"
                     },
                     "tissue_processing": "Ficoll gradient",
                     "cell_subset": {
-                        "id": "CL_0000788",
+                        "id": "CL:0000788",
                         "label": "naive B cell"
                     },
                     "cell_phenotype": "expression of CD20 and the absence of CD27",
                     "cell_species": {
-                        "id": "NCBITaxon_9606",
+                        "id": "NCBITAXON:9606",
                         "label": "Homo sapiens"
                     },
                     "single_cell": false,
@@ -173,7 +212,7 @@
                         "label": null
                     },
                     "collection_time_point_reference": null,
-					"collection_location": {
+                    "collection_location": {
                         "id": null,
                         "label": null
                     },
@@ -227,13 +266,52 @@
                     "label": null
                 },
                 "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.",
-                "study_contact": "Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X",
                 "inclusion_exclusion_criteria": null,
-                "lab_name": "Mark M. Davis",
-                "lab_address": "Stanford University",
-                "submitted_by": "Florian Rubelt",
+                "contributors": [
+                    {
+                        "contributor_id": "1",
+                        "name": "Florian Rubelt",
+                        "orcid_id": {
+                            "id": null,
+                            "label": null
+                        },
+                        "affiliation": {
+                            "id": "ROR:00f54p054",
+                            "label": "Stanford University"
+                         },
+                        "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine",
+                        "contributions": [
+                            {
+                                "role": "investigation",
+                                "degree": null
+                            },
+                            {
+                                "role": "data curation",
+                                "degree": null
+                            }
+                        ]
+                    },
+                    {
+                        "contributor_id": "2",
+                        "name": "Mark M. Davis",
+                        "orcid_id": {
+                            "id": "ORCID:0000-0001-6868-657X",
+                            "label": "Mark Davis"
+                        },
+                        "affiliation": {
+                            "id": "ROR:00f54p054",
+                            "label": "Stanford University"
+                         },
+                        "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine",
+                        "contributions": [
+                            {
+                                "role": "supervision",
+                                "degree": null
+                            }
+                        ]
+                    }
+                ],
                 "pub_ids": ["PMID:27005435"],
-                "collected_by": null,
                 "grants": null,
                 "keywords_study": [
                     "contains_ig",
@@ -244,25 +322,25 @@
                 "subject_id": "TW01A",
                 "synthetic": false,
                 "species": {
-                    "id": "NCBITaxon_9606",
+                    "id": "NCBITAXON:9606",
                     "label": "Homo sapiens"
                 },
                 "sex": "female",
                 "age_min": 27,
                 "age_max": 27,
                 "age_unit": {
-                    "id": "UO_0000036",
+                    "id": "UO:0000036",
                     "label": "year"
                 },
                 "age_event": null,
                 "ancestry_population": {
-				    "id": null,
-					"label": null
-				},
-				"location_birth": {
-				    "id": null,
-					"label": null
-				},
+                    "id": null,
+                    "label": null
+                },
+                "location_birth": {
+                    "id": null,
+                    "label": null
+                },
                 "ethnicity": null,
                 "race": null,
                 "strain_name": null,
@@ -290,17 +368,17 @@
                     "sample_processing_id": null,
                     "sample_type": "peripheral venous puncture",
                     "tissue": {
-                        "id": "UBERON_0000178",
+                        "id": "UBERON:0000178",
                         "label": "blood"
                     },
                     "tissue_processing": "Ficoll gradient",
                     "cell_subset": {
-                        "id": "CL_0000787",
+                        "id": "CL:0000787",
                         "label": "memory B cell"
                     },
                     "cell_phenotype": "expression of CD20 and CD27",
                     "cell_species": {
-                        "id": "NCBITaxon_9606",
+                        "id": "NCBITAXON:9606",
                         "label": "Homo sapiens"
                     },
                     "single_cell": false,
@@ -332,7 +410,7 @@
                         "label": null
                     },
                     "collection_time_point_reference": null,
-					"collection_location": {
+                    "collection_location": {
                         "id": null,
                         "label": null
                     },
@@ -386,13 +464,52 @@
                     "label": null
                 },
                 "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.",
-                "study_contact": "Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X",
                 "inclusion_exclusion_criteria": null,
-                "lab_name": "Mark M. Davis",
-                "lab_address": "Stanford University",
-                "submitted_by": "Florian Rubelt",
+                "contributors": [
+                    {
+                        "contributor_id": "1",
+                        "name": "Florian Rubelt",
+                        "orcid_id": {
+                            "id": null,
+                            "label": null
+                        },
+                        "affiliation": {
+                            "id": "ROR:00f54p054",
+                            "label": "Stanford University"
+                         },
+                        "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine",
+                        "contributions": [
+                            {
+                                "role": "investigation",
+                                "degree": null
+                            },
+                            {
+                                "role": "data curation",
+                                "degree": null
+                            }
+                        ]
+                    },
+                    {
+                        "contributor_id": "2",
+                        "name": "Mark M. Davis",
+                        "orcid_id": {
+                            "id": "ORCID:0000-0001-6868-657X",
+                            "label": "Mark Davis"
+                        },
+                        "affiliation": {
+                            "id": "ROR:00f54p054",
+                            "label": "Stanford University"
+                         },
+                        "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine",
+                        "contributions": [
+                            {
+                                "role": "supervision",
+                                "degree": null
+                            }
+                        ]
+                    }
+                ],
                 "pub_ids": ["PMID:27005435"],
-                "collected_by": null,
                 "grants": null,
                 "keywords_study": [
                     "contains_ig",
@@ -403,25 +520,25 @@
                 "subject_id": "TW01A",
                 "synthetic": false,
                 "species": {
-                    "id": "NCBITaxon_9606",
+                    "id": "NCBITAXON:9606",
                     "label": "Homo sapiens"
                 },
                 "sex": "female",
                 "age_min": 27,
                 "age_max": 27,
                 "age_unit": {
-                    "id": "UO_0000036",
+                    "id": "UO:0000036",
                     "label": "year"
                 },
                 "age_event": null,
                 "ancestry_population": {
-				    "id": null,
-					"label": null
-				},
-				"location_birth": {
-				    "id": null,
-					"label": null
-				},
+                    "id": null,
+                    "label": null
+                },
+                "location_birth": {
+                    "id": null,
+                    "label": null
+                },
                 "ethnicity": null,
                 "race": null,
                 "strain_name": null,
@@ -449,17 +566,17 @@
                     "sample_processing_id": null,
                     "sample_type": "peripheral venous puncture",
                     "tissue": {
-                        "id": "UBERON_0000178",
+                        "id": "UBERON:0000178",
                         "label": "blood"
                     },
                     "tissue_processing": "Ficoll gradient",
                     "cell_subset": {
-                        "id": "CL_0000895",
+                        "id": "CL:0000895",
                         "label": "naive thymus-derived CD4-positive, alpha-beta T cell"
                     },
                     "cell_phenotype": "expression of CD8 and absence of CD4 and CD45RO",
                     "cell_species": {
-                        "id": "NCBITaxon_9606",
+                        "id": "NCBITAXON:9606",
                         "label": "Homo sapiens"
                     },
                     "single_cell": false,
@@ -491,7 +608,7 @@
                         "label": null
                     },
                     "collection_time_point_reference": null,
-					"collection_location": {
+                    "collection_location": {
                         "id": null,
                         "label": null
                     },
@@ -540,17 +657,41 @@
 
     "GermlineSet": [{
         "germline_set_id": "OGRDB:G00007",
-        "author": "William Lees",
-        "lab_name": "",
-        "lab_address": "Birkbeck College, University of London, Malet Street, London",
-        "acknowledgements": [],
+        "acknowledgements": [
+            {
+                "contributor_id": "3",
+                "name": "William Lees",
+                "orcid_id": {
+                    "id": "ORCID:0000-0001-9834-6840",
+                    "label": "William Lees"
+                },
+                "affiliation": {
+                    "id": "ROR:02mb95055",
+                    "label": "Birkbeck, University of London"
+                },
+                "affiliation_department": null,
+                "contributions": [
+                    {
+                        "role": "investigation",
+                        "degree": null
+                    },
+                    {
+                        "role": "data curation",
+                        "degree": null
+                    }
+                ]
+            }
+        ],
         "release_version": 1,
         "release_description": "",
         "release_date": "2021-11-24",
         "germline_set_name": "CAST IGH",
         "germline_set_ref": "OGRDB:G00007.1",
         "pub_ids": [""],
-        "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" },
+        "species": {
+            "id": "NCBITAXON:10090",
+            "label": "Mus musculus"
+        },
         "species_subgroup": "CAST_EiJ",
         "species_subgroup_type": "strain",
         "locus": "IGH",
@@ -558,15 +699,37 @@
             {
                 "allele_description_id": "OGRDB:A00301",
                 "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2DBF",
-                "maintainer": "William Lees",
-                "acknowledgements": [],
-                "lab_address": "Birkbeck College, University of London, Malet Street, London",
+                "acknowledgements": [
+                    {
+                        "contributor_id": "3",
+                        "name": "William Lees",
+                        "orcid_id": {
+                            "id": "ORCID:0000-0001-9834-6840",
+                            "label": "William Lees"
+                        },
+                        "affiliation": {
+                            "id": "ROR:02mb95055",
+                            "label": "Birkbeck, University of London"
+                        },
+                        "affiliation_department": null,
+                        "contributions": [
+                            {
+                                "role": "investigation",
+                                "degree": null
+                            },
+                            {
+                                "role": "data curation",
+                                "degree": null
+                            }
+                        ]
+                    }
+                ],
                 "release_version": 1,
-                "release_date": "24-Nov-2021",
+                "release_date": "2021-11-24",
                 "release_description": "First release",
                 "label": "IGHV-2DBF",
-                "sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
-                "coding_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
+                "sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
+                "coding_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
                 "aliases": [
                     "watson_et_al:CAST_EiJ_IGHV5-3"
                 ],
@@ -575,7 +738,10 @@
                 "sequence_type": "V",
                 "functional": true,
                 "inference_type": "rearranged_only",
-                "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" },
+                "species": {
+                    "id": "NCBITAXON:10090",
+                    "label": "Mus musculus"
+                },
                 "species_subgroup": "CAST_EiJ",
                 "species_subgroup_type": "strain",
                 "status": "active",
@@ -596,8 +762,8 @@
                     {
                         "sequence_delineation_id": "1",
                         "delineation_scheme": "IMGT",
+                        "unaligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
                         "aligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
-                        "unaligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
                         "fwr1_start": 1,
                         "fwr1_end": 75,
                         "cdr1_start": 76,
@@ -609,7 +775,7 @@
                         "fwr3_start": 161,
                         "fwr3_end": 294,
                         "cdr3_start": 295,
-                        "alignment": [
+                        "alignment_labels": [
                             "1",
                             "2",
                             "3",
@@ -726,11 +892,33 @@
             {
                 "allele_description_id": "OGRDB:A00314",
                 "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2ETO",
-                "maintainer": "William Lees",
-                "acknowledgements": [],
-                "lab_address": "Birkbeck College, University of London, Malet Street, London",
+                "acknowledgements": [
+                    {
+                        "contributor_id": "3",
+                        "name": "William Lees",
+                        "orcid_id": {
+                            "id": "ORCID:0000-0001-9834-6840",
+                            "label": "William Lees"
+                        },
+                        "affiliation": {
+                            "id": "ROR:02mb95055",
+                            "label": "Birkbeck, University of London"
+                        },
+                        "affiliation_department": null,
+                        "contributions": [
+                            {
+                                "role": "investigation",
+                                "degree": null
+                            },
+                            {
+                                "role": "data curation",
+                                "degree": null
+                            }
+                        ]
+                    }
+                ],
                 "release_version": 1,
-                "release_date": "24-Nov-2021",
+                "release_date": "2021-11-24",
                 "release_description": "First release",
                 "label": "IGHV-2ETO",
                 "sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC",
@@ -743,7 +931,10 @@
                 "sequence_type": "V",
                 "functional": true,
                 "inference_type": "rearranged_only",
-                "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" },
+                "species": {
+                    "id": "NCBITAXON:10090",
+                    "label": "Mus musculus"
+                },
                 "species_subgroup": "CAST_EiJ",
                 "species_subgroup_type": "strain",
                 "status": "active",
@@ -764,8 +955,8 @@
                     {
                         "sequence_delineation_id": "1",
                         "delineation_scheme": "IMGT",
-                        "aligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
-                        "unaligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
+                        "unaligned_sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC",
+                        "aligned_sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTG...GGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGA............GCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGT......GGGATGATGATAAGTACTATAACCCATCCCTGA...AGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC",
                         "fwr1_start": 1,
                         "fwr1_end": 75,
                         "cdr1_start": 76,
@@ -777,7 +968,7 @@
                         "fwr3_start": 161,
                         "fwr3_end": 294,
                         "cdr3_start": 295,
-                        "alignment": [
+                        "alignment_labels": [
                             "1",
                             "2",
                             "3",
diff --git a/lang/python/tests/data/good_combined_airr.yaml b/lang/python/tests/data/good_combined_airr.yaml
index 80d0fe3a2..2c9ab547c 100644
--- a/lang/python/tests/data/good_combined_airr.yaml
+++ b/lang/python/tests/data/good_combined_airr.yaml
@@ -21,13 +21,35 @@ Repertoire:
         a single chromosome. These data refine our understanding of the heritable mechanisms
         affecting the repertoire, and show that biases are evident on a chromosome-wide
         level.
-      study_contact: Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X
       inclusion_exclusion_criteria:
-      lab_name: Mark M. Davis
-      lab_address: Stanford University
-      submitted_by: Florian Rubelt
+      contributors:
+        - contributor_id: "1"
+          name: "Florian Rubelt"
+          orcid_id:
+            id:
+            label:
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "investigation"
+              degree:
+            - role: "data curation"
+              degree:
+        - contributor_id: "2"
+          name: "Mark M. Davis"
+          orcid_id:
+            id: "ORCID:0000-0001-6868-657X"
+            label: "Mark Davis"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "supervision"
+              degree:
       pub_ids: ["PMID:27005435"]
-      collected_by:
       grants:
       keywords_study:
         - contains_ig
@@ -36,13 +58,13 @@ Repertoire:
       subject_id: TW01A
       synthetic: false
       species:
-        id: NCBITaxon_9606
+        id: NCBITAXON:9606
         label: Homo sapiens
       sex: female
       age_min: 27
       age_max: 27
       age_unit:
-        id: UO_0000036
+        id: UO:0000036
         label: year
       age_event:
       ancestry_population:
@@ -69,9 +91,9 @@ Repertoire:
           medical_history:
       genotype:
         receptor_genotype_set:
-          receptor_genotype_set_id: '1'
+          receptor_genotype_set_id: "1"
           genotype_class_list:
-            - receptor_genotype_id: '1'
+            - receptor_genotype_id: "1"
               locus: IGH
               documented_alleles:
                 - label: IGHV1-69*01
@@ -90,31 +112,31 @@ Repertoire:
                   phasing: 1
               inference_process: repertoire_sequencing
         mhc_genotype_set:
-          mhc_genotype_set_id: "this is a unique identifier"
+          mhc_genotype_set_id: 01847298-d0c2-11ee-bc66
           mhc_genotype_list:
-            - mhc_genotype_id: unique
+            - mhc_genotype_id: 00be1c2e-d0c2-11ee-bfe7
               mhc_class: MHC-I
               mhc_genotyping_method: pcr_low_resolution
               mhc_alleles:
                 - allele_designation: "01:01"
                   gene:
-                    id: "MRO-0000046"
-                    label: "HLA-A"
-                  reference_set_ref: blah
+                    id: MRO-0000046
+                    label: HLA-A
+                  reference_set_ref:
     sample:
       - sample_id: TW01A_B_naive
         sample_processing_id:
         sample_type: peripheral venous puncture
         tissue:
-          id: UBERON_0000178
+          id: UBERON:0000178
           label: blood
         tissue_processing: Ficoll gradient
         cell_subset:
-          id: CL_0000788
+          id: CL:0000788
           label: naive B cell
         cell_phenotype: expression of CD20 and the absence of CD27
         cell_species:
-          id: NCBITaxon_9606
+          id: NCBITAXON:9606
           label: Homo sapiens
         single_cell: false
         cell_isolation: FACS
@@ -198,13 +220,35 @@ Repertoire:
         a single chromosome. These data refine our understanding of the heritable mechanisms
         affecting the repertoire, and show that biases are evident on a chromosome-wide
         level.
-      study_contact: Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X
       inclusion_exclusion_criteria:
-      lab_name: Mark M. Davis
-      lab_address: Stanford University
-      submitted_by: Florian Rubelt
+      contributors:
+        - contributor_id: "1"
+          name: "Florian Rubelt"
+          orcid_id:
+            id:
+            label:
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "investigation"
+              degree:
+            - role: "data curation"
+              degree:
+        - contributor_id: "2"
+          name: "Mark M. Davis"
+          orcid_id:
+            id: "ORCID:0000-0001-6868-657X"
+            label: "Mark Davis"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "supervision"
+              degree:
       pub_ids: ["PMID:27005435"]
-      collected_by:
       grants:
       keywords_study:
         - contains_ig
@@ -213,13 +257,13 @@ Repertoire:
       subject_id: TW01A
       synthetic: false
       species:
-        id: NCBITaxon_9606
+        id: NCBITAXON:9606
         label: Homo sapiens
       sex: female
       age_min: 27
       age_max: 27
       age_unit:
-        id: UO_0000036
+        id: UO:0000036
         label: year
       age_event:
       ancestry_population:
@@ -249,15 +293,15 @@ Repertoire:
         sample_processing_id:
         sample_type: peripheral venous puncture
         tissue:
-          id: UBERON_0000178
+          id: UBERON:0000178
           label: blood
         tissue_processing: Ficoll gradient
         cell_subset:
-          id: CL_0000787
+          id: CL:0000787
           label: memory B cell
         cell_phenotype: expression of CD20 and CD27
         cell_species:
-          id: NCBITaxon_9606
+          id: NCBITAXON:9606
           label: Homo sapiens
         single_cell: false
         cell_isolation: FACS
@@ -341,13 +385,35 @@ Repertoire:
         a single chromosome. These data refine our understanding of the heritable mechanisms
         affecting the repertoire, and show that biases are evident on a chromosome-wide
         level.
-      study_contact: Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X
       inclusion_exclusion_criteria:
-      lab_name: Mark M. Davis
-      lab_address: Stanford University
-      submitted_by: Florian Rubelt
+      contributors:
+        - contributor_id: "1"
+          name: "Florian Rubelt"
+          orcid_id:
+            id:
+            label:
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "investigation"
+              degree:
+            - role: "data curation"
+              degree:
+        - contributor_id: "2"
+          name: "Mark M. Davis"
+          orcid_id:
+            id: "ORCID:0000-0001-6868-657X"
+            label: "Mark Davis"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "supervision"
+              degree:
       pub_ids: ["PMID:27005435"]
-      collected_by:
       grants:
       keywords_study:
         - contains_ig
@@ -356,13 +422,13 @@ Repertoire:
       subject_id: TW01A
       synthetic: false
       species:
-        id: NCBITaxon_9606
+        id: NCBITAXON:9606
         label: Homo sapiens
       sex: female
       age_min: 27
       age_max: 27
       age_unit:
-        id: UO_0000036
+        id: UO:0000036
         label: year
       age_event:
       ancestry_population:
@@ -392,15 +458,15 @@ Repertoire:
         sample_processing_id:
         sample_type: peripheral venous puncture
         tissue:
-          id: UBERON_0000178
+          id: UBERON:0000178
           label: blood
         tissue_processing: Ficoll gradient
         cell_subset:
-          id: CL_0000895
+          id: CL:0000895
           label: naive thymus-derived CD4-positive, alpha-beta T cell
         cell_phenotype: expression of CD8 and absence of CD4 and CD45RO
         cell_species:
-          id: NCBITaxon_9606
+          id: NCBITAXON:9606
           label: Homo sapiens
         single_cell: false
         cell_isolation: FACS
@@ -464,357 +530,391 @@ Repertoire:
         analysis_provenance_id: 4625424004665971176-242ac11c-0001-012
 
 GermlineSet:
-- acknowledgements: []
-  allele_descriptions:
-  - acknowledgements: []
-    aliases:
-    - watson_et_al:CAST_EiJ_IGHV5-3
-    allele_description_id: OGRDB:A00301
-    allele_description_ref: OGRDB:Mouse_IGH:IGHV-2DBF
-    allele_designation: null
-    chromosome: null
-    coding_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA
-    curation: 'Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV5-3'
-    curational_tags: null
-    functional: true
-    gene_designation: null
-    gene_end: null
-    gene_start: null
-    inference_type: rearranged_only
-    lab_address: Birkbeck College, University of London, Malet Street, London
-    label: IGHV-2DBF
-    leader_1_end: null
-    leader_1_start: null
-    leader_2_end: null
-    leader_2_start: null
-    locus: IGH
-    maintainer: William Lees
-    paralogs: []
-    rearranged_support: []
-    release_date: 24-Nov-2021
-    release_description: First release
+  - germline_set_id: OGRDB:G00007
+    acknowledgements:
+      - contributor_id: "3"
+        name: William Lees
+        orcid_id:
+          id: ORCID:0000-0001-9834-6840
+          label: William Lees
+        affiliation:
+          id: ROR:02mb95055
+          label: Birkbeck, University of London
+        affiliation_department:
+        contributions:
+          - role: investigation
+            degree: null
+          - role: data curation
+            degree: null
     release_version: 1
-    sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA
-    sequence_type: V
+    release_description: ""
+    release_date: "2021-11-24"
+    germline_set_name: CAST IGH
+    germline_set_ref: OGRDB:G00007.1
+    pub_ids: [""]
     species:
       id: NCBITAXON:10090
       label: Mus musculus
     species_subgroup: CAST_EiJ
     species_subgroup_type: strain
-    status: active
-    subgroup_designation: null
-    unrearranged_support: []
-    utr_5_prime_end: null
-    utr_5_prime_start: null
-    v_gene_delineations:
-    - aligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA
-      alignment:
-      - '1'
-      - '2'
-      - '3'
-      - '4'
-      - '5'
-      - '6'
-      - '7'
-      - '8'
-      - '9'
-      - '10'
-      - '11'
-      - '12'
-      - '13'
-      - '14'
-      - '15'
-      - '16'
-      - '17'
-      - '18'
-      - '19'
-      - '20'
-      - '21'
-      - '22'
-      - '23'
-      - '24'
-      - '25'
-      - '26'
-      - '27'
-      - '28'
-      - '29'
-      - '30'
-      - '31'
-      - '32'
-      - '33'
-      - '34'
-      - '35'
-      - '36'
-      - '37'
-      - '38'
-      - '39'
-      - '40'
-      - '41'
-      - '42'
-      - '43'
-      - '44'
-      - '45'
-      - '46'
-      - '47'
-      - '48'
-      - '49'
-      - '50'
-      - '51'
-      - '52'
-      - '53'
-      - '54'
-      - '55'
-      - '56'
-      - '57'
-      - '58'
-      - '59'
-      - '60'
-      - '61'
-      - '62'
-      - '63'
-      - '64'
-      - '65'
-      - '66'
-      - '67'
-      - '68'
-      - '69'
-      - '70'
-      - '71'
-      - '72'
-      - '73'
-      - '74'
-      - '75'
-      - '76'
-      - '77'
-      - '78'
-      - '79'
-      - '80'
-      - '81'
-      - '82'
-      - '83'
-      - '84'
-      - '85'
-      - '86'
-      - '87'
-      - '88'
-      - '89'
-      - '90'
-      - '91'
-      - '92'
-      - '93'
-      - '94'
-      - '95'
-      - '96'
-      - '97'
-      - '98'
-      - '99'
-      - '100'
-      - '101'
-      - '102'
-      - '103'
-      - '104'
-      cdr1_end: 110
-      cdr1_start: 76
-      cdr2_end: 160
-      cdr2_start: 151
-      cdr3_start: 295
-      delineation_scheme: IMGT
-      fwr1_end: 75
-      fwr1_start: 1
-      fwr2_end: 150
-      fwr2_start: 111
-      fwr3_end: 294
-      fwr3_start: 161
-      sequence_delineation_id: '1'
-      unaligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA
-    v_rs_end: null
-    v_rs_start: null
-  - acknowledgements: []
-    aliases:
-    - watson_et_al:CAST_EiJ_IGHV8-2
-    allele_description_id: OGRDB:A00314
-    allele_description_ref: OGRDB:Mouse_IGH:IGHV-2ETO
-    allele_designation: null
-    chromosome: null
-    coding_sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC
-    curation: 'Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV8-2'
-    curational_tags: null
-    functional: true
-    gene_designation: null
-    gene_end: null
-    gene_start: null
-    inference_type: rearranged_only
-    lab_address: Birkbeck College, University of London, Malet Street, London
-    label: IGHV-2ETO
-    leader_1_end: null
-    leader_1_start: null
-    leader_2_end: null
-    leader_2_start: null
     locus: IGH
-    maintainer: William Lees
-    paralogs: []
-    rearranged_support: []
-    release_date: 24-Nov-2021
-    release_description: First release
-    release_version: 1
-    sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC
-    sequence_type: V
-    species:
-      id: NCBITAXON:10090
-      label: Mus musculus
-    species_subgroup: CAST_EiJ
-    species_subgroup_type: strain
-    status: active
-    subgroup_designation: null
-    unrearranged_support: []
-    utr_5_prime_end: null
-    utr_5_prime_start: null
-    v_gene_delineations:
-    - aligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA
-      alignment:
-      - '1'
-      - '2'
-      - '3'
-      - '4'
-      - '5'
-      - '6'
-      - '7'
-      - '8'
-      - '9'
-      - '10'
-      - '11'
-      - '12'
-      - '13'
-      - '14'
-      - '15'
-      - '16'
-      - '17'
-      - '18'
-      - '19'
-      - '20'
-      - '21'
-      - '22'
-      - '23'
-      - '24'
-      - '25'
-      - '26'
-      - '27'
-      - '28'
-      - '29'
-      - '30'
-      - '31'
-      - '32'
-      - '33'
-      - '34'
-      - '35'
-      - '36'
-      - '37'
-      - '38'
-      - '39'
-      - '40'
-      - '41'
-      - '42'
-      - '43'
-      - '44'
-      - '45'
-      - '46'
-      - '47'
-      - '48'
-      - '49'
-      - '50'
-      - '51'
-      - '52'
-      - '53'
-      - '54'
-      - '55'
-      - '56'
-      - '57'
-      - '58'
-      - '59'
-      - '60'
-      - '61'
-      - '62'
-      - '63'
-      - '64'
-      - '65'
-      - '66'
-      - '67'
-      - '68'
-      - '69'
-      - '70'
-      - '71'
-      - '72'
-      - '73'
-      - '74'
-      - '75'
-      - '76'
-      - '77'
-      - '78'
-      - '79'
-      - '80'
-      - '81'
-      - '82'
-      - '83'
-      - '84'
-      - '85'
-      - '86'
-      - '87'
-      - '88'
-      - '89'
-      - '90'
-      - '91'
-      - '92'
-      - '93'
-      - '94'
-      - '95'
-      - '96'
-      - '97'
-      - '98'
-      - '99'
-      - '100'
-      - '101'
-      - '102'
-      - '103'
-      - '104'
-      cdr1_end: 110
-      cdr1_start: 76
-      cdr2_end: 160
-      cdr2_start: 151
-      cdr3_start: 295
-      delineation_scheme: IMGT
-      fwr1_end: 75
-      fwr1_start: 1
-      fwr2_end: 150
-      fwr2_start: 111
-      fwr3_end: 294
-      fwr3_start: 161
-      sequence_delineation_id: '1'
-      unaligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA
-    v_rs_end: null
-    v_rs_start: null
-  author: William Lees
-  curation: null
-  germline_set_id: OGRDB:G00007
-  germline_set_name: CAST IGH
-  germline_set_ref: OGRDB:G00007.1
-  lab_address: Birkbeck College, University of London, Malet Street, London
-  lab_name: ''
-  locus: IGH
-  pub_ids: ['']
-  release_date: '2021-11-24'
-  release_description: ''
-  release_version: 1
-  species:
-    id: NCBITAXON:10090
-    label: Mus musculus
-  species_subgroup: CAST_EiJ
-  species_subgroup_type: strain
-
+    allele_descriptions:
+      - allele_description_id: OGRDB:A00301
+        allele_description_ref: OGRDB:Mouse_IGH:IGHV-2DBF
+        acknowledgements:
+          - contributor_id: "3"
+            name: William Lees
+            orcid_id:
+              id: ORCID:0000-0001-9834-6840
+              label: William Lees
+            affiliation:
+              id: ROR:02mb95055
+              label: Birkbeck, University of London
+            affiliation_department:
+            contributions:
+              - role: investigation
+                degree:
+              - role: data curation
+                degree:
+        release_version: 1
+        release_date: "2021-11-24"
+        release_description: First release
+        label: IGHV-2DBF
+        sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA
+        coding_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA
+        aliases:
+          - watson_et_al:CAST_EiJ_IGHV5-3
+        locus: IGH
+        chromosome:
+        sequence_type: V
+        functional: true
+        inference_type: rearranged_only
+        species:
+          id: NCBITAXON:10090
+          label: Mus musculus
+        species_subgroup: CAST_EiJ
+        species_subgroup_type: strain
+        status: active
+        gene_designation:
+        subgroup_designation:
+        allele_designation:
+        gene_start:
+        gene_end:
+        utr_5_prime_start:
+        utr_5_prime_end:
+        leader_1_start:
+        leader_1_end:
+        leader_2_start:
+        leader_2_end:
+        v_rs_start:
+        v_rs_end:
+        v_gene_delineations:
+          - sequence_delineation_id: '1'
+            delineation_scheme: IMGT
+            unaligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA
+            aligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA
+            fwr1_start: 1
+            fwr1_end: 75
+            cdr1_start: 76
+            cdr1_end: 110
+            fwr2_start: 111
+            fwr2_end: 150
+            cdr2_start: 151
+            cdr2_end: 160
+            fwr3_start: 161
+            fwr3_end: 294
+            cdr3_start: 295
+            alignment_labels:
+              - '1'
+              - '2'
+              - '3'
+              - '4'
+              - '5'
+              - '6'
+              - '7'
+              - '8'
+              - '9'
+              - '10'
+              - '11'
+              - '12'
+              - '13'
+              - '14'
+              - '15'
+              - '16'
+              - '17'
+              - '18'
+              - '19'
+              - '20'
+              - '21'
+              - '22'
+              - '23'
+              - '24'
+              - '25'
+              - '26'
+              - '27'
+              - '28'
+              - '29'
+              - '30'
+              - '31'
+              - '32'
+              - '33'
+              - '34'
+              - '35'
+              - '36'
+              - '37'
+              - '38'
+              - '39'
+              - '40'
+              - '41'
+              - '42'
+              - '43'
+              - '44'
+              - '45'
+              - '46'
+              - '47'
+              - '48'
+              - '49'
+              - '50'
+              - '51'
+              - '52'
+              - '53'
+              - '54'
+              - '55'
+              - '56'
+              - '57'
+              - '58'
+              - '59'
+              - '60'
+              - '61'
+              - '62'
+              - '63'
+              - '64'
+              - '65'
+              - '66'
+              - '67'
+              - '68'
+              - '69'
+              - '70'
+              - '71'
+              - '72'
+              - '73'
+              - '74'
+              - '75'
+              - '76'
+              - '77'
+              - '78'
+              - '79'
+              - '80'
+              - '81'
+              - '82'
+              - '83'
+              - '84'
+              - '85'
+              - '86'
+              - '87'
+              - '88'
+              - '89'
+              - '90'
+              - '91'
+              - '92'
+              - '93'
+              - '94'
+              - '95'
+              - '96'
+              - '97'
+              - '98'
+              - '99'
+              - '100'
+              - '101'
+              - '102'
+              - '103'
+              - '104'
+        unrearranged_support: []
+        rearranged_support: []
+        paralogs: []
+        curation: 'Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV5-3'
+        curational_tags:
+      - allele_description_id: OGRDB:A00314
+        allele_description_ref: OGRDB:Mouse_IGH:IGHV-2ETO
+        acknowledgements:
+          - contributor_id: "3"
+            name: William Lees
+            orcid_id:
+              id: ORCID:0000-0001-9834-6840
+              label: William Lees
+            affiliation:
+              id: ROR:02mb95055
+              label: Birkbeck, University of London
+            affiliation_department:
+            contributions:
+              - role: investigation
+                degree:
+              - role: data curation
+                degree:
+        release_version: 1
+        release_date: "2021-11-24"
+        release_description: First release
+        label: IGHV-2ETO
+        sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC
+        coding_sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC
+        aliases:
+          - watson_et_al:CAST_EiJ_IGHV8-2
+        locus: IGH
+        chromosome:
+        sequence_type: V
+        functional: true
+        inference_type: rearranged_only
+        species:
+          id: NCBITAXON:10090
+          label: Mus musculus
+        species_subgroup: CAST_EiJ
+        species_subgroup_type: strain
+        status: active
+        gene_designation:
+        subgroup_designation:
+        allele_designation:
+        gene_start:
+        gene_end:
+        utr_5_prime_start:
+        utr_5_prime_end:
+        leader_1_start:
+        leader_1_end:
+        leader_2_start:
+        leader_2_end:
+        v_rs_start:
+        v_rs_end:
+        v_gene_delineations:
+          - sequence_delineation_id: '1'
+            delineation_scheme: IMGT
+            unaligned_sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC
+            aligned_sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTG...GGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGA............GCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGT......GGGATGATGATAAGTACTATAACCCATCCCTGA...AGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC
+            fwr1_start: 1
+            fwr1_end: 75
+            cdr1_start: 76
+            cdr1_end: 110
+            fwr2_start: 111
+            fwr2_end: 150
+            cdr2_start: 151
+            cdr2_end: 160
+            fwr3_start: 161
+            fwr3_end: 294
+            cdr3_start: 295
+            alignment_labels:
+              - '1'
+              - '2'
+              - '3'
+              - '4'
+              - '5'
+              - '6'
+              - '7'
+              - '8'
+              - '9'
+              - '10'
+              - '11'
+              - '12'
+              - '13'
+              - '14'
+              - '15'
+              - '16'
+              - '17'
+              - '18'
+              - '19'
+              - '20'
+              - '21'
+              - '22'
+              - '23'
+              - '24'
+              - '25'
+              - '26'
+              - '27'
+              - '28'
+              - '29'
+              - '30'
+              - '31'
+              - '32'
+              - '33'
+              - '34'
+              - '35'
+              - '36'
+              - '37'
+              - '38'
+              - '39'
+              - '40'
+              - '41'
+              - '42'
+              - '43'
+              - '44'
+              - '45'
+              - '46'
+              - '47'
+              - '48'
+              - '49'
+              - '50'
+              - '51'
+              - '52'
+              - '53'
+              - '54'
+              - '55'
+              - '56'
+              - '57'
+              - '58'
+              - '59'
+              - '60'
+              - '61'
+              - '62'
+              - '63'
+              - '64'
+              - '65'
+              - '66'
+              - '67'
+              - '68'
+              - '69'
+              - '70'
+              - '71'
+              - '72'
+              - '73'
+              - '74'
+              - '75'
+              - '76'
+              - '77'
+              - '78'
+              - '79'
+              - '80'
+              - '81'
+              - '82'
+              - '83'
+              - '84'
+              - '85'
+              - '86'
+              - '87'
+              - '88'
+              - '89'
+              - '90'
+              - '91'
+              - '92'
+              - '93'
+              - '94'
+              - '95'
+              - '96'
+              - '97'
+              - '98'
+              - '99'
+              - '100'
+              - '101'
+              - '102'
+              - '103'
+              - '104'
+        unrearranged_support: []
+        rearranged_support: []
+        paralogs: []
+        curation: 'Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV8-2'
+        curational_tags:
+    curation:
 
 GenotypeSet:
-  - receptor_genotype_set_id: '1'
+  - receptor_genotype_set_id: "1"
     genotype_class_list:
-      - receptor_genotype_id: '1'
+      - receptor_genotype_id: "1"
         locus: IGH
         documented_alleles:
           - label: IGHV1-69*01
diff --git a/lang/python/tests/data/good_genotype_set.json b/lang/python/tests/data/good_genotype_set.json
index ba10f56e9..abd24646c 100644
--- a/lang/python/tests/data/good_genotype_set.json
+++ b/lang/python/tests/data/good_genotype_set.json
@@ -35,4 +35,4 @@
             }
         ]
     }]
-}
\ No newline at end of file
+}
diff --git a/lang/python/tests/data/good_germline_set.json b/lang/python/tests/data/good_germline_set.json
index 41ecf5f7d..e74c590dc 100644
--- a/lang/python/tests/data/good_germline_set.json
+++ b/lang/python/tests/data/good_germline_set.json
@@ -1,17 +1,41 @@
 {
     "GermlineSet": [{
         "germline_set_id": "OGRDB:G00007",
-        "author": "William Lees",
-        "lab_name": "",
-        "lab_address": "Birkbeck College, University of London, Malet Street, London",
-        "acknowledgements": [],
+        "acknowledgements": [
+            {
+                "contributor_id": "1",
+                "name": "William Lees",
+                "orcid_id": {
+                    "id": "ORCID:0000-0001-9834-6840",
+                    "label": "William Lees"
+                },
+                "affiliation": {
+                    "id": "ROR:02mb95055",
+                    "label": "Birkbeck, University of London"
+                },
+                "affiliation_department":"",
+                "contributions": [
+                    {
+                        "role": "investigation",
+                        "degree": null
+                    },
+                    {
+                        "role": "data curation",
+                        "degree": null
+                    }
+                ]
+            }
+        ],
         "release_version": 1,
         "release_description": "",
         "release_date": "2021-11-24",
         "germline_set_name": "CAST IGH",
         "germline_set_ref": "OGRDB:G00007.1",
         "pub_ids": [""],
-        "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" },
+        "species": {
+            "id": "NCBITAXON:10090",
+            "label": "Mus musculus"
+        },
         "species_subgroup": "CAST_EiJ",
         "species_subgroup_type": "strain",
         "locus": "IGH",
@@ -19,11 +43,33 @@
             {
                 "allele_description_id": "OGRDB:A00301",
                 "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2DBF",
-                "maintainer": "William Lees",
-                "acknowledgements": [],
-                "lab_address": "Birkbeck College, University of London, Malet Street, London",
+                "acknowledgements":  [
+                    {
+                        "contributor_id": "1",
+                        "name": "William Lees",
+                        "orcid_id": {
+                            "id": "ORCID:0000-0001-9834-6840",
+                            "label": "William Lees"
+                        },
+                        "affiliation": {
+                            "id": "ROR:02mb95055",
+                            "label": "Birkbeck, University of London"
+                        },
+                        "affiliation_department":"",
+                        "contributions": [
+                            {
+                                "role": "investigation",
+                                "degree": null
+                            },
+                            {
+                                "role": "data curation",
+                                "degree": null
+                            }
+                        ]
+                    }
+                ],
                 "release_version": 1,
-                "release_date": "24-Nov-2021",
+                "release_date": "2021-11-24",
                 "release_description": "First release",
                 "label": "IGHV-2DBF",
                 "sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
@@ -36,7 +82,10 @@
                 "sequence_type": "V",
                 "functional": true,
                 "inference_type": "rearranged_only",
-                "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" },
+                "species": {
+                    "id": "NCBITAXON:10090",
+                    "label": "Mus musculus"
+                },
                 "species_subgroup": "CAST_EiJ",
                 "species_subgroup_type": "strain",
                 "status": "active",
@@ -70,7 +119,7 @@
                         "fwr3_start": 161,
                         "fwr3_end": 294,
                         "cdr3_start": 295,
-                        "alignment": [
+                        "alignment_labels": [
                             "1",
                             "2",
                             "3",
@@ -187,11 +236,33 @@
             {
                 "allele_description_id": "OGRDB:A00314",
                 "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2ETO",
-                "maintainer": "William Lees",
-                "acknowledgements": [],
-                "lab_address": "Birkbeck College, University of London, Malet Street, London",
+                "acknowledgements":  [
+                    {
+                        "contributor_id": "1",
+                        "name": "William Lees",
+                        "orcid_id": {
+                            "id": "ORCID:0000-0001-9834-6840",
+                            "label": "William Lees"
+                        },
+                        "affiliation": {
+                            "id": "ROR:02mb95055",
+                            "label": "Birkbeck, University of London"
+                        },
+                        "affiliation_department":"",
+                        "contributions": [
+                            {
+                                "role": "investigation",
+                                "degree": null
+                            },
+                            {
+                                "role": "data curation",
+                                "degree": null
+                            }
+                        ]
+                    }
+                ],
                 "release_version": 1,
-                "release_date": "24-Nov-2021",
+                "release_date": "2021-11-24",
                 "release_description": "First release",
                 "label": "IGHV-2ETO",
                 "sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC",
@@ -204,7 +275,10 @@
                 "sequence_type": "V",
                 "functional": true,
                 "inference_type": "rearranged_only",
-                "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" },
+                "species": {
+                    "id": "NCBITAXON:10090",
+                    "label": "Mus musculus"
+                },
                 "species_subgroup": "CAST_EiJ",
                 "species_subgroup_type": "strain",
                 "status": "active",
@@ -238,7 +312,7 @@
                         "fwr3_start": 161,
                         "fwr3_end": 294,
                         "cdr3_start": 295,
-                        "alignment": [
+                        "alignment_labels": [
                             "1",
                             "2",
                             "3",
diff --git a/lang/python/tests/data/good_repertoire.yaml b/lang/python/tests/data/good_repertoire.yaml
index 9bf3a4653..6adaa2361 100644
--- a/lang/python/tests/data/good_repertoire.yaml
+++ b/lang/python/tests/data/good_repertoire.yaml
@@ -11,28 +11,50 @@ Repertoire:
         id: null
         label: null
       study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level."
-      study_contact: "Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X"
       inclusion_exclusion_criteria: null
-      lab_name: "Mark M. Davis"
-      lab_address: "Stanford University"
-      submitted_by: "Florian Rubelt"
+      contributors:
+        - contributor_id: "1"
+          name: "Florian Rubelt"
+          orcid_id:
+            id: null
+            label: null
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "investigation"
+              degree: null
+            - role: "data curation"
+              degree: null
+        - contributor_id: "2"
+          name: "Mark M. Davis"
+          orcid_id:
+            id: "ORCID:0000-0001-6868-657X"
+            label: "Mark Davis"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "supervision"
+              degree: null
       pub_ids: ["PMID:27005435"]
-      collected_by: null
       grants: null
-      keywords_study: 
+      keywords_study:
         - "contains_ig"
         - "contains_tr"
     subject:
       subject_id: TW01A
       synthetic: false
       species:
-        id: "NCBITaxon_9606"
+        id: "NCBITAXON:9606"
         label: "Homo sapiens"
       sex: female
       age_min: 27
       age_max: 27
       age_unit:
-        id: UO_0000036
+        id: UO:0000036
         label: year
       age_event: null
       ancestry_population:
@@ -63,15 +85,15 @@ Repertoire:
         sample_processing_id: null
         sample_type: "peripheral venous puncture"
         tissue:
-          id: "UBERON_0000178"
+          id: "UBERON:0000178"
           label: "blood"
         tissue_processing: "Ficoll gradient"
         cell_subset:
-          id: "CL_0000788"
+          id: "CL:0000788"
           label: "naive B cell"
         cell_phenotype: "expression of CD20 and the absence of CD27"
         cell_species:
-          id: "NCBITaxon_9606"
+          id: "NCBITAXON:9606"
           label: "Homo sapiens"
         single_cell: false
         cell_isolation: FACS
@@ -144,13 +166,35 @@ Repertoire:
         id: null
         label: null
       study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level."
-      study_contact: "Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X"
       inclusion_exclusion_criteria: null
-      lab_name: "Mark M. Davis"
-      lab_address: "Stanford University"
-      submitted_by: "Florian Rubelt"
+      contributors:
+        - contributor_id: "1"
+          name: "Florian Rubelt"
+          orcid_id:
+            id: null
+            label: null
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "investigation"
+              degree: null
+            - role: "data curation"
+              degree: null
+        - contributor_id: "2"
+          name: "Mark M. Davis"
+          orcid_id:
+            id: "ORCID:0000-0001-6868-657X"
+            label: "Mark Davis"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "supervision"
+              degree: null
       pub_ids: ["PMID:27005435"]
-      collected_by: null
       grants: null
       keywords_study:
         - "contains_ig"
@@ -159,13 +203,13 @@ Repertoire:
       subject_id: TW01A
       synthetic: false
       species:
-        id: "NCBITaxon_9606"
+        id: "NCBITAXON:9606"
         label: "Homo sapiens"
       sex: female
       age_min: 27
       age_max: 27
       age_unit:
-        id: UO_0000036
+        id: UO:0000036
         label: year
       age_event: null
       ancestry_population:
@@ -196,15 +240,15 @@ Repertoire:
         sample_processing_id: null
         sample_type: "peripheral venous puncture"
         tissue:
-          id: "UBERON_0000178"
+          id: "UBERON:0000178"
           label: "blood"
         tissue_processing: "Ficoll gradient"
         cell_subset:
-          id: "CL_0000787"
+          id: "CL:0000787"
           label: "memory B cell"
         cell_phenotype: "expression of CD20 and CD27"
         cell_species:
-          id: "NCBITaxon_9606"
+          id: "NCBITAXON:9606"
           label: "Homo sapiens"
         single_cell: false
         cell_isolation: FACS
@@ -277,13 +321,35 @@ Repertoire:
         id: null
         label: null
       study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level."
-      study_contact: "Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X"
       inclusion_exclusion_criteria: null
-      lab_name: "Mark M. Davis"
-      lab_address: "Stanford University"
-      submitted_by: "Florian Rubelt"
+      contributors:
+        - contributor_id: "1"
+          name: "Florian Rubelt"
+          orcid_id:
+            id: null
+            label: null
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "investigation"
+              degree: null
+            - role: "data curation"
+              degree: null
+        - contributor_id: "2"
+          name: "Mark M. Davis"
+          orcid_id:
+            id: "ORCID:0000-0001-6868-657X"
+            label: "Mark Davis"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "supervision"
+              degree: null
       pub_ids: ["PMID:27005435"]
-      collected_by: null
       grants: null
       keywords_study:
         - "contains_ig"
@@ -292,13 +358,13 @@ Repertoire:
       subject_id: TW01A
       synthetic: false
       species:
-        id: "NCBITaxon_9606"
+        id: "NCBITAXON:9606"
         label: "Homo sapiens"
       sex: female
       age_min: 27
       age_max: 27
       age_unit:
-        id: UO_0000036
+        id: UO:0000036
         label: year
       age_event: null
       ancestry_population:
@@ -329,15 +395,15 @@ Repertoire:
         sample_processing_id: null
         sample_type: "peripheral venous puncture"
         tissue:
-          id: "UBERON_0000178"
+          id: "UBERON:0000178"
           label: "blood"
         tissue_processing: "Ficoll gradient"
         cell_subset:
-          id: "CL_0000895"
+          id: "CL:0000895"
           label: "naive thymus-derived CD4-positive, alpha-beta T cell"
         cell_phenotype: "expression of CD8 and absence of CD4 and CD45RO"
         cell_species:
-          id: "NCBITaxon_9606"
+          id: "NCBITAXON:9606"
           label: "Homo sapiens"
         single_cell: false
         cell_isolation: FACS
diff --git a/specs/airr-schema-openapi3.yaml b/specs/airr-schema-openapi3.yaml
index 1ae5ad012..d6c6d48e2 100644
--- a/specs/airr-schema-openapi3.yaml
+++ b/specs/airr-schema-openapi3.yaml
@@ -1667,10 +1667,7 @@ Study:
         - study_type
         - inclusion_exclusion_criteria
         - grants
-        - collected_by
-        - lab_name
-        - lab_address
-        - submitted_by
+        - contributors
         - pub_ids
         - keywords_study
     properties:
diff --git a/specs/airr-schema.yaml b/specs/airr-schema.yaml
index 87a25b5bd..dd2c0c241 100644
--- a/specs/airr-schema.yaml
+++ b/specs/airr-schema.yaml
@@ -1571,10 +1571,7 @@ Study:
         - study_type
         - inclusion_exclusion_criteria
         - grants
-        - collected_by
-        - lab_name
-        - lab_address
-        - submitted_by
+        - contributors
         - pub_ids
         - keywords_study
     properties:

From 6a137b5b225fa47947090d2e37f678f78c488a88 Mon Sep 17 00:00:00 2001
From: Christian Busse <christian.busse@dkfz-heidelberg.de>
Date: Thu, 22 Feb 2024 04:43:50 +0100
Subject: [PATCH 13/15] Add routine to validate class of an ontology object

---
 lang/R/R/Interface.R | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/lang/R/R/Interface.R b/lang/R/R/Interface.R
index 2b88e3801..ed7756f3d 100644
--- a/lang/R/R/Interface.R
+++ b/lang/R/R/Interface.R
@@ -645,7 +645,7 @@ validate_airr <- function(data, model=TRUE, each=FALSE) {
 validate_entry <- function(entry, schema) {
     schema_name <- schema@definition
     valid <- TRUE
-    
+
     # Check all required fields exist
     missing_fields <- setdiff(schema@required, names(entry))
     
@@ -664,8 +664,15 @@ validate_entry <- function(entry, schema) {
         # in this case the type on the 1st level is NULL
         if (is.na(schema[f][["type"]]) || is.null(schema[f][["type"]])) {
             if (!is.null(reference_schemes)) {
-                v <- validate_entry(entry[[f]], schema=reference_schemes)
-                if (!v) { valid <- FALSE }
+                # check whether an ontology is a list, before recursing into it.
+                if (reference_schemes@definition == "Ontology" & class(entry[[f]]) != "list") {
+                    valid <- FALSE
+                    warning(paste("Warning: Property", paste(schema_name, ".", f, sep=""),
+                                "should be an ontology but is of class", class(entry[[f]]), "\n"))
+                } else {
+                    v <- validate_entry(entry[[f]], schema=reference_schemes)
+                    if (!v) { valid <- FALSE }
+                }
             }
         # entry of array type with a list of on or several reference schemes
         } else if (schema[f][["type"]] == "array" & !is.null(reference_schemes)) {

From ca4a564fef59853f39f2a7b5504bf7fa4ea35fe3 Mon Sep 17 00:00:00 2001
From: Scott Christley <scott.christley@utsouthwestern.edu>
Date: Mon, 26 Feb 2024 16:13:12 -0600
Subject: [PATCH 14/15] update openapi3 spec in lang directories

---
 lang/R/inst/extdata/airr-schema-openapi3.yaml | 253 +++++++++++-------
 .../airr/specs/airr-schema-openapi3.yaml      | 253 +++++++++++-------
 2 files changed, 318 insertions(+), 188 deletions(-)

diff --git a/lang/R/inst/extdata/airr-schema-openapi3.yaml b/lang/R/inst/extdata/airr-schema-openapi3.yaml
index bba3a45d8..d6c6d48e2 100644
--- a/lang/R/inst/extdata/airr-schema-openapi3.yaml
+++ b/lang/R/inst/extdata/airr-schema-openapi3.yaml
@@ -368,7 +368,7 @@ DataFile:
         RepertoireGroup:
             type: array
             nullable: false
-            description: List of repertoire collections
+            description: List of repertoire groups
             items:
                 $ref: '#/RepertoireGroup'
         Rearrangement:
@@ -486,34 +486,111 @@ TimePoint:
 # General objects
 #
 
-# An individual
-Acknowledgement:
+# Contributor record to describe invididuals and their contribution to a data set
+#
+Contributor:
     description: Individual whose contribution to this work should be acknowledged
     type: object
     required:
-        - acknowledgement_id
+        - contributor_id
         - name
-        - institution_name
     properties:
-        acknowledgement_id:
+        contributor_id:
             type: string
-            description: unique identifier of this Acknowledgement within the file
+            nullable: true
+            description: Unique identifier of this contributor within the file
             x-airr:
                 identifier: true
                 miairr: important
-            nullable: true
         name:
             type: string
+            nullable: false
+            description: Full name of contributor
+        orcid_id:
+            $ref: '#/Ontology'
             nullable: true
-            description: Full name of individual
-        institution_name:
-            type: string
+            description: >
+                ORCID identifier of the contributor. Note that if present, the label of the ORCID record should take
+                precedence over the name reported in the `name` property.
+            title: ORCID iD
+            example:
+                id: ORCID:0000-0002-1825-0097
+                label: Josiah Carberry
+            x-airr:
+                adc-query-support: true
+                format: ontology
+                ontology:
+                    draft: false
+                    top_node:
+                        id: null
+                        label: null
+        affiliation:
+            $ref: '#/Ontology'
             nullable: true
-            description: Individual's department and institution name
-        orcid_id:
+            description: >
+                ROR of the contributor's primary affiliation. Note that ROR are only minted for institutions, not
+                from individuals institutes, divisions or departments.
+            title: ROR
+            example:
+                id: ROR:05h7xva58
+                label: Wesleyan University
+            x-airr:
+                adc-query-support: true
+                format: ontology
+                ontology:
+                    draft: false
+                    top_node:
+                        id: null
+                        label: null
+        affiliation_department:
             type: string
             nullable: true
-            description: Individual's ORCID identifier
+            description: >
+                Additional information regarding the contributor's primary affiliation. Can be used to specify
+                individual institutes, divisions or departments.
+            example: Department for Psychoceramics
+        contributions:
+            type: array
+            nullable: true
+            description: List of all roles the contributor had in a project
+            items:
+                $ref: '#/ContributorContribution'
+
+ContributorContribution:
+    type: object
+    required:
+        - role
+    properties:
+        role:
+            type: string
+            nullable: false
+            description: Role according to CRediT taxonomy
+            enum:
+                - conceptualization
+                - data curation
+                - formal analysis
+                - funding acquisition
+                - investigation
+                - methodology
+                - project administration
+                - resources
+                - software
+                - supervision
+                - validation
+                - visualization
+                - writing - original draft
+                - writing - review & editing
+        degree:
+            type: string
+            nullable: true
+            description: >
+                Optional specification of the degree of contribution, should be used if multiple individuals serve
+                the same role.
+            enum:
+                - lead
+                - equal
+                - supporting
+
 
 #
 # Germline gene schema
@@ -809,8 +886,7 @@ AlleleDescription:
     type: object
     required:
         - allele_description_id
-        - maintainer
-        - lab_address
+        - acknowledgements
         - release_version
         - release_date
         - release_description
@@ -838,24 +914,16 @@ AlleleDescription:
                 miairr: important
             description: Unique reference to the allele description, in standardized form (Repo:Label:Version)
             example: OGRDB:Human_IGH:IGHV1-69*01.001
-        maintainer:
-            type: string
-            nullable: true
-            x-airr:
-                miairr: defined
-            description: Maintainer of this sequence record
         acknowledgements:
             type: array
             nullable: true
-            description: List of individuals whose contribution to the gene description should be acknowledged
+            description: >
+                List of individuals whose contribution to the gene description should be acknowledged. Note that these
+                are not necessarily identical with the authors on an associated manuscript or other scholarly
+                communication. Further note that typically at least the three CRediT contributor roles "supervision",
+                "investigation" and "data curation" should be assigned. The current maintainer should be listed first.
             items:
-                $ref: '#/Acknowledgement'
-        lab_address:
-            type: string
-            nullable: true
-            x-airr:
-                miairr: defined
-            description: Institution and full address of corresponding author
+                $ref: '#/Contributor'
         release_version:
             type: integer
             nullable: true
@@ -1148,9 +1216,7 @@ GermlineSet:
         All genes in a GermlineSet should be from a single locus.
     required:
         - germline_set_id
-        - author
-        - lab_name
-        - lab_address
+        - acknowledgements
         - release_version
         - release_description
         - release_date
@@ -1169,30 +1235,16 @@ GermlineSet:
             x-airr:
                 identifier: true
                 miairr: important
-        author:
-            type: string
-            nullable: true
-            x-airr:
-                miairr: important
-            description: Corresponding author
-        lab_name:
-            type: string
-            nullable: true
-            x-airr:
-                miairr: important
-            description: Department of corresponding author
-        lab_address:
-            type: string
-            nullable: true
-            x-airr:
-                miairr: important
-            description: Institutional address of corresponding author
         acknowledgements:
             type: array
             nullable: true
-            description: List of individuals whose contribution to the germline set should be acknowledged
+            description: >
+                List of individuals whose contribution to the germline set should be acknowledged. Note that these are
+                not necessarily identical with the authors on an associated manuscript or other scholarly communication.
+                Further note that typically at least the three CRediT contributor roles "supervision", "investigation"
+                and "data curation" should be assigned. The coresponding author should be listed last.
             items:
-                $ref: '#/Acknowledgement'
+                $ref: '#/Contributor'
         release_version:
             type: number
             nullable: true
@@ -1615,10 +1667,7 @@ Study:
         - study_type
         - inclusion_exclusion_criteria
         - grants
-        - collected_by
-        - lab_name
-        - lab_address
-        - submitted_by
+        - contributors
         - pub_ids
         - keywords_study
     properties:
@@ -1702,17 +1751,36 @@ Study:
                 set: 1
                 subset: study
                 name: Grant funding agency
+        contributors:
+            type: array
+            nullable: false
+            description: >
+                List of individuals who contributed to the study. Note that these are not necessarily identical with
+                the authors on an associated manuscript or other scholarly communication. Further note that typically
+                at least the three CRediT contributor roles "supervision", "investigation" and "data curation" should
+                be assigned. The coresponding author should be listed last.
+            title: Contributors
+            items:
+                $ref: '#/Contributor'
+            x-airr:
+                miairr: essential
+                adc-query-support: true
+                set: 1
+                subset: study
+                name: Contributors
         study_contact:
             type: string
             nullable: true
             description: >
                 Full contact information of the contact persons for this study This should include an e-mail address
                 and a persistent identifier such as an ORCID ID.
-            title: Contact information (study)
-            example: Dr. P. Stibbons, p.stibbons@unseenu.edu, https://orcid.org/0000-0002-1825-0097
             x-airr:
-                adc-query-support: true
-                name: Contact information (study)
+                deprecated: true
+                deprecated-description: >
+                    Acknowledgements and contact information was re-organized into the contributors property, which
+                    is an array of Contributor objects.
+                deprecated-replaced-by:
+                    - contributors
         collected_by:
             type: string
             nullable: true
@@ -1720,38 +1788,35 @@ Study:
                 Full contact information of the data collector, i.e. the person who is legally responsible for data
                 collection and release. This should include an e-mail address and a persistent identifier such as an
                 ORCID ID.
-            title: Contact information (data collection)
-            example: Dr. P. Stibbons, p.stibbons@unseenu.edu, https://orcid.org/0000-0002-1825-0097
             x-airr:
-                miairr: important
-                adc-query-support: true
-                set: 1
-                subset: study
-                name: Contact information (data collection)
+                deprecated: true
+                deprecated-description: >
+                    Acknowledgements and contact information was re-organized into the contributors property, which
+                    is an array of Contributor objects.
+                deprecated-replaced-by:
+                    - contributors
         lab_name:
             type: string
             nullable: true
             description: Department of data collector
-            title: Lab name
-            example: Department for Planar Immunology
             x-airr:
-                miairr: important
-                adc-query-support: true
-                set: 1
-                subset: study
-                name: Lab name
+                deprecated: true
+                deprecated-description: >
+                    Acknowledgements and contact information was re-organized into the contributors property, which
+                    is an array of Contributor objects.
+                deprecated-replaced-by:
+                    - contributors
         lab_address:
             type: string
             nullable: true
             description: Institution and institutional address of data collector
-            title: Lab address
-            example: School of Medicine, Unseen University, Ankh-Morpork, Disk World
             x-airr:
-                miairr: important
-                adc-query-support: true
-                set: 1
-                subset: study
-                name: Lab address
+                deprecated: true
+                deprecated-description: >
+                    Acknowledgements and contact information was re-organized into the contributors property, which
+                    is an array of Contributor objects.
+                deprecated-replaced-by:
+                    - contributors
         submitted_by:
             type: string
             nullable: true
@@ -1759,14 +1824,13 @@ Study:
                 Full contact information of the data depositor, i.e., the person submitting the data to a repository.
                 This should include an e-mail address and a persistent identifier such as an ORCID ID. This is
                 supposed to be a short-lived and technical role until the submission is relased.
-            title: Contact information (data deposition)
-            example: Adrian Turnipseed, a.turnipseed@unseenu.edu, https://orcid.org/0000-0002-1825-0097
             x-airr:
-                miairr: important
-                adc-query-support: true
-                set: 1
-                subset: study
-                name: Contact information (data deposition)
+                deprecated: true
+                deprecated-description: >
+                    Acknowledgements and contact information was re-organized into the contributors property, which
+                    is an array of Contributor objects.
+                deprecated-replaced-by:
+                    - contributors
         pub_ids:
             type: array
             items:
@@ -3298,7 +3362,8 @@ Repertoire:
             x-airr:
                 adc-query-support: true
 
-# A collection of repertoires for analysis purposes, includes optional time course
+# An ordered group of repertoires for analysis purposes, includes optional time course
+# Can be treated as a set if all repertoire_group_id are unique
 RepertoireGroup:
     type: object
     required:
@@ -3308,22 +3373,22 @@ RepertoireGroup:
         repertoire_group_id:
             type: string
             nullable: true
-            description: Identifier for this repertoire collection
+            description: Identifier for this repertoire group
             x-airr:
                 identifier: true
         repertoire_group_name:
             type: string
             nullable: true
-            description: Short display name for this repertoire collection
+            description: Short display name for this repertoire group
         repertoire_group_description:
             type: string
             nullable: true
-            description: Repertoire collection description
+            description: Repertoire group description
         repertoires:
             type: array
             nullable: true
             description: >
-                List of repertoires in this collection with an associated description and time point designation
+                List of repertoires in this group with an associated description and time point designation
             items:
                 type: object
                 properties:
diff --git a/lang/python/airr/specs/airr-schema-openapi3.yaml b/lang/python/airr/specs/airr-schema-openapi3.yaml
index bba3a45d8..d6c6d48e2 100644
--- a/lang/python/airr/specs/airr-schema-openapi3.yaml
+++ b/lang/python/airr/specs/airr-schema-openapi3.yaml
@@ -368,7 +368,7 @@ DataFile:
         RepertoireGroup:
             type: array
             nullable: false
-            description: List of repertoire collections
+            description: List of repertoire groups
             items:
                 $ref: '#/RepertoireGroup'
         Rearrangement:
@@ -486,34 +486,111 @@ TimePoint:
 # General objects
 #
 
-# An individual
-Acknowledgement:
+# Contributor record to describe invididuals and their contribution to a data set
+#
+Contributor:
     description: Individual whose contribution to this work should be acknowledged
     type: object
     required:
-        - acknowledgement_id
+        - contributor_id
         - name
-        - institution_name
     properties:
-        acknowledgement_id:
+        contributor_id:
             type: string
-            description: unique identifier of this Acknowledgement within the file
+            nullable: true
+            description: Unique identifier of this contributor within the file
             x-airr:
                 identifier: true
                 miairr: important
-            nullable: true
         name:
             type: string
+            nullable: false
+            description: Full name of contributor
+        orcid_id:
+            $ref: '#/Ontology'
             nullable: true
-            description: Full name of individual
-        institution_name:
-            type: string
+            description: >
+                ORCID identifier of the contributor. Note that if present, the label of the ORCID record should take
+                precedence over the name reported in the `name` property.
+            title: ORCID iD
+            example:
+                id: ORCID:0000-0002-1825-0097
+                label: Josiah Carberry
+            x-airr:
+                adc-query-support: true
+                format: ontology
+                ontology:
+                    draft: false
+                    top_node:
+                        id: null
+                        label: null
+        affiliation:
+            $ref: '#/Ontology'
             nullable: true
-            description: Individual's department and institution name
-        orcid_id:
+            description: >
+                ROR of the contributor's primary affiliation. Note that ROR are only minted for institutions, not
+                from individuals institutes, divisions or departments.
+            title: ROR
+            example:
+                id: ROR:05h7xva58
+                label: Wesleyan University
+            x-airr:
+                adc-query-support: true
+                format: ontology
+                ontology:
+                    draft: false
+                    top_node:
+                        id: null
+                        label: null
+        affiliation_department:
             type: string
             nullable: true
-            description: Individual's ORCID identifier
+            description: >
+                Additional information regarding the contributor's primary affiliation. Can be used to specify
+                individual institutes, divisions or departments.
+            example: Department for Psychoceramics
+        contributions:
+            type: array
+            nullable: true
+            description: List of all roles the contributor had in a project
+            items:
+                $ref: '#/ContributorContribution'
+
+ContributorContribution:
+    type: object
+    required:
+        - role
+    properties:
+        role:
+            type: string
+            nullable: false
+            description: Role according to CRediT taxonomy
+            enum:
+                - conceptualization
+                - data curation
+                - formal analysis
+                - funding acquisition
+                - investigation
+                - methodology
+                - project administration
+                - resources
+                - software
+                - supervision
+                - validation
+                - visualization
+                - writing - original draft
+                - writing - review & editing
+        degree:
+            type: string
+            nullable: true
+            description: >
+                Optional specification of the degree of contribution, should be used if multiple individuals serve
+                the same role.
+            enum:
+                - lead
+                - equal
+                - supporting
+
 
 #
 # Germline gene schema
@@ -809,8 +886,7 @@ AlleleDescription:
     type: object
     required:
         - allele_description_id
-        - maintainer
-        - lab_address
+        - acknowledgements
         - release_version
         - release_date
         - release_description
@@ -838,24 +914,16 @@ AlleleDescription:
                 miairr: important
             description: Unique reference to the allele description, in standardized form (Repo:Label:Version)
             example: OGRDB:Human_IGH:IGHV1-69*01.001
-        maintainer:
-            type: string
-            nullable: true
-            x-airr:
-                miairr: defined
-            description: Maintainer of this sequence record
         acknowledgements:
             type: array
             nullable: true
-            description: List of individuals whose contribution to the gene description should be acknowledged
+            description: >
+                List of individuals whose contribution to the gene description should be acknowledged. Note that these
+                are not necessarily identical with the authors on an associated manuscript or other scholarly
+                communication. Further note that typically at least the three CRediT contributor roles "supervision",
+                "investigation" and "data curation" should be assigned. The current maintainer should be listed first.
             items:
-                $ref: '#/Acknowledgement'
-        lab_address:
-            type: string
-            nullable: true
-            x-airr:
-                miairr: defined
-            description: Institution and full address of corresponding author
+                $ref: '#/Contributor'
         release_version:
             type: integer
             nullable: true
@@ -1148,9 +1216,7 @@ GermlineSet:
         All genes in a GermlineSet should be from a single locus.
     required:
         - germline_set_id
-        - author
-        - lab_name
-        - lab_address
+        - acknowledgements
         - release_version
         - release_description
         - release_date
@@ -1169,30 +1235,16 @@ GermlineSet:
             x-airr:
                 identifier: true
                 miairr: important
-        author:
-            type: string
-            nullable: true
-            x-airr:
-                miairr: important
-            description: Corresponding author
-        lab_name:
-            type: string
-            nullable: true
-            x-airr:
-                miairr: important
-            description: Department of corresponding author
-        lab_address:
-            type: string
-            nullable: true
-            x-airr:
-                miairr: important
-            description: Institutional address of corresponding author
         acknowledgements:
             type: array
             nullable: true
-            description: List of individuals whose contribution to the germline set should be acknowledged
+            description: >
+                List of individuals whose contribution to the germline set should be acknowledged. Note that these are
+                not necessarily identical with the authors on an associated manuscript or other scholarly communication.
+                Further note that typically at least the three CRediT contributor roles "supervision", "investigation"
+                and "data curation" should be assigned. The coresponding author should be listed last.
             items:
-                $ref: '#/Acknowledgement'
+                $ref: '#/Contributor'
         release_version:
             type: number
             nullable: true
@@ -1615,10 +1667,7 @@ Study:
         - study_type
         - inclusion_exclusion_criteria
         - grants
-        - collected_by
-        - lab_name
-        - lab_address
-        - submitted_by
+        - contributors
         - pub_ids
         - keywords_study
     properties:
@@ -1702,17 +1751,36 @@ Study:
                 set: 1
                 subset: study
                 name: Grant funding agency
+        contributors:
+            type: array
+            nullable: false
+            description: >
+                List of individuals who contributed to the study. Note that these are not necessarily identical with
+                the authors on an associated manuscript or other scholarly communication. Further note that typically
+                at least the three CRediT contributor roles "supervision", "investigation" and "data curation" should
+                be assigned. The coresponding author should be listed last.
+            title: Contributors
+            items:
+                $ref: '#/Contributor'
+            x-airr:
+                miairr: essential
+                adc-query-support: true
+                set: 1
+                subset: study
+                name: Contributors
         study_contact:
             type: string
             nullable: true
             description: >
                 Full contact information of the contact persons for this study This should include an e-mail address
                 and a persistent identifier such as an ORCID ID.
-            title: Contact information (study)
-            example: Dr. P. Stibbons, p.stibbons@unseenu.edu, https://orcid.org/0000-0002-1825-0097
             x-airr:
-                adc-query-support: true
-                name: Contact information (study)
+                deprecated: true
+                deprecated-description: >
+                    Acknowledgements and contact information was re-organized into the contributors property, which
+                    is an array of Contributor objects.
+                deprecated-replaced-by:
+                    - contributors
         collected_by:
             type: string
             nullable: true
@@ -1720,38 +1788,35 @@ Study:
                 Full contact information of the data collector, i.e. the person who is legally responsible for data
                 collection and release. This should include an e-mail address and a persistent identifier such as an
                 ORCID ID.
-            title: Contact information (data collection)
-            example: Dr. P. Stibbons, p.stibbons@unseenu.edu, https://orcid.org/0000-0002-1825-0097
             x-airr:
-                miairr: important
-                adc-query-support: true
-                set: 1
-                subset: study
-                name: Contact information (data collection)
+                deprecated: true
+                deprecated-description: >
+                    Acknowledgements and contact information was re-organized into the contributors property, which
+                    is an array of Contributor objects.
+                deprecated-replaced-by:
+                    - contributors
         lab_name:
             type: string
             nullable: true
             description: Department of data collector
-            title: Lab name
-            example: Department for Planar Immunology
             x-airr:
-                miairr: important
-                adc-query-support: true
-                set: 1
-                subset: study
-                name: Lab name
+                deprecated: true
+                deprecated-description: >
+                    Acknowledgements and contact information was re-organized into the contributors property, which
+                    is an array of Contributor objects.
+                deprecated-replaced-by:
+                    - contributors
         lab_address:
             type: string
             nullable: true
             description: Institution and institutional address of data collector
-            title: Lab address
-            example: School of Medicine, Unseen University, Ankh-Morpork, Disk World
             x-airr:
-                miairr: important
-                adc-query-support: true
-                set: 1
-                subset: study
-                name: Lab address
+                deprecated: true
+                deprecated-description: >
+                    Acknowledgements and contact information was re-organized into the contributors property, which
+                    is an array of Contributor objects.
+                deprecated-replaced-by:
+                    - contributors
         submitted_by:
             type: string
             nullable: true
@@ -1759,14 +1824,13 @@ Study:
                 Full contact information of the data depositor, i.e., the person submitting the data to a repository.
                 This should include an e-mail address and a persistent identifier such as an ORCID ID. This is
                 supposed to be a short-lived and technical role until the submission is relased.
-            title: Contact information (data deposition)
-            example: Adrian Turnipseed, a.turnipseed@unseenu.edu, https://orcid.org/0000-0002-1825-0097
             x-airr:
-                miairr: important
-                adc-query-support: true
-                set: 1
-                subset: study
-                name: Contact information (data deposition)
+                deprecated: true
+                deprecated-description: >
+                    Acknowledgements and contact information was re-organized into the contributors property, which
+                    is an array of Contributor objects.
+                deprecated-replaced-by:
+                    - contributors
         pub_ids:
             type: array
             items:
@@ -3298,7 +3362,8 @@ Repertoire:
             x-airr:
                 adc-query-support: true
 
-# A collection of repertoires for analysis purposes, includes optional time course
+# An ordered group of repertoires for analysis purposes, includes optional time course
+# Can be treated as a set if all repertoire_group_id are unique
 RepertoireGroup:
     type: object
     required:
@@ -3308,22 +3373,22 @@ RepertoireGroup:
         repertoire_group_id:
             type: string
             nullable: true
-            description: Identifier for this repertoire collection
+            description: Identifier for this repertoire group
             x-airr:
                 identifier: true
         repertoire_group_name:
             type: string
             nullable: true
-            description: Short display name for this repertoire collection
+            description: Short display name for this repertoire group
         repertoire_group_description:
             type: string
             nullable: true
-            description: Repertoire collection description
+            description: Repertoire group description
         repertoires:
             type: array
             nullable: true
             description: >
-                List of repertoires in this collection with an associated description and time point designation
+                List of repertoires in this group with an associated description and time point designation
             items:
                 type: object
                 properties:

From caff1123914a0e68d8c4a179e85c3434105e8c2b Mon Sep 17 00:00:00 2001
From: Scott Christley <scott.christley@utsouthwestern.edu>
Date: Mon, 26 Feb 2024 16:17:37 -0600
Subject: [PATCH 15/15] update tests

---
 Makefile                           |  13 +-
 tests/data/bad_genotype_set.json   |   2 +-
 tests/data/bad_germline_set.json   |  94 +++-
 tests/data/bad_repertoire.yaml     |  90 ++-
 tests/data/good_combined_airr.json | 349 +++++++++---
 tests/data/good_combined_airr.yaml | 856 ++++++++++++++++-------------
 tests/data/good_genotype_set.json  |   2 +-
 tests/data/good_germline_set.json  | 108 +++-
 tests/data/good_repertoire.yaml    | 128 +++--
 9 files changed, 1099 insertions(+), 543 deletions(-)

diff --git a/Makefile b/Makefile
index 151bf2dd4..a18207a95 100644
--- a/Makefile
+++ b/Makefile
@@ -7,8 +7,8 @@ help:
 	@echo "Helper commands for AIRR Standards repository"
 	@echo ""
 	@echo "make gen-v2       -- Generate OpenAPI V2 spec from the V3 spec"
-	@echo "make docs         -- Build documentation"
-	@echo "make lang-copy    -- Copy spec files to language directories"
+	@echo "make build-docs   -- Build documentation"
+	@echo "make spec-copy    -- Copy spec files to language directories"
 	@echo "make data-copy    -- Copy test data files to language directories"
 	@echo "make checks       -- Run consistency checks on spec files"
 	@echo "make tests        -- Run all language test suites"
@@ -20,7 +20,10 @@ help:
 gen-v2:
 	@echo "Not implemented"
 
-lang-copy:
+build-docs:
+	sphinx-build -a -E -b html docs docs/_build/html
+
+spec-copy:
 	@echo "Copying specs to language directories"
 	cp specs/airr-schema.yaml lang/python/airr/specs
 	cp specs/airr-schema-openapi3.yaml lang/python/airr/specs
@@ -30,7 +33,9 @@ lang-copy:
 #	cp specs/airr-schema-openapi3.yaml lang/js/
 
 data-copy:
-	@echo "Not implemented"
+	@echo "Copying test data to language directories"
+	cp tests/data/* lang/python/tests/data
+	cp tests/data/* lang/R/tests/data-tests
 
 checks:
 	@echo "Running consistency checks on spec files"
diff --git a/tests/data/bad_genotype_set.json b/tests/data/bad_genotype_set.json
index c58a39027..01709d60a 100644
--- a/tests/data/bad_genotype_set.json
+++ b/tests/data/bad_genotype_set.json
@@ -41,4 +41,4 @@
             }
         ]
     }]
-}
\ No newline at end of file
+}
diff --git a/tests/data/bad_germline_set.json b/tests/data/bad_germline_set.json
index 168cc1fa5..28531aabb 100644
--- a/tests/data/bad_germline_set.json
+++ b/tests/data/bad_germline_set.json
@@ -1,27 +1,71 @@
 {
     "GermlineSet": [{
         "germline_set_id": "OGRDB:G00007",
-        "author": "William Lees",
-        "lab_name": "",
-        "lab_address": "Birkbeck College, University of London, Malet Street, London",
+        "acknowledgements": [
+            {
+                "contributor_id": "1",
+                "name": "William Lees",
+                "orcid_id": {
+                    "id": "ORCID:0000-0001-9834-6840",
+                    "label": "William Lees"
+                },
+                "affiliation": {
+                    "id": "ROR:02mb95055",
+                    "label": "Birkbeck, University of London"
+                },
+                "affiliation_department":"",
+                "contributions": [
+                    {
+                        "role": "investigation",
+                        "degree": null
+                    },
+                    {
+                        "role": "data curation",
+                        "degree": null
+                    }
+                ]
+            }
+        ],
         "release_version": 1,
         "release_description": "",
         "release_date": "2021-11-24",
         "germline_set_name": "CAST IGH",
         "germline_set_ref": "OGRDB:G00007.1",
         "pub_ids": [""],
-        "species": ["Mouse"],
+        "species": "Mouse",
         "species_subgroup": "CAST_EiJ",
         "species_subgroup_type": "strain",
         "locus": "IGH",
         "allele_descriptions": [
             {
                 "allele_description_id": "OGRDB:A00301",
-                "maintainer": "William Lees",
-                "acknowledgements": [],
-                "lab_address": "Birkbeck College, University of London, Malet Street, London",
+                "acknowledgements": [
+                    {
+                        "contributor_id": "1",
+                        "name": "William Lees",
+                        "orcid_id": {
+                            "id": "ORCID:0000-0001-9834-6840",
+                            "label": "William Lees"
+                        },
+                        "affiliation": {
+                            "id": "ROR:02mb95055",
+                            "label": "Birkbeck, University of London"
+                        },
+                        "affiliation_department":"",
+                        "contributions": [
+                            {
+                                "role": "investigation",
+                                "degree": null
+                            },
+                            {
+                                "role": "data curation",
+                                "degree": null
+                            }
+                        ]
+                    }
+                ],
                 "release_version": 1,
-                "release_date": "24-Nov-2021",
+                "release_date": "2021-11-24",
                 "release_description": "First release",
                 "label": "IGHV-2DBF",
                 "sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
@@ -66,7 +110,7 @@
                         "fwr3_start": 196,
                         "fwr3_end": 312,
                         "cdr3_start": 313,
-                        "alignment": [
+                        "alignment_labels": [
                             "1",
                             "2",
                             "3",
@@ -182,11 +226,33 @@
             },
             {
                 "allele_description_id": "OGRDB:A00314",
-                "maintainer": "William Lees",
-                "acknowledgements": [],
-                "lab_address": "Birkbeck College, University of London, Malet Street, London",
+                "acknowledgements": [
+                    {
+                        "contributor_id": "1",
+                        "name": "William Lees",
+                        "orcid_id": {
+                            "id": "ORCID:0000-0001-9834-6840",
+                            "label": "William Lees"
+                        },
+                        "affiliation": {
+                            "id": "ROR:02mb95055",
+                            "label": "Birkbeck, University of London"
+                        },
+                        "affiliation_department":"",
+                        "contributions": [
+                            {
+                                "role": "investigation",
+                                "degree": null
+                            },
+                            {
+                                "role": "data curation",
+                                "degree": null
+                            }
+                        ]
+                    }
+                ],
                 "release_version": 1,
-                "release_date": "24-Nov-2021",
+                "release_date": "2021-11-24",
                 "release_description": "First release",
                 "label": "IGHV-2ETO",
                 "sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC",
@@ -231,7 +297,7 @@
                         "fwr3_start": 196,
                         "fwr3_end": 312,
                         "cdr3_start": 313,
-                        "alignment": [
+                        "alignment_labels": [
                             "1",
                             "2",
                             "3",
diff --git a/tests/data/bad_repertoire.yaml b/tests/data/bad_repertoire.yaml
index 2de377cb3..f35355e98 100644
--- a/tests/data/bad_repertoire.yaml
+++ b/tests/data/bad_repertoire.yaml
@@ -8,21 +8,39 @@ Repertoire:
       study_id: PRJNA300878
       study_title: "Homo sapiens B and T cell repertoire - MZ twins"
       study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level."
-      lab_name: "Mark M. Davis"
-      lab_address: "Stanford University"
-      submitted_by: "Florian Rubelt"
+      contributors:
+        - contributor_id: "1"
+          name: "Florian Rubelt"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "investigation"
+            - role: "data curation"
+        - contributor_id: "2"
+          name: "Mark M. Davis"
+          orcid_id:
+            id: "ORCID:0000-0001-6868-657X"
+            label: "Mark Davis"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "supervision"
       pub_ids: ["PMID:27005435"]
     subject:
       subject_id: TW01A
       synthetic: false
       species:
-        id: "NCBITaxon_9606"
+        id: "NCBITAXON:9606"
         value: "Homo sapiens"
       sex: female
       age_min: 27
       age_max: 27
       age_unit:
-        id: UO_0000036
+        id: UO:0000036
         value: year
       linked_subjects: TW01B
       link_type: twin
@@ -32,7 +50,7 @@ Repertoire:
         cell_subset: "Naive B cell"
         cell_phenotype: "expression of CD20 and the absence of CD27"
         cell_species:
-          id: "NCBITaxon_9606"
+          id: "NCBITAXON:9606"
           value: "Homo sapiens"
         single_cell: false
         cell_isolation: FACS
@@ -56,21 +74,39 @@ Repertoire:
       study_id: PRJNA300878
       study_title: "Homo sapiens B and T cell repertoire - MZ twins"
       study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level."
-      lab_name: "Mark M. Davis"
-      lab_address: "Stanford University"
-      submitted_by: "Florian Rubelt"
+      contributors:
+        - contributor_id: "1"
+          name: "Florian Rubelt"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "investigation"
+            - role: "data curation"
+        - contributor_id: "2"
+          name: "Mark M. Davis"
+          orcid_id:
+            id: "ORCID:0000-0001-6868-657X"
+            label: "Mark Davis"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "supervision"
       pub_ids: ["PMID:27005435"]
     subject:
       subject_id: TW01A
       synthetic: false
       species:
-        id: "NCBITaxon_9606"
+        id: "NCBITAXON:9606"
         value: "Homo sapiens"
       sex: female
       age_min: 27
       age_max: 27
       age_unit:
-        id: UO_0000036
+        id: UO:0000036
         value: year
       linked_subjects: TW01B
       link_type: twin
@@ -80,7 +116,7 @@ Repertoire:
         cell_subset: "Memory B cell"
         cell_phenotype: "expression of CD20 and CD27"
         cell_species:
-          id: "NCBITaxon_9606"
+          id: "NCBITAXON:9606"
           value: "Homo sapiens"
         single_cell: false
         cell_isolation: FACS
@@ -104,21 +140,39 @@ Repertoire:
       study_id: PRJNA300878
       study_title: "Homo sapiens B and T cell repertoire - MZ twins"
       study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level."
-      lab_name: "Mark M. Davis"
-      lab_address: "Stanford University"
-      submitted_by: "Florian Rubelt"
+      contributors:
+        - contributor_id: "1"
+          name: "Florian Rubelt"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "investigation"
+            - role: "data curation"
+        - contributor_id: "2"
+          name: "Mark M. Davis"
+          orcid_id:
+            id: "ORCID:0000-0001-6868-657X"
+            label: "Mark Davis"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "supervision"
       pub_ids: ["PMID:27005435"]
     subject:
       subject_id: TW01A
       synthetic: false
       species:
-        id: "NCBITaxon_9606"
+        id: "NCBITAXON:9606"
         value: "Homo sapiens"
       sex: female
       age_min: 27
       age_max: 27
       age_unit:
-        id: UO_0000036
+        id: UO:0000036
         value: year
       linked_subjects: TW01B
       link_type: twin
@@ -128,7 +182,7 @@ Repertoire:
         cell_subset: "Naive CD4+ T cell"
         cell_phenotype: "expression of CD8 and absence of CD4 and CD45RO"
         cell_species:
-          id: "NCBITaxon_9606"
+          id: "NCBITAXON:9606"
           value: "Homo sapiens"
         single_cell: false
         cell_isolation: FACS
diff --git a/tests/data/good_combined_airr.json b/tests/data/good_combined_airr.json
index 9101b24a9..0ef2106ae 100644
--- a/tests/data/good_combined_airr.json
+++ b/tests/data/good_combined_airr.json
@@ -10,13 +10,52 @@
                     "label": null
                 },
                 "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.",
-                "study_contact": "Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X",
                 "inclusion_exclusion_criteria": null,
-                "lab_name": "Mark M. Davis",
-                "lab_address": "Stanford University",
-                "submitted_by": "Florian Rubelt",
+                "contributors": [
+                    {
+                        "contributor_id": "1",
+                        "name": "Florian Rubelt",
+                        "orcid_id": {
+                            "id": null,
+                            "label": null
+                        },
+                        "affiliation": {
+                            "id": "ROR:00f54p054",
+                            "label": "Stanford University"
+                         },
+                        "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine",
+                        "contributions": [
+                            {
+                                "role": "investigation",
+                                "degree": null
+                            },
+                            {
+                                "role": "data curation",
+                                "degree": null
+                            }
+                        ]
+                    },
+                    {
+                        "contributor_id": "2",
+                        "name": "Mark M. Davis",
+                        "orcid_id": {
+                            "id": "ORCID:0000-0001-6868-657X",
+                            "label": "Mark Davis"
+                        },
+                        "affiliation": {
+                            "id": "ROR:00f54p054",
+                            "label": "Stanford University"
+                         },
+                        "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine",
+                        "contributions": [
+                            {
+                                "role": "supervision",
+                                "degree": null
+                            }
+                        ]
+                    }
+                ],
                 "pub_ids": ["PMID:27005435"],
-                "collected_by": null,
                 "grants": null,
                 "keywords_study": [
                     "contains_ig",
@@ -27,25 +66,25 @@
                 "subject_id": "TW01A",
                 "synthetic": false,
                 "species": {
-                    "id": "NCBITaxon_9606",
+                    "id": "NCBITAXON:9606",
                     "label": "Homo sapiens"
                 },
                 "sex": "female",
                 "age_min": 27,
                 "age_max": 27,
                 "age_unit": {
-                    "id": "UO_0000036",
+                    "id": "UO:0000036",
                     "label": "year"
                 },
                 "age_event": null,
                 "ancestry_population": {
-				    "id": null,
-					"label": null
-				},
-				"location_birth": {
-				    "id": null,
-					"label": null
-				},
+                    "id": null,
+                    "label": null
+                },
+                "location_birth": {
+                    "id": null,
+                    "label": null
+                },
                 "ethnicity": null,
                 "race": null,
                 "strain_name": null,
@@ -104,10 +143,10 @@
                         ]
                     },
                     "mhc_genotype_set": {
-                        "mhc_genotype_set_id": "this is a unique identifier",
+                        "mhc_genotype_set_id": "01847298-d0c2-11ee-bc66",
                         "mhc_genotype_list": [
                             {
-                                "mhc_genotype_id": "unique",
+                                "mhc_genotype_id": "00be1c2e-d0c2-11ee-bfe7",
                                 "mhc_class": "MHC-I",
                                 "mhc_genotyping_method": "pcr_low_resolution",
                                 "mhc_alleles": [
@@ -117,7 +156,7 @@
                                             "id": "MRO-0000046",
                                             "label": "HLA-A"
                                         },
-                                        "reference_set_ref": "blah"
+                                        "reference_set_ref": null
                                     }
                                 ]
                             }
@@ -131,17 +170,17 @@
                     "sample_processing_id": null,
                     "sample_type": "peripheral venous puncture",
                     "tissue": {
-                        "id": "UBERON_0000178",
+                        "id": "UBERON:0000178",
                         "label": "blood"
                     },
                     "tissue_processing": "Ficoll gradient",
                     "cell_subset": {
-                        "id": "CL_0000788",
+                        "id": "CL:0000788",
                         "label": "naive B cell"
                     },
                     "cell_phenotype": "expression of CD20 and the absence of CD27",
                     "cell_species": {
-                        "id": "NCBITaxon_9606",
+                        "id": "NCBITAXON:9606",
                         "label": "Homo sapiens"
                     },
                     "single_cell": false,
@@ -173,7 +212,7 @@
                         "label": null
                     },
                     "collection_time_point_reference": null,
-					"collection_location": {
+                    "collection_location": {
                         "id": null,
                         "label": null
                     },
@@ -227,13 +266,52 @@
                     "label": null
                 },
                 "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.",
-                "study_contact": "Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X",
                 "inclusion_exclusion_criteria": null,
-                "lab_name": "Mark M. Davis",
-                "lab_address": "Stanford University",
-                "submitted_by": "Florian Rubelt",
+                "contributors": [
+                    {
+                        "contributor_id": "1",
+                        "name": "Florian Rubelt",
+                        "orcid_id": {
+                            "id": null,
+                            "label": null
+                        },
+                        "affiliation": {
+                            "id": "ROR:00f54p054",
+                            "label": "Stanford University"
+                         },
+                        "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine",
+                        "contributions": [
+                            {
+                                "role": "investigation",
+                                "degree": null
+                            },
+                            {
+                                "role": "data curation",
+                                "degree": null
+                            }
+                        ]
+                    },
+                    {
+                        "contributor_id": "2",
+                        "name": "Mark M. Davis",
+                        "orcid_id": {
+                            "id": "ORCID:0000-0001-6868-657X",
+                            "label": "Mark Davis"
+                        },
+                        "affiliation": {
+                            "id": "ROR:00f54p054",
+                            "label": "Stanford University"
+                         },
+                        "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine",
+                        "contributions": [
+                            {
+                                "role": "supervision",
+                                "degree": null
+                            }
+                        ]
+                    }
+                ],
                 "pub_ids": ["PMID:27005435"],
-                "collected_by": null,
                 "grants": null,
                 "keywords_study": [
                     "contains_ig",
@@ -244,25 +322,25 @@
                 "subject_id": "TW01A",
                 "synthetic": false,
                 "species": {
-                    "id": "NCBITaxon_9606",
+                    "id": "NCBITAXON:9606",
                     "label": "Homo sapiens"
                 },
                 "sex": "female",
                 "age_min": 27,
                 "age_max": 27,
                 "age_unit": {
-                    "id": "UO_0000036",
+                    "id": "UO:0000036",
                     "label": "year"
                 },
                 "age_event": null,
                 "ancestry_population": {
-				    "id": null,
-					"label": null
-				},
-				"location_birth": {
-				    "id": null,
-					"label": null
-				},
+                    "id": null,
+                    "label": null
+                },
+                "location_birth": {
+                    "id": null,
+                    "label": null
+                },
                 "ethnicity": null,
                 "race": null,
                 "strain_name": null,
@@ -290,17 +368,17 @@
                     "sample_processing_id": null,
                     "sample_type": "peripheral venous puncture",
                     "tissue": {
-                        "id": "UBERON_0000178",
+                        "id": "UBERON:0000178",
                         "label": "blood"
                     },
                     "tissue_processing": "Ficoll gradient",
                     "cell_subset": {
-                        "id": "CL_0000787",
+                        "id": "CL:0000787",
                         "label": "memory B cell"
                     },
                     "cell_phenotype": "expression of CD20 and CD27",
                     "cell_species": {
-                        "id": "NCBITaxon_9606",
+                        "id": "NCBITAXON:9606",
                         "label": "Homo sapiens"
                     },
                     "single_cell": false,
@@ -332,7 +410,7 @@
                         "label": null
                     },
                     "collection_time_point_reference": null,
-					"collection_location": {
+                    "collection_location": {
                         "id": null,
                         "label": null
                     },
@@ -386,13 +464,52 @@
                     "label": null
                 },
                 "study_description": "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level.",
-                "study_contact": "Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X",
                 "inclusion_exclusion_criteria": null,
-                "lab_name": "Mark M. Davis",
-                "lab_address": "Stanford University",
-                "submitted_by": "Florian Rubelt",
+                "contributors": [
+                    {
+                        "contributor_id": "1",
+                        "name": "Florian Rubelt",
+                        "orcid_id": {
+                            "id": null,
+                            "label": null
+                        },
+                        "affiliation": {
+                            "id": "ROR:00f54p054",
+                            "label": "Stanford University"
+                         },
+                        "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine",
+                        "contributions": [
+                            {
+                                "role": "investigation",
+                                "degree": null
+                            },
+                            {
+                                "role": "data curation",
+                                "degree": null
+                            }
+                        ]
+                    },
+                    {
+                        "contributor_id": "2",
+                        "name": "Mark M. Davis",
+                        "orcid_id": {
+                            "id": "ORCID:0000-0001-6868-657X",
+                            "label": "Mark Davis"
+                        },
+                        "affiliation": {
+                            "id": "ROR:00f54p054",
+                            "label": "Stanford University"
+                         },
+                        "affiliation_department": "Department of Microbiology and Immunology, Stanford University School of Medicine",
+                        "contributions": [
+                            {
+                                "role": "supervision",
+                                "degree": null
+                            }
+                        ]
+                    }
+                ],
                 "pub_ids": ["PMID:27005435"],
-                "collected_by": null,
                 "grants": null,
                 "keywords_study": [
                     "contains_ig",
@@ -403,25 +520,25 @@
                 "subject_id": "TW01A",
                 "synthetic": false,
                 "species": {
-                    "id": "NCBITaxon_9606",
+                    "id": "NCBITAXON:9606",
                     "label": "Homo sapiens"
                 },
                 "sex": "female",
                 "age_min": 27,
                 "age_max": 27,
                 "age_unit": {
-                    "id": "UO_0000036",
+                    "id": "UO:0000036",
                     "label": "year"
                 },
                 "age_event": null,
                 "ancestry_population": {
-				    "id": null,
-					"label": null
-				},
-				"location_birth": {
-				    "id": null,
-					"label": null
-				},
+                    "id": null,
+                    "label": null
+                },
+                "location_birth": {
+                    "id": null,
+                    "label": null
+                },
                 "ethnicity": null,
                 "race": null,
                 "strain_name": null,
@@ -449,17 +566,17 @@
                     "sample_processing_id": null,
                     "sample_type": "peripheral venous puncture",
                     "tissue": {
-                        "id": "UBERON_0000178",
+                        "id": "UBERON:0000178",
                         "label": "blood"
                     },
                     "tissue_processing": "Ficoll gradient",
                     "cell_subset": {
-                        "id": "CL_0000895",
+                        "id": "CL:0000895",
                         "label": "naive thymus-derived CD4-positive, alpha-beta T cell"
                     },
                     "cell_phenotype": "expression of CD8 and absence of CD4 and CD45RO",
                     "cell_species": {
-                        "id": "NCBITaxon_9606",
+                        "id": "NCBITAXON:9606",
                         "label": "Homo sapiens"
                     },
                     "single_cell": false,
@@ -491,7 +608,7 @@
                         "label": null
                     },
                     "collection_time_point_reference": null,
-					"collection_location": {
+                    "collection_location": {
                         "id": null,
                         "label": null
                     },
@@ -540,17 +657,41 @@
 
     "GermlineSet": [{
         "germline_set_id": "OGRDB:G00007",
-        "author": "William Lees",
-        "lab_name": "",
-        "lab_address": "Birkbeck College, University of London, Malet Street, London",
-        "acknowledgements": [],
+        "acknowledgements": [
+            {
+                "contributor_id": "3",
+                "name": "William Lees",
+                "orcid_id": {
+                    "id": "ORCID:0000-0001-9834-6840",
+                    "label": "William Lees"
+                },
+                "affiliation": {
+                    "id": "ROR:02mb95055",
+                    "label": "Birkbeck, University of London"
+                },
+                "affiliation_department": null,
+                "contributions": [
+                    {
+                        "role": "investigation",
+                        "degree": null
+                    },
+                    {
+                        "role": "data curation",
+                        "degree": null
+                    }
+                ]
+            }
+        ],
         "release_version": 1,
         "release_description": "",
         "release_date": "2021-11-24",
         "germline_set_name": "CAST IGH",
         "germline_set_ref": "OGRDB:G00007.1",
         "pub_ids": [""],
-        "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" },
+        "species": {
+            "id": "NCBITAXON:10090",
+            "label": "Mus musculus"
+        },
         "species_subgroup": "CAST_EiJ",
         "species_subgroup_type": "strain",
         "locus": "IGH",
@@ -558,15 +699,37 @@
             {
                 "allele_description_id": "OGRDB:A00301",
                 "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2DBF",
-                "maintainer": "William Lees",
-                "acknowledgements": [],
-                "lab_address": "Birkbeck College, University of London, Malet Street, London",
+                "acknowledgements": [
+                    {
+                        "contributor_id": "3",
+                        "name": "William Lees",
+                        "orcid_id": {
+                            "id": "ORCID:0000-0001-9834-6840",
+                            "label": "William Lees"
+                        },
+                        "affiliation": {
+                            "id": "ROR:02mb95055",
+                            "label": "Birkbeck, University of London"
+                        },
+                        "affiliation_department": null,
+                        "contributions": [
+                            {
+                                "role": "investigation",
+                                "degree": null
+                            },
+                            {
+                                "role": "data curation",
+                                "degree": null
+                            }
+                        ]
+                    }
+                ],
                 "release_version": 1,
-                "release_date": "24-Nov-2021",
+                "release_date": "2021-11-24",
                 "release_description": "First release",
                 "label": "IGHV-2DBF",
-                "sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
-                "coding_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
+                "sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
+                "coding_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
                 "aliases": [
                     "watson_et_al:CAST_EiJ_IGHV5-3"
                 ],
@@ -575,7 +738,10 @@
                 "sequence_type": "V",
                 "functional": true,
                 "inference_type": "rearranged_only",
-                "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" },
+                "species": {
+                    "id": "NCBITAXON:10090",
+                    "label": "Mus musculus"
+                },
                 "species_subgroup": "CAST_EiJ",
                 "species_subgroup_type": "strain",
                 "status": "active",
@@ -596,8 +762,8 @@
                     {
                         "sequence_delineation_id": "1",
                         "delineation_scheme": "IMGT",
+                        "unaligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
                         "aligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
-                        "unaligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
                         "fwr1_start": 1,
                         "fwr1_end": 75,
                         "cdr1_start": 76,
@@ -609,7 +775,7 @@
                         "fwr3_start": 161,
                         "fwr3_end": 294,
                         "cdr3_start": 295,
-                        "alignment": [
+                        "alignment_labels": [
                             "1",
                             "2",
                             "3",
@@ -726,11 +892,33 @@
             {
                 "allele_description_id": "OGRDB:A00314",
                 "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2ETO",
-                "maintainer": "William Lees",
-                "acknowledgements": [],
-                "lab_address": "Birkbeck College, University of London, Malet Street, London",
+                "acknowledgements": [
+                    {
+                        "contributor_id": "3",
+                        "name": "William Lees",
+                        "orcid_id": {
+                            "id": "ORCID:0000-0001-9834-6840",
+                            "label": "William Lees"
+                        },
+                        "affiliation": {
+                            "id": "ROR:02mb95055",
+                            "label": "Birkbeck, University of London"
+                        },
+                        "affiliation_department": null,
+                        "contributions": [
+                            {
+                                "role": "investigation",
+                                "degree": null
+                            },
+                            {
+                                "role": "data curation",
+                                "degree": null
+                            }
+                        ]
+                    }
+                ],
                 "release_version": 1,
-                "release_date": "24-Nov-2021",
+                "release_date": "2021-11-24",
                 "release_description": "First release",
                 "label": "IGHV-2ETO",
                 "sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC",
@@ -743,7 +931,10 @@
                 "sequence_type": "V",
                 "functional": true,
                 "inference_type": "rearranged_only",
-                "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" },
+                "species": {
+                    "id": "NCBITAXON:10090",
+                    "label": "Mus musculus"
+                },
                 "species_subgroup": "CAST_EiJ",
                 "species_subgroup_type": "strain",
                 "status": "active",
@@ -764,8 +955,8 @@
                     {
                         "sequence_delineation_id": "1",
                         "delineation_scheme": "IMGT",
-                        "aligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
-                        "unaligned_sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
+                        "unaligned_sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC",
+                        "aligned_sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTG...GGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGA............GCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGT......GGGATGATGATAAGTACTATAACCCATCCCTGA...AGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC",
                         "fwr1_start": 1,
                         "fwr1_end": 75,
                         "cdr1_start": 76,
@@ -777,7 +968,7 @@
                         "fwr3_start": 161,
                         "fwr3_end": 294,
                         "cdr3_start": 295,
-                        "alignment": [
+                        "alignment_labels": [
                             "1",
                             "2",
                             "3",
diff --git a/tests/data/good_combined_airr.yaml b/tests/data/good_combined_airr.yaml
index 80d0fe3a2..2c9ab547c 100644
--- a/tests/data/good_combined_airr.yaml
+++ b/tests/data/good_combined_airr.yaml
@@ -21,13 +21,35 @@ Repertoire:
         a single chromosome. These data refine our understanding of the heritable mechanisms
         affecting the repertoire, and show that biases are evident on a chromosome-wide
         level.
-      study_contact: Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X
       inclusion_exclusion_criteria:
-      lab_name: Mark M. Davis
-      lab_address: Stanford University
-      submitted_by: Florian Rubelt
+      contributors:
+        - contributor_id: "1"
+          name: "Florian Rubelt"
+          orcid_id:
+            id:
+            label:
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "investigation"
+              degree:
+            - role: "data curation"
+              degree:
+        - contributor_id: "2"
+          name: "Mark M. Davis"
+          orcid_id:
+            id: "ORCID:0000-0001-6868-657X"
+            label: "Mark Davis"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "supervision"
+              degree:
       pub_ids: ["PMID:27005435"]
-      collected_by:
       grants:
       keywords_study:
         - contains_ig
@@ -36,13 +58,13 @@ Repertoire:
       subject_id: TW01A
       synthetic: false
       species:
-        id: NCBITaxon_9606
+        id: NCBITAXON:9606
         label: Homo sapiens
       sex: female
       age_min: 27
       age_max: 27
       age_unit:
-        id: UO_0000036
+        id: UO:0000036
         label: year
       age_event:
       ancestry_population:
@@ -69,9 +91,9 @@ Repertoire:
           medical_history:
       genotype:
         receptor_genotype_set:
-          receptor_genotype_set_id: '1'
+          receptor_genotype_set_id: "1"
           genotype_class_list:
-            - receptor_genotype_id: '1'
+            - receptor_genotype_id: "1"
               locus: IGH
               documented_alleles:
                 - label: IGHV1-69*01
@@ -90,31 +112,31 @@ Repertoire:
                   phasing: 1
               inference_process: repertoire_sequencing
         mhc_genotype_set:
-          mhc_genotype_set_id: "this is a unique identifier"
+          mhc_genotype_set_id: 01847298-d0c2-11ee-bc66
           mhc_genotype_list:
-            - mhc_genotype_id: unique
+            - mhc_genotype_id: 00be1c2e-d0c2-11ee-bfe7
               mhc_class: MHC-I
               mhc_genotyping_method: pcr_low_resolution
               mhc_alleles:
                 - allele_designation: "01:01"
                   gene:
-                    id: "MRO-0000046"
-                    label: "HLA-A"
-                  reference_set_ref: blah
+                    id: MRO-0000046
+                    label: HLA-A
+                  reference_set_ref:
     sample:
       - sample_id: TW01A_B_naive
         sample_processing_id:
         sample_type: peripheral venous puncture
         tissue:
-          id: UBERON_0000178
+          id: UBERON:0000178
           label: blood
         tissue_processing: Ficoll gradient
         cell_subset:
-          id: CL_0000788
+          id: CL:0000788
           label: naive B cell
         cell_phenotype: expression of CD20 and the absence of CD27
         cell_species:
-          id: NCBITaxon_9606
+          id: NCBITAXON:9606
           label: Homo sapiens
         single_cell: false
         cell_isolation: FACS
@@ -198,13 +220,35 @@ Repertoire:
         a single chromosome. These data refine our understanding of the heritable mechanisms
         affecting the repertoire, and show that biases are evident on a chromosome-wide
         level.
-      study_contact: Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X
       inclusion_exclusion_criteria:
-      lab_name: Mark M. Davis
-      lab_address: Stanford University
-      submitted_by: Florian Rubelt
+      contributors:
+        - contributor_id: "1"
+          name: "Florian Rubelt"
+          orcid_id:
+            id:
+            label:
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "investigation"
+              degree:
+            - role: "data curation"
+              degree:
+        - contributor_id: "2"
+          name: "Mark M. Davis"
+          orcid_id:
+            id: "ORCID:0000-0001-6868-657X"
+            label: "Mark Davis"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "supervision"
+              degree:
       pub_ids: ["PMID:27005435"]
-      collected_by:
       grants:
       keywords_study:
         - contains_ig
@@ -213,13 +257,13 @@ Repertoire:
       subject_id: TW01A
       synthetic: false
       species:
-        id: NCBITaxon_9606
+        id: NCBITAXON:9606
         label: Homo sapiens
       sex: female
       age_min: 27
       age_max: 27
       age_unit:
-        id: UO_0000036
+        id: UO:0000036
         label: year
       age_event:
       ancestry_population:
@@ -249,15 +293,15 @@ Repertoire:
         sample_processing_id:
         sample_type: peripheral venous puncture
         tissue:
-          id: UBERON_0000178
+          id: UBERON:0000178
           label: blood
         tissue_processing: Ficoll gradient
         cell_subset:
-          id: CL_0000787
+          id: CL:0000787
           label: memory B cell
         cell_phenotype: expression of CD20 and CD27
         cell_species:
-          id: NCBITaxon_9606
+          id: NCBITAXON:9606
           label: Homo sapiens
         single_cell: false
         cell_isolation: FACS
@@ -341,13 +385,35 @@ Repertoire:
         a single chromosome. These data refine our understanding of the heritable mechanisms
         affecting the repertoire, and show that biases are evident on a chromosome-wide
         level.
-      study_contact: Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X
       inclusion_exclusion_criteria:
-      lab_name: Mark M. Davis
-      lab_address: Stanford University
-      submitted_by: Florian Rubelt
+      contributors:
+        - contributor_id: "1"
+          name: "Florian Rubelt"
+          orcid_id:
+            id:
+            label:
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "investigation"
+              degree:
+            - role: "data curation"
+              degree:
+        - contributor_id: "2"
+          name: "Mark M. Davis"
+          orcid_id:
+            id: "ORCID:0000-0001-6868-657X"
+            label: "Mark Davis"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "supervision"
+              degree:
       pub_ids: ["PMID:27005435"]
-      collected_by:
       grants:
       keywords_study:
         - contains_ig
@@ -356,13 +422,13 @@ Repertoire:
       subject_id: TW01A
       synthetic: false
       species:
-        id: NCBITaxon_9606
+        id: NCBITAXON:9606
         label: Homo sapiens
       sex: female
       age_min: 27
       age_max: 27
       age_unit:
-        id: UO_0000036
+        id: UO:0000036
         label: year
       age_event:
       ancestry_population:
@@ -392,15 +458,15 @@ Repertoire:
         sample_processing_id:
         sample_type: peripheral venous puncture
         tissue:
-          id: UBERON_0000178
+          id: UBERON:0000178
           label: blood
         tissue_processing: Ficoll gradient
         cell_subset:
-          id: CL_0000895
+          id: CL:0000895
           label: naive thymus-derived CD4-positive, alpha-beta T cell
         cell_phenotype: expression of CD8 and absence of CD4 and CD45RO
         cell_species:
-          id: NCBITaxon_9606
+          id: NCBITAXON:9606
           label: Homo sapiens
         single_cell: false
         cell_isolation: FACS
@@ -464,357 +530,391 @@ Repertoire:
         analysis_provenance_id: 4625424004665971176-242ac11c-0001-012
 
 GermlineSet:
-- acknowledgements: []
-  allele_descriptions:
-  - acknowledgements: []
-    aliases:
-    - watson_et_al:CAST_EiJ_IGHV5-3
-    allele_description_id: OGRDB:A00301
-    allele_description_ref: OGRDB:Mouse_IGH:IGHV-2DBF
-    allele_designation: null
-    chromosome: null
-    coding_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA
-    curation: 'Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV5-3'
-    curational_tags: null
-    functional: true
-    gene_designation: null
-    gene_end: null
-    gene_start: null
-    inference_type: rearranged_only
-    lab_address: Birkbeck College, University of London, Malet Street, London
-    label: IGHV-2DBF
-    leader_1_end: null
-    leader_1_start: null
-    leader_2_end: null
-    leader_2_start: null
-    locus: IGH
-    maintainer: William Lees
-    paralogs: []
-    rearranged_support: []
-    release_date: 24-Nov-2021
-    release_description: First release
+  - germline_set_id: OGRDB:G00007
+    acknowledgements:
+      - contributor_id: "3"
+        name: William Lees
+        orcid_id:
+          id: ORCID:0000-0001-9834-6840
+          label: William Lees
+        affiliation:
+          id: ROR:02mb95055
+          label: Birkbeck, University of London
+        affiliation_department:
+        contributions:
+          - role: investigation
+            degree: null
+          - role: data curation
+            degree: null
     release_version: 1
-    sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA
-    sequence_type: V
+    release_description: ""
+    release_date: "2021-11-24"
+    germline_set_name: CAST IGH
+    germline_set_ref: OGRDB:G00007.1
+    pub_ids: [""]
     species:
       id: NCBITAXON:10090
       label: Mus musculus
     species_subgroup: CAST_EiJ
     species_subgroup_type: strain
-    status: active
-    subgroup_designation: null
-    unrearranged_support: []
-    utr_5_prime_end: null
-    utr_5_prime_start: null
-    v_gene_delineations:
-    - aligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA
-      alignment:
-      - '1'
-      - '2'
-      - '3'
-      - '4'
-      - '5'
-      - '6'
-      - '7'
-      - '8'
-      - '9'
-      - '10'
-      - '11'
-      - '12'
-      - '13'
-      - '14'
-      - '15'
-      - '16'
-      - '17'
-      - '18'
-      - '19'
-      - '20'
-      - '21'
-      - '22'
-      - '23'
-      - '24'
-      - '25'
-      - '26'
-      - '27'
-      - '28'
-      - '29'
-      - '30'
-      - '31'
-      - '32'
-      - '33'
-      - '34'
-      - '35'
-      - '36'
-      - '37'
-      - '38'
-      - '39'
-      - '40'
-      - '41'
-      - '42'
-      - '43'
-      - '44'
-      - '45'
-      - '46'
-      - '47'
-      - '48'
-      - '49'
-      - '50'
-      - '51'
-      - '52'
-      - '53'
-      - '54'
-      - '55'
-      - '56'
-      - '57'
-      - '58'
-      - '59'
-      - '60'
-      - '61'
-      - '62'
-      - '63'
-      - '64'
-      - '65'
-      - '66'
-      - '67'
-      - '68'
-      - '69'
-      - '70'
-      - '71'
-      - '72'
-      - '73'
-      - '74'
-      - '75'
-      - '76'
-      - '77'
-      - '78'
-      - '79'
-      - '80'
-      - '81'
-      - '82'
-      - '83'
-      - '84'
-      - '85'
-      - '86'
-      - '87'
-      - '88'
-      - '89'
-      - '90'
-      - '91'
-      - '92'
-      - '93'
-      - '94'
-      - '95'
-      - '96'
-      - '97'
-      - '98'
-      - '99'
-      - '100'
-      - '101'
-      - '102'
-      - '103'
-      - '104'
-      cdr1_end: 110
-      cdr1_start: 76
-      cdr2_end: 160
-      cdr2_start: 151
-      cdr3_start: 295
-      delineation_scheme: IMGT
-      fwr1_end: 75
-      fwr1_start: 1
-      fwr2_end: 150
-      fwr2_start: 111
-      fwr3_end: 294
-      fwr3_start: 161
-      sequence_delineation_id: '1'
-      unaligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA
-    v_rs_end: null
-    v_rs_start: null
-  - acknowledgements: []
-    aliases:
-    - watson_et_al:CAST_EiJ_IGHV8-2
-    allele_description_id: OGRDB:A00314
-    allele_description_ref: OGRDB:Mouse_IGH:IGHV-2ETO
-    allele_designation: null
-    chromosome: null
-    coding_sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC
-    curation: 'Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV8-2'
-    curational_tags: null
-    functional: true
-    gene_designation: null
-    gene_end: null
-    gene_start: null
-    inference_type: rearranged_only
-    lab_address: Birkbeck College, University of London, Malet Street, London
-    label: IGHV-2ETO
-    leader_1_end: null
-    leader_1_start: null
-    leader_2_end: null
-    leader_2_start: null
     locus: IGH
-    maintainer: William Lees
-    paralogs: []
-    rearranged_support: []
-    release_date: 24-Nov-2021
-    release_description: First release
-    release_version: 1
-    sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC
-    sequence_type: V
-    species:
-      id: NCBITAXON:10090
-      label: Mus musculus
-    species_subgroup: CAST_EiJ
-    species_subgroup_type: strain
-    status: active
-    subgroup_designation: null
-    unrearranged_support: []
-    utr_5_prime_end: null
-    utr_5_prime_start: null
-    v_gene_delineations:
-    - aligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA
-      alignment:
-      - '1'
-      - '2'
-      - '3'
-      - '4'
-      - '5'
-      - '6'
-      - '7'
-      - '8'
-      - '9'
-      - '10'
-      - '11'
-      - '12'
-      - '13'
-      - '14'
-      - '15'
-      - '16'
-      - '17'
-      - '18'
-      - '19'
-      - '20'
-      - '21'
-      - '22'
-      - '23'
-      - '24'
-      - '25'
-      - '26'
-      - '27'
-      - '28'
-      - '29'
-      - '30'
-      - '31'
-      - '32'
-      - '33'
-      - '34'
-      - '35'
-      - '36'
-      - '37'
-      - '38'
-      - '39'
-      - '40'
-      - '41'
-      - '42'
-      - '43'
-      - '44'
-      - '45'
-      - '46'
-      - '47'
-      - '48'
-      - '49'
-      - '50'
-      - '51'
-      - '52'
-      - '53'
-      - '54'
-      - '55'
-      - '56'
-      - '57'
-      - '58'
-      - '59'
-      - '60'
-      - '61'
-      - '62'
-      - '63'
-      - '64'
-      - '65'
-      - '66'
-      - '67'
-      - '68'
-      - '69'
-      - '70'
-      - '71'
-      - '72'
-      - '73'
-      - '74'
-      - '75'
-      - '76'
-      - '77'
-      - '78'
-      - '79'
-      - '80'
-      - '81'
-      - '82'
-      - '83'
-      - '84'
-      - '85'
-      - '86'
-      - '87'
-      - '88'
-      - '89'
-      - '90'
-      - '91'
-      - '92'
-      - '93'
-      - '94'
-      - '95'
-      - '96'
-      - '97'
-      - '98'
-      - '99'
-      - '100'
-      - '101'
-      - '102'
-      - '103'
-      - '104'
-      cdr1_end: 110
-      cdr1_start: 76
-      cdr2_end: 160
-      cdr2_start: 151
-      cdr3_start: 295
-      delineation_scheme: IMGT
-      fwr1_end: 75
-      fwr1_start: 1
-      fwr2_end: 150
-      fwr2_start: 111
-      fwr3_end: 294
-      fwr3_start: 161
-      sequence_delineation_id: '1'
-      unaligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA
-    v_rs_end: null
-    v_rs_start: null
-  author: William Lees
-  curation: null
-  germline_set_id: OGRDB:G00007
-  germline_set_name: CAST IGH
-  germline_set_ref: OGRDB:G00007.1
-  lab_address: Birkbeck College, University of London, Malet Street, London
-  lab_name: ''
-  locus: IGH
-  pub_ids: ['']
-  release_date: '2021-11-24'
-  release_description: ''
-  release_version: 1
-  species:
-    id: NCBITAXON:10090
-    label: Mus musculus
-  species_subgroup: CAST_EiJ
-  species_subgroup_type: strain
-
+    allele_descriptions:
+      - allele_description_id: OGRDB:A00301
+        allele_description_ref: OGRDB:Mouse_IGH:IGHV-2DBF
+        acknowledgements:
+          - contributor_id: "3"
+            name: William Lees
+            orcid_id:
+              id: ORCID:0000-0001-9834-6840
+              label: William Lees
+            affiliation:
+              id: ROR:02mb95055
+              label: Birkbeck, University of London
+            affiliation_department:
+            contributions:
+              - role: investigation
+                degree:
+              - role: data curation
+                degree:
+        release_version: 1
+        release_date: "2021-11-24"
+        release_description: First release
+        label: IGHV-2DBF
+        sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA
+        coding_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA
+        aliases:
+          - watson_et_al:CAST_EiJ_IGHV5-3
+        locus: IGH
+        chromosome:
+        sequence_type: V
+        functional: true
+        inference_type: rearranged_only
+        species:
+          id: NCBITAXON:10090
+          label: Mus musculus
+        species_subgroup: CAST_EiJ
+        species_subgroup_type: strain
+        status: active
+        gene_designation:
+        subgroup_designation:
+        allele_designation:
+        gene_start:
+        gene_end:
+        utr_5_prime_start:
+        utr_5_prime_end:
+        leader_1_start:
+        leader_1_end:
+        leader_2_start:
+        leader_2_end:
+        v_rs_start:
+        v_rs_end:
+        v_gene_delineations:
+          - sequence_delineation_id: '1'
+            delineation_scheme: IMGT
+            unaligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGATGGTAGTGGCACCTACTATCTGGACTCCTTGAAGAGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA
+            aligned_sequence: GAAGTGAAGCTGGTGGAGTCTGAGGGA...GGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTC............AGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA
+            fwr1_start: 1
+            fwr1_end: 75
+            cdr1_start: 76
+            cdr1_end: 110
+            fwr2_start: 111
+            fwr2_end: 150
+            cdr2_start: 151
+            cdr2_end: 160
+            fwr3_start: 161
+            fwr3_end: 294
+            cdr3_start: 295
+            alignment_labels:
+              - '1'
+              - '2'
+              - '3'
+              - '4'
+              - '5'
+              - '6'
+              - '7'
+              - '8'
+              - '9'
+              - '10'
+              - '11'
+              - '12'
+              - '13'
+              - '14'
+              - '15'
+              - '16'
+              - '17'
+              - '18'
+              - '19'
+              - '20'
+              - '21'
+              - '22'
+              - '23'
+              - '24'
+              - '25'
+              - '26'
+              - '27'
+              - '28'
+              - '29'
+              - '30'
+              - '31'
+              - '32'
+              - '33'
+              - '34'
+              - '35'
+              - '36'
+              - '37'
+              - '38'
+              - '39'
+              - '40'
+              - '41'
+              - '42'
+              - '43'
+              - '44'
+              - '45'
+              - '46'
+              - '47'
+              - '48'
+              - '49'
+              - '50'
+              - '51'
+              - '52'
+              - '53'
+              - '54'
+              - '55'
+              - '56'
+              - '57'
+              - '58'
+              - '59'
+              - '60'
+              - '61'
+              - '62'
+              - '63'
+              - '64'
+              - '65'
+              - '66'
+              - '67'
+              - '68'
+              - '69'
+              - '70'
+              - '71'
+              - '72'
+              - '73'
+              - '74'
+              - '75'
+              - '76'
+              - '77'
+              - '78'
+              - '79'
+              - '80'
+              - '81'
+              - '82'
+              - '83'
+              - '84'
+              - '85'
+              - '86'
+              - '87'
+              - '88'
+              - '89'
+              - '90'
+              - '91'
+              - '92'
+              - '93'
+              - '94'
+              - '95'
+              - '96'
+              - '97'
+              - '98'
+              - '99'
+              - '100'
+              - '101'
+              - '102'
+              - '103'
+              - '104'
+        unrearranged_support: []
+        rearranged_support: []
+        paralogs: []
+        curation: 'Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV5-3'
+        curational_tags:
+      - allele_description_id: OGRDB:A00314
+        allele_description_ref: OGRDB:Mouse_IGH:IGHV-2ETO
+        acknowledgements:
+          - contributor_id: "3"
+            name: William Lees
+            orcid_id:
+              id: ORCID:0000-0001-9834-6840
+              label: William Lees
+            affiliation:
+              id: ROR:02mb95055
+              label: Birkbeck, University of London
+            affiliation_department:
+            contributions:
+              - role: investigation
+                degree:
+              - role: data curation
+                degree:
+        release_version: 1
+        release_date: "2021-11-24"
+        release_description: First release
+        label: IGHV-2ETO
+        sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC
+        coding_sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC
+        aliases:
+          - watson_et_al:CAST_EiJ_IGHV8-2
+        locus: IGH
+        chromosome:
+        sequence_type: V
+        functional: true
+        inference_type: rearranged_only
+        species:
+          id: NCBITAXON:10090
+          label: Mus musculus
+        species_subgroup: CAST_EiJ
+        species_subgroup_type: strain
+        status: active
+        gene_designation:
+        subgroup_designation:
+        allele_designation:
+        gene_start:
+        gene_end:
+        utr_5_prime_start:
+        utr_5_prime_end:
+        leader_1_start:
+        leader_1_end:
+        leader_2_start:
+        leader_2_end:
+        v_rs_start:
+        v_rs_end:
+        v_gene_delineations:
+          - sequence_delineation_id: '1'
+            delineation_scheme: IMGT
+            unaligned_sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC
+            aligned_sequence: CAAGTTACTCTAAAAGAGTCTGGCCCTG...GGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGA............GCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGT......GGGATGATGATAAGTACTATAACCCATCCCTGA...AGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC
+            fwr1_start: 1
+            fwr1_end: 75
+            cdr1_start: 76
+            cdr1_end: 110
+            fwr2_start: 111
+            fwr2_end: 150
+            cdr2_start: 151
+            cdr2_end: 160
+            fwr3_start: 161
+            fwr3_end: 294
+            cdr3_start: 295
+            alignment_labels:
+              - '1'
+              - '2'
+              - '3'
+              - '4'
+              - '5'
+              - '6'
+              - '7'
+              - '8'
+              - '9'
+              - '10'
+              - '11'
+              - '12'
+              - '13'
+              - '14'
+              - '15'
+              - '16'
+              - '17'
+              - '18'
+              - '19'
+              - '20'
+              - '21'
+              - '22'
+              - '23'
+              - '24'
+              - '25'
+              - '26'
+              - '27'
+              - '28'
+              - '29'
+              - '30'
+              - '31'
+              - '32'
+              - '33'
+              - '34'
+              - '35'
+              - '36'
+              - '37'
+              - '38'
+              - '39'
+              - '40'
+              - '41'
+              - '42'
+              - '43'
+              - '44'
+              - '45'
+              - '46'
+              - '47'
+              - '48'
+              - '49'
+              - '50'
+              - '51'
+              - '52'
+              - '53'
+              - '54'
+              - '55'
+              - '56'
+              - '57'
+              - '58'
+              - '59'
+              - '60'
+              - '61'
+              - '62'
+              - '63'
+              - '64'
+              - '65'
+              - '66'
+              - '67'
+              - '68'
+              - '69'
+              - '70'
+              - '71'
+              - '72'
+              - '73'
+              - '74'
+              - '75'
+              - '76'
+              - '77'
+              - '78'
+              - '79'
+              - '80'
+              - '81'
+              - '82'
+              - '83'
+              - '84'
+              - '85'
+              - '86'
+              - '87'
+              - '88'
+              - '89'
+              - '90'
+              - '91'
+              - '92'
+              - '93'
+              - '94'
+              - '95'
+              - '96'
+              - '97'
+              - '98'
+              - '99'
+              - '100'
+              - '101'
+              - '102'
+              - '103'
+              - '104'
+        unrearranged_support: []
+        rearranged_support: []
+        paralogs: []
+        curation: 'Imported to OGRDB with the following notes: watson_et_al: CAST_EiJ_IGHV8-2'
+        curational_tags:
+    curation:
 
 GenotypeSet:
-  - receptor_genotype_set_id: '1'
+  - receptor_genotype_set_id: "1"
     genotype_class_list:
-      - receptor_genotype_id: '1'
+      - receptor_genotype_id: "1"
         locus: IGH
         documented_alleles:
           - label: IGHV1-69*01
diff --git a/tests/data/good_genotype_set.json b/tests/data/good_genotype_set.json
index ba10f56e9..abd24646c 100644
--- a/tests/data/good_genotype_set.json
+++ b/tests/data/good_genotype_set.json
@@ -35,4 +35,4 @@
             }
         ]
     }]
-}
\ No newline at end of file
+}
diff --git a/tests/data/good_germline_set.json b/tests/data/good_germline_set.json
index 41ecf5f7d..e74c590dc 100644
--- a/tests/data/good_germline_set.json
+++ b/tests/data/good_germline_set.json
@@ -1,17 +1,41 @@
 {
     "GermlineSet": [{
         "germline_set_id": "OGRDB:G00007",
-        "author": "William Lees",
-        "lab_name": "",
-        "lab_address": "Birkbeck College, University of London, Malet Street, London",
-        "acknowledgements": [],
+        "acknowledgements": [
+            {
+                "contributor_id": "1",
+                "name": "William Lees",
+                "orcid_id": {
+                    "id": "ORCID:0000-0001-9834-6840",
+                    "label": "William Lees"
+                },
+                "affiliation": {
+                    "id": "ROR:02mb95055",
+                    "label": "Birkbeck, University of London"
+                },
+                "affiliation_department":"",
+                "contributions": [
+                    {
+                        "role": "investigation",
+                        "degree": null
+                    },
+                    {
+                        "role": "data curation",
+                        "degree": null
+                    }
+                ]
+            }
+        ],
         "release_version": 1,
         "release_description": "",
         "release_date": "2021-11-24",
         "germline_set_name": "CAST IGH",
         "germline_set_ref": "OGRDB:G00007.1",
         "pub_ids": [""],
-        "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" },
+        "species": {
+            "id": "NCBITAXON:10090",
+            "label": "Mus musculus"
+        },
         "species_subgroup": "CAST_EiJ",
         "species_subgroup_type": "strain",
         "locus": "IGH",
@@ -19,11 +43,33 @@
             {
                 "allele_description_id": "OGRDB:A00301",
                 "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2DBF",
-                "maintainer": "William Lees",
-                "acknowledgements": [],
-                "lab_address": "Birkbeck College, University of London, Malet Street, London",
+                "acknowledgements":  [
+                    {
+                        "contributor_id": "1",
+                        "name": "William Lees",
+                        "orcid_id": {
+                            "id": "ORCID:0000-0001-9834-6840",
+                            "label": "William Lees"
+                        },
+                        "affiliation": {
+                            "id": "ROR:02mb95055",
+                            "label": "Birkbeck, University of London"
+                        },
+                        "affiliation_department":"",
+                        "contributions": [
+                            {
+                                "role": "investigation",
+                                "degree": null
+                            },
+                            {
+                                "role": "data curation",
+                                "degree": null
+                            }
+                        ]
+                    }
+                ],
                 "release_version": 1,
-                "release_date": "24-Nov-2021",
+                "release_date": "2021-11-24",
                 "release_description": "First release",
                 "label": "IGHV-2DBF",
                 "sequence": "GAAGTGAAGCTGGTGGAGTCTGAGGGAGGCTTAGTGCAGCCTGGAAGTTCCATGAAACTCTCCTGCACAGCCTCTGGATTCACTTTCAGTGACTATTACATGGCTTGGGTCCGCCAGGTTCCAGAAAAGGGTCTAGAATGGGTTGCAAACATTAATTATGAT......GGTAGTGGCACCTACTATCTGGACTCCTTGAAG...AGCCGTTTCATCATCTCGAGAGACAATGCAAAGAACATTCTATACCTGCAAATGAGCAGTCTGAAGTCTGAGGACACAGCCACGTATTACTGTGCAA",
@@ -36,7 +82,10 @@
                 "sequence_type": "V",
                 "functional": true,
                 "inference_type": "rearranged_only",
-                "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" },
+                "species": {
+                    "id": "NCBITAXON:10090",
+                    "label": "Mus musculus"
+                },
                 "species_subgroup": "CAST_EiJ",
                 "species_subgroup_type": "strain",
                 "status": "active",
@@ -70,7 +119,7 @@
                         "fwr3_start": 161,
                         "fwr3_end": 294,
                         "cdr3_start": 295,
-                        "alignment": [
+                        "alignment_labels": [
                             "1",
                             "2",
                             "3",
@@ -187,11 +236,33 @@
             {
                 "allele_description_id": "OGRDB:A00314",
                 "allele_description_ref": "OGRDB:Mouse_IGH:IGHV-2ETO",
-                "maintainer": "William Lees",
-                "acknowledgements": [],
-                "lab_address": "Birkbeck College, University of London, Malet Street, London",
+                "acknowledgements":  [
+                    {
+                        "contributor_id": "1",
+                        "name": "William Lees",
+                        "orcid_id": {
+                            "id": "ORCID:0000-0001-9834-6840",
+                            "label": "William Lees"
+                        },
+                        "affiliation": {
+                            "id": "ROR:02mb95055",
+                            "label": "Birkbeck, University of London"
+                        },
+                        "affiliation_department":"",
+                        "contributions": [
+                            {
+                                "role": "investigation",
+                                "degree": null
+                            },
+                            {
+                                "role": "data curation",
+                                "degree": null
+                            }
+                        ]
+                    }
+                ],
                 "release_version": 1,
-                "release_date": "24-Nov-2021",
+                "release_date": "2021-11-24",
                 "release_description": "First release",
                 "label": "IGHV-2ETO",
                 "sequence": "CAAGTTACTCTAAAAGAGTCTGGCCCTGGGATATTGAAGCCCTCACAGACCCTCAGTCTGACTTGTTCTTTCTCTGGGTTTTCACTGAGCACTACTAATATGGGTGTAGGCTGGATTCGTCAGCCTTCAGGGAAGGGTCTGGAGTGGCTGGCACACATTTGGTGGGATGATGATAAGTACTATAACCCATCCCTGAAGAGCCGGCTAACAATCTCCAAGGATACCTCCAGAAACCAGGTATTCCTCAAGATCACCAGTGTGGACACTGCAGATACTGCCACTTACTACTGTGCTC",
@@ -204,7 +275,10 @@
                 "sequence_type": "V",
                 "functional": true,
                 "inference_type": "rearranged_only",
-                "species": { "id": "NCBITAXON:10090", "label": "Mus musculus" },
+                "species": {
+                    "id": "NCBITAXON:10090",
+                    "label": "Mus musculus"
+                },
                 "species_subgroup": "CAST_EiJ",
                 "species_subgroup_type": "strain",
                 "status": "active",
@@ -238,7 +312,7 @@
                         "fwr3_start": 161,
                         "fwr3_end": 294,
                         "cdr3_start": 295,
-                        "alignment": [
+                        "alignment_labels": [
                             "1",
                             "2",
                             "3",
diff --git a/tests/data/good_repertoire.yaml b/tests/data/good_repertoire.yaml
index 9bf3a4653..6adaa2361 100644
--- a/tests/data/good_repertoire.yaml
+++ b/tests/data/good_repertoire.yaml
@@ -11,28 +11,50 @@ Repertoire:
         id: null
         label: null
       study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level."
-      study_contact: "Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X"
       inclusion_exclusion_criteria: null
-      lab_name: "Mark M. Davis"
-      lab_address: "Stanford University"
-      submitted_by: "Florian Rubelt"
+      contributors:
+        - contributor_id: "1"
+          name: "Florian Rubelt"
+          orcid_id:
+            id: null
+            label: null
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "investigation"
+              degree: null
+            - role: "data curation"
+              degree: null
+        - contributor_id: "2"
+          name: "Mark M. Davis"
+          orcid_id:
+            id: "ORCID:0000-0001-6868-657X"
+            label: "Mark Davis"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "supervision"
+              degree: null
       pub_ids: ["PMID:27005435"]
-      collected_by: null
       grants: null
-      keywords_study: 
+      keywords_study:
         - "contains_ig"
         - "contains_tr"
     subject:
       subject_id: TW01A
       synthetic: false
       species:
-        id: "NCBITaxon_9606"
+        id: "NCBITAXON:9606"
         label: "Homo sapiens"
       sex: female
       age_min: 27
       age_max: 27
       age_unit:
-        id: UO_0000036
+        id: UO:0000036
         label: year
       age_event: null
       ancestry_population:
@@ -63,15 +85,15 @@ Repertoire:
         sample_processing_id: null
         sample_type: "peripheral venous puncture"
         tissue:
-          id: "UBERON_0000178"
+          id: "UBERON:0000178"
           label: "blood"
         tissue_processing: "Ficoll gradient"
         cell_subset:
-          id: "CL_0000788"
+          id: "CL:0000788"
           label: "naive B cell"
         cell_phenotype: "expression of CD20 and the absence of CD27"
         cell_species:
-          id: "NCBITaxon_9606"
+          id: "NCBITAXON:9606"
           label: "Homo sapiens"
         single_cell: false
         cell_isolation: FACS
@@ -144,13 +166,35 @@ Repertoire:
         id: null
         label: null
       study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level."
-      study_contact: "Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X"
       inclusion_exclusion_criteria: null
-      lab_name: "Mark M. Davis"
-      lab_address: "Stanford University"
-      submitted_by: "Florian Rubelt"
+      contributors:
+        - contributor_id: "1"
+          name: "Florian Rubelt"
+          orcid_id:
+            id: null
+            label: null
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "investigation"
+              degree: null
+            - role: "data curation"
+              degree: null
+        - contributor_id: "2"
+          name: "Mark M. Davis"
+          orcid_id:
+            id: "ORCID:0000-0001-6868-657X"
+            label: "Mark Davis"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "supervision"
+              degree: null
       pub_ids: ["PMID:27005435"]
-      collected_by: null
       grants: null
       keywords_study:
         - "contains_ig"
@@ -159,13 +203,13 @@ Repertoire:
       subject_id: TW01A
       synthetic: false
       species:
-        id: "NCBITaxon_9606"
+        id: "NCBITAXON:9606"
         label: "Homo sapiens"
       sex: female
       age_min: 27
       age_max: 27
       age_unit:
-        id: UO_0000036
+        id: UO:0000036
         label: year
       age_event: null
       ancestry_population:
@@ -196,15 +240,15 @@ Repertoire:
         sample_processing_id: null
         sample_type: "peripheral venous puncture"
         tissue:
-          id: "UBERON_0000178"
+          id: "UBERON:0000178"
           label: "blood"
         tissue_processing: "Ficoll gradient"
         cell_subset:
-          id: "CL_0000787"
+          id: "CL:0000787"
           label: "memory B cell"
         cell_phenotype: "expression of CD20 and CD27"
         cell_species:
-          id: "NCBITaxon_9606"
+          id: "NCBITAXON:9606"
           label: "Homo sapiens"
         single_cell: false
         cell_isolation: FACS
@@ -277,13 +321,35 @@ Repertoire:
         id: null
         label: null
       study_description: "The adaptive immune system's capability to protect the body requires a highly diverse lymphocyte antigen receptor repertoire. However, the influence of individual genetic and epigenetic differences on these repertoires is not typically measured. By leveraging the unique characteristics of B, CD4+ T, and CD8+ T lymphocyte subsets isolated from monozygotic twins, we have quantified the impact of heritable factors on both the V(D)J recombination process and thymic selection in the case of T cell receptors, and show that the repertoires of both naive and antigen experienced cells are subject to biases resulting from differences in recombination. We show that biases in V(D)J usage, as well as biased N/P additions, contribute to significant variation in the CDR3 region. Moreover, we show that the relative usage of V and J gene segments is chromosomally biased, with approximately 1.5 times as many rearrangements originating from a single chromosome. These data refine our understanding of the heritable mechanisms affecting the repertoire, and show that biases are evident on a chromosome-wide level."
-      study_contact: "Mark M. Davis,  mmdavis@stanford.edu, ORCID:0000-0001-6868-657X"
       inclusion_exclusion_criteria: null
-      lab_name: "Mark M. Davis"
-      lab_address: "Stanford University"
-      submitted_by: "Florian Rubelt"
+      contributors:
+        - contributor_id: "1"
+          name: "Florian Rubelt"
+          orcid_id:
+            id: null
+            label: null
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "investigation"
+              degree: null
+            - role: "data curation"
+              degree: null
+        - contributor_id: "2"
+          name: "Mark M. Davis"
+          orcid_id:
+            id: "ORCID:0000-0001-6868-657X"
+            label: "Mark Davis"
+          affiliation:
+            id: "ROR:00f54p054"
+            label: "Stanford University"
+          affiliation_department: "Department of Microbiology and Immunology, Stanford University School of Medicine"
+          contributions:
+            - role: "supervision"
+              degree: null
       pub_ids: ["PMID:27005435"]
-      collected_by: null
       grants: null
       keywords_study:
         - "contains_ig"
@@ -292,13 +358,13 @@ Repertoire:
       subject_id: TW01A
       synthetic: false
       species:
-        id: "NCBITaxon_9606"
+        id: "NCBITAXON:9606"
         label: "Homo sapiens"
       sex: female
       age_min: 27
       age_max: 27
       age_unit:
-        id: UO_0000036
+        id: UO:0000036
         label: year
       age_event: null
       ancestry_population:
@@ -329,15 +395,15 @@ Repertoire:
         sample_processing_id: null
         sample_type: "peripheral venous puncture"
         tissue:
-          id: "UBERON_0000178"
+          id: "UBERON:0000178"
           label: "blood"
         tissue_processing: "Ficoll gradient"
         cell_subset:
-          id: "CL_0000895"
+          id: "CL:0000895"
           label: "naive thymus-derived CD4-positive, alpha-beta T cell"
         cell_phenotype: "expression of CD8 and absence of CD4 and CD45RO"
         cell_species:
-          id: "NCBITaxon_9606"
+          id: "NCBITAXON:9606"
           label: "Homo sapiens"
         single_cell: false
         cell_isolation: FACS