From f421d6e0c9bd9ee9ae2da9a74c1c38a10589a6ca Mon Sep 17 00:00:00 2001 From: CH Albach Date: Fri, 15 Apr 2016 15:22:56 -0700 Subject: [PATCH 1/3] mv Avro files to proto equivalent locations, no content change. --- .../ga4gh/allele_annotation_service.proto} | 0 .../ga4gh/allele_annotations.proto} | 0 .../common.avdl => proto/ga4gh/common.proto} | 11 +++++++++++ .../metadata.avdl => proto/ga4gh/metadata.proto} | 0 .../ga4gh/metadata_service.proto} | 0 .../ga4gh/read_service.proto} | 0 .../avro/reads.avdl => proto/ga4gh/reads.proto} | 0 .../ga4gh/reference_service.proto} | 0 .../ga4gh/references.proto} | 0 .../ga4gh/sequence_annotation_service.proto} | 0 .../ga4gh/sequence_annotations.proto} | 0 .../ga4gh/variant_service.proto} | 0 .../variants.avdl => proto/ga4gh/variants.proto} | 0 src/main/resources/avro/methods.avdl | 16 ---------------- 14 files changed, 11 insertions(+), 16 deletions(-) rename src/main/{resources/avro/alleleAnnotationmethods.avdl => proto/ga4gh/allele_annotation_service.proto} (100%) rename src/main/{resources/avro/alleleAnnotations.avdl => proto/ga4gh/allele_annotations.proto} (100%) rename src/main/{resources/avro/common.avdl => proto/ga4gh/common.proto} (96%) rename src/main/{resources/avro/metadata.avdl => proto/ga4gh/metadata.proto} (100%) rename src/main/{resources/avro/metadatamethods.avdl => proto/ga4gh/metadata_service.proto} (100%) rename src/main/{resources/avro/readmethods.avdl => proto/ga4gh/read_service.proto} (100%) rename src/main/{resources/avro/reads.avdl => proto/ga4gh/reads.proto} (100%) rename src/main/{resources/avro/referencemethods.avdl => proto/ga4gh/reference_service.proto} (100%) rename src/main/{resources/avro/references.avdl => proto/ga4gh/references.proto} (100%) rename src/main/{resources/avro/sequenceAnnotationmethods.avdl => proto/ga4gh/sequence_annotation_service.proto} (100%) rename src/main/{resources/avro/sequenceAnnotations.avdl => proto/ga4gh/sequence_annotations.proto} (100%) rename src/main/{resources/avro/variantmethods.avdl => proto/ga4gh/variant_service.proto} (100%) rename src/main/{resources/avro/variants.avdl => proto/ga4gh/variants.proto} (100%) delete mode 100644 src/main/resources/avro/methods.avdl diff --git a/src/main/resources/avro/alleleAnnotationmethods.avdl b/src/main/proto/ga4gh/allele_annotation_service.proto similarity index 100% rename from src/main/resources/avro/alleleAnnotationmethods.avdl rename to src/main/proto/ga4gh/allele_annotation_service.proto diff --git a/src/main/resources/avro/alleleAnnotations.avdl b/src/main/proto/ga4gh/allele_annotations.proto similarity index 100% rename from src/main/resources/avro/alleleAnnotations.avdl rename to src/main/proto/ga4gh/allele_annotations.proto diff --git a/src/main/resources/avro/common.avdl b/src/main/proto/ga4gh/common.proto similarity index 96% rename from src/main/resources/avro/common.avdl rename to src/main/proto/ga4gh/common.proto index 8f9bba87..8f9f2e4e 100644 --- a/src/main/resources/avro/common.avdl +++ b/src/main/proto/ga4gh/common.proto @@ -5,6 +5,17 @@ There are no directly associated methods. */ protocol Common { +/** +A general exception type. +*/ +error GAException { + /** The error message */ + string message; + + /** The numerical error code */ + int errorCode = -1; +} + /** Indicates the DNA strand associate for some data item. * `NEG_STRAND`: The negative (-) strand. diff --git a/src/main/resources/avro/metadata.avdl b/src/main/proto/ga4gh/metadata.proto similarity index 100% rename from src/main/resources/avro/metadata.avdl rename to src/main/proto/ga4gh/metadata.proto diff --git a/src/main/resources/avro/metadatamethods.avdl b/src/main/proto/ga4gh/metadata_service.proto similarity index 100% rename from src/main/resources/avro/metadatamethods.avdl rename to src/main/proto/ga4gh/metadata_service.proto diff --git a/src/main/resources/avro/readmethods.avdl b/src/main/proto/ga4gh/read_service.proto similarity index 100% rename from src/main/resources/avro/readmethods.avdl rename to src/main/proto/ga4gh/read_service.proto diff --git a/src/main/resources/avro/reads.avdl b/src/main/proto/ga4gh/reads.proto similarity index 100% rename from src/main/resources/avro/reads.avdl rename to src/main/proto/ga4gh/reads.proto diff --git a/src/main/resources/avro/referencemethods.avdl b/src/main/proto/ga4gh/reference_service.proto similarity index 100% rename from src/main/resources/avro/referencemethods.avdl rename to src/main/proto/ga4gh/reference_service.proto diff --git a/src/main/resources/avro/references.avdl b/src/main/proto/ga4gh/references.proto similarity index 100% rename from src/main/resources/avro/references.avdl rename to src/main/proto/ga4gh/references.proto diff --git a/src/main/resources/avro/sequenceAnnotationmethods.avdl b/src/main/proto/ga4gh/sequence_annotation_service.proto similarity index 100% rename from src/main/resources/avro/sequenceAnnotationmethods.avdl rename to src/main/proto/ga4gh/sequence_annotation_service.proto diff --git a/src/main/resources/avro/sequenceAnnotations.avdl b/src/main/proto/ga4gh/sequence_annotations.proto similarity index 100% rename from src/main/resources/avro/sequenceAnnotations.avdl rename to src/main/proto/ga4gh/sequence_annotations.proto diff --git a/src/main/resources/avro/variantmethods.avdl b/src/main/proto/ga4gh/variant_service.proto similarity index 100% rename from src/main/resources/avro/variantmethods.avdl rename to src/main/proto/ga4gh/variant_service.proto diff --git a/src/main/resources/avro/variants.avdl b/src/main/proto/ga4gh/variants.proto similarity index 100% rename from src/main/resources/avro/variants.avdl rename to src/main/proto/ga4gh/variants.proto diff --git a/src/main/resources/avro/methods.avdl b/src/main/resources/avro/methods.avdl deleted file mode 100644 index ead63134..00000000 --- a/src/main/resources/avro/methods.avdl +++ /dev/null @@ -1,16 +0,0 @@ -@namespace("org.ga4gh.methods") - -protocol RPC { - -/** -A general exception type. -*/ -error GAException { - /** The error message */ - string message; - - /** The numerical error code */ - int errorCode = -1; -} - -} From 0ff95ff42e9436522a7387266742832ed29d77a6 Mon Sep 17 00:00:00 2001 From: CH Albach Date: Fri, 15 Apr 2016 17:39:32 -0700 Subject: [PATCH 2/3] Convert Avro -> proto3. --- CONTRIBUTING.rst | 50 +-- README.rst | 4 +- pom.xml | 84 ++-- .../ga4gh/allele_annotation_service.proto | 323 ++++++------- src/main/proto/ga4gh/allele_annotations.proto | 234 ++++------ src/main/proto/ga4gh/common.proto | 242 +++++----- src/main/proto/ga4gh/metadata.proto | 309 ++++++------- src/main/proto/ga4gh/metadata_service.proto | 99 ++-- src/main/proto/ga4gh/read_service.proto | 296 +++++------- src/main/proto/ga4gh/reads.proto | 411 ++++++++--------- src/main/proto/ga4gh/reference_service.proto | 369 +++++++-------- src/main/proto/ga4gh/references.proto | 211 ++++----- .../ga4gh/sequence_annotation_service.proto | 288 +++++------- .../proto/ga4gh/sequence_annotations.proto | 213 ++++----- src/main/proto/ga4gh/variant_service.proto | 355 +++++++-------- src/main/proto/ga4gh/variants.proto | 423 ++++++++---------- tests/compile_schemas.py | 126 ------ tests/test_protocol.py | 62 --- 18 files changed, 1667 insertions(+), 2432 deletions(-) delete mode 100644 tests/compile_schemas.py delete mode 100644 tests/test_protocol.py diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index a1834992..fe745158 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -118,54 +118,12 @@ Syntax Style and Conventions The current code conventions for the source files are as follows: +- Follow the `protocol buffers style guide + `__ - Use two-space indentation, and no tabs. - Hard-wrap code to 80 characters per line. -- Use ``UpperCamelCase`` for object or record names. -- Use ``lowerCamelCase`` for attribute or method names. -- Use ``CONSTANT_CASE`` for global and constant values. -- Comments: - - - Comments should be indented at the same level as the surrounding - code. - - Comments should precede the code that they make a comment on. - Documentation comments will not work otherwise. - - Documentation comments, which are intended to be processed by - avrodoc and displayed in the user-facing API documentation, must use - the ``/** ... */`` style, and must not have a leading ``*`` on each - internal line: - - :: - - /** - This documentation comment will be - processed correctly by avrodoc. - */ - - :: - - /** - * This documentation comment will have a - * bullet point at the start of every line - * when processed by avrodoc. - */ - - - Block and multi-line non-documentation comments, intended for schema - developers only, must use the ``/* ... */`` style. - - :: - - /* - This multi-line comment will not appear in the - avrodoc documentation and is intended for - schema developers. - */ - - - All multi-line comments should have the comment text at the same - indent level as the comment delimeters. - - One-line non-documentation comments, intended for schema developers - only, must use the ``// ...`` style. - - Comments may use `reStructuredText - `__ mark up. +- Comments may use `reStructuredText + `__ mark up. Documentation @@@@@@@@@@@@@ diff --git a/README.rst b/README.rst index 9e087310..4cf5007e 100644 --- a/README.rst +++ b/README.rst @@ -31,8 +31,8 @@ primary data collected from sequencing machines. The team will deliver: #. Data model. An abstract, mathematically complete and precise model of - the data that is manipulated by the API. See the `Avro - directory `__ for our in-progress work on + the data that is manipulated by the API. See the `Proto + directory `__ for our in-progress work on defining v0.5 of the data model. #. API Specification. A human-readable document introducing and diff --git a/pom.xml b/pom.xml index 1ac68de0..4ff836bb 100644 --- a/pom.xml +++ b/pom.xml @@ -35,21 +35,44 @@ - 1.7.7 + 3.0.0-beta-2 [1.6,) [3.0.4,) UTF-8 UTF-8 + + + + + never + + + false + + central + Central Repository + https://repo.maven.apache.org/maven2 + + + protoc-plugin + https://dl.bintray.com/sergei-ivanov/maven/ + + + + + + kr.motd.maven + os-maven-plugin + 1.4.0.Final + + - - org.apache.avro - avro-maven-plugin - ${avro.version} - org.apache.maven.plugins maven-clean-plugin @@ -125,21 +148,18 @@ - org.apache.avro - avro-maven-plugin + com.google.protobuf.tools + maven-protoc-plugin + 0.4.4 + + com.google.protobuf:protoc:${proto.version}:exe:${os.detected.classifier} + - schemas - generate-sources - schema - protocol - idl-protocol + compile + compile-python - - ${project.basedir}/src/main/resources/avro - String - @@ -165,7 +185,8 @@ - + - - - - org.apache.avro - avro - ${avro.version} - - - org.apache.avro - avro-ipc - ${avro.version} - - - - org.apache.avro - avro - compile - - - org.apache.avro - avro-ipc - compile + com.google.protobuf + protobuf-java + ${proto.version} diff --git a/src/main/proto/ga4gh/allele_annotation_service.proto b/src/main/proto/ga4gh/allele_annotation_service.proto index a32e3ac4..708d62da 100644 --- a/src/main/proto/ga4gh/allele_annotation_service.proto +++ b/src/main/proto/ga4gh/allele_annotation_service.proto @@ -1,200 +1,147 @@ -@namespace("org.ga4gh.methods") -protocol AlleleAnnotationMethods { - -/* -This protocol defines methods use to mine pre-calculated variant allele -annotations. -*/ - -import idl "methods.avdl"; -import idl "variants.avdl"; -import idl "alleleAnnotations.avdl"; -import idl "metadata.avdl"; -/* -variantannotations/search returns annotation for the alleles of Variants - -This allows the mining of allele-specific annotations on a VariantSet by -either a region or by a set of genomic features. Where a region is supplied -annotation of all alleles vs all features in the region is returned. Where a -set of features is supplied, only annotations against these features (matching -on featuretype and id) are returned and other overlapping features are ignored. - -variantannotationsets/search returns information on the input to the -annotation. This will be a VariantSet and the reference data and software -versions used in calculating the annotation. -It is essential this information is exhaustive. -*/ - -/****************** /variantannotations/search *********************/ -/** -This request maps to the body of `POST /variantannotations/search` as JSON -*/ -record SearchVariantAnnotationsRequest { - /** Required. The ID of the variant annotation set to search over. */ - string variantAnnotationSetId; - - /** - Only return variants with reference alleles on the reference with this - name. One of this field or `referenceId` or `features` is required. - (case-sensitive, exact match) - */ - union { null, string } referenceName = null; - - /** - Only return variants with reference alleles on the reference with this - ID. One of this field or `referenceName` or `features` is required. - */ - union { null, string } referenceId = null; - - /** - Required if referenceName or referenceId supplied. - The beginning of the window (0-based, inclusive) for which variants with - overlapping reference alleles should be returned. - Genomic positions are non-negative integers less than reference length. - Requests spanning the join of circular genomes are represented as - two requests one on each side of the join (position 0). - */ - long start; - - /** - Required if referenceName or referenceId supplied. - The end of the window (0-based, exclusive) for which variants with - overlapping reference alleles should be returned. - */ - long end; - -// This section will be re-instated when features are available in the API -// /** -// Only return variant annotations for any of these features. -// Features may include specific transcripts or genes. A search by gene will -// return information for all transcripts associated with the gene in the -// variant annotation set. -// This or a location (referenceName/referenceId plus optional start and end) -// must be supplied. -// If null, return all variant annotations in specified window. -// */ -// union { null, array } featureIds = null; - - /** - This filter allows variant, transcript combinations to be extracted by effect - type(s). - Only return variant annotations including any of these effects and only return - transcript effects including any of these effects. Exact matching across all - fields of the Sequence Ontology OntologyTerm is required. - (A transcript effect may have multiple SO effects which will all be reported.) - If null, return all variant annotations. - */ - union { null, array } effects = null; - - /** - Specifies the maximum number of results to return in a single page. - If unspecified, a system default will be used. - */ - union { null, int } pageSize = null; - - /** - The continuation token, which is used to page through large result sets. - To get the next page of results, set this parameter to the value of - `nextPageToken` from the previous response. - */ - union { null, string } pageToken = null; +syntax = "proto3"; + +package ga4gh; + +import "ga4gh/allele_annotations.proto"; +import "ga4gh/metadata.proto"; + +service AlleleAnnotationService { + // Gets a list of `VariantAnnotation`s matching the search criteria. + // + // This allows the mining of allele-specific annotations on a VariantSet by + // either a region or by a set of genomic features. Where a region is supplied + // annotation of all alleles vs all features in the region is returned. Where + // a set of features is supplied, only annotations against these features + // (matching on featuretype and id) are returned and other overlapping + // features are ignored. + // + // variantannotationsets/search returns information on the input to the + // annotation. This will be a VariantSet and the reference data and software + // versions used in calculating the annotation. It is essential this + // information is exhaustive. + // + // `POST /variantannotations/search` must accept a JSON version of + // `SearchVariantAnnotationsRequest` as the post body and will return a JSON + // version of `SearchVariantAnnotationsResponse`. + rpc SearchVariantAnnotations(SearchVariantAnnotationsRequest) + returns (SearchVariantAnnotationsResponse); + + // Returns a list of available variant annotation sets. + // + // `POST /variantannotationsets/search` must accept a JSON version of + // `SearchVariantAnnotationSetsRequest` as the post body and will return a + // JSON version of `SearchVariantAnnotationSetsResponse`. + rpc SearchVariantAnnotationSets(SearchVariantAnnotationSetsRequest) + returns (SearchVariantAnnotationSetsResponse); + + // Gets an `VariantAnnotationSet` by ID. + // + // `GET /variantannotationsets/{variant_annotation_set_id}` will return a JSON + // version of `VariantAnnotationSet`. + rpc GetVariantAnnotationSet(GetVariantAnnotationSetRequest) + returns (VariantAnnotationSet); } -/** -This is the response from `POST /variantannotations/search` expressed as JSON. -*/ -record SearchVariantAnnotationsResponse { - /** - The list of matching variant annotations. - */ - array variantAnnotations = []; - - /** - The continuation token, which is used to page through large result sets. - Provide this value in a subsequent request to return the next page of - results. This field will be empty if there aren't any additional results. - */ - union { null, string } nextPageToken = null; +/****************** /variantannotations *********************/ +// This request maps to the body of `POST /variantannotations/search` as JSON. +message SearchVariantAnnotationsRequest { + // Required. The ID of the variant annotation set to search over. + string variant_annotation_set_id = 1; + + // Only return variants with reference alleles on the reference with this + // name. One of this field or `reference_id` is required. + string reference_name = 2; + + // Only return variants with reference alleles on the reference with this + // ID. One of this field or `reference_name` is required. + string reference_id = 3; + + // Required if reference_name or reference_id supplied. The beginning of the + // window (0-based, inclusive) for which variants with overlapping reference + // alleles should be returned. Genomic positions are non-negative integers + // less than reference length. Requests spanning the join of circular + // genomes are represented as two requests one on each side of the join + // (position 0). + int64 start = 4; + + // Required if reference_name or reference_id supplied. The end of the window + // (0-based, exclusive) for which variants with overlapping reference + // alleles should be returned. + int64 end = 5; + + // This section will be re-instated when features are available in the API + // + // Only return variant annotations for any of these features. + // Features may include specific transcripts or genes. A search by gene will + // return information for all transcripts associated with the gene in the + // variant annotation set. + // This or a location (referenceName/referenceId plus optional start and end) + // must be supplied. + // If empty, return all variant annotations in specified window. + // repeated string feature_ids; + + // This filter allows variant, transcript combinations to be extracted by + // effect type(s). Only return variant annotations including any of these + // effects and only return transcript effects including any of these + // effects. Exact matching across all fields of the Sequence Ontology + // OntologyTerm is required. (A transcript effect may have multiple SO + // effects which will all be reported.) If empty, return all variant + // annotations. + repeated OntologyTerm effects = 6; + + // Specifies the maximum number of results to return in a single page. If + // unspecified, a system default will be used. + int32 page_size = 7; + + // The continuation token, which is used to page through large result sets. + // To get the next page of results, set this parameter to the value of + // `next_page_token` from the previous response. + string page_token = 8; } -/** -Gets a list of `VariantAnnotations` matching the search criteria. - -`POST /variantannotations/search` must accept a JSON version of -`SearchVariantAnnotationsRequest` as the post body and will return a -JSON version of `SearchVariantAnnotationsResponse`. -*/ -SearchVariantAnnotationsResponse searchVariantAnnotations( - /** - This request maps to the body of `POST /variantannotations/search` as JSON. - */ - SearchVariantAnnotationsRequest request) throws GAException; - -/** -This request maps to the body of `POST /variantannotationsets/search` as JSON -*/ -record SearchVariantAnnotationSetsRequest { - - /** - Required. The `VariantSet` to search. - */ - string variantSetId; - - /** - Specifies the maximum number of results to return in a single page. - If unspecified, a system default will be used. - */ - union { null, int } pageSize = null; - - /** - The continuation token, which is used to page through large result sets. - To get the next page of results, set this parameter to the value of - `nextPageToken` from the previous response. - */ - union { null, string } pageToken = null; +// This is the response from `POST /variantannotations/search` expressed as +// JSON. +message SearchVariantAnnotationsResponse { + // The list of matching variant annotations. + repeated VariantAnnotation variant_annotations = 1; + // The continuation token, which is used to page through large result sets. + // Provide this value in a subsequent request to return the next page of + // results. This field will be empty if there aren't any additional results. + string next_page_token = 2; } -/** -This is the response from `POST /variantannotationsets/search` expressed -as JSON. -*/ -record SearchVariantAnnotationSetsResponse { - - /** The list of matching variant annotation sets. */ - array variantAnnotationSets = []; +/****************** /variantannotationsets *********************/ +// This request maps to the body of `POST /variantannotationsets/search` as +// JSON. +message SearchVariantAnnotationSetsRequest { + // Required. The `VariantSet` to search. + string variant_set_id = 1; + + // Specifies the maximum number of results to return in a single page. If + // unspecified, a system default will be used. + int32 page_size = 2; + + // The continuation token, which is used to page through large result sets. + // To get the next page of results, set this parameter to the value of + // `next_page_token` from the previous response. + string page_token = 3; +} - /** - The continuation token, which is used to page through large result sets. - Provide this value in a subsequent request to return the next page of - results. This field will be empty if there aren't any additional results. - */ - union { null, string } nextPageToken = null; +// This is the response from `POST /variantannotationsets/search` expressed as +// JSON. +message SearchVariantAnnotationSetsResponse { + // The list of matching variant annotation sets. + repeated VariantAnnotationSet variant_annotation_sets = 1; + // The continuation token, which is used to page through large result sets. + // Provide this value in a subsequent request to return the next page of + // results. This field will be empty if there aren't any additional results. + string next_page_token = 2; } -/** -Returns a list of available variant annotation sets -`POST /variantannotationsets/search` must accept a JSON version of -`SearchVariantAnnotationSetsRequest` as the post body and will return a JSON -version of `SearchVariantAnnotationSetsResponse`. -*/ -SearchVariantAnnotationSetsResponse searchVariantAnnotationSets( - /** - This request maps to the body of `POST /variantannotationsets/search` as JSON - */ - SearchVariantAnnotationSetsRequest request) throws GAException; - -/**************** /variantannotationsets/{id} *******************/ -/** -Gets an `VariantAnnotationSet` by ID. -`GET /variantannotationsets/{id}` will return a JSON version of -`VariantAnnotationSet`. -*/ -org.ga4gh.models.VariantAnnotationSet getVariantAnnotationSet( - /** - The ID of the `VariantAnnotationSet`. - */ - string id) throws GAException; - +// This request maps to the URL `GET /variantannotationsets/{id}`. +message GetVariantAnnotationSetRequest { + // The ID of the `VariantAnnotationSet` to be retrieved. + string variant_annotation_set_id = 1; } diff --git a/src/main/proto/ga4gh/allele_annotations.proto b/src/main/proto/ga4gh/allele_annotations.proto index 511f5279..d53282cd 100644 --- a/src/main/proto/ga4gh/allele_annotations.proto +++ b/src/main/proto/ga4gh/allele_annotations.proto @@ -1,180 +1,144 @@ -@namespace("org.ga4gh.models") +syntax = "proto3"; -/** -This protocol defines types used by the GA4GH Allele Annotation API. -*/ - -protocol AlleleAnnotations{ +package ga4gh; -import idl "common.avdl"; -import idl "metadata.avdl"; -import idl "variants.avdl"; +import "ga4gh/metadata.proto"; +import "google/protobuf/struct.proto"; /* - The VariantAnnotation record groups different types of annotation records by Variant. The TranscriptEffect sub record holds information on the effect of a specific -allele on a specific transcript. -As Variants may overlap multiple transcripts, they may have multiple -TranscriptEffect records. Variants with multiple alternate alleles will have -multiple TranscriptEffect records per transcript. -(2 alternate alleles x 3 transcripts = 6 TranscriptEffect records) - -VariantAnnotation records belong to VariantAnnotationSets. -VariantAnnotationSets are created by comparing a number of Variants from a -VariantSet to a specific set of reference data using specific software tools. -A VariantAnnotationSet contains information on reference data and software -versions used in calculating the annotation; it is essential this information -is exhaustive. - -*/ - -/** -An AnalysisResult record holds the output of a prediction package such -as SIFT on a specific allele. +allele on a specific transcript. As Variants may overlap multiple transcripts, +they may have multiple TranscriptEffect records. Variants with multiple +alternate alleles will have multiple TranscriptEffect records per transcript. (2 +alternate alleles x 3 transcripts = 6 TranscriptEffect records) + +VariantAnnotation records belong to VariantAnnotationSets. VariantAnnotationSets +are created by comparing a number of Variants from a VariantSet to a specific +set of reference data using specific software tools. A VariantAnnotationSet +contains information on reference data and software versions used in calculating +the annotation; it is essential this information is exhaustive. */ -record AnalysisResult { - - /** The ID of the analysis record for this result */ - string analysisId; - /** The text-based result for this analysis */ - union { null, string} result; +// An AnalysisResult record holds the output of a prediction package such as +// SIFT on a specific allele. +message AnalysisResult { + // The ID of the analysis record for this result + string analysis_id = 1; - /** The numeric score for this analysis */ - union { null, int} score; + // The text-based result for this analysis + string result = 2; + // The numeric score for this analysis + int32 score = 3; } -/** -An allele location record holds the location of an allele relative to a -non-genomic coordinate system such as a CDS or protein and holds the -reference and alternate sequence where appropriate -*/ -record AlleleLocation { +// An allele location record holds the location of an allele relative to a non - +// genomic coordinate system such as a CDS or protein and holds the reference +// and alternate sequence where appropriate +message AlleleLocation { + // Relative start position of the allele in this coordinate system + int32 start = 1; - /** Relative start position of the allele in this coordinate system */ - int start ; + // Relative end position of the allele in this coordinate system + int32 end = 2; - /** Relative end position of the allele in this coordinate system */ - union { null, int } end = null; + // Reference sequence in feature (this should be the codon at CDS level) + string reference_sequence = 3; - /** Reference sequence in feature (this should be the codon at CDS level) */ - union { null, string } referenceSequence = null; - - /** Alternate sequence in feature (this should be the codon at CDS level) */ - union { null, string } alternateSequence = null; + // Alternate sequence in feature (this should be the codon at CDS level) + string alternate_sequence = 4; } -/** -A VariantAnnotationSet record groups VariantAnnotation records. It is derived -from a VariantSet and holds information describing the software and reference -data used in the annotation. -*/ -record VariantAnnotationSet { +// A VariantAnnotationSet record groups VariantAnnotation records. It is derived +// from a VariantSet and holds information describing the software and reference +// data used in the annotation. - /** The ID of the variant annotation set record */ - string id; +message VariantAnnotationSet { + // The ID of the variant annotation set record + string id = 1; - /** The ID of the variant set to which this annotation set belongs */ - string variantSetId; + // The ID of the variant set to which this annotation set belongs + string variantSet_id = 2; - /** The variant annotation set name. */ - union { null, string } name = null; + // The variant annotation set name. + string name = 3; - /** - Analysis details. It is essential to supply versions for all software and - reference data used. - */ - Analysis analysis; + // Analysis details. It is essential to supply versions for all software and + // reference data used. + Analysis analysis = 4; } -/** -A HGVSAnnotation record holds Human Genome Variation Society descriptions -of the sequence change with respect to genomic, transcript and protein -sequences. See: http://www.hgvs.org/mutnomen/recs.html. -Descriptions should be provided at genomic level. Descriptions at transcript -level should be provided when the allele lies within a transcript. Descriptions -at protein level should be provided when the allele lies within the translated -sequence or stop codon. -*/ -record HGVSAnnotation { - - union { null, string } genomic = null; - - union { null, string } transcript = null; - - union { null, string } protein = null; +// A HGVSAnnotation record holds Human Genome Variation Society descriptions of +// the sequence change with respect to genomic, transcript and protein +// sequences. See: http:// www.hgvs.org/mutnomen/recs.html. +// Descriptions should be provided at genomic level +// - Descriptions at transcript level should be provided when the allele +// lies within a transcript +// - Descriptions at protein level should be provided when the allele +// lies within the translated sequence or stop codon. +message HGVSAnnotation { + string genomic = 1; + string transcript = 2; + string protein = 3; } -/** -A transcript effect record is a set of information describing the -effect of an allele on a transcript -*/ -record TranscriptEffect { +// A transcript effect record is a set of information describing the effect of +// an allele on a transcript - /** The ID of the transcript effect record - */ - string id; +message TranscriptEffect { + // The ID of the transcript effect record + string id = 1; - //TODO: derive unique id from digest of data [location, allele, transcript?] + // TODO: derive unique id from digest of data [location, allele, transcript?] - /** The id of the transcript feature the annotation is relative to - */ - string featureId; + // The id of the transcript feature the annotation is relative to. + string feature_id = 2; - /** Alternate allele - a variant may have more than one alternate allele, - each of which will have distinct annotation. - */ - union { null, string} alternateBases = null; + // Alternate allele - a variant may have more than one alternate allele, + // each of which will have distinct annotation. + string alternate_bases = 3; - /** Effect of variant on this feature */ - array effects; + // Effect of variant on this feature. + repeated OntologyTerm effects = 4; - /** Human Genome Variation Society variant descriptions */ - HGVSAnnotation hgvsAnnotation; + // Human Genome Variation Society variant descriptions. + HGVSAnnotation hgvs_annotation = 5; - /** Change relative to cDNA */ - union {null, AlleleLocation} cDNALocation =null; + // Change relative to cDNA. + AlleleLocation cdna_location = 6; - /* Change relative to coding sequence */ - union { null, AlleleLocation} CDSLocation = null; + // Change relative to coding sequence. + AlleleLocation cds_location = 7; - /** Change relative to protein */ - union { null, AlleleLocation} proteinLocation = null; + // Change relative to protein. + AlleleLocation protein_location = 8; - /** Output from prediction packages such as SIFT */ - array analysisResults; + // Output from prediction packages such as SIFT. + repeated AnalysisResult analysis_results = 9; } -/** -A `VariantAnnotation` record represents the result of comparing a variant -to a set of reference data. -*/ -record VariantAnnotation { - - /** The ID of this VariantAnnotation. */ - string id; +// A `VariantAnnotation` record represents the result of comparing a variant to +// a set of reference data. +message VariantAnnotation { + // The ID of this VariantAnnotation. + string id = 1; - /** The variant ID. */ - string variantId; + // The variant ID. + string variant_id = 2; - /** The ID of the variant annotation set this record belongs to. */ - string variantAnnotationSetId; + // The ID of the variant annotation set this record belongs to. + string variant_annotation_set_id = 3; - /** The :ref:`ISO 8601 ` time at which this record was created. */ - union { null, string } createDateTime = null; + // The time at which this record was created, in ISO 8601 format. + string create_date_time = 4; - /** - The transcript effect annotation for the alleles of this variant. Each one - represents the effect of a single allele on a single transcript. - */ - array transcriptEffects = []; - - /** Additional annotation data in key-value pairs. */ - map> info = {}; -} + // The transcript effect annotation for the alleles of this variant. Each + // one represents the effect of a single allele on a single transcript. + repeated TranscriptEffect transcript_effects = 5; + // Additional annotation data in key-value pairs. + map info = 6; } diff --git a/src/main/proto/ga4gh/common.proto b/src/main/proto/ga4gh/common.proto index 8f9f2e4e..182672b3 100644 --- a/src/main/proto/ga4gh/common.proto +++ b/src/main/proto/ga4gh/common.proto @@ -1,147 +1,119 @@ -@namespace("org.ga4gh.models") -/** -This file defines common types used in other parts of the schema. -There are no directly associated methods. -*/ -protocol Common { - -/** -A general exception type. -*/ -error GAException { - /** The error message */ - string message; - - /** The numerical error code */ - int errorCode = -1; +// This file defines common types used in other parts of the schema. +// There are no directly associated methods. + +syntax = "proto3"; + +package ga4gh; + +// When returning an HTTP error response, a server may also return a JSON formatted GAException +// to better describe the error. +message GAException { + // Numerical error code + int32 error_code = 1; + + // The error message. + string message = 2; } -/** -Indicates the DNA strand associate for some data item. -* `NEG_STRAND`: The negative (-) strand. -* `POS_STRAND`: The postive (+) strand. -*/ +// Indicates the associated DNA strand for some data item. enum Strand { - NEG_STRAND, - POS_STRAND -} + STRAND_UNSPECIFIED = 0; -/** -A `Position` is an unoriented base in some `Reference`. A `Position` is -represented by a `Reference` name, and a base number on that `Reference` -(0-based). -*/ -record Position { - /** - The name of the `Reference` on which the `Position` is located. - */ - string referenceName; - - /** - The 0-based offset from the start of the forward strand for that `Reference`. - Genomic positions are non-negative integers less than `Reference` length. - */ - long position; - - /** - Strand the position is associated with. - */ - Strand strand; -} + // The negative (-) strand. + NEG_STRAND = 1; -/** -Identifier from a public database -*/ -record ExternalIdentifier { - /** - The source of the identifier. - (e.g. `Ensembl`) - */ - string database; - - /** - The ID defined by the external database. - (e.g. `ENST00000000000`) - */ - string identifier; - - /** - The version of the object or the database - (e.g. `78`) - */ - string version; + // The postive (+) strand. + POS_STRAND = 2; } -/** -An enum for the different types of CIGAR alignment operations that exist. -Used wherever CIGAR alignments are used. The different enumerated values -have the following usage: - -* `ALIGNMENT_MATCH`: An alignment match indicates that a sequence can be - aligned to the reference without evidence of an INDEL. Unlike the - `SEQUENCE_MATCH` and `SEQUENCE_MISMATCH` operators, the `ALIGNMENT_MATCH` - operator does not indicate whether the reference and read sequences are an - exact match. This operator is equivalent to SAM's `M`. -* `INSERT`: The insert operator indicates that the read contains evidence of - bases being inserted into the reference. This operator is equivalent to - SAM's `I`. -* `DELETE`: The delete operator indicates that the read contains evidence of - bases being deleted from the reference. This operator is equivalent to - SAM's `D`. -* `SKIP`: The skip operator indicates that this read skips a long segment of - the reference, but the bases have not been deleted. This operator is - commonly used when working with RNA-seq data, where reads may skip long - segments of the reference between exons. This operator is equivalent to - SAM's 'N'. -* `CLIP_SOFT`: The soft clip operator indicates that bases at the start/end - of a read have not been considered during alignment. This may occur if the - majority of a read maps, except for low quality bases at the start/end of - a read. This operator is equivalent to SAM's 'S'. Bases that are soft clipped - will still be stored in the read. -* `CLIP_HARD`: The hard clip operator indicates that bases at the start/end of - a read have been omitted from this alignment. This may occur if this linear - alignment is part of a chimeric alignment, or if the read has been trimmed - (e.g., during error correction, or to trim poly-A tails for RNA-seq). This - operator is equivalent to SAM's 'H'. -* `PAD`: The pad operator indicates that there is padding in an alignment. - This operator is equivalent to SAM's 'P'. -* `SEQUENCE_MATCH`: This operator indicates that this portion of the aligned - sequence exactly matches the reference (e.g., all bases are equal to the - reference bases). This operator is equivalent to SAM's '='. -* `SEQUENCE_MISMATCH`: This operator indicates that this portion of the - aligned sequence is an alignment match to the reference, but a sequence - mismatch (e.g., the bases are not equal to the reference). This can - indicate a SNP or a read error. This operator is equivalent to SAM's 'X'. -*/ -enum CigarOperation { - ALIGNMENT_MATCH, - INSERT, - DELETE, - SKIP, - CLIP_SOFT, - CLIP_HARD, - PAD, - SEQUENCE_MATCH, - SEQUENCE_MISMATCH +// A `Position` is an unoriented base in some `Reference`. A `Position` is +// represented by a `Reference` name, and a base number on that `Reference` +// (0-based). +message Position { + // The name of the `Reference` on which the `Position` is located. + string reference_name = 1; + + // The 0-based offset from the start of the forward strand for that + // `Reference`. Genomic positions are non-negative integers less than + // `Reference` length. + int64 position = 2; + + // Strand the position is associated with. + Strand strand = 3; } -/** -A structure for an instance of a CIGAR operation. -`FIXME: This belongs under Reads (only readAlignment refers to this)` -*/ -record CigarUnit { - /** The operation type. */ - CigarOperation operation; - - /** The number of bases that the operation runs for. */ - long operationLength; - - /** - `referenceSequence` is only used at mismatches (`SEQUENCE_MISMATCH`) - and deletions (`DELETE`). Filling this field replaces the MD tag. - If the relevant information is not available, leave this field as `null`. - */ - union { null, string } referenceSequence = null; +// Identifier from a public database +message ExternalIdentifier { + // The source of the identifier, e.g. `Ensembl`. + string database = 1; + + // The ID defined by the external database, e.g. `ENST00000000000`. + string identifier = 2; + + // The version of the object or the database, e.g. `78`. + string version = 3; } +// A single CIGAR operation. +// FIXME: This belongs under Reads (only readAlignment refers to this). +message CigarUnit { + // Describes the different types of CIGAR alignment operations that exist. + // Used wherever CIGAR alignments are used. + enum Operation { + OPERATION_UNSPECIFIED = 0; + // An alignment match indicates that a sequence can be aligned to the + // reference without evidence of an INDEL. Unlike the + // `SEQUENCE_MATCH` and `SEQUENCE_MISMATCH` operators, + // the `ALIGNMENT_MATCH` operator does not indicate whether the + // reference and read sequences are an exact match. This operator is + // equivalent to SAM's `M`. + ALIGNMENT_MATCH = 1; + // The insert operator indicates that the read contains evidence of bases + // being inserted into the reference. This operator is equivalent to SAM's + // `I`. + INSERT = 2; + // The delete operator indicates that the read contains evidence of bases + // being deleted from the reference. This operator is equivalent to SAM's + // `D`. + DELETE = 3; + // The skip operator indicates that this read skips a long segment of the + // reference, but the bases have not been deleted. This operator is commonly + // used when working with RNA-seq data, where reads may skip long segments + // of the reference between exons. This operator is equivalent to SAM's + // `N`. + SKIP = 4; + // The soft clip operator indicates that bases at the start/end of a read + // have not been considered during alignment. This may occur if the majority + // of a read maps, except for low quality bases at the start/end of a read. + // This operator is equivalent to SAM's `S`. Bases that are soft + // clipped will still be stored in the read. + CLIP_SOFT = 5; + // The hard clip operator indicates that bases at the start/end of a read + // have been omitted from this alignment. This may occur if this linear + // alignment is part of a chimeric alignment, or if the read has been + // trimmed (for example, during error correction or to trim poly-A tails for + // RNA-seq). This operator is equivalent to SAM's `H`. + CLIP_HARD = 6; + // The pad operator indicates that there is padding in an alignment. This + // operator is equivalent to SAM's `P`. + PAD = 7; + // This operator indicates that this portion of the aligned sequence exactly + // matches the reference. This operator is equivalent to SAM's `=`. + SEQUENCE_MATCH = 8; + // This operator indicates that this portion of the aligned sequence is an + // alignment match to the reference, but a sequence mismatch. This can + // indicate a SNP or a read error. This operator is equivalent to SAM's + // `X`. + SEQUENCE_MISMATCH = 9; + } + Operation operation = 1; + + // The number of genomic bases that the operation runs for. Required. + int64 operation_length = 2; + + // `referenceSequence` is only used at mismatches + // (`SEQUENCE_MISMATCH`) and deletions (`DELETE`). + // Filling this field replaces SAM's MD tag. If the relevant information is + // not available, this field is unset. + string reference_sequence = 3; } diff --git a/src/main/proto/ga4gh/metadata.proto b/src/main/proto/ga4gh/metadata.proto index fa2258a7..5283c504 100644 --- a/src/main/proto/ga4gh/metadata.proto +++ b/src/main/proto/ga4gh/metadata.proto @@ -1,192 +1,133 @@ -@namespace("org.ga4gh.models") - -/** -This protocol defines metadata used in the other GA4GH protocols. -*/ - -protocol Metadata { - -import idl "common.avdl"; - -/** - An ontology term describing an attribute. (e.g. the phenotype attribute - 'polydactyly' from HPO) - */ - -record OntologyTerm { - /** - Ontology source identifier - the identifier, a CURIE (preferred) or - PURL for an ontology source e.g. http://purl.obolibrary.org/obo/hp.obo - It differs from the standard GA4GH schema's :ref:`id ` - in that it is a URI pointing to an information resource outside of the scope - of the schema or its resource implementation. - */ - string id; - - /** - Ontology term - the representation the id is pointing to. - */ - union { null, string } term = null; - - /** - Ontology source name - the name of ontology from which the term is obtained - e.g. 'Human Phenotype Ontology' - */ - union { null, string } sourceName = null; - - /** - Ontology source version - the version of the ontology from which the - OntologyTerm is obtained; e.g. 2.6.1. - There is no standard for ontology versioning and some frequently - released ontologies may use a datestamp, or build number. - */ - union { null, string } sourceVersion = null; +syntax = "proto3"; + +package ga4gh; + +import "google/protobuf/struct.proto"; + +// An ontology term describing an attribute. (e.g. the phenotype attribute +// 'polydactyly' from HPO) +message OntologyTerm { + // Ontology source identifier - the identifier, a CURIE (preferred) or PURL + // for an ontology source e.g. http://purl.obolibrary.org/obo/hp.obo It + // differs from the standard GA4GH schema's :ref:`id ` + // in that it is a URI pointing to an information resource outside of the + // scope of the schema or its resource implementation. + string id = 1; + + // Ontology term - the representation the id is pointing to. + string term = 2; + + // Ontology source name - the name of ontology from which the term is obtained + // e.g. 'Human Phenotype Ontology' + string source_name = 3; + + // Ontology source version - the version of the ontology from which the + // OntologyTerm is obtained; e.g. 2.6.1. There is no standard for ontology + // versioning and some frequently released ontologies may use a datestamp, or + // build number. + string source_version = 4; } -/** -An experimental preparation of a sample. -*/ -record Experiment { - /** The experiment UUID. This is globally unique. */ - string id; - - /** The name of the experiment. */ - union { null, string } name = null; - - /** A description of the experiment. */ - union { null, string } description = null; - - /** - The time at which this record was created. - Format: :ref:`ISO 8601 ` - */ - string createDateTime; - - /** - The time at which this record was last updated. - Format: :ref:`ISO 8601 ` - */ - string updateDateTime; - - /** - The time at which this experiment was performed. - Granularity here is variable (e.g. date only). - Format: :ref:`ISO 8601 ` - */ - union { null, string } runTime = null; - - /** - The molecule examined in this experiment. (e.g. genomics DNA, total RNA) - */ - union { null, string } molecule = null; - - /** - The experiment technique or strategy applied to the sample. - (e.g. whole genome sequencing, RNA-seq, RIP-seq) - */ - union { null, string } strategy = null; - - /** - The method used to enrich the target. (e.g. immunoprecipitation, size - fractionation, MNase digestion) - */ - union { null, string } selection = null; - - /** The name of the library used as part of this experiment. */ - union { null, string } library = null; - - /** The configuration of sequenced reads. (e.g. Single or Paired) */ - union { null, string } libraryLayout = null; - - /** - The instrument model used as part of this experiment. - This maps to sequencing technology in BAM. - */ - union { null, string } instrumentModel; - - /** - The data file generated by the instrument. - TODO: This isn't actually a file is it? - Should this be `instrumentData` instead? - */ - union { null, string } instrumentDataFile = null; - - /** The sequencing center used as part of this experiment. */ - union { null, string } sequencingCenter; - - /** - The platform unit used as part of this experiment. This is a flowcell-barcode - or slide unique identifier. - */ - union { null, string } platformUnit = null; - - /** - A map of additional experiment information. - */ - map> info = {}; -} +// An experimental preparation of a sample. +message Experiment { + // The experiment UUID. This is globally unique. + string id = 1; + + // The name of the experiment. + string name = 2; + + // A description of the experiment. + string description = 3; + + // The time at which this message was created. + // Format: ISO 8601, YYYY-MM-DDTHH:MM:SS.SSS (e.g. 2015-02-10T00:03:42.123Z) + string message_create_time = 4; + + // The time at which this message was last updated. + // Format: ISO 8601, YYYY-MM-DDTHH:MM:SS.SSS (e.g. 2015-02-10T00:03:42.123Z) + string message_update_time = 5; + + // The time at which this experiment was performed. + // Granularity here is variable (e.g. date only). + // Format: ISO 8601, YYYY-MM-DDTHH:MM:SS (e.g. 2015-02-10T00:03:42) + string run_time = 6; + + // The molecule examined in this experiment. (e.g. genomics DNA, total RNA) + string molecule = 7; + + // The experiment technique or strategy applied to the sample. + // (e.g. whole genome sequencing, RNA-seq, RIP-seq) + string strategy = 8; + + // The method used to enrich the target. (e.g. immunoprecipitation, size + // fractionation, MNase digestion) + string selection = 9; + + // The name of the library used as part of this experiment. + string library = 10; + + // The configuration of sequenced reads. (e.g. Single or Paired). + string library_layout = 11; + + // The instrument model used as part of this experiment. + // This maps to sequencing technology in BAM. + string instrument_model = 12; -/** -A Dataset is a collection of related data of multiple types. -Data providers decide how to group data into datasets. -See [Metadata API](../api/metadata.html) for a more detailed discussion. -*/ -record Dataset { - /** - The dataset's id, locally unique to the server instance. - */ - string id; - - /** - The name of the dataset. - */ - union { null, string } name = null; - - /** - Additional, human-readable information on the dataset. - */ - union { null, string } description = null; + // The data file generated by the instrument. + // TODO: This isn't actually a file is it? + // Should this be `instrumentData` instead? + string instrument_data_file = 13; + // The sequencing center used as part of this experiment. + string sequencing_center = 14; + + // The platform unit used as part of this experiment. This is a + // flowcell-barcode + // or slide unique identifier. + string platform_unit = 15; + + // A map of additional experiment information. + map info = 16; } -/** -An analysis contains an interpretation of one or several experiments. -(e.g. SNVs, copy number variations, methylation status) together with -information about the methodology used. -*/ -record Analysis { - - /** - Formats of id | name | description | accessions are described in the - documentation on general attributes and formats. - */ - string id; - - union { null, string } name = null; - - union { null, string } description = null; - - /** - The time at which this record was created. - Format: :ref:`ISO 8601 ` - */ - union { null, string } createDateTime = null; - - /** - The time at which this record was last updated. - Format: :ref:`ISO 8601 ` - */ - string updateDateTime; - - /** The type of analysis. */ - union { null, string } type = null; - - /** The software run to generate this analysis. */ - array software = []; - - /** - A map of additional analysis information. - */ - map> info = {}; +// A Dataset is a collection of related data of multiple types. +// Data providers decide how to group data into datasets. +// See [Metadata API](../api/metadata.html) for a more detailed discussion. +message Dataset { + // The dataset's id, locally unique to the server instance. + string id = 1; + + // The name of the dataset. + string name = 2; + + // Additional, human-readable information on the dataset. + string description = 3; } + +// An analysis contains an interpretation of one or several experiments. (e.g. +// SNVs, copy number variations, methylation status) together with information +// about the methodology used. +message Analysis { + // Formats of id | name | description | accessions are described in the + // documentation on general attributes and formats. + string id = 1; + + string name = 2; + + string description = 3; + + // The time at which this record was created, in ISO 8601 format. + string create_date_time = 4; + + // The time at which this record was last updated, in ISO 8601 format. + string update_date_time = 5; + + // The type of analysis. + string type = 6; + + // The software run to generate this analysis. + repeated string software = 7; + + // A map of additional analysis information. + map info = 8; } diff --git a/src/main/proto/ga4gh/metadata_service.proto b/src/main/proto/ga4gh/metadata_service.proto index 1d4bee36..c3b65514 100644 --- a/src/main/proto/ga4gh/metadata_service.proto +++ b/src/main/proto/ga4gh/metadata_service.proto @@ -1,70 +1,51 @@ -@namespace("org.ga4gh.methods") +syntax = "proto3"; -protocol MetadataMethods { +package ga4gh; -import idl "metadata.avdl"; -import idl "common.avdl"; -import idl "methods.avdl"; +import "ga4gh/metadata.proto"; -/****************** /datasets/search *********************/ -/** -This request maps to the body of `POST /datasets/search` as JSON. -*/ -record SearchDatasetsRequest { +service MetadataService { + // Gets a list of `Dataset` matching the search criteria. + // + // `POST /datasets/search` must accept a JSON version of + // `SearchDatasetsRequest` as the post body and will return a JSON + // version of `SearchDatasetsResponse`. + rpc SearchDatasets(SearchDatasetsRequest) returns (SearchDatasetsResponse); - /** - Specifies the maximum number of results to return in a single page. - If unspecified, a system default will be used. - */ - union { null, int } pageSize = null; - - /** - The continuation token, which is used to page through large result sets. - To get the next page of results, set this parameter to the value of - `nextPageToken` from the previous response. - */ - union { null, string } pageToken = null; + // Gets a `Dataset` by ID. + // + // `GET /datasets/{dataset_id}` will return a JSON version of + // `Dataset`. + rpc GetDataset(GetDatasetRequest) returns (Dataset); } -/** -This is the response from `POST /datasets/search` expressed as JSON. -*/ -record SearchDatasetsResponse { - /** The list of datasets. */ - array datasets = []; - - /** - The continuation token, which is used to page through large result sets. - Provide this value in a subsequent request to return the next page of - results. This field will be empty if there aren't any additional results. - */ - union { null, string } nextPageToken = null; -} +// **************** /datasets ******************* -/** -Gets a list of datasets accessible through the API. +// This request maps to the body of `POST /datasets/search` as JSON. +message SearchDatasetsRequest { + // Specifies the maximum number of results to return in a single page. + // If unspecified, a system default will be used. + int32 page_size = 1; -TODO: Reads and variants both want to have datasets. Are they the same object? + // The continuation token, which is used to page through large result sets. + // To get the next page of results, set this parameter to the value of + // `next_page_token` from the previous response. + string page_token = 2; +} -`POST /datasets/search` must accept a JSON version of -`SearchDatasetsRequest` as the post body and will return a JSON version -of `SearchDatasetsResponse`. -*/ -SearchDatasetsResponse searchDatasets( - /** - This request maps to the body of `POST /datasets/search` as JSON. - */ - SearchDatasetsRequest request) throws GAException; +// This is the response from `POST /datasets/search` expressed as JSON. +message SearchDatasetsResponse { + // The list of datasets. + repeated Dataset datasets = 1; -/**************** /datasets/{id} *******************/ -/** -Gets a `Dataset` by ID. -`GET /datasets/{id}` will return a JSON version of `Dataset`. -*/ -org.ga4gh.models.Dataset getDataset( - /** - The ID of the `Dataset`. - */ - string id) throws GAException; + // The continuation token, which is used to page through large result sets. + // Provide this value in a subsequent request to return the next page of + // results. This field will be empty if there aren't any additional results. + string next_page_token = 2; +} -} \ No newline at end of file +// This request maps to the URL `GET /datasets/{dataset_id}`. +message GetDatasetRequest { + // The ID of the `Dataset` to be retrieved. + string dataset_id = 1; +} diff --git a/src/main/proto/ga4gh/read_service.proto b/src/main/proto/ga4gh/read_service.proto index 3405205a..d8d348fe 100644 --- a/src/main/proto/ga4gh/read_service.proto +++ b/src/main/proto/ga4gh/read_service.proto @@ -1,182 +1,134 @@ -@namespace("org.ga4gh.methods") -protocol ReadMethods { - -import idl "common.avdl"; -import idl "methods.avdl"; -import idl "reads.avdl"; - -/****************** /reads/search *********************/ -/** -This request maps to the body of `POST /reads/search` as JSON. - -If a reference is specified, all queried `ReadGroup`s must be aligned -to `ReferenceSet`s containing that same `Reference`. If no reference is -specified, all queried `ReadGroup`s must be aligned to the same `ReferenceSet`. -*/ -record SearchReadsRequest { - /** - The ReadGroups to search. At least one id must be specified. - */ - array readGroupIds; - - /** - The reference to query. Leaving blank returns results from all - references, including unmapped reads - this could be very large. - */ - union { null, string } referenceId = null; - - /** - The start position (0-based) of this query. - If a reference is specified, this defaults to 0. - Genomic positions are non-negative integers less than reference length. - Requests spanning the join of circular genomes are represented as - two requests one on each side of the join (position 0). - */ - union { null, long } start = null; - - /** - The end position (0-based, exclusive) of this query. - If a reference is specified, this defaults to the - reference's length. - */ - union { null, long } end = null; - - /** - Specifies the maximum number of results to return in a single page. - If unspecified, a system default will be used. - */ - union { null, int } pageSize = null; - - /** - The continuation token, which is used to page through large result sets. - To get the next page of results, set this parameter to the value of - `nextPageToken` from the previous response. - */ - union { null, string } pageToken = null; +syntax = "proto3"; + +package ga4gh; + +import "ga4gh/reads.proto"; + +service ReadService { + // Gets a list of `ReadGroupSet` matching the search criteria. + // + // `POST /readgroupsets/search` must accept a JSON version of + // `SearchReadGroupSetsRequest` as the post body and will return a JSON + // version of `SearchReadGroupSetsResponse`. + rpc SearchReadGroupSets(SearchReadGroupSetsRequest) + returns (SearchReadGroupSetsResponse); + + // Gets a `ReadGroupSet` by ID. + // + // `GET /readgroupsets/{read_group_set_id}` will return a JSON version of + // `ReadGroupSet`. + rpc GetReadGroupSet(GetReadGroupSetRequest) returns (ReadGroupSet); + + // Gets a list of `ReadAlignment`s for one or more `ReadGroup`s. + // + // `searchReads` operates over a genomic coordinate space of reference sequence + // and position defined by the `Reference`s to which the requested `ReadGroup`s are + // aligned. + // + // If a target positional range is specified, search returns all reads whose + // alignment to the reference genome *overlap* the range. A query which specifies + // only read group IDs yields all reads in those read groups, including unmapped + // reads. + // + // All reads returned (including reads on subsequent pages) are ordered by genomic + // coordinate (by reference sequence, then position). Reads with equivalent genomic + // coordinates are returned in an unspecified order. This order must be consistent + // for a given repository, such that two queries for the same content (regardless + // of page size) yield reads in the same order across their respective streams of + // paginated responses. + // + // `POST /reads/search` must accept a JSON version of `SearchReadsRequest` as + // the post body and will return a JSON version of `SearchReadsResponse`. + rpc SearchReads(SearchReadsRequest) returns (SearchReadsResponse); } -/** This is the response from `POST /reads/search` expressed as JSON. */ -record SearchReadsResponse { - /** - The list of matching alignment records, sorted by position. - Unmapped reads, which have no position, are returned last. - */ - array alignments = []; - - /** - The continuation token, which is used to page through large result sets. - Provide this value in a subsequent request to return the next page of - results. This field will be empty if there aren't any additional results. - */ - union { null, string } nextPageToken = null; +// ****************** /readgroupsets *********************/ + +// This request maps to the body of `POST /readgroupsets/search` as JSON. +// +// TODO: Factor this out to a common API patterns section. +// - If searching by a resource ID, and that resource is not found, the method +// will return a `404` HTTP status code (`NOT_FOUND`). +// - If searching by other attributes, e.g. `name`, and no matches are found, the +// method will return a `200` HTTP status code (`OK`) with an empty result list. +message SearchReadGroupSetsRequest { + // The dataset to search. + string dataset_id = 1; + + // Only return read group sets with this name (case-sensitive, exact match). + string name = 2; + + // Specifies the maximum number of results to return in a single page. + // If unspecified, a system default will be used. + int32 page_size = 3; + + // The continuation token, which is used to page through large result sets. + // To get the next page of results, set this parameter to the value of + // `next_page_token` from the previous response. + string page_token = 4; } -/** -Gets a list of `ReadAlignment`s for one or more `ReadGroup`s. - -`searchReads` operates over a genomic coordinate space of reference sequence -and position defined by the `Reference`s to which the requested `ReadGroup`s are -aligned. - -If a target positional range is specified, search returns all reads whose -alignment to the reference genome *overlap* the range. A query which specifies -only read group IDs yields all reads in those read groups, including unmapped -reads. - -All reads returned (including reads on subsequent pages) are ordered by genomic -coordinate (by reference sequence, then position). Reads with equivalent genomic -coordinates are returned in an unspecified order. This order must be consistent -for a given repository, such that two queries for the same content (regardless -of page size) yield reads in the same order across their respective streams of -paginated responses. - -`POST /reads/search` must accept a JSON version of `SearchReadsRequest` as -the post body and will return a JSON version of `SearchReadsResponse`. -*/ -SearchReadsResponse searchReads( - /** This request maps to the body of `POST /reads/search` as JSON. */ - SearchReadsRequest request) throws GAException; - -/****************** /readgroupsets/search *********************/ -/** This request maps to the body of `POST /readgroupsets/search` as JSON. - -TODO: Factor this out to a common API patterns section. -- If searching by a resource ID, and that resource is not found, the method -will return a `404` HTTP status code (`NOT_FOUND`). -- If searching by other attributes, e.g. `name`, and no matches are found, the -method will return a `200` HTTP status code (`OK`) with an empty result list. -*/ -record SearchReadGroupSetsRequest { - /** - The dataset to search. - */ - string datasetId; - - /** - Only return read group sets with this name (case-sensitive, exact match). - */ - union { null, string } name = null; - - /** - Specifies the maximum number of results to return in a single page. - If unspecified, a system default will be used. - */ - union { null, int } pageSize = null; - - /** - The continuation token, which is used to page through large result sets. - To get the next page of results, set this parameter to the value of - `nextPageToken` from the previous response. - */ - union { null, string } pageToken = null; +// This is the response from `POST /readgroupsets/search` expressed as JSON. +message SearchReadGroupSetsResponse { + // The list of matching read group sets. + repeated ReadGroupSet read_group_sets = 1; + + // The continuation token, which is used to page through large result sets. + // Provide this value in a subsequent request to return the next page of + // results. This field will be empty if there aren't any additional results. + string next_page_token = 2; +} + +// This request maps to the URL `GET /readgroupsets/{read_group_set_id}`. +message GetReadGroupSetRequest { + // The ID of the `ReadGroupSet` to be retrieved. + string read_group_set_id = 1; } -/** This is the response from `POST /readgroupsets/search` expressed as JSON. */ -record SearchReadGroupSetsResponse { - /** The list of matching read group sets. */ - array readGroupSets = []; - - /** - The continuation token, which is used to page through large result sets. - Provide this value in a subsequent request to return the next page of - results. This field will be empty if there aren't any additional results. - */ - union { null, string } nextPageToken = null; +// ****************** /reads ********************* +// This request maps to the body of `POST /reads/search` as JSON. +// +// If a reference is specified, all queried `ReadGroup`s must be aligned +// to `ReferenceSet`s containing that same `Reference`. If no reference is +// specified, all `ReadGroup`s must be aligned to the same `ReferenceSet`. +message SearchReadsRequest { + // The ReadGroups to search. At least one id must be specified. + repeated string read_group_ids = 1; + + // The reference to query. Leaving blank returns results from all + // references, including unmapped reads - this could be very large. + string reference_id = 2; + + // The start position (0-based) of this query. + // If a reference is specified, this defaults to 0. + // Genomic positions are non-negative integers less than reference length. + // Requests spanning the join of circular genomes are represented as + // two requests one on each side of the join (position 0). + int64 start = 3; + + // The end position (0-based, exclusive) of this query. + // If a reference is specified, this defaults to the + // reference's length. + int64 end = 4; + + // Specifies the maximum number of results to return in a single page. + // If unspecified, a system default will be used. + int32 page_size = 5; + + // The continuation token, which is used to page through large result sets. + // To get the next page of results, set this parameter to the value of + // `next_page_token` from the previous response. + string page_token = 6; } -/** -Gets a list of `ReadGroupSet` matching the search criteria. - -`POST /readgroupsets/search` must accept a JSON version of -`SearchReadGroupSetsRequest` as the post body and will return a JSON -version of `SearchReadGroupSetsResponse`. -*/ -SearchReadGroupSetsResponse searchReadGroupSets( - /** - This request maps to the body of `POST /readgroupsets/search` as JSON. - */ - SearchReadGroupSetsRequest request) throws GAException; - -/**************** /readgroupsets/{id} *******************/ -/** -Gets a `org.ga4gh.models.ReadGroupSet` by ID. -`GET /readgroupsets/{id}` will return a JSON version of `ReadGroupSet`. -*/ -org.ga4gh.models.ReadGroupSet getReadGroupSet( - /** - The ID of the `ReadGroupSet`. - */ - string id) throws GAException; - -/**************** /readgroups/{id} *******************/ -/** -Gets a `org.ga4gh.models.ReadGroup` by ID. -`GET /readgroups/{id}` will return a JSON version of `ReadGroup`. -*/ -org.ga4gh.models.ReadGroup getReadGroup( - /** - The ID of the `ReadGroup`. - */ - string id) throws GAException; +// This is the response from `POST /reads/search` expressed as JSON. +message SearchReadsResponse { + // The list of matching alignment messages, sorted by position. + // Unmapped reads, which have no position, are returned last. + repeated ReadAlignment alignments = 1; + // The continuation token, which is used to page through large result sets. + // Provide this value in a subsequent request to return the next page of + // results. This field will be empty if there aren't any additional results. + string next_page_token = 2; } diff --git a/src/main/proto/ga4gh/reads.proto b/src/main/proto/ga4gh/reads.proto index 3cd18b53..cf7f55bb 100644 --- a/src/main/proto/ga4gh/reads.proto +++ b/src/main/proto/ga4gh/reads.proto @@ -1,278 +1,229 @@ -@namespace("org.ga4gh.models") +// This file defines the objects used to represent a reads and alignments, most +// importantly +// ReadGroupSet, ReadGroup, and ReadAlignment. +// See {TODO: LINK TO READS OVERVIEW} for more information. -/** -This file defines the objects used to represent a reads and alignments, most importantly -ReadGroupSet, ReadGroup, and ReadAlignment. -See {TODO: LINK TO READS OVERVIEW} for more information. -*/ -protocol Reads { +syntax = "proto3"; -import idl "common.avdl"; -import idl "metadata.avdl"; +package ga4gh; -/** -Program can be used to track the provenance of how read data was generated. -*/ -record Program { - /** The command line used to run this program. */ - union { null, string } commandLine = null; +import "ga4gh/common.proto"; +import "ga4gh/metadata.proto"; +import "google/protobuf/struct.proto"; - /** The user specified ID of the program. */ - union { null, string } id = null; +// ReadStats can be used to provide summary statistics about read data. +message ReadStats { + // The number of aligned reads. + int64 aligned_read_count = 1; - /** The name of the program. */ - union { null, string } name = null; + // The number of unaligned reads. + int64 unaligned_read_count = 2; - /** The ID of the program run before this one. */ - union { null, string } prevProgramId = null; - - /** The version of the program run. */ - union { null, string } version = null; + // The total number of bases. + // This is equivalent to the sum of `alignedSequence.length` for all reads. + int64 base_count = 3; } -/** ReadStats can be used to provide summary statistics about read data. */ -record ReadStats { - /** The number of aligned reads. */ - union { null, long } alignedReadCount = null; +// A ReadGroup is a set of reads derived from one physical sequencing process. +message ReadGroup { + // The read group ID. + string id = 1; - /** The number of unaligned reads. */ - union { null, long } unalignedReadCount = null; + // The ID of the dataset this read group belongs to. + string dataset_id = 2; - /** - The total number of bases. - This is equivalent to the sum of `alignedSequence.length` for all reads. - */ - union { null, long } baseCount = null; -} + // The read group name. + string name = 3; -/** -A ReadGroup is a set of reads derived from one physical sequencing process. -*/ -record ReadGroup { + // The read group description. + string description = 4; - /** The read group ID. */ - string id; + // The sample this read group's data was generated from. + // Note: the current API does not have a rigorous definition of sample. + // Therefore, this field actually contains an arbitrary string, typically + // corresponding to the SM tag in a BAM file. + string sample_id = 5; - /** The ID of the dataset this read group belongs to. */ - union { null, string } datasetId = null; + // The experiment used to generate this read group. + Experiment experiment = 6; - /** The read group name. */ - union { null, string } name = null; + // The predicted insert size of this read group. + int32 predicted_insert_size = 7; - /** The read group description. */ - union { null, string } description = null; + // The time at which this read group was created in milliseconds from the + // epoch. + int64 created = 8; - /** - The sample this read group's data was generated from. - Note: the current API does not have a rigorous definition of sample. Therefore, this - field actually contains an arbitrary string, typically corresponding to the SM tag in a - BAM file. - */ - union { null, string } sampleId; + // The time at which this read group was last updated in milliseconds + // from the epoch. + int64 updated = 9; - /** The experiment used to generate this read group. */ - union { null, Experiment } experiment; + // Statistical data on reads in this read group. + ReadStats stats = 10; - /** The predicted insert size of this read group. */ - union { null, int } predictedInsertSize = null; + // Program can be used to track the provenance of how read data was generated. + message Program { + // The command line used to run this program. + string command_line = 1; - /** - The time at which this read group was created in milliseconds from the epoch. - */ - union { null, long } created = null; + // The user specified ID of the program. + string id = 2; - /** - The time at which this read group was last updated in milliseconds - from the epoch. - */ - union { null, long } updated = null; + // The name of the program. + string name = 3; - /** Statistical data on reads in this read group. */ - union { null, ReadStats } stats = null; + // The ID of the program run before this one. + string prev_program_id = 4; - /** The programs used to generate this read group. */ - array programs = []; + // The version of the program run. + string version = 5; + } + repeated Program programs = 11; - /** - The ID of the reference set to which the reads in this read group are aligned. - Required if there are any read alignments. - */ - union {null, string } referenceSetId = null; + // The ID of the reference set to which the reads in this read group are + // aligned. Required if there are any read alignments. + string reference_set_id = 12; - /** - A map of additional read group information. - */ - map> info = {}; + // A map of additional read group information. + map info = 13; } -/** -A ReadGroupSet is a logical collection of ReadGroups. Typically one ReadGroupSet -represents all the reads from one experimental sample. -*/ -record ReadGroupSet { - /** The read group set ID. */ - string id; +// A ReadGroupSet is a logical collection of ReadGroups. Typically one +// ReadGroupSet represents all the reads from one experimental sample. +message ReadGroupSet { + // The read group set ID. + string id = 1; - /** The ID of the dataset this read group set belongs to. */ - union { null, string } datasetId = null; + // The ID of the dataset this read group set belongs to. + string dataset_id = 2; - /** The read group set name. */ - union { null, string } name = null; + // The read group set name. + string name = 3; - /** Statistical data on reads in this read group set. */ - union { null, ReadStats } stats = null; + // Statistical data on reads in this read group set. + ReadStats stats = 4; - /** The read groups in this set. */ - array readGroups = []; + // The read groups in this set. + repeated ReadGroup read_groups = 5; // NB: we require that all readgroups in the set are mapped to the same // referenceSet. } -/** -A linear alignment describes the alignment of a read to a Reference, using a -position and CIGAR array. -*/ -record LinearAlignment { - /** The position of this alignment. */ - Position position; - - /** - The mapping quality of this alignment, meaning the likelihood that the read - maps to this position. - - Specifically, this is -10 log10 Pr(mapping position is wrong), rounded to the - nearest integer. - */ - union { null, int } mappingQuality = null; - - /** - Represents the local alignment of this sequence (alignment matches, indels, etc) - versus the reference. - */ - array cigar = []; +// A linear alignment describes the alignment of a read to a Reference, using a +// position and CIGAR array. +message LinearAlignment { + // The position of this alignment. + Position position = 1; + + // The mapping quality of this alignment, meaning the likelihood that the read + // maps to this position. + // + // Specifically, this is -10 log10 Pr(mapping position is wrong), rounded to + // the nearest integer. + int32 mapping_quality = 2; + + // Represents the local alignment of this sequence (alignment matches, indels, + // etc) + // versus the reference. + repeated CigarUnit cigar = 3; } -/** -Each read alignment describes an alignment with additional information -about the fragment and the read. A read alignment object is equivalent to a -line in a SAM file. -*/ -record ReadAlignment { - - /** - The read alignment ID. This ID is unique within the read group this - alignment belongs to. - - For performance reasons, this field may be omitted by a backend. - If provided, its intended use is to make caching and UI display easier for - genome browsers and other lightweight clients. - */ - union { null, string } id; - - /** - The ID of the read group this read belongs to. - (Every read must belong to exactly one read group.) - */ - string readGroupId; +// Each read alignment describes an alignment with additional information +// about the fragment and the read. A read alignment object is equivalent to a +// line in a SAM file. +message ReadAlignment { + // The read alignment ID. This ID is unique within the read group this + // alignment belongs to. + // + // For performance reasons, this field may be omitted by a backend. + // If provided, its intended use is to make caching and UI display easier for + // genome browsers and other lightweight clients. + string id = 1; + + // The ID of the read group this read belongs to. + // (Every read must belong to exactly one read group.) + string read_group_id = 2; // fragment attributes - /** The fragment name. Equivalent to QNAME (query template name) in SAM. */ - string fragmentName; + // The fragment name. Equivalent to QNAME (query template name) in SAM. + string fragment_name = 3; - /** - The orientation and the distance between reads from the fragment are - inconsistent with the sequencing protocol (inverse of SAM flag 0x2) - */ - union { null, boolean } improperPlacement = null; + // The orientation and the distance between reads from the fragment are + // inconsistent with the sequencing protocol (inverse of SAM flag 0x2). + bool improper_placement = 4; - /** The fragment is a PCR or optical duplicate (SAM flag 0x400). */ - union { null, boolean } duplicateFragment = null; + // The fragment is a PCR or optical duplicate (SAM flag 0x400). + bool duplicate_fragment = 5; - /** The number of reads in the fragment (extension to SAM flag 0x1) */ - union { null, int } numberReads = null; + // The number of reads in the fragment (extension to SAM flag 0x1). + int32 number_reads = 6; - /** The observed length of the fragment, equivalent to TLEN in SAM. */ - union { null, int } fragmentLength = null; + // The observed length of the fragment, equivalent to TLEN in SAM. + int32 fragment_length = 7; // read attributes - /** - The read ordinal in the fragment, 0-based and less than numberReads. This - field replaces SAM flag 0x40 and 0x80 and is intended to more cleanly - represent multiple reads per fragment. - */ - union { null, int } readNumber = null; - - /** The read fails platform or vendor quality checks (SAM flag 0x200). */ - union { null, boolean } failedVendorQualityChecks = null; - - /** - The alignment for this alignment record. This field will be null if the read - is unmapped. - */ - union { null, LinearAlignment } alignment = null; - - /** - Whether this alignment is secondary. Equivalent to SAM flag 0x100. - A secondary alignment represents an alternative to the primary alignment - for this read. Aligners may return secondary alignments if a read can map - ambiguously to multiple coordinates in the genome. - - By convention, each read has one and only one alignment where both - secondaryAlignment and supplementaryAlignment are false. - */ - union { null, boolean } secondaryAlignment = null; - - /** - Whether this alignment is supplementary. Equivalent to SAM flag 0x800. - Supplementary alignments are used in the representation of a chimeric - alignment. In a chimeric alignment, a read is split into multiple - linear alignments that map to different reference contigs. The first - linear alignment in the read will be designated as the representative alignment; - the remaining linear alignments will be designated as supplementary alignments. - These alignments may have different mapping quality scores. - - In each linear alignment in a chimeric alignment, the read will be hard clipped. - The `alignedSequence` and `alignedQuality` fields in the alignment record will - only represent the bases for its respective linear alignment. - */ - union { null, boolean } supplementaryAlignment = null; - - /** - The bases of the read sequence contained in this alignment record (equivalent - to SEQ in SAM). - - `alignedSequence` and `alignedQuality` may be shorter than the full read sequence - and quality. This will occur if the alignment is part of a chimeric alignment, - or if the read was trimmed. When this occurs, the CIGAR for this read will - begin/end with a hard clip operator that will indicate the length of the - excised sequence. - */ - union { null, string } alignedSequence = null; - - /** - The quality of the read sequence contained in this alignment record - (equivalent to QUAL in SAM). - - `alignedSequence` and `alignedQuality` may be shorter than the full read sequence - and quality. This will occur if the alignment is part of a chimeric alignment, - or if the read was trimmed. When this occurs, the CIGAR for this read will - begin/end with a hard clip operator that will indicate the length of the excised sequence. - */ - array alignedQuality = []; - - /** - The mapping of the primary alignment of the `(readNumber+1)%numberReads` - read in the fragment. It replaces mate position and mate strand in SAM. - */ - union { null, Position } nextMatePosition = null; - - /** - A map of additional read alignment information. - */ - map> info = {}; -} - + // The read ordinal in the fragment, 0-based and less than numberReads. This + // field replaces SAM flag 0x40 and 0x80 and is intended to more cleanly + // represent multiple reads per fragment. + int32 read_number = 8; + + // The read fails platform or vendor quality checks (SAM flag 0x200). + bool failed_vendor_quality_checks = 9; + + // The alignment for this alignment message. This field will be null if the + // read is unmapped. + LinearAlignment alignment = 10; + + // Whether this alignment is secondary. Equivalent to SAM flag 0x100. + // A secondary alignment represents an alternative to the primary alignment + // for this read. Aligners may return secondary alignments if a read can map + // ambiguously to multiple coordinates in the genome. + // + // By convention, each read has one and only one alignment where both + // secondaryAlignment and supplementaryAlignment are false. + bool secondary_alignment = 11; + + // Whether this alignment is supplementary. Equivalent to SAM flag 0x800. + // Supplementary alignments are used in the representation of a chimeric + // alignment. In a chimeric alignment, a read is split into multiple + // linear alignments that map to different reference contigs. The first + // linear alignment in the read will be designated as the representative + // alignment; the remaining linear alignments will be designated as + // supplementary alignments. These alignments may have different mapping + // quality scores. + // + // In each linear alignment in a chimeric alignment, the read will be hard + // clipped. The `alignedSequence` and `alignedQuality` fields in the alignment + // message will only represent the bases for its respective linear alignment. + bool supplementary_alignment = 12; + + // The bases of the read sequence contained in this alignment record + // (equivalent to SEQ in SAM). + // + // `alignedSequence` and `alignedQuality` may be shorter than the full read + // sequence and quality. This will occur if the alignment is part of a + // chimeric alignment, or if the read was trimmed. When this occurs, the CIGAR + // for this read will begin/end with a hard clip operator that will indicate + // the length of the excised sequence. + string aligned_sequence = 13; + + // The quality of the read sequence contained in this alignment message + // (equivalent to QUAL in SAM). + // + // `alignedSequence` and `alignedQuality` may be shorter than the full read + // sequence and quality. This will occur if the alignment is part of a + // chimeric alignment, or if the read was trimmed. When this occurs, the CIGAR + // for this read will begin/end with a hard clip operator that will indicate + // the length of the excised sequence. + repeated int32 aligned_quality = 14; + + // The mapping of the primary alignment of the `(readNumber+1)%numberReads` + // read in the fragment. It replaces mate position and mate strand in SAM. + Position next_mate_position = 15; + + // A map of additional read alignment information. + map info = 16; } diff --git a/src/main/proto/ga4gh/reference_service.proto b/src/main/proto/ga4gh/reference_service.proto index 4d9236e9..4a340579 100644 --- a/src/main/proto/ga4gh/reference_service.proto +++ b/src/main/proto/ga4gh/reference_service.proto @@ -1,229 +1,168 @@ -@namespace("org.ga4gh.methods") -protocol ReferenceMethods { - -import idl "common.avdl"; -import idl "methods.avdl"; -import idl "references.avdl"; - -/**************** /referencesets/search *******************/ -/** -This request maps to the body of `POST /referencesets/search` -as JSON. -*/ -record SearchReferenceSetsRequest { - /** - If not null, return the reference sets for which the - `md5checksum` matches this string (case-sensitive, exact match). - See `ReferenceSet::md5checksum` for details. - */ - union { null, string } md5checksum = null; - - /** - If not null, return the reference sets for which the `accession` - matches this string (case-sensitive, exact match). - */ - union { null, string } accession = null; - - /** - If not null, return the reference sets for which the `assemblyId` - matches this string (case-sensitive, exact match). - */ - union { null, string } assemblyId = null; - - /** - Specifies the maximum number of results to return in a single page. - If unspecified, a system default will be used. - */ - union { null, int } pageSize = null; - - /** - The continuation token, which is used to page through large result sets. - To get the next page of results, set this parameter to the value of - `nextPageToken` from the previous response. - */ - union { null, string } pageToken = null; +syntax = "proto3"; + +package ga4gh; + +import "ga4gh/references.proto"; + +service ReferenceService { + // Gets a list of `ReferenceSet` matching the search criteria. + // + // `POST /referencesets/search` must accept a JSON version of + // `SearchReferenceSetsRequest` as the post body and will return a JSON + // version of `SearchReferenceSetsResponse`. + rpc SearchReferenceSets(SearchReferenceSetsRequest) + returns (SearchReferenceSetsResponse); + + // Gets a `ReferenceSet` by ID. + // + // `GET /referencesets/{reference_set_id}` will return a JSON version of + // `ReferenceSet`. + rpc GetReferenceSet(GetReferenceSetRequest) returns (ReferenceSet); + + // Gets a list of `Reference` matching the search criteria. + // + // `POST /references/search` must accept a JSON version of + // `SearchReferencesRequest` as the post body and will return a JSON + // version of `SearchReferencesResponse`. + rpc SearchReferences(SearchReferencesRequest) + returns (SearchReferencesResponse); + + // Gets a `Reference` by ID. + // + // `GET /references/{reference_id}` will return a JSON version of + // `Reference`. + rpc GetReference(GetReferenceRequest) returns (Reference); + + // Lists `Reference` bases by ID and optional range. + // + // `GET /references/{id}/bases` will return a JSON version of + // `ListReferenceBasesResponse`. + rpc ListReferenceBases(ListReferenceBasesRequest) + returns (ListReferenceBasesResponse); } -/** -This is the response from `POST /referencesets/search` -expressed as JSON. -*/ -record SearchReferenceSetsResponse { - /** The list of matching reference sets. */ - array referenceSets = []; - - /** - The continuation token, which is used to page through large result sets. - Provide this value in a subsequent request to return the next page of - results. This field will be empty if there aren't any additional results. - */ - union { null, string } nextPageToken = null; +// **************** /referencesets ******************* +// This request maps to the body of `POST /referencesets/search` as JSON. +message SearchReferenceSetsRequest { + // If unset, return the reference sets for which the + // `md5checksum` matches this string (case-sensitive, exact match). + // See `ReferenceSet::md5checksum` for details. + string md5checksum = 1; + + // If unset, return the reference sets for which the `accession` + // matches this string (case-sensitive, exact match). + string accession = 2; + + // If unset, return the reference sets for which the `assemblyId` + // matches this string (case-sensitive, exact match). + string assembly_id = 3; + + // Specifies the maximum number of results to return in a single page. + // If unspecified, a system default will be used. + int32 page_size = 4; + + // The continuation token, which is used to page through large result sets. + // To get the next page of results, set this parameter to the value of + // `next_page_token` from the previous response. + string page_token = 5; } -/** -Gets a list of `ReferenceSet` matching the search criteria. - -`POST /referencesets/search` must accept a JSON version of -`SearchReferenceSetsRequest` as the post body and will return a JSON -version of `SearchReferenceSetsResponse`. -*/ -SearchReferenceSetsResponse searchReferenceSets( - /** - This request maps to the body of `POST /referencesets/search` - as JSON. - */ - SearchReferenceSetsRequest request) throws GAException; - -/**************** /referencesets/{id} *******************/ -/** -Gets a `ReferenceSet` by ID. -`GET /referencesets/{id}` will return a JSON version of `ReferenceSet`. -*/ -org.ga4gh.models.ReferenceSet getReferenceSet( - /** - The ID of the `ReferenceSet`. - */ - string id) throws GAException; - -/**************** /references/search *******************/ -/** -This request maps to the body of `POST /references/search` -as JSON. -*/ -record SearchReferencesRequest { - /** - The `ReferenceSet` to search. - */ - string referenceSetId; - - /** - If not null, return the references for which the - `md5checksum` matches this string (case-sensitive, exact match). - See `ReferenceSet::md5checksum` for details. - */ - union { null, string } md5checksum = null; - - /** - If not null, return the references for which the `accession` - matches this string (case-sensitive, exact match). - */ - union { null, string } accession = null; - - /** - Specifies the maximum number of results to return in a single page. - If unspecified, a system default will be used. - */ - union { null, int } pageSize = null; - - /** - The continuation token, which is used to page through large result sets. - To get the next page of results, set this parameter to the value of - `nextPageToken` from the previous response. - */ - union { null, string } pageToken = null; +// This is the response from `POST /referencesets/search` expressed as JSON. +message SearchReferenceSetsResponse { + // The list of matching reference sets. + repeated ReferenceSet reference_sets = 1; + + // The continuation token, which is used to page through large result sets. + // Provide this value in a subsequent request to return the next page of + // results. This field will be empty if there aren't any additional results. + string next_page_token = 2; +} + +// This request maps to the URL `GET /referencesets/{reference_set_id}`. +message GetReferenceSetRequest { + // The ID of the `ReferenceSet` to be retrieved. + string reference_set_id = 1; } -/** -This is the response from `POST /references/search` expressed as JSON. -*/ -record SearchReferencesResponse { - /** The list of matching references. */ - array references = []; - - /** - The continuation token, which is used to page through large result sets. - Provide this value in a subsequent request to return the next page of - results. This field will be empty if there aren't any additional results. - */ - union { null, string } nextPageToken = null; +// **************** /references ******************* +// This request maps to the body of `POST /references/search` as JSON. +message SearchReferencesRequest { + // The `ReferenceSet` to search. + string reference_set_id = 1; + + // If specified, return the references for which the + // `md5checksum` matches this string (case-sensitive, exact match). + // See `ReferenceSet::md5checksum` for details. + string md5checksum = 2; + + // If specified, return the references for which the `accession` + // matches this string (case-sensitive, exact match). + string accession = 3; + + // Specifies the maximum number of results to return in a single page. + // If unspecified, a system default will be used. + int32 page_size = 4; + + // The continuation token, which is used to page through large result sets. + // To get the next page of results, set this parameter to the value of + // `next_page_token` from the previous response. + string page_token = 5; } -/** -Gets a list of `Reference` matching the search criteria. - -`POST /references/search` must accept a JSON version of -`SearchReferencesRequest` as the post body and will return a JSON -version of `SearchReferencesResponse`. -*/ -SearchReferencesResponse searchReferences( - /** - This request maps to the body of `POST /references/search` - as JSON. - */ - SearchReferencesRequest request) throws GAException; - -/**************** /references/{id} *******************/ -/** -Gets a `Reference` by ID. -`GET /references/{id}` will return a JSON version of `Reference`. -*/ -org.ga4gh.models.Reference getReference( - /** - The ID of the `Reference`. - */ - string id) throws GAException; - -/**************** /references/{id}/bases *******************/ -/** -The query parameters for a request to `GET /references/{id}/bases`, for -example: - -`GET /references/{id}/bases?start=100&end=200` -*/ -record ListReferenceBasesRequest { - /** - The start position (0-based) of this query. Defaults to 0. - Genomic positions are non-negative integers less than reference length. - Requests spanning the join of circular genomes are represented as - two requests one on each side of the join (position 0). - */ - long start = 0; - - /** - The end position (0-based, exclusive) of this query. Defaults - to the length of this `Reference`. - */ - union { null, long } end = null; - - /** - The continuation token, which is used to page through large result sets. - To get the next page of results, set this parameter to the value of - `nextPageToken` from the previous response. - */ - union { null, string } pageToken = null; +// This is the response from `POST /references/search` expressed as JSON. +message SearchReferencesResponse { + // The list of matching references. + repeated Reference references = 1; + + // The continuation token, which is used to page through large result sets. + // Provide this value in a subsequent request to return the next page of + // results. This field will be empty if there aren't any additional results. + string next_page_token = 2; } -/** The response from `GET /references/{id}/bases` expressed as JSON. */ -record ListReferenceBasesResponse { - /** - The offset position (0-based) of the given sequence from the start of this - `Reference`. This value will differ for each page in a paginated request. - */ - long offset = 0; - - /** - A substring of the bases that make up this reference. Bases are represented - as IUPAC-IUB codes; this string matches the regexp `[ACGTMRWSYKVHDBN]*`. - */ - string sequence; - - /** - The continuation token, which is used to page through large result sets. - Provide this value in a subsequent request to return the next page of - results. This field will be empty if there aren't any additional results. - */ - union { null, string } nextPageToken = null; +// This request maps to the URL `GET /references/{reference_id}`. +message GetReferenceRequest { + // The ID of the `Reference` to be retrieved. + string reference_id = 1; } -/** -Lists `Reference` bases by ID and optional range. -`GET /references/{id}/bases` will return a JSON version of -`ListReferenceBasesResponse`. -*/ -ListReferenceBasesResponse getReferenceBases( - /** The ID of the `Reference`. */ - string id, - /** Additional request parameters to restrict the query. */ - ListReferenceBasesRequest request) throws GAException; +// The parameters for a `GET /references/{reference_id}/bases` request. When +// sending a request, the `reference_id` should be encoded in the URL path, +// while all other parameters should be encoded as URL query parameters. +// +// For example: `GET /references/{id}/bases?start=100&end=200` +message ListReferenceBasesRequest { + // The ID of the `Reference` to be retrieved. + string reference_id = 1; + + // The start position (0-based) of this query. Defaults to 0. + // Genomic positions are non-negative integers less than reference length. + // Requests spanning the join of circular genomes are represented as + // two requests one on each side of the join (position 0). + int64 start = 2; + + // The end position (0-based, exclusive) of this query. Defaults + // to the length of this `Reference`. + int64 end = 3; + + // The continuation token, which is used to page through large result sets. + // To get the next page of results, set this parameter to the value of + // `next_page_token` from the previous response. + string page_token = 4; +} + +// The response from `GET /references/{id}/bases` expressed as JSON. +message ListReferenceBasesResponse { + // The offset position (0-based) of the given sequence from the start of this + // `Reference`. This value will differ for each page in a paginated request. + int64 offset = 1; + + // A substring of the bases that make up this reference. Bases are represented + // as IUPAC-IUB codes; this string matches the regexp `[ACGTMRWSYKVHDBN]*`. + string sequence = 2; + // The continuation token, which is used to page through large result sets. + // Provide this value in a subsequent request to return the next page of + // results. This field will be empty if there aren't any additional results. + string next_page_token = 3; } diff --git a/src/main/proto/ga4gh/references.proto b/src/main/proto/ga4gh/references.proto index c1805dfe..fb8fc4df 100644 --- a/src/main/proto/ga4gh/references.proto +++ b/src/main/proto/ga4gh/references.proto @@ -1,130 +1,99 @@ -@namespace("org.ga4gh.models") - -/** -Defines types used by the GA4GH References API. -*/ -protocol References { - -import idl "common.avdl"; - -/** -A `Reference` is a canonical assembled contig, intended to act as a -reference coordinate space for other genomic annotations. A single -`Reference` might represent the human chromosome 1, for instance. - -`Reference`s are designed to be immutable. -*/ -record Reference { - - /** - The reference ID. Unique within the repository. - */ - string id; - - /** The length of this reference's sequence. */ - long length; - - /** - The MD5 checksum uniquely representing this `Reference` as a lower-case - hexadecimal string, calculated as the MD5 of the upper-case sequence - excluding all whitespace characters (this is equivalent to SQ:M5 in SAM). - */ - string md5checksum; - - /** - The name of this reference. (e.g. '22'). - */ - string name; - - /** - The URI from which the sequence was obtained. Specifies a FASTA format - file/string with one name, sequence pair. In most cases, clients should call - the `getReferenceBases()` method to obtain sequence bases for a `Reference` - instead of attempting to retrieve this URI. - */ - union { null, string } sourceURI = null; - - /** - All known corresponding accession IDs in INSDC (GenBank/ENA/DDBJ) which must include - a version number, e.g. `GCF_000001405.26`. - */ - array sourceAccessions; - - /** - A sequence X is said to be derived from source sequence Y, if X and Y - are of the same length and the per-base sequence divergence at A/C/G/T bases - is sufficiently small. Two sequences derived from the same official - sequence share the same coordinates and annotations, and - can be replaced with the official sequence for certain use cases. - */ - boolean isDerived = false; - - /** - The `sourceDivergence` is the fraction of non-indel bases that do not match the - reference this record was derived from. - */ - union { null, float } sourceDivergence = null; - - /** ID from http://www.ncbi.nlm.nih.gov/taxonomy (e.g. 9606->human). */ - union { null, int } ncbiTaxonId = null; - +syntax = "proto3"; + +package ga4gh; + +// A `Reference` is a canonical assembled contig, intended to act as a +// reference coordinate space for other genomic annotations. A single +// `Reference` might represent the human chromosome 1, for instance. +// +// `Reference`s are designed to be immutable. +message Reference { + // The reference ID. Unique within the repository. + string id = 1; + + // The length of this reference's sequence. + int64 length = 2; + + // The MD5 checksum uniquely representing this `Reference` as a lower-case + // hexadecimal string, calculated as the MD5 of the upper-case sequence + // excluding all whitespace characters (this is equivalent to SQ:M5 in SAM). + string md5checksum = 3; + + // The name of this reference. (e.g. '22'). + string name = 4; + + // The URI from which the sequence was obtained. Specifies a FASTA format + // file/string with one name, sequence pair. In most cases, clients should + // call + // the `getReferenceBases()` method to obtain sequence bases for a `Reference` + // instead of attempting to retrieve this URI. + string source_uri = 5; + + // All known corresponding accession IDs in INSDC (GenBank/ENA/DDBJ) which + // must include + // a version number, e.g. `GCF_000001405.26`. + repeated string source_accessions = 6; + + // A sequence X is said to be derived from source sequence Y, if X and Y + // are of the same length and the per-base sequence divergence at A/C/G/T + // bases + // is sufficiently small. Two sequences derived from the same official + // sequence share the same coordinates and annotations, and + // can be replaced with the official sequence for certain use cases. + bool is_derived = 7; + + // The `sourceDivergence` is the fraction of non-indel bases that do not match + // the + // reference this message was derived from. + float source_divergence = 8; + + // ID from http://www.ncbi.nlm.nih.gov/taxonomy (e.g. 9606->human). + int32 ncbi_taxon_id = 9; } -/** -A `ReferenceSet` is a set of `Reference`s which typically comprise a -reference assembly, such as `GRCh38`. A `ReferenceSet` defines a common -coordinate space for comparing reference-aligned experimental data. -*/ - -record ReferenceSet { - /** The reference set ID. Unique in the repository. */ - string id; - - /** The reference set name. */ - union { null, string } name = null; - - /** - Order-independent MD5 checksum which identifies this `ReferenceSet`. - - To compute this checksum, make a list of `Reference.md5checksum` for all - `Reference`s in this set. Then sort that list, and take the MD5 hash of - all the strings concatenated together. Express the hash as a lower-case - hexadecimal string. - */ - string md5checksum; - - /** - ID from http://www.ncbi.nlm.nih.gov/taxonomy (e.g. 9606->human) indicating - the species which this assembly is intended to model. Note that contained - `Reference`s may specify a different `ncbiTaxonId`, as assemblies may - contain reference sequences which do not belong to the modeled species, e.g. - EBV in a human reference genome. - */ - union { null, int } ncbiTaxonId = null; - - /** Optional free text description of this reference set. */ - union { null, string } description = null; +// A `ReferenceSet` is a set of `Reference`s which typically comprise a +// reference assembly, such as `GRCh38`. A `ReferenceSet` defines a common +// coordinate space for comparing reference-aligned experimental data. +message ReferenceSet { + // The reference set ID. Unique in the repository. + string id = 1; + + // The reference set name. + string name = 2; + + // Order-independent MD5 checksum which identifies this `ReferenceSet`. + // + // To compute this checksum, make a list of `Reference.md5checksum` for all + // `Reference`s in this set. Then sort that list, and take the MD5 hash of + // all the strings concatenated together. Express the hash as a lower-case + // hexadecimal string. + string md5checksum = 3; + + // ID from http://www.ncbi.nlm.nih.gov/taxonomy (e.g. 9606->human) indicating + // the species which this assembly is intended to model. Note that contained + // `Reference`s may specify a different `ncbiTaxonId`, as assemblies may + // contain reference sequences which do not belong to the modeled species, + // e.g. + // EBV in a human reference genome. + int32 ncbi_taxon_id = 4; + + // Optional free text description of this reference set. + string description = 5; // next information about the source of the sequences - /** Public id of this reference set, such as `GRCh37`. */ - union { null, string } assemblyId = null; - - /** Specifies a FASTA format file/string. */ - union { null, string } sourceURI = null; + // Public id of this reference set, such as `GRCh37`. + string assembly_id = 6; - /** - All known corresponding accession IDs in INSDC (GenBank/ENA/DDBJ) ideally - with a version number, e.g. `NC_000001.11`. - */ - array sourceAccessions; + // Specifies a FASTA format file/string. + string source_uri = 7; - /** - A reference set may be derived from a source if it contains - additional sequences, or some of the sequences within it are derived - (see the definition of `isDerived` in `Reference`). - */ - boolean isDerived = false; -} + // All known corresponding accession IDs in INSDC (GenBank/ENA/DDBJ) ideally + // with a version number, e.g. `NC_000001.11`. + repeated string source_accessions = 8; + // A reference set may be derived from a source if it contains + // additional sequences, or some of the sequences within it are derived + // (see the definition of `isDerived` in `Reference`). + bool is_derived = 9; } diff --git a/src/main/proto/ga4gh/sequence_annotation_service.proto b/src/main/proto/ga4gh/sequence_annotation_service.proto index 82e74a50..ea21f3e9 100644 --- a/src/main/proto/ga4gh/sequence_annotation_service.proto +++ b/src/main/proto/ga4gh/sequence_annotation_service.proto @@ -1,166 +1,126 @@ -@namespace("org.ga4gh.methods") - -protocol SequenceAnnotationMethods { - - import idl "common.avdl"; - import idl "methods.avdl"; - import idl "sequenceAnnotations.avdl"; - - /****************** /featuresets/search *********************/ - /** This request maps to the body of `POST /featuresets/search` as JSON. */ - record SearchFeatureSetsRequest { - /** - The `Dataset` to search. - */ - string datasetId; - - /** - Specifies the maximum number of results to return in a single page. - If unspecified, a system default will be used. - */ - union { null, int } pageSize = null; - - /** - The continuation token, which is used to page through large result sets. - To get the next page of results, set this parameter to the value of - `nextPageToken` from the previous response. - */ - union { null, string } pageToken = null; - } - - /** This is the response from `POST /featuresets/search` expressed as JSON. */ - record SearchFeatureSetsResponse { - /** The list of matching feature sets. */ - array featureSets = []; - - /** - The continuation token, which is used to page through large result sets. - Provide this value in a subsequent request to return the next page of - results. This field will be empty if there aren't any additional results. - */ - union { null, string } nextPageToken = null; - } - - /** - Gets a list of `FeatureSet` matching the search criteria. - - `POST /featuresets/search` must accept a JSON version of - `SearchFeatureSetsRequest` as the post body and will return a JSON version - of `SearchFeatureSetsResponse`. - */ - SearchFeatureSetsResponse searchFeatureSets( - /** This request maps to the body of `POST /featuresets/search` as JSON. */ - SearchFeatureSetsRequest request) throws GAException; - - /**************** /featuresets/{id} *******************/ - /** - Gets a `FeatureSet` by ID. - `GET /featuresets/{id}` will return a JSON version of `FeatureSet`. - */ - org.ga4gh.models.FeatureSet getFeatureSet( - /** - The ID of the `FeatureSet`. - */ - string id) throws GAException; - - - /****************** /features/search *****************/ - /** - This request maps to the body of `POST /features/search` as JSON. - */ - record SearchFeaturesRequest { - /** - The annotation set to search within. Either `featureSetId` or - `parentId` must be non-empty. - */ - union { null, string } featureSetId; - - /** - Restricts the search to direct children of the given parent `feature` - ID. Either `featureSetId` or `parentId` must be non-empty. - */ - union { null, string } parentId; - - /** - Only return features on the reference with this name - (matched to literal reference name as imported from the GFF3). - */ - string referenceName; - - /** - Required. The beginning of the window (0-based, inclusive) for which - overlapping features should be returned. Genomic positions are - non-negative integers less than reference length. Requests spanning the - join of circular genomes are represented as two requests one on each side - of the join (position 0). - */ - long start; - - /** - Required. The end of the window (0-based, exclusive) for which overlapping - features should be returned. - */ - long end; - - // TODO: To be replaced with a fully featured ontology search - // once the Metadata definitions are rounded out. - /** - If specified, this query matches only annotations whose `featureType` - matches one of the provided ontology terms. - */ - array featureTypes = []; - - /** - Specifies the maximum number of results to return in a single page. - If unspecified, a system default will be used. - */ - union { null, int } pageSize = null; - - /** - The continuation token, which is used to page through large result sets. - To get the next page of results, set this parameter to the value of - `nextPageToken` from the previous response. - */ - union { null, string } pageToken = null; - } - - /** This is the response from `POST /features/search` expressed as JSON. */ - record SearchFeaturesResponse { - /** - The list of matching annotations, sorted by start position. Annotations which - share a start position are returned in a deterministic order. - */ - array features = []; - - /** - The continuation token, which is used to page through large result sets. - Provide this value in a subsequent request to return the next page of - results. This field will be empty if there aren't any additional results. - */ - union { null, string } nextPageToken = null; - } - - /** - Gets a list of `Feature` matching the search criteria. - - `POST /features/search` must accept a JSON version of - `SearchFeaturesRequest` as the post body and will return a JSON version of - `SearchFeaturesResponse`. - */ - SearchFeaturesResponse searchFeatures( - /** This request maps to the body of `POST /features/search` as JSON. */ - SearchFeaturesRequest request) throws GAException; - - - /**************** /features/{id} *******************/ - /** - Gets a `org.ga4gh.models.Feature` by ID. - `GET /features/{id}` will return a JSON version of `Feature`. - */ - org.ga4gh.models.Feature getFeature( - /** - The ID of the `Feature`. - */ - string id) throws GAException; +syntax = "proto3"; + +package ga4gh; + +import "ga4gh/sequence_annotations.proto"; + +service SequenceAnnotationService { + // Gets a list of `FeatureSet` matching the search criteria. + // + // `POST /featuresets/search` must accept a JSON version of + // `SearchFeatureSetsRequest` as the post body and will return a JSON version + // of `SearchFeatureSetsResponse`. + rpc SearchFeatureSets(SearchFeatureSetsRequest) + returns (SearchFeatureSetsResponse); + + // Gets a `FeatureSet` by ID. + // + // `GET /featuresets/{id}` will return a JSON version of `FeatureSet`. + rpc GetFeatureSet(GetFeatureSetRequest) returns (FeatureSet); + + // Gets a list of `Feature` matching the search criteria. + // + // `POST /features/search` must accept a JSON version of + // `SearchFeaturesRequest` as the post body and will return a JSON version of + // `SearchFeaturesResponse`. + rpc SearchFeatures(SearchFeaturesRequest) returns (SearchFeaturesResponse); + + // Gets a `org.ga4gh.models.Feature` by ID. + // + // `GET /features/{id}` will return a JSON version of `Feature`. + rpc GetFeature(GetFeatureRequest) returns (Feature); +} + +/****************** /featuresets *********************/ +/** This request maps to the body of `POST /featuresets/search` as JSON. */ +message SearchFeatureSetsRequest { + // The `Dataset` to search. + string dataset_id = 1; + + // Specifies the maximum number of results to return in a single page. + // If unspecified, a system default will be used. + int32 page_size = 2; + + // The continuation token, which is used to page through large result sets. + // To get the next page of results, set this parameter to the value of + // `next_page_token` from the previous response. + string page_token = 3; +} + +// This is the response from `POST /featuresets/search` expressed as JSON. +message SearchFeatureSetsResponse { + // The list of matching feature sets. + repeated FeatureSet feature_sets = 1; + + // The continuation token, which is used to page through large result sets. + // Provide this value in a subsequent request to return the next page of + // results. This field will be empty if there aren't any additional results. + string next_page_token = 2; +} + +// This request maps to the URL `GET /featuresets/{id}`. +message GetFeatureSetRequest { + // The ID of the `FeatureSet` to be retrieved. + string feature_set_id = 1; +} + +/****************** /features *****************/ +// This request maps to the body of `POST /features/search` as JSON. +message SearchFeaturesRequest { + // The annotation set to search within. Either `feature_set_id` or + // `parent_id` must be non-empty. + string feature_set_id = 1; + + // Restricts the search to direct children of the given parent `feature` + // ID. Either `feature_set_id` or `parent_id` must be non-empty. + string parent_id = 2; + + // Only return features on the reference with this name + // (matched to literal reference name as imported from the GFF3). + string reference_name = 3; + + // Required. The beginning of the window (0-based, inclusive) for which + // overlapping features should be returned. Genomic positions are + // non-negative integers less than reference length. Requests spanning the + // join of circular genomes are represented as two requests one on each side + // of the join (position 0). + int64 start = 4; + + // Required. The end of the window (0-based, exclusive) for which overlapping + // features should be returned. + int64 end = 5; + + // TODO: To be replaced with a fully featured ontology search + // once the Metadata definitions are rounded out. + + // If specified, this query matches only annotations whose `feature_type` + // matches one of the provided ontology terms. + repeated string feature_types = 6; + + // Specifies the maximum number of results to return in a single page. + // If unspecified, a system default will be used. + int32 page_size = 7; + + // The continuation token, which is used to page through large result sets. + // To get the next page of results, set this parameter to the value of + // `next_page_token` from the previous response. + string page_token = 8; +} + +// This is the response from `POST /features/search` expressed as JSON. +message SearchFeaturesResponse { + // The list of matching annotations, sorted by start position. Annotations + // which share a start position are returned in a deterministic order. + repeated Feature features = 1; + + // The continuation token, which is used to page through large result sets. + // Provide this value in a subsequent request to return the next page of + // results. This field will be empty if there aren't any additional results. + string next_page_token = 2; +} +// This request maps to the URL `GET /features/{id}`. +message GetFeatureRequest { + // The ID of the `Feature` to be retrieved. + string feature_id = 1; } diff --git a/src/main/proto/ga4gh/sequence_annotations.proto b/src/main/proto/ga4gh/sequence_annotations.proto index 4e3b6f15..0cbe44a3 100644 --- a/src/main/proto/ga4gh/sequence_annotations.proto +++ b/src/main/proto/ga4gh/sequence_annotations.proto @@ -1,10 +1,17 @@ -@namespace("org.ga4gh.models") -/** +syntax = "proto3"; + +package ga4gh; + +import "ga4gh/common.proto"; +import "ga4gh/metadata.proto"; +import "google/protobuf/struct.proto"; + +/* This protocol defines annotations on GA4GH genomic sequences It includes two types of annotations: continuous and discrete hierarchical. The discrete hierarchical annotations are derived from the Sequence Ontology -(SO) and GFF3 work +(SO) and GFF3 work http://www.sequenceontology.org/gff3.shtml @@ -16,120 +23,94 @@ The minimum requirement is to be able to accurately represent the current state of the art annotation data and the full SO model. Feature is the core generic record which corresponds to the a GFF3 record. */ -protocol SequenceAnnotations { - - import idl "common.avdl"; - import idl "metadata.avdl"; - - /** - Type defining a collection of attributes associated with various protocol - records. Each attribute is a name that maps to an array of one or more - values. Values can be strings, external identifiers, or ontology terms. - Values should be split into the array elements instead of using a separator - syntax that needs to parsed. - */ - record Attributes { - map> vals = {}; + +// Type defining a collection of attributes associated with various protocol +// records. Each attribute is a name that maps to an array of one or more +// values. Values can be strings, external identifiers, or ontology terms Values +// should be split into the array elements instead of using a separator syntax +// that needs to parsed. +message Attributes { + message AttributeValue { + oneof value { + string string_value = 1; + ExternalIdentifier external_identifier = 2; + OntologyTerm ontology_term = 3; + } } - - /** - Node in the annotation graph that annotates a contiguous region of a - sequence. - */ - record Feature { - /** - Id of this annotation node. - */ - string id; - - /** - Parent Id of this node. Set to empty string if node has no parent. - */ - string parentId; - - /** - Ordered array of Child Ids of this node. - Since not all child nodes are ordered by genomic coordinates, - this can't always be reconstructed from parentId's of the children alone. - */ - - array childIds = []; - - /** - Identifier for the containing feature set. - */ - string featureSetId; - - /** - The reference on which this feature occurs. - (e.g. `chr20` or `X`) - */ - string referenceName; - - /** - The start position at which this feature occurs (0-based). - This corresponds to the first base of the string of reference bases. - Genomic positions are non-negative integers less than reference length. - Features spanning the join of circular genomes are represented as - two features one on each side of the join (position 0). - */ - long start = 0; - - /** - The end position (exclusive), resulting in [start, end) closed-open interval. - This is typically calculated by `start + referenceBases.length`. - */ - long end; - - /** - The strand on which the feature is present. - */ - Strand strand; - - /** - Feature that is annotated by this region. Normally, this will be a term in - the Sequence Ontology. - */ - OntologyTerm featureType; - - /** - Name/value attributes of the annotation. Attribute names follow the GFF3 - naming convention of reserved names starting with an upper cases - character, and user-define names start with lower-case. Most GFF3 - pre-defined attributes apply, the exceptions are ID and Parent, which are - defined as fields. Additional, the following attributes are added: - * Score - the GFF3 score column - * Phase - the GFF3 phase column for CDS features. - */ - Attributes attributes; + message AttributeValueList { + repeated AttributeValue values = 1; } + map vals = 1; +} - /* - A set of sequence features annotations - */ - record FeatureSet { - /** The ID of this annotation set. */ - string id; - - /** The ID of the dataset this annotation set belongs to. */ - string datasetId; - - /** - The ID of the reference set which defines the coordinate-space for this - set of annotations. - */ - union { null, string } referenceSetId; - - /** The display name for this annotation set. */ - union { null, string } name = null; - - /** - The source URI describing the file from which this annotation set was - generated, if any. - */ - union { null, string } sourceURI = null; - - /** Remaining structured metadata key-value pairs. */ - map> info = {}; - } +// Node in the annotation graph that annotates a contiguous region of a +// sequence. +message Feature { + // Id of this annotation node. + string id = 1; + + // Parent Id of this node. Set to empty string if node has no parent. + string parent_id = 2; + + // Ordered array of Child Ids of this node. Since not all child nodes are + // ordered by genomic coordinates, this can't always be reconstructed from + // parent_id's of the children alone. + repeated string child_ids = 3; + + // Identifier for the containing feature set. + string feature_set_id = 4; + + // The reference on which this feature occurs (e.g. `chr20` or `X`). + string reference_name = 5; + + // The start position at which this feature occurs (0-based). This + // corresponds to the first base of the string of reference bases. Genomic + // positions are non-negative integers less than reference length. Features + // spanning the join of circular genomes are represented as two features one + // on each side of the join (position 0). + int64 start = 6; + + // The end position (exclusive), resulting in [start, end) closed-open + // interval. This is typically calculated by `start + + // reference_bases.length`. + int64 end = 7; + + // The strand on which the feature is present. + Strand strand = 8; + + // Feature that is annotated by this region. Normally, this will be a term + // in the Sequence Ontology. + OntologyTerm feature_type = 9; + + // Name/value attributes of the annotation. Attribute names follow the GFF3 + // naming convention of reserved names starting with an upper cases character, + // and user-define names start with lower-case. Most GFF3 pre-defined + // attributes apply, the exceptions are ID and Parent, which are defined as + // fields. Additional, the following attributes are added: + // * Score - the GFF3 score column + // * Phase - the GFF3 phase column for CDS features. + Attributes attributes = 10; +} + +// A set of sequence features annotations. +message FeatureSet { + // The ID of this annotation set. + string id = 1; + + // The ID of the dataset this annotation set belongs to. + string dataset_id = 2; + + // The ID of the reference set which defines the coordinate-space for this + // set of annotations. + string reference_set_id = 3; + + // The display name for this annotation set. + string name = 4; + + // The source URI describing the file from which this annotation set was + // generated, if any. + string source_uri = 5; + + // Remaining structured metadata key-value pairs. + map info = 6; } diff --git a/src/main/proto/ga4gh/variant_service.proto b/src/main/proto/ga4gh/variant_service.proto index 59e19b7b..11d277a4 100644 --- a/src/main/proto/ga4gh/variant_service.proto +++ b/src/main/proto/ga4gh/variant_service.proto @@ -1,215 +1,172 @@ -@namespace("org.ga4gh.methods") -protocol VariantMethods { - -import idl "methods.avdl"; -import idl "variants.avdl"; - -/****************** /variantsets/search *********************/ -/** This request maps to the body of `POST /variantsets/search` as JSON. */ -record SearchVariantSetsRequest { - /** - The `Dataset` to search. - */ - string datasetId; - - /** - Specifies the maximum number of results to return in a single page. - If unspecified, a system default will be used. - */ - union { null, int } pageSize = null; - - /** - The continuation token, which is used to page through large result sets. - To get the next page of results, set this parameter to the value of - `nextPageToken` from the previous response. - */ - union { null, string } pageToken = null; +syntax = "proto3"; + +package ga4gh; + +import "ga4gh/variants.proto"; + +service VariantService { + // Gets a list of `VariantSet` matching the search criteria. + // + // `POST /variantsets/search` must accept a JSON version of + // `SearchVariantSetsRequest` as the post body and will return a JSON version + // of `SearchVariantSetsResponse`. + rpc SearchVariantSets(SearchVariantSetsRequest) + returns (SearchVariantSetsResponse); + + // Gets a `VariantSet` by ID. + // + // `GET /variantsets/{variant_set_id}` will return a JSON version of + // `VariantSet`. + rpc GetVariantSet(GetVariantSetRequest) returns (VariantSet); + + // Gets a list of `Variant` matching the search criteria. + // + // `POST /variants/search` must accept a JSON version of + // `SearchVariantsRequest` as the post body and will return a JSON version of + // `SearchVariantsResponse`. + rpc SearchVariants(SearchVariantsRequest) returns (SearchVariantsResponse); + + // Gets a `Variant` by ID. + // + // `GET /variants/{id}` will return a JSON version of `Variant`. + rpc GetVariant(GetVariantRequest) returns (Variant); + + // Gets a list of call sets matching the search criteria. + // + // `POST /callsets/search` must accept a JSON version of + // `SearchCallSetsRequest` as the post body and will return a JSON version of + // `SearchCallSetsResponse`. + rpc SearchCallSets(SearchCallSetsRequest) returns (SearchCallSetsResponse); + + // Gets a `CallSet` by ID. + // + // `GET /callsets/{id}` will return a JSON version of `CallSet`. + rpc GetCallSet(GetCallSetRequest) returns (CallSet); } -/** This is the response from `POST /variantsets/search` expressed as JSON. */ -record SearchVariantSetsResponse { - /** The list of matching variant sets. */ - array variantSets = []; - - /** - The continuation token, which is used to page through large result sets. - Provide this value in a subsequent request to return the next page of - results. This field will be empty if there aren't any additional results. - */ - union { null, string } nextPageToken = null; +// ****************** /variantsets ********************* +// This request maps to the body of `POST /variantsets/search` as JSON. +message SearchVariantSetsRequest { + // The `Dataset` to search. + string dataset_id = 1; + + // Specifies the maximum number of results to return in a single page. + // If unspecified, a system default will be used. + int32 page_size = 2; + + // The continuation token, which is used to page through large result sets. To + // get the next page of results, set this parameter to the value of + // `next_page_token` from the previous response. + string page_token = 3; } -/** -Gets a list of `VariantSet` matching the search criteria. - -`POST /variantsets/search` must accept a JSON version of -`SearchVariantSetsRequest` as the post body and will return a JSON version -of `SearchVariantSetsResponse`. -*/ -SearchVariantSetsResponse searchVariantSets( - /** This request maps to the body of `POST /variantsets/search` as JSON. */ - SearchVariantSetsRequest request) throws GAException; - -/**************** /variantsets/{id} *******************/ -/** -Gets a `VariantSet` by ID. -`GET /variantsets/{id}` will return a JSON version of `VariantSet`. -*/ -org.ga4gh.models.VariantSet getVariantSet( - /** - The ID of the `VariantSet`. - */ - string id) throws GAException; - -/****************** /variants/search *********************/ -/** This request maps to the body of `POST /variants/search` as JSON. */ -record SearchVariantsRequest { - /** - The `VariantSet` to search. - */ - string variantSetId; - - /** - Only return variant calls which belong to call sets with these IDs. - If an empty array, returns variants without any call objects. - If null, returns all variant calls. - */ - union { null, array } callSetIds = null; - - /** Required. Only return variants on this reference. */ - string referenceName; - - /** - Required. The beginning of the window (0-based, inclusive) for - which overlapping variants should be returned. - Genomic positions are non-negative integers less than reference length. - Requests spanning the join of circular genomes are represented as - two requests one on each side of the join (position 0). - */ - long start; - - /** - Required. The end of the window (0-based, exclusive) for which overlapping - variants should be returned. - */ - long end; - - /** - Specifies the maximum number of results to return in a single page. - If unspecified, a system default will be used. - */ - union { null, int } pageSize = null; - - /** - The continuation token, which is used to page through large result sets. - To get the next page of results, set this parameter to the value of - `nextPageToken` from the previous response. - */ - union { null, string } pageToken = null; +// This is the response from `POST /variantsets/search` expressed as JSON. +message SearchVariantSetsResponse { + // The list of matching variant sets. + repeated VariantSet variant_sets = 1; + + // The continuation token, which is used to page through large result sets. + // Provide this value in a subsequent request to return the next page of + // results. This field will be empty if there aren't any additional results. + string next_page_token = 2; } -/** This is the response from `POST /variants/search` expressed as JSON. */ -record SearchVariantsResponse { - /** - The list of matching variants. - If the `callSetId` field on the returned calls is not present, - the ordering of the call sets from a `SearchCallSetsRequest` - over the parent `VariantSet` is guaranteed to match the ordering - of the calls on each `Variant`. The number of results will also be - the same. - */ - array variants = []; - - /** - The continuation token, which is used to page through large result sets. - Provide this value in a subsequent request to return the next page of - results. This field will be empty if there aren't any additional results. - */ - union { null, string } nextPageToken = null; +// This request maps to the URL `GET /variantsets/{id}`. +message GetVariantSetRequest { + // The ID of the `VariantSet` to be retrieved. + string variant_set_id = 1; } -/** -Gets a list of `Variant` matching the search criteria. - -`POST /variants/search` must accept a JSON version of `SearchVariantsRequest` -as the post body and will return a JSON version of `SearchVariantsResponse`. -*/ -SearchVariantsResponse searchVariants( - /** This request maps to the body of `POST /variants/search` as JSON. */ - SearchVariantsRequest request) throws GAException; - -/**************** /variants/{id} *******************/ -/** -Gets a `Variant` by ID. -`GET /variants/{id}` will return a JSON version of `Variant`. -*/ -org.ga4gh.models.Variant getVariant( - /** - The ID of the `Variant`. - */ - string id) throws GAException; - -/****************** /callsets/search *********************/ -/** This request maps to the body of `POST /callsets/search` as JSON. */ -record SearchCallSetsRequest { - /** - The VariantSet to search. - */ - string variantSetId; - - /** - Only return call sets with this name (case-sensitive, exact match). - */ - union { null, string } name = null; +// ****************** /variants ********************* +// This request maps to the body of `POST /variants/search` as JSON. +message SearchVariantsRequest { + // The `VariantSet` to search. + string variant_set_id = 1; + + // Only return variant calls which belong to call sets with these IDs. + // If unspecified, return all variants and no variant call objects. + repeated string call_set_ids = 2; + + // Required. Only return variants on this reference. + string reference_name = 3; + + // Required. The beginning of the window (0-based, inclusive) for + // which overlapping variants should be returned. + // Genomic positions are non-negative integers less than reference length. + // Requests spanning the join of circular genomes are represented as + // two requests one on each side of the join (position 0). + int64 start = 4; + + // Required. The end of the window (0-based, exclusive) for which overlapping + // variants should be returned. + int64 end = 5; + + // Specifies the maximum number of results to return in a single page. + // If unspecified, a system default will be used. + int32 page_size = 6; + + // The continuation token, which is used to page through large result sets. + // To get the next page of results, set this parameter to the value of + // `next_page_token` from the previous response. + string page_token = 7; +} - // TODO: Add more ways to search by other metadata +// This is the response from `POST /variants/search` expressed as JSON. +message SearchVariantsResponse { + // The list of matching variants. + // If the `callSetId` field on the returned calls is not present, + // the ordering of the call sets from a `SearchCallSetsRequest` + // over the parent `VariantSet` is guaranteed to match the ordering + // of the calls on each `Variant`. The number of results will also be + // the same. + repeated Variant variants = 1; + + // The continuation token, which is used to page through large result sets. + // Provide this value in a subsequent request to return the next page of + // results. This field will be empty if there aren't any additional results. + string next_page_token = 2; +} - /** - Specifies the maximum number of results to return in a single page. - If unspecified, a system default will be used. - */ - union { null, int } pageSize = null; - - /** - The continuation token, which is used to page through large result sets. - To get the next page of results, set this parameter to the value of - `nextPageToken` from the previous response. - */ - union { null, string } pageToken = null; +// This request maps to the URL `GET /variants/{id}`. +message GetVariantRequest { + // The ID of the `Variant` to be retrieved. + string variant_id = 1; } -/** This is the response from `POST /callsets/search` expressed as JSON. */ -record SearchCallSetsResponse { - /** The list of matching call sets. */ - array callSets = []; - - /** - The continuation token, which is used to page through large result sets. - Provide this value in a subsequent request to return the next page of - results. This field will be empty if there aren't any additional results. - */ - union { null, string } nextPageToken = null; +// ****************** /callsets ********************* +// This request maps to the body of `POST /callsets/search` as JSON. +message SearchCallSetsRequest { + // The VariantSet to search. + string variant_set_id = 1; + + // Only return call sets with this name (case-sensitive, exact match). + string name = 2; + + // TODO: Add more ways to search by other metadata + + // Specifies the maximum number of results to return in a single page. + // If unspecified, a system default will be used. + int32 page_size = 3; + + // The continuation token, which is used to page through large result sets. + // To get the next page of results, set this parameter to the value of + // `next_page_token` from the previous response. + string page_token = 4; } -/** -Gets a list of `CallSet` matching the search criteria. - -`POST /callsets/search` must accept a JSON version of `SearchCallSetsRequest` -as the post body and will return a JSON version of `SearchCallSetsResponse`. -*/ -SearchCallSetsResponse searchCallSets( - /** This request maps to the body of `POST /callsets/search` as JSON. */ - SearchCallSetsRequest request) throws GAException; - -/**************** /callsets/{id} *******************/ -/** -Gets a `CallSet` by ID. -`GET /callsets/{id}` will return a JSON version of `CallSet`. -*/ -org.ga4gh.models.CallSet getCallSet( - /** - The ID of the `CallSet`. - */ - string id) throws GAException; +// This is the response from `POST /callsets/search` expressed as JSON. +message SearchCallSetsResponse { + // The list of matching call sets. + repeated CallSet call_sets = 1; + + // The continuation token, which is used to page through large result sets. + // Provide this value in a subsequent request to return the next page of + // results. This field will be empty if there aren't any additional results. + string next_page_token = 2; +} +// This request maps to the URL `GET /callsets/{call_set_id}`. +message GetCallSetRequest { + // The ID of the `CallSet` to be retrieved. + string call_set_id = 1; } diff --git a/src/main/proto/ga4gh/variants.proto b/src/main/proto/ga4gh/variants.proto index 3e72a6f3..7dc46826 100644 --- a/src/main/proto/ga4gh/variants.proto +++ b/src/main/proto/ga4gh/variants.proto @@ -1,247 +1,196 @@ -@namespace("org.ga4gh.models") - -/** -This file defines the objects used to represent variant calls, most importantly -VariantSet, Variant, and Call. -See {TODO: LINK TO VARIANTS OVERVIEW} for more information. -*/ -protocol Variants { - -import idl "common.avdl"; - -/** Optional metadata associated with a variant set. */ -record VariantSetMetadata { - /** The top-level key. */ - string key; - - /** The value field for simple metadata. */ - string value; - - /** - User-provided ID field, not enforced by this API. - Two or more pieces of structured metadata with identical - id and key fields are considered equivalent. - `FIXME: If it's not enforced, then why can't it be null?` - */ - string id; - - /** The type of data. */ - string type; - - /** - The number of values that can be included in a field described by this - metadata. - */ - string number; - - /** A textual description of this metadata. */ - string description; - - /** Remaining structured metadata key-value pairs. */ - map> info = {}; -} +// This file defines the objects used to represent variant calls, most importantly +// VariantSet, Variant, and Call. +// See {TODO: LINK TO VARIANTS OVERVIEW} for more information. + +syntax = "proto3"; + +package ga4gh; + +import "google/protobuf/struct.proto"; + +// This metadata represents VCF header information. +message VariantSetMetadata { + // The top-level key. + string key = 1; + + // The value field for simple metadata. + string value = 2; + + // User-provided ID field, not enforced by this API. + // Two or more pieces of structured metadata with identical + // id and key fields are considered equivalent. + // FIXME: If it's not enforced, then why can't it be null? + string id = 3; + + // The type of data. + string type = 4; + + // The number of values that can be included in a field described by this + // metadata. + string number = 5; + + // A textual description of this metadata. + string description = 6; -/** -A VariantSet is a collection of variants and variant calls intended to be analyzed together. -*/ -record VariantSet { - /** The variant set ID. */ - string id; - - /** The variant set name. */ - union { null, string } name = null; - - /** The ID of the dataset this variant set belongs to. */ - string datasetId; - - /** - The ID of the reference set that describes the sequences used by the variants in this set. - */ - string referenceSetId; - - /** - Optional metadata associated with this variant set. - This array can be used to store information about the variant set, such as information found - in VCF header fields, that isn't already available in first class fields such as "name". - */ - array metadata = []; + // Remaining structured metadata key-value pairs. + map info = 7; } -/** -A CallSet is a collection of calls that were generated by the same analysis of the same sample. -*/ -record CallSet { - - /** The call set ID. */ - string id; - - /** The call set name. */ - union { null, string } name = null; - - /** - The sample this call set's data was generated from. - Note: the current API does not have a rigorous definition of sample. Therefore, this - field actually contains an arbitrary string, typically corresponding to the sampleId - field in the read groups used to generate this call set. - */ - union { null, string } sampleId; - - /** The IDs of the variant sets this call set has calls in. */ - array variantSetIds = []; - - /** The date this call set was created in milliseconds from the epoch. */ - union { null, long } created = null; - - /** - The time at which this call set was last updated in - milliseconds from the epoch. - */ - union { null, long } updated = null; - - /** - A map of additional call set information. - */ - map> info = {}; +// A VariantSet is a collection of variants and variant calls intended to be +// analyzed together. +message VariantSet { + // The variant set ID. + string id = 1; + + // The variant set name. + string name = 2; + + // The ID of the dataset this variant set belongs to. + string dataset_id = 3; + + // The ID of the reference set that describes the sequences used by the + // variants in this set. + string reference_set_id = 4; + + // Optional metadata associated with this variant set. + // This array can be used to store information about the variant set, such as + // information found in VCF header fields, that isn't already available in + // first class fields such as "name". + repeated VariantSetMetadata metadata = 5; } -/** -A `Call` represents the determination of genotype with respect to a -particular `Variant`. - -It may include associated information such as quality -and phasing. For example, a call might assign a probability of 0.32 to -the occurrence of a SNP named rs1234 in a call set with the name NA12345. -*/ -record Call { - - /** - The name of the call set this variant call belongs to. - If this field is not present, the ordering of the call sets from a - `SearchCallSetsRequest` over this `VariantSet` is guaranteed to match - the ordering of the calls on this `Variant`. - The number of results will also be the same. - */ - union { null, string } callSetName = null; - - /** - The ID of the call set this variant call belongs to. - - If this field is not present, the ordering of the call sets from a - `SearchCallSetsRequest` over this `VariantSet` is guaranteed to match - the ordering of the calls on this `Variant`. - The number of results will also be the same. - */ - union { null, string} callSetId = null; - - /** - The genotype of this variant call. - - A 0 value represents the reference allele of the associated `Variant`. Any - other value is a 1-based index into the alternate alleles of the associated - `Variant`. - - If a variant had a referenceBases field of "T", an alternateBases - value of ["A", "C"], and the genotype was [2, 1], that would mean the call - represented the heterozygous value "CA" for this variant. If the genotype - was instead [0, 1] the represented value would be "TA". Ordering of the - genotype values is important if the phaseset field is present. - */ - array genotype = []; - - /** - If this field is not null, this variant call's genotype ordering implies - the phase of the bases and is consistent with any other variant calls on - the same contig which have the same phaseset string. - */ - union { null, string } phaseset = null; - - /** - The genotype likelihoods for this variant call. Each array entry - represents how likely a specific genotype is for this call as - log10(P(data | genotype)), analogous to the GL tag in the VCF spec. The - value ordering is defined by the GL tag in the VCF spec. - */ - array genotypeLikelihood = []; - - /** - A map of additional variant call information. - */ - map> info = {}; +// A CallSet is a collection of calls that were generated by the same analysis +// of the same sample. +message CallSet { + // The call set ID. + string id = 1; + + // The call set name. + string name = 2; + + // The sample this call set's data was generated from. + // Note: the current API does not have a rigorous definition of sample. + // Therefore, this field actually contains an arbitrary string, typically + // corresponding to the sampleId field in the read groups used to generate + // this call set. + string sample_id = 3; + + // The IDs of the variant sets this call set has calls in. + repeated string variant_set_ids = 4; + + // The date this call set was created in milliseconds from the epoch. + int64 created = 5; + + // The time at which this call set was last updated in + // milliseconds from the epoch. + int64 updated = 6; + + // A map of additional call set information. + map info = 7; } -/** -A `Variant` represents a change in DNA sequence relative to some reference. -For example, a variant could represent a SNP or an insertion. -Variants belong to a `VariantSet`. -This is equivalent to a row in VCF. -*/ -record Variant { - - /** The variant ID. */ - string id; - - /** - The ID of the `VariantSet` this variant belongs to. This transitively defines - the `ReferenceSet` against which the `Variant` is to be interpreted. - */ - string variantSetId; - - /** Names for the variant, for example a RefSNP ID. */ - array names = []; - - /** The date this variant was created in milliseconds from the epoch. */ - union { null, long } created = null; - - /** - The time at which this variant was last updated in - milliseconds from the epoch. - */ - union { null, long } updated = null; - - /** - The reference on which this variant occurs. - (e.g. `chr20` or `X`) - */ - string referenceName; - - /** - The start position at which this variant occurs (0-based). - This corresponds to the first base of the string of reference bases. - Genomic positions are non-negative integers less than reference length. - Variants spanning the join of circular genomes are represented as - two variants one on each side of the join (position 0). - */ - long start; - - /** - The end position (exclusive), resulting in [start, end) closed-open interval. - This is typically calculated by `start + referenceBases.length`. - */ - long end; - - - /** - The reference bases for this variant. They start at the given start position. - */ - string referenceBases; - - /** - The bases that appear instead of the reference bases. Multiple alternate - alleles are possible. - */ - array alternateBases = []; - - /** - A map of additional variant information. - */ - map> info = {}; - - /** - The variant calls for this particular variant. Each one represents the - determination of genotype with respect to this variant. `Call`s in this array - are implicitly associated with this `Variant`. - */ - array calls = []; +// A `Call` represents the determination of genotype with respect to a +// particular `Variant`. +// +// It may include associated information such as quality +// and phasing. For example, a call might assign a probability of 0.32 to +// the occurrence of a SNP named rs1234 in a call set with the name NA12345. +message Call { + // The name of the call set this variant call belongs to. + // If this field is not present, the ordering of the call sets from a + // `SearchCallSetsRequest` over this `VariantSet` is guaranteed to match + // the ordering of the calls on this `Variant`. + // The number of results will also be the same. + string call_set_name = 1; + + // The ID of the call set this variant call belongs to. + // + // If this field is not present, the ordering of the call sets from a + // `SearchCallSetsRequest` over this `VariantSet` is guaranteed to match + // the ordering of the calls on this `Variant`. + // The number of results will also be the same. + string call_set_id = 2; + + // The genotype of this variant call. + // + // A 0 value represents the reference allele of the associated `Variant`. Any + // other value is a 1-based index into the alternate alleles of the associated + // `Variant`. + // + // If a variant had a referenceBases field of "T", an alternateBases + // value of ["A", "C"], and the genotype was [2, 1], that would mean the call + // represented the heterozygous value "CA" for this variant. If the genotype + // was instead [0, 1] the represented value would be "TA". Ordering of the + // genotype values is important if the phaseset field is present. + repeated int32 genotype = 3; + + // If this field is populated, this variant call's genotype ordering implies + // the phase of the bases and is consistent with any other variant calls on + // the same contig which have the same phaseset string. + string phaseset = 4; + + // The genotype likelihoods for this variant call. Each array entry + // represents how likely a specific genotype is for this call as + // log10(P(data | genotype)), analogous to the GL tag in the VCF spec. The + // value ordering is defined by the GL tag in the VCF spec. + repeated double genotype_likelihood = 5; + + // A map of additional variant call information. + map info = 6; } +// A `Variant` represents a change in DNA sequence relative to some reference. +// For example, a variant could represent a SNP or an insertion. +// Variants belong to a `VariantSet`. +// This is equivalent to a row in VCF. +message Variant { + // The variant ID. + string id = 1; + + // The ID of the `VariantSet` this variant belongs to. This transitively + // defines + // the `ReferenceSet` against which the `Variant` is to be interpreted. + string variant_set_id = 2; + + // Names for the variant, for example a RefSNP ID. + repeated string names = 3; + + // The date this variant was created in milliseconds from the epoch. + int64 created = 4; + + // The time at which this variant was last updated in + // milliseconds from the epoch. + int64 updated = 5; + + // The reference on which this variant occurs. + // (e.g. `chr20` or `X`) + string reference_name = 6; + + // The start position at which this variant occurs (0-based). + // This corresponds to the first base of the string of reference bases. + // Genomic positions are non-negative integers less than reference length. + // Variants spanning the join of circular genomes are represented as + // two variants one on each side of the join (position 0). + int64 start = 7; + + // The end position (exclusive), resulting in [start, end) closed-open + // interval. + // This is typically calculated by `start + referenceBases.length`. + int64 end = 8; + + // The reference bases for this variant. They start at the given start + // position. + string reference_bases = 9; + + // The bases that appear instead of the reference bases. Multiple alternate + // alleles are possible. + repeated string alternate_bases = 10; + + // A map of additional variant information. + map info = 11; + + // The variant calls for this particular variant. Each one represents the + // determination of genotype with respect to this variant. `Call`s in this + // array are implicitly associated with this `Variant`. + repeated Call calls = 12; } diff --git a/tests/compile_schemas.py b/tests/compile_schemas.py deleted file mode 100644 index f4caf083..00000000 --- a/tests/compile_schemas.py +++ /dev/null @@ -1,126 +0,0 @@ -""" -Compiles avro schemas into python representations of those schemas -""" -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals - -import glob -import shutil -import os.path -import tempfile -import re - -import avro.schema - -import utils - - -class SchemaClass(object): - """ - Representation of an avro class - """ - def __init__(self, sourceFile): - self.sourceFile = sourceFile - with open(sourceFile) as sf: - self.schemaSource = sf.read() - self.schema = avro.schema.parse(self.schemaSource) - self.name = self.schema.name - - def getFields(self): - """ - Returns the list of avro fields sorted in order of name. - """ - return sorted(self.schema.fields, key=lambda f: f.name) - - def isSearchRequest(self): - """ - Returns True if the class we are converting is a subclass of - SearchRequest, and False otherwise. - """ - return re.search('Search.+Request', self.name) is not None - - def isSearchResponse(self): - """ - Returns True if the class we are converting is a subclass of - SearchResponse, and False otherwise. - """ - return re.search('Search.+Response', self.name) is not None - - -class SchemaProcessor(object): - """ - Compiles avro schemas into python classes - """ - def __init__(self, args): - self.version = args.version - self.tmpDir = tempfile.mkdtemp(prefix="ga4gh_") - self.avroJarPath = args.avro_tools_jar - # Note! The tarball does not contain the leading v - string = "schemas-{0}".format(self.version[1:]) - self.schemaDir = os.path.join(self.tmpDir, string) - self.avroJar = os.path.join(self.schemaDir, "avro-tools.jar") - self.avroPath = "src/main/resources/avro" - self.avdlDirectory = os.path.join(self.schemaDir, self.avroPath) - - def run(self): - self._getSchemaFromLocal() - self._compileSchemas() - self._initClasses() - self._initPostSignatures() - - def cleanup(self): - shutil.rmtree(self.tmpDir) - - def getClasses(self): - return self.classes - - def getPostSignatures(self): - return self.postSignatures - - def _compileSchemas(self): - url = "http://www.carfab.com/apachesoftware/avro/stable/java/"\ - "avro-tools-1.7.7.jar" - fileDownloader = utils.FileDownloader(url, self.avroJar) - fileDownloader.download() - cwd = os.getcwd() - os.chdir(self.avdlDirectory) - for avdlFile in glob.glob("*.avdl"): - self._convertAvro(avdlFile) - os.chdir(cwd) - - def _convertAvro(self, avdlFile): - args = ["java", "-jar", self.avroJar, "idl2schemata", avdlFile] - stdoutLines, stderrLines = utils.runCommandSplitsOutput(args) - printableArgs = "'{}'".format(" ".join(args)) - utils.ensureNoWarnings( - stdoutLines, "stdout of {}".format(printableArgs)) - utils.ensureNoWarnings( - stderrLines, "stderr of {}".format(printableArgs)) - - def _getSchemaFromLocal(self): - if not os.path.exists(self.avdlDirectory): - os.makedirs(self.avdlDirectory) - avdlFiles = glob.iglob(os.path.join(self.avroPath, "*.avdl")) - for avdlFile in avdlFiles: - if os.path.isfile(avdlFile): - shutil.copy2(avdlFile, self.avdlDirectory) - - def _initClasses(self): - self.classes = [] - for avscFile in glob.glob(os.path.join(self.avdlDirectory, "*.avsc")): - self.classes.append(SchemaClass(avscFile)) - self.requestClassNames = [ - cls.name for cls in self.classes if cls.isSearchRequest()] - self.responseClassNames = [ - cls.name for cls in self.classes if cls.isSearchResponse()] - - def _initPostSignatures(self): - self.postSignatures = [] - for request, response in zip( - self.requestClassNames, self.responseClassNames): - objname = re.search('Search(.+)Request', request).groups()[0] - url = '/{0}/search'.format(objname.lower()) - tup = (url, request, response) - self.postSignatures.append(tup) - self.postSignatures.sort() diff --git a/tests/test_protocol.py b/tests/test_protocol.py deleted file mode 100644 index 36d48b19..00000000 --- a/tests/test_protocol.py +++ /dev/null @@ -1,62 +0,0 @@ -""" -Runs tests that ensure protocol invariants - -TODO add other tests including: -- some CI on postSignatures -""" -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals - -import unittest - -import avro.schema - -import compile_schemas - - -class TestValidateSchemas(unittest.TestCase): - """ - Ensure the schemas conform to certain rules - """ - @classmethod - def setupClass(cls): - args = cls._makeArgs() - cls.schemaProcessor = compile_schemas.SchemaProcessor(args) - cls.schemaProcessor.run() - - @classmethod - def tearDownClass(cls): - cls.schemaProcessor.cleanup() - - @classmethod - def getClasses(cls): - return cls.schemaProcessor.getClasses() - - @classmethod - def _makeArgs(self): - class FakeArgs(object): - pass - args = FakeArgs() - args.version = "test" - args.avro_tools_jar = None - return args - - def testSchemaProperties(self): - for schemaClass in self.getClasses(): - self._checkProperties(schemaClass) - - def _checkProperties(self, schemaClass): - """ - Checks that the class schema satisfies certain properties: - - every union must have null as the first type - """ - if isinstance(schemaClass.schema, avro.schema.RecordSchema): - for field in schemaClass.getFields(): - if isinstance(field.type, avro.schema.UnionSchema): - t0 = field.type.schemas[0] - if not (isinstance(t0, avro.schema.PrimitiveSchema) and - t0.type == "null"): - msg = "Schema union assumptions violated: {}.{}" - raise Exception(msg.format( - schemaClass.name, field.name)) From d765ef9313200206eee38133cb6d7b6dc9d5db9e Mon Sep 17 00:00:00 2001 From: David Steinberg Date: Mon, 23 May 2016 15:27:19 -0700 Subject: [PATCH 3/3] Replace mentions of Avro with pb @calbach's edits @jeromekelleher's edits --- INSTALL.rst | 14 +-- doc/README.rst | 24 ----- doc/source/api/apidesign_intro.rst | 6 +- doc/source/appendix/avro_intro.rst | 72 -------------- doc/source/appendix/json_intro.rst | 140 ++++------------------------ doc/source/appendix/proto_intro.rst | 54 +++++++++++ doc/source/intro.rst | 34 +++---- 7 files changed, 99 insertions(+), 245 deletions(-) delete mode 100644 doc/source/appendix/avro_intro.rst create mode 100644 doc/source/appendix/proto_intro.rst diff --git a/INSTALL.rst b/INSTALL.rst index 21a4e5c6..60ed37f0 100644 --- a/INSTALL.rst +++ b/INSTALL.rst @@ -4,7 +4,7 @@ Installing the GA4GH Schemas The schemas are documents (text files) that formally describe the messages that pass between GA4GH reference servers and clients, which we also refer to collectively as "the API." The schemas are written in a -language called `Avro `__. +language called `Protocol Buffers `__. We use the schemas in a couple of different ways: @@ -14,19 +14,21 @@ We use the schemas in a couple of different ways: Generating Source Code @@@@@@@@@@@@@@@@@@@@@@ -(To be written.) +:: + +$ cd src/main/proto && protoc --python_out=. ga4gh/* Installing the Documentation Tools and Generating Documentation @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ -We use a tool called Sphinx to generate the documentation from Avro -input files. +We use a tool called Sphinx to generate the documentation from Protocol +Buffers input files. Install prerequisites ##################### -To use the Sphinx/Avro documentation generator, you must install some -software packages it requires. +To use the Sphinx/Protocol Buffers documentation generator, you must +install some software packages it requires. Maven $$$$$ diff --git a/doc/README.rst b/doc/README.rst index b6f6d9fd..6eb7664e 100644 --- a/doc/README.rst +++ b/doc/README.rst @@ -23,28 +23,6 @@ prerequisites and, for the moment, you'll have to ferret those out yourself.) -Building Process -@@@@@@@@@@@@@@@@ - -The current doc flow is roughly as follows:: - - avdl ----1----> avpr ----2----> rst -| - | ----3----> html - rst -| - - |- doc/source/schema/Makefile -| |-- sphinx --| - |------ top-level Makefile ('make docs') --------| - -* 1 = avro-tools, downloaded on demand; requires java -* 2 = avpr2rest.py, a custom script in tools/sphinx/ -* 3 = sphinx-build, part of the sphinx package - -.. warning:: Because we cannot currently run step 1 at Read the Docs, - it is imperative that developers type `make docs-schema` - at the top level if avdl files are updated, and then - commit the changed rst files. - - Documentation tips @@@@@@@@@@@@@@@@@@ @@ -54,5 +32,3 @@ Documents are written in `ReStructured Text schemas. - Abbreviations are stored in ``epilog.rst``. -- Reference avro elements with ``:avro:key``. - diff --git a/doc/source/api/apidesign_intro.rst b/doc/source/api/apidesign_intro.rst index 6db792ed..67aa8267 100644 --- a/doc/source/api/apidesign_intro.rst +++ b/doc/source/api/apidesign_intro.rst @@ -103,9 +103,9 @@ Unresolved Issues * What is the definition of the wire protocol? HTTP 1.0? Is HTTP 1.1 chunked encoding allowed? What is the specification for the - generate JSON for a given an Avro schema? + generated JSON for a given an Protocol Buffers schema? -* What is the role of Avro? Is it for documentation-only or for use - as an IDL? +* What is the role of Protocol Buffers? Is it for documentation-only + or for use as an IDL? * Need overall object relationship diagram. diff --git a/doc/source/appendix/avro_intro.rst b/doc/source/appendix/avro_intro.rst deleted file mode 100644 index e0cade4d..00000000 --- a/doc/source/appendix/avro_intro.rst +++ /dev/null @@ -1,72 +0,0 @@ -.. _avro: - -******************* -Apache Avro -******************* - -Apache Avro is a data serialization ecosystem, comparable to Google's Protocol Buffers. - -------------------- -What does the GA4GH web API take from Avro? -------------------- - -The GA4GH web API uses the Avro IDL (aka AVDL) language and JSON serialization labraries. - -The GA4GH web API presents a simple HTTP(S) and JSON interface to clients. It does **not** use Avro's binary serialization format, or Avro's built-in client/server networking and RPC features. - ------------------- -How does the GA4GH web API use Avro schemas? ------------------- - -GA4GH web API objects, including both the data objects actually exchanged and the control messages requesting and returning those objects, are defined in the Avro IDL language, AVDL. - -The `full documentation for the AVDL language is abvailable here `_. Bear in mind that the Avro IDL comes with an entire ecosystem; the GA4GH web APIs do not use most of it. - ------------------- -How does the GA4GH Web API use AVDL? ------------------- - -The GA4GH web API schemas are broken up into multiple AVDL files, which reference each other. Each file defines a number of types (mostly Avro Records, with a smattering of Avro Enums), grouped into a "protocol" (which is somewhat of a misnomer) of types defining a facet of the API. Mostly, the files come in pairs: a normal AVDL file defining the types representing actual data, and a "methods" AVDL file defining the control messages to be sent back and forth to query and exchange the representational types, and the URLs associated with various operations. - -Each type has a leading comment documenting its purpose, and each field in the type has a description. These are included in the automatically generated API documentation. - -Here is an example of an AVDL definition from, in this case defining a genomic `Position` type which is used across the API:: - - /** - A `Position` is an unoriented base in some `Reference`. A `Position` is - represented by a `Reference` name, and a base number on that `Reference` - (0-based). - */ - record Position { - /** - The name of the `Reference` on which the `Position` is located. - */ - string referenceName; - - /** - The 0-based offset from the start of the forward strand for that `Reference`. - Genomic positions are non-negative integers less than `Reference` length. - */ - long position; - - /** - Strand the position is associated with. - */ - Strand strand; - } - -This is a "record", which contains three fields. All of the fields are required to be filled in, and all of the fields can only hold objects of a particular single type. (In cases where this is not desired, see the AVDL documentation on unions). The last field holds a `Strand` object, which is defined elsewhere in the file. - -~~~~~~~~~~~~~~~~~~ -A note on unions and optional fields -~~~~~~~~~~~~~~~~~~ - -Any field which is optional should be defined as a ``union``, and given a default value of ``null``. Note that ``null`` should always be first in the union, since it is the type of the default value. - -The Avro JSON libraries serialize union types strangely, so the GA4GH API schemas have been specifically designed never to include union types that would trigger this behavior. The upshot of this is that the **only** legal union type is ``union``. Unions with multiple non-``null`` types are not allowed. - -.. todo:: - * How much of the AVDL tutorial do we want in here? - * Document/show an example for methods (request and response pairing pattern) - * Talk about how we manually specify that some things land in URLs - diff --git a/doc/source/appendix/json_intro.rst b/doc/source/appendix/json_intro.rst index bf8025f7..12110a6b 100644 --- a/doc/source/appendix/json_intro.rst +++ b/doc/source/appendix/json_intro.rst @@ -6,13 +6,13 @@ The JSON Format JSON, or JavaScript Object Notation, is officially defined `here `_. It is the standard data interchange format for web APIs. -The GA4GH Web API uses a JSON wire protocol, exchanging JSON representations of the objects defined in its AVDL schemas. More information on the AVDL schemas is available in :ref:`avro`; basically, the AVDL type definitions say what attributes any given JSON object ought to have, and what ought to be stored in each of them. +The GA4GH Web API uses a JSON wire protocol, exchanging JSON representations of the objects defined in its Protocol Buffers schemas. More information on the schemas is available in :ref:`proto`; basically, the Protocol Buffers type definitions say what attributes any given JSON object ought to have, and what ought to be stored in each of them. ----------------------- GA4GH JSON Serialization ----------------------- -The GA4GH web APIs use Avro IDL to define their schemas, and use the associated Avro JSON serialization libraries. Since the schemas use a restricted subset of AVDL types (see `A note on unions`_ below), the serialized JSON format is fairly standard. This means that standard non-Avro JSON serialization and deserialization libraries (like, for example, the Python ``json`` module) can be used to serialize and deserialize GA4GH JSON messages in an idiomatic way. +The GA4GH web APIs use Protocol Buffers IDL to define their schemas, and use the associated Google Protocol Buffers JSON serialization libraries. Notice that the Protocol Buffers IDL uses snake case, while the on-the-wire protocol is in camel case. --------------------- Serialization example @@ -20,19 +20,19 @@ Serialization example For example, here is the schema definition for Variants (with comments removed):: - record Variant { - string id; - string variantSetId; - array names = []; - union { null, long } created = null; - union { null, long } updated = null; - string referenceName; - long start; - long end; - string referenceBases; - array alternateBases = []; - map> info = {}; - array calls = []; + message Variant { + string id = 1; + string variant_set_id = 2; + repeated string names = 3; + int64 created = 4; + int64 updated = 5; + string reference_name = 6; + int64 start = 7; + int64 end = 8; + string reference_bases = 9; + repeated string alternate_bases = 10; + map info = 11; + repeated Call calls = 12; } Here is a serialized variant in JSON. It's a bit of an edge case in some respects:: @@ -61,114 +61,8 @@ Here is a serialized variant in JSON. It's a bit of an edge case in some respect Things to notice: * A serialized record contains no explicit information about its type. - * Arrays are serialized as JSON arrays. + * "repeated" types are serialized as JSON arrays. * Maps are serialized as JSON objects. - * Records are also serialized as JSON objects. + * Messages are also serialized as JSON objects. * Enums (not shown here) are serialized as JSON strings. - * Nulls are serialized as JSON nulls. - * Fields with default values may be omitted (see the lack of an ``updated`` or ``calls``) as a way of serializing their default values. - * Unions of ``null`` and a non-``null`` type are serialized as either ``null`` or the serialized non-null value. No other kinds of unions are present or permitted. - ------------------------ -A note on unions ------------------------ - -As noted above, a field with union type serialized in GA4GH JSON looks no different from a field of any other type: you just put the field name and its recursively serialized value. In order for the Avro JSON libraries to support this, it is necessary that AVDL ``union`` types union together only ``null`` and a single non-``null`` type. If there were two or more non-``null`` types, the Avro libraries would need to include additional type information to say which to use when deserializing. Since we prohibit those unions, however, API clients and alternative server implementations never need to worry about this additional type information or its syntax. They can just handle "normal" JSON. - -.. todo:: - * add example of Python decoder output - * create a python class, if necessary - ------------------------ -Wire protocol example ------------------------ - -This is from the `ga4gh server example`_. - -.. _ga4gh server example: http://ga4gh-reference-implementation.readthedocs.org/en/stable/demo.html#demo - -To get information from the readgroupsets on a server, create a JSON format request:: - - { - "datasetIds":[], - "name":null - } - -.. note:: - What is this actually asking? - -To send this to the server, we need to create a HTTP request which tells the server what type of -data to expect (JSON format, in this case) -In our test case, we have a server running at \http://localhost:8000 - -Since we want to query the readgroupsets, we'll have to make that part of the URL - -.. note:: - * How do we know it's v0.5.1? - * where is the readgroupsets/search part documented or defined? - -To create a command line request, we can use `cURL `_:: - - curl --data '{"datasetIds":[], "name":null}' --header 'Content-Type: application/json' http://localhost:8000/v0.5.1/readgroupsets/search - -The server returns:: - - { - "nextPageToken": null, - "readGroupSets": [{ - "readGroups": [{ - "info": {}, - "updated": 1432287597662, - "predictedInsertSize": null, - "description": null, - "created": 1432287597662, - "programs": [], - "sampleId": null, - "experiment": null, - "referenceSetId": null, - "id": - "low-coverage:HG00533.mapped.ILLUMINA.bwa.CHS.low_coverage.20120522", - "datasetId": null, - "name": - "low-coverage:HG00533.mapped.ILLUMINA.bwa.CHS.low_coverage.20120522" - }, - { "info": {}, - "updated": 1432287793946, - "predictedInsertSize": null, - "description": null, - "created": 1432287793946, - "programs": [], - "sampleId": null, - "experiment": null, - "referenceSetId": null, - "id": - "low-coverage:HG00096.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522", - "datasetId": null, - "name": - "low-coverage:HG00096.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522" - }, - { "info": {}, - "updated": 1432287793946, - "predictedInsertSize": null, - "description": null, - "created": 1432287793946, - "programs": [], - "sampleId": null, - "experiment": null, - "referenceSetId": null, - "id": - "low-coverage:HG00534.mapped.ILLUMINA.bwa.CHS.low_coverage.20120522", - "datasetId": null, - "name": - "low-coverage:HG00534.mapped.ILLUMINA.bwa.CHS.low_coverage.20120522" - }], - "id": - "low-coverage", - "datasetId": null, - "name": null - } - ] - } - - diff --git a/doc/source/appendix/proto_intro.rst b/doc/source/appendix/proto_intro.rst new file mode 100644 index 00000000..7865b2a2 --- /dev/null +++ b/doc/source/appendix/proto_intro.rst @@ -0,0 +1,54 @@ +.. _proto: + +*********************** +Google Protocol Buffers +*********************** + +Apache Avro is a data serialization ecosystem, comparable to Google's Protocol Buffers. + +------------------------------------------------------- +What does the GA4GH web API take from Protocol Buffers? +------------------------------------------------------- + +The GA4GH web API uses the Google Protocol Buffers language and JSON serialization libraries. + +The GA4GH web API presents a simple HTTP(S) and JSON interface to clients. It does **not** use Protocol Buffers's binary serialization format. + +------------------------------------------------------- +How does the GA4GH web API use Protocol Buffer schemas? +------------------------------------------------------- + +GA4GH web API objects, including both the data objects actually exchanged and the control messages requesting and returning those objects, are defined in Protocol Buffers. + +The full documentation for the Protocol buffers language can be found `here `_. + +------------------------------------------------ +How does the GA4GH Web API use Protocol Buffers? +------------------------------------------------ + +The GA4GH web API schemas are broken up into multiple proto files, which reference each other. Each file defines a number of message types, grouped into a "protocol" that defines a facet of the API. Mostly, the files come in pairs: a normal proto file defining the types representing actual data, and a "methods" proto file defining the control messages to be sent back and forth to query and exchange the representational types, and the URLs associated with various operations. + +Each type has a leading comment documenting its purpose, and each field in the type has a description. These are included in the automatically generated API documentation. + +Here is an example of a proto definition from , in this case defining a genomic `Position` type which is used across the API:: + + message Position { + // The name of the `Reference` on which the `Position` is located. + string reference_name = 1; + + // The 0-based offset from the start of the forward strand for that + // `Reference`. Genomic positions are non-negative integers less than + // `Reference` length. + int64 position = 2; + + // Strand the position is associated with. + Strand strand = 3; + } + +This is a "message", which contains three fields. All of the fields are required to be filled in, and all of the fields can only hold objects of a particular single type. The last field holds a `Strand` object, which is defined elsewhere in the file. + +.. todo:: + * How much of the Protocol Buffers tutorial do we want in here? + * Document/show an example for methods (request and response pairing pattern) + * Talk about how we manually specify that some things land in URLs + diff --git a/doc/source/intro.rst b/doc/source/intro.rst index d090caa8..3e2f61b6 100644 --- a/doc/source/intro.rst +++ b/doc/source/intro.rst @@ -53,29 +53,29 @@ define the types of things that API clients and servers exchange: requests for data, server responses, error messages, and objects actually representing pieces of genomics data. -The schemas are written in Avro Interface Description Language -(extension .avdl). For more details on Avro and how it is used in the -GA4GH APIs, see :ref:`avro`. +The schemas are written in Protocol Buffers Interface Description +Language (extension .proto). For more details on Protocol Buffers +and how it is used in the GA4GH APIs, see :ref:`proto`. Here is an example schema definition for a Variant (with comments removed):: - record Variant { - string id; - string variantSetId; - array names = []; - union { null, long } created = null; - union { null, long } updated = null; - string referenceName; - long start; - long end; - string referenceBases; - array alternateBases = []; - map> info = {}; - array calls = []; + message Variant { + string id = 1; + string variant_set_id = 2; + repeated string names = 3; + int64 created = 4; + int64 updated = 5; + string reference_name = 6; + int64 start = 7; + int64 end = 8; + string reference_bases = 9; + repeated string alternate_bases = 10; + map info = 11; + repeated Call calls = 12; } On the wire, the GA4GH web API takes the form of a client and a server exchanging JSON-serialized objects over HTTP or HTTPS. For more details on JSON, including how the GA4GH web API serializes and -deserializes Avro-specified objects in JSON, see :ref:`json`. +deserializes Protocol Buffers objects in JSON, see :ref:`json`.