Merge remote-tracking branch 'origin/ah_var_store' into gg_VS-1335_Bg…

…zipOnExtractAhVarStoreEdition
broadinstitute · May 8, 2024 · 729d4ae · 729d4ae
2 parents 4e31021 + 614be8b
commit 729d4ae
Show file tree

Hide file tree

Showing 17 changed files with 226 additions and 33 deletions.
diff --git a/.dockstore.yml b/.dockstore.yml
@@ -164,7 +164,7 @@ workflows:
          branches:
              - master
              - ah_var_store
-             - gg_VS-1335_BgzipOnExtractAhVarStoreEdition
+             - vs_1351_retry_with_more_memory_vcf_extract
          tags:
              - /.*/
    - name: GvsImportGenomes
@@ -183,7 +183,6 @@ workflows:
          branches:
              - master
              - ah_var_store
-             - rsa_vs_1328
          tags:
              - /.*/
    - name: GvsPrepareRangesCallset
@@ -261,7 +260,6 @@ workflows:
              # `master` is here so there will be *something* under the branches filter, but GVS WDLs are not on `master`
              # so this shouldn't show up in the list of available versions in Dockstore or Terra.
              - master
-             - vs_1099_composite_object_inputs
          tags:
              - /.*/
    - name: GvsJointVariantCallsetCost
@@ -280,6 +278,7 @@ workflows:
          branches:
              - master
              - ah_var_store
+             - rsa_vs_1168_bge
          tags:
              - /.*/
    - name: GvsQuickstartVcfIntegration
@@ -289,7 +288,6 @@ workflows:
          branches:
              - master
              - ah_var_store
-             - gg_VS-1335_BgzipOnExtractAhVarStoreEdition
          tags:
              - /.*/
    - name: GvsQuickstartHailIntegration
@@ -299,7 +297,6 @@ workflows:
          branches:
              - master
              - ah_var_store
-             - gg_VS-1335_BgzipOnExtractAhVarStoreEdition
          tags:
              - /.*/
    - name: GvsQuickstartIntegration
@@ -309,8 +306,7 @@ workflows:
          branches:
              - master
              - ah_var_store
-             - vs_1342_maybe_stick_with_gar
-             - gg_VS-1335_BgzipOnExtractAhVarStoreEdition
+             - vs_1334_vcf_max_alt_alleles
          tags:
              - /.*/
    - name: GvsIngestTieout
@@ -358,6 +354,16 @@ workflows:
              - ah_var_store
          tags:
              - /.*/
+   - name: GvsTieoutVcfMaxAltAlleles
+     subclass: WDL
+     primaryDescriptorPath: /scripts/variantstore/wdl/GvsTieoutVcfMaxAltAlleles.wdl
+     filters:
+       branches:
+         - ah_var_store
+         - master
+         - vs_1334_vcf_max_alt_alleles
+       tags:
+         - /.*/
    - name: HailFromWdl
      subclass: WDL
      primaryDescriptorPath: /scripts/variantstore/wdl/HailFromWdl.wdl
@@ -430,7 +436,6 @@ workflows:
          - master
          - ah_var_store
          - EchoCallset
-         - vs_1315_plink_docker
    - name: MergePgenHierarchicalWdl
      subclass: WDL
      primaryDescriptorPath: /scripts/variantstore/wdl/MergePgenHierarchical.wdl
@@ -446,4 +451,3 @@ workflows:
        branches:
          - ah_var_store
          - EchoCallset
-         - vs_1315_plink_docker
diff --git a/build_docker.sh b/build_docker.sh
@@ -124,9 +124,9 @@ fi
 
 echo "Building image to tag ${REPO_PRJ}:${GITHUB_TAG}..."
 if [ -n "${IS_PUSH}" ]; then
-    docker build -t ${REPO_PRJ}:${GITHUB_TAG} --build-arg RELEASE=${RELEASE}  --squash .
+    docker build -t ${REPO_PRJ}:${GITHUB_TAG} --build-arg RELEASE=${RELEASE}  --squash . --iidfile /tmp/idfile.txt
 else
-    docker build -t ${REPO_PRJ}:${GITHUB_TAG} --build-arg RELEASE=${RELEASE} .
+    docker build -t ${REPO_PRJ}:${GITHUB_TAG} --build-arg RELEASE=${RELEASE} . --iidfile /tmp/idfile.txt
 fi
 # Since we build the docker image with stages, the first build stage for GATK will be leftover in
 # the local docker context after executing the above commands. This step reclaims that space automatically

diff --git a/scripts/variantstore/docs/Build Docker from VM/building_gatk_docker_on_a_vm.md b/scripts/variantstore/docs/Build Docker from VM/building_gatk_docker_on_a_vm.md
@@ -47,16 +47,29 @@ cd gatk
 # Log in to Google Cloud
 gcloud init
 
-# Configure the credential helper for GCR
-gcloud auth configure-docker
+FULL_IMAGE_ID=$(cat /tmp/idfile.txt)
+
+# Take the slice of this full Docker image ID that corresponds with the output of `docker images`:
+IMAGE_ID=${FULL_IMAGE_ID:7:12}
+
+# The GATK Docker image is based on gatkbase.
+IMAGE_TYPE="gatkbase"
+TAG=$(python3 ./scripts/variantstore/wdl/extract/build_docker_tag.py --image-id "${IMAGE_ID}" --image-type "${IMAGE_TYPE}")
+
+BASE_REPO="broad-dsde-methods/gvs"
+REPO_WITH_TAG="${BASE_REPO}/gatk:${TAG}"
+docker tag "${IMAGE_ID}" "${REPO_WITH_TAG}"
+
+# Configure the credential helper for GAR
+gcloud auth configure-docker us-central1-docker.pkg.dev
 
 # Tag and push
-today="$(date -Idate | sed 's/-/_/g')"
-git_hash="$(git rev-parse --short HEAD)"
-DOCKER_TAG="us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_${today}_${git_hash}"
-DOCKER_IMAGE_ID=$(docker images | head -n 2 | tail -n 1 | awk '{print $3}')
-docker tag $DOCKER_IMAGE_ID $DOCKER_TAG
-docker push $DOCKER_TAG
+GAR_TAG="us-central1-docker.pkg.dev/${REPO_WITH_TAG}"
+docker tag "${REPO_WITH_TAG}" "${GAR_TAG}"
+
+docker push "${GAR_TAG}"
+
+echo "Docker image pushed to \"${GAR_TAG}\""
 ```
 
 Don't forget to shut down (and possibly delete) your VM once you're done!
diff --git a/scripts/variantstore/docs/aou/AOU_DELIVERABLES.md b/scripts/variantstore/docs/aou/AOU_DELIVERABLES.md
@@ -83,17 +83,19 @@
     - This workflow needs to be run with the `extract_table_prefix` input from `GvsPrepareRangesCallset` step.
     - This workflow needs to be run with the `filter_set_name` input from `GvsCreateFilterSet` step.
     - This workflow does not use the Terra Data Entity Model to run, so be sure to select the `Run workflow with inputs defined by file paths` workflow submission option.
-1. `GvsExtractCallset` / `GvsExtractCallsetPgenMerged` workflow
+1. `GvsExtractCallset` / `GvsExtractCallsetPgenMerged` workflows ("small callset" Exome, Clinvar, and ACAF threshold extracts in VCF and PGEN formats respectively)
     - You will need to run the `GvsPrepareRangesCallset` workflow for each "[Region](https://support.researchallofus.org/hc/en-us/articles/14929793660948-Smaller-Callsets-for-Analyzing-Short-Read-WGS-SNP-Indel-Data-with-Hail-MT-VCF-and-PLINK)" (interval list) for which a PGEN or VCF deliverable is required for the callset.
         - This workflow transforms the data in the vet, ref_ranges, and samples tables into a schema optimized for extract.
         - The `enable_extract_table_ttl` input should be set to `true` (the default value is `false`), which will add a TTL of two weeks to the tables it creates.
         - `extract_table_prefix` should be set to a name that is unique to the given Region / interval list. See the [naming conventions doc](https://docs.google.com/document/d/1pNtuv7uDoiOFPbwe4zx5sAGH7MyxwKqXkyrpNmBxeow) for guidance on what to use.
         - Specify the `interval_list` appropriate for the PGEN / VCF extraction run you are performing.
+        - `GvsExtractCallsetPgen` currently defaults to 100 alt alleles maximum, which means that any sites having more than that number of alt alleles will be dropped.
         - This workflow does not use the Terra Data Entity Model to run, so be sure to select the `Run workflow with inputs defined by file paths` workflow submission option.
     - Specify the same `call_set_identifier`, `dataset_name`, `project_id`, `extract_table_prefix`, and `interval_list` that were used in the `GvsPrepareRangesCallset` run documented above.
     - Specify the `interval_weights_bed` appropriate for the PGEN / VCF extraction run you are performing. `gs://gvs_quickstart_storage/weights/gvs_full_vet_weights_1kb_padded_orig.bed` is the interval weights BED used for Quickstart.
-    - For `GvsExtractCallset` only, if you want to output VCFs that are compressed using bgzip, set the `bgzip_output_vcfs` input to `true` to generate VCFs that are compressed using bgzip.
-    - For `GvsExtractCallsetPgenMerged` only, select the workflow option "Retry with more memory" and choose a "Memory retry factor" of 1.5
+    - For both `GvsExtractCallset` and `GvsExtractCallsetPgenMerged`, select the workflow option "Retry with more memory" and choose a "Memory retry factor" of 1.5
+    - For `GvsExtractCallset`, make sure to specify the appropriate `maximum_alternate_alleles` value (currently 100).
+    - For `GvsExtractCallset`, if you want to output VCFs that are compressed using bgzip, set the `bgzip_output_vcfs` input to `true` to generate VCFs that are compressed using bgzip.
     - These workflows do not use the Terra Data Entity Model to run, so be sure to select the `Run workflow with inputs defined by file paths` workflow submission option.
 1. `GvsCalculatePrecisionAndSensitivity` workflow
     - Please see the detailed instructions for running the Precision and Sensitivity workflow [here](../../tieout/AoU_PRECISION_SENSITIVITY.md).

diff --git a/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl b/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl
@@ -10,6 +10,8 @@ workflow GvsCalculatePrecisionAndSensitivity {
     String dataset_name
     String filter_set_name
     File interval_list
+    File? vcf_eval_bed_file
+    Array[String] chromosomes = ["chr20"]
     String project_id
     Array[String] sample_names
 
@@ -32,6 +34,7 @@ workflow GvsCalculatePrecisionAndSensitivity {
 
   parameter_meta {
     call_set_identifier: "The name of the callset for which we are calculating precision and sensitivity."
+    chromosomes: "The chromosome(s) on which to analyze precision and sensitivity. The default value for this is `['chr20']`."
     dataset_name: "The GVS BigQuery dataset name."
     filter_set_name: "The filter_set_name used to generate the callset."
     interval_list: "The intervals over which to calculate precision and sensitivity."
@@ -41,6 +44,7 @@ workflow GvsCalculatePrecisionAndSensitivity {
     truth_vcf_indices: "A list of the VCF indices for the truth data VCFs supplied above."
     truth_beds: "A list of the bed files for the truth data used for analyzing the samples in `sample_names`."
     ref_fasta: "The cloud path for the reference fasta sequence."
+    vcf_eval_bed_file: "Optional bed file for EvaluateVcf; if passed, will be used instead of chromosomes."
   }
 
   String output_basename = call_set_identifier + "_PS"
@@ -132,7 +136,8 @@ workflow GvsCalculatePrecisionAndSensitivity {
         truth_vcf = truth_vcfs[i],
         truth_vcf_index = truth_vcf_indices[i],
         truth_bed = truth_beds[i],
-        interval_list = interval_list,
+        vcf_eval_bed_file = vcf_eval_bed_file,
+        chromosomes = chromosomes,
         output_basename = sample_name + "-bq_roc_filtered",
         is_vqsr_lite = IsVQSRLite.is_vqsr_lite,
         ref_fasta = ref_fasta,
@@ -146,7 +151,8 @@ workflow GvsCalculatePrecisionAndSensitivity {
         truth_vcf = truth_vcfs[i],
         truth_vcf_index = truth_vcf_indices[i],
         truth_bed = truth_beds[i],
-        interval_list = interval_list,
+        vcf_eval_bed_file = vcf_eval_bed_file,
+        chromosomes = chromosomes,
         all_records = true,
         output_basename = sample_name + "-bq_all",
         is_vqsr_lite = IsVQSRLite.is_vqsr_lite,
@@ -377,9 +383,10 @@ task EvaluateVcf {
     File truth_vcf
     File truth_vcf_index
     File truth_bed
+    File? vcf_eval_bed_file
+    Array[String] chromosomes
 
     Boolean all_records = false
-    File interval_list
 
     File ref_fasta
 
@@ -396,14 +403,22 @@ task EvaluateVcf {
   String max_score_field_tag = if (is_vqsr_lite == true) then 'MAX_CALIBRATION_SENSITIVITY' else 'MAX_AS_VQSLOD'
 
   command <<<
+    chromosomes=( ~{sep=' ' chromosomes} )
+
+    echo "Creating .bed file to control which chromosomes should be evaluated."
+    for i in "${chromosomes[@]}"
+    do
+    echo "$i	0	300000000" >> chromosomes.to.eval.txt
+    done
+
     # Prepend date, time and pwd to xtrace log entries.
     PS4='\D{+%F %T} \w $ '
     set -o errexit -o nounset -o pipefail -o xtrace
 
     rtg format --output human_REF_SDF ~{ref_fasta}
 
     rtg vcfeval \
-      --bed-regions ~{interval_list} \
+      --bed-regions ~{if defined(vcf_eval_bed_file) then vcf_eval_bed_file else "chromosomes.to.eval.txt"} \
       ~{if all_records then "--all-records" else ""} \
       --roc-subset snp,indel \
       --vcf-score-field=INFO.~{max_score_field_tag} \

diff --git a/scripts/variantstore/wdl/GvsExtractCallset.wdl b/scripts/variantstore/wdl/GvsExtractCallset.wdl
@@ -47,6 +47,8 @@ workflow GvsExtractCallset {
     Float y_bed_weight_scaling = 4
     Boolean is_wgs = true
     Boolean convert_filtered_genotypes_to_nocalls = false
+    Boolean write_cost_to_db = true
+    Int? maximum_alternate_alleles
   }
 
   File reference = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta"
@@ -72,7 +74,6 @@ workflow GvsExtractCallset {
 
   Boolean emit_pls = false
   Boolean emit_ads = true
-  Boolean write_cost_to_db = true
 
   String intervals_file_extension = if (zero_pad_output_vcf_filenames) then '-~{output_file_base_name}.interval_list' else '-scattered.interval_list'
   String vcf_extension = if (bgzip_output_vcfs) then '.vcf.bgz' else '.vcf.gz'
@@ -240,6 +241,7 @@ workflow GvsExtractCallset {
         emit_ads                              = emit_ads,
         convert_filtered_genotypes_to_nocalls = convert_filtered_genotypes_to_nocalls,
         write_cost_to_db                      = write_cost_to_db,
+        maximum_alternate_alleles             = maximum_alternate_alleles,
     }
   }
 
@@ -333,6 +335,7 @@ task ExtractTask {
     Int memory_gib
 
     Int? local_sort_max_records_in_ram = 10000000
+    Int? maximum_alternate_alleles
 
     # for call-caching -- check if DB tables haven't been updated since the last run
     String max_last_modified_timestamp
@@ -370,7 +373,18 @@ task ExtractTask {
         --filter-set-name ~{filter_set_name}'
     fi
 
-    gatk --java-options "-Xmx~{memory_gib - 3}g" \
+    # This tool may get invoked with "Retry with more memory" with a different amount of memory than specified in
+    # the input `memory_gib`, so use the memory-related environment variables rather than the `memory_gib` input.
+    # https://support.terra.bio/hc/en-us/articles/4403215299355-Out-of-Memory-Retry
+    if [[ ${MEM_UNIT} == "GB" ]]
+    then
+        memory_mb=$(python3 -c "from math import floor; print(floor((${MEM_SIZE} - 3) * 1000))")
+    else
+        echo "Unexpected memory unit: ${MEM_UNIT}" 1>&2
+        exit 1
+    fi
+
+    gatk --java-options "-Xmx${memory_mb}m" \
       ExtractCohortToVcf \
         --vet-ranges-extract-fq-table ~{fq_ranges_cohort_vet_extract_table} \
         ~{"--vet-ranges-extract-table-version " + vet_extract_table_version} \
@@ -387,6 +401,7 @@ task ExtractTask {
         ~{true='--emit-ads' false='' emit_ads} \
         ~{true='' false='--use-vqsr-scoring' use_VQSR_lite} \
         ~{true='--convert-filtered-genotypes-to-no-calls' false='' convert_filtered_genotypes_to_nocalls} \
+        ~{'--maximum-alternate-alleles ' + maximum_alternate_alleles} \
         ${FILTERING_ARGS} \
         --dataset-id ~{dataset_name} \
         --call-set-identifier ~{call_set_identifier} \

diff --git a/scripts/variantstore/wdl/GvsExtractCallsetPgen.wdl b/scripts/variantstore/wdl/GvsExtractCallsetPgen.wdl
@@ -16,8 +16,10 @@ workflow GvsExtractCallsetPgen {
         # reference (see plink chromosome codes here: https://www.cog-genomics.org/plink/2.0/data#irreg_output)
         # ExtractCohortToPgen currently only supports codes "chrM" and "MT"
         String pgen_chromosome_code = "chrM"
-        # Max number of alt alleles a site can have. If a site exceeds this number, it will not be written (max 254)
-        Int max_alt_alleles = 254
+        # Max number of alt alleles a site can have. If a site exceeds this number, it will not be written (max 254).
+        # PGEN extract is currently only used for AoU "small callsets" which always want 100 max alt alleles, so default
+        # to that value.
+        Int max_alt_alleles = 100
         # If true, does not throw an exception for samples@sites with unsupported ploidy (codes it as missing instead)
         Boolean lenient_ploidy_validation = false
         # If true, preserves phasing in the output PGEN files if phasing is present in the source genotypes
@@ -318,7 +320,7 @@ task PgenExtractTask {
         # ExtractCohortToPgen currently only supports codes "chrM" and "MT"
         String pgen_chromosome_code
         # Max number of alt alleles a site can have. If a site exceeds this number, it will not be written (max 254)
-        Int? max_alt_alleles
+        Int max_alt_alleles
         # If true, does not throw an exception for samples@sites with unsupported ploidy (codes it as missing instead)
         Boolean? lenient_ploidy_validation
         # If true, preserves phasing in the output PGEN files if phasing is present in the source genotypes

diff --git a/scripts/variantstore/wdl/GvsJointVariantCalling.wdl b/scripts/variantstore/wdl/GvsJointVariantCalling.wdl
@@ -63,6 +63,8 @@ workflow GvsJointVariantCalling {
 
         File? training_python_script
         File? scoring_python_script
+
+        Int? maximum_alternate_alleles
     }
 
     # If is_wgs is true, we'll use the WGS interval list else, we'll use the Exome interval list.  We'll currently use
@@ -225,6 +227,7 @@ workflow GvsJointVariantCalling {
             drop_state = drop_state,
             bgzip_output_vcfs = bgzip_output_vcfs,
             is_wgs = is_wgs,
+            maximum_alternate_alleles = maximum_alternate_alleles,
     }
 
     output {

diff --git a/scripts/variantstore/wdl/GvsQuickstartHailIntegration.wdl b/scripts/variantstore/wdl/GvsQuickstartHailIntegration.wdl
@@ -35,6 +35,8 @@ workflow GvsQuickstartHailIntegration {
         String? workspace_bucket
         String? workspace_id
         String? submission_id
+
+        Int? maximum_alternate_alleles
     }
 
     String project_id = "gvs-internal"
@@ -89,6 +91,7 @@ workflow GvsQuickstartHailIntegration {
             workspace_bucket = effective_workspace_bucket,
             workspace_id = effective_workspace_id,
             submission_id = effective_submission_id,
+            maximum_alternate_alleles = maximum_alternate_alleles,
     }
 
     call ExtractAvroFilesForHail.GvsExtractAvroFilesForHail {