diff --git a/.dockstore.yml b/.dockstore.yml index 2a2d4c498b1..2c40ec408fb 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -164,7 +164,7 @@ workflows: branches: - master - ah_var_store - - gg_VS-1335_BgzipOnExtractAhVarStoreEdition + - vs_1351_retry_with_more_memory_vcf_extract tags: - /.*/ - name: GvsImportGenomes @@ -183,7 +183,6 @@ workflows: branches: - master - ah_var_store - - rsa_vs_1328 tags: - /.*/ - name: GvsPrepareRangesCallset @@ -261,7 +260,6 @@ workflows: # `master` is here so there will be *something* under the branches filter, but GVS WDLs are not on `master` # so this shouldn't show up in the list of available versions in Dockstore or Terra. - master - - vs_1099_composite_object_inputs tags: - /.*/ - name: GvsJointVariantCallsetCost @@ -280,6 +278,7 @@ workflows: branches: - master - ah_var_store + - rsa_vs_1168_bge tags: - /.*/ - name: GvsQuickstartVcfIntegration @@ -289,7 +288,6 @@ workflows: branches: - master - ah_var_store - - gg_VS-1335_BgzipOnExtractAhVarStoreEdition tags: - /.*/ - name: GvsQuickstartHailIntegration @@ -299,7 +297,6 @@ workflows: branches: - master - ah_var_store - - gg_VS-1335_BgzipOnExtractAhVarStoreEdition tags: - /.*/ - name: GvsQuickstartIntegration @@ -309,8 +306,7 @@ workflows: branches: - master - ah_var_store - - vs_1342_maybe_stick_with_gar - - gg_VS-1335_BgzipOnExtractAhVarStoreEdition + - vs_1334_vcf_max_alt_alleles tags: - /.*/ - name: GvsIngestTieout @@ -358,6 +354,16 @@ workflows: - ah_var_store tags: - /.*/ + - name: GvsTieoutVcfMaxAltAlleles + subclass: WDL + primaryDescriptorPath: /scripts/variantstore/wdl/GvsTieoutVcfMaxAltAlleles.wdl + filters: + branches: + - ah_var_store + - master + - vs_1334_vcf_max_alt_alleles + tags: + - /.*/ - name: HailFromWdl subclass: WDL primaryDescriptorPath: /scripts/variantstore/wdl/HailFromWdl.wdl @@ -430,7 +436,6 @@ workflows: - master - ah_var_store - EchoCallset - - vs_1315_plink_docker - name: MergePgenHierarchicalWdl subclass: WDL primaryDescriptorPath: /scripts/variantstore/wdl/MergePgenHierarchical.wdl @@ -446,4 +451,3 @@ workflows: branches: - ah_var_store - EchoCallset - - vs_1315_plink_docker diff --git a/build_docker.sh b/build_docker.sh index 63b9be5a187..c53c4e51230 100755 --- a/build_docker.sh +++ b/build_docker.sh @@ -124,9 +124,9 @@ fi echo "Building image to tag ${REPO_PRJ}:${GITHUB_TAG}..." if [ -n "${IS_PUSH}" ]; then - docker build -t ${REPO_PRJ}:${GITHUB_TAG} --build-arg RELEASE=${RELEASE} --squash . + docker build -t ${REPO_PRJ}:${GITHUB_TAG} --build-arg RELEASE=${RELEASE} --squash . --iidfile /tmp/idfile.txt else - docker build -t ${REPO_PRJ}:${GITHUB_TAG} --build-arg RELEASE=${RELEASE} . + docker build -t ${REPO_PRJ}:${GITHUB_TAG} --build-arg RELEASE=${RELEASE} . --iidfile /tmp/idfile.txt fi # Since we build the docker image with stages, the first build stage for GATK will be leftover in # the local docker context after executing the above commands. This step reclaims that space automatically diff --git a/scripts/variantstore/docs/Build Docker from VM/building_gatk_docker_on_a_vm.md b/scripts/variantstore/docs/Build Docker from VM/building_gatk_docker_on_a_vm.md index 6100f17bb82..7f172134e9f 100644 --- a/scripts/variantstore/docs/Build Docker from VM/building_gatk_docker_on_a_vm.md +++ b/scripts/variantstore/docs/Build Docker from VM/building_gatk_docker_on_a_vm.md @@ -47,16 +47,29 @@ cd gatk # Log in to Google Cloud gcloud init -# Configure the credential helper for GCR -gcloud auth configure-docker +FULL_IMAGE_ID=$(cat /tmp/idfile.txt) + +# Take the slice of this full Docker image ID that corresponds with the output of `docker images`: +IMAGE_ID=${FULL_IMAGE_ID:7:12} + +# The GATK Docker image is based on gatkbase. +IMAGE_TYPE="gatkbase" +TAG=$(python3 ./scripts/variantstore/wdl/extract/build_docker_tag.py --image-id "${IMAGE_ID}" --image-type "${IMAGE_TYPE}") + +BASE_REPO="broad-dsde-methods/gvs" +REPO_WITH_TAG="${BASE_REPO}/gatk:${TAG}" +docker tag "${IMAGE_ID}" "${REPO_WITH_TAG}" + +# Configure the credential helper for GAR +gcloud auth configure-docker us-central1-docker.pkg.dev # Tag and push -today="$(date -Idate | sed 's/-/_/g')" -git_hash="$(git rev-parse --short HEAD)" -DOCKER_TAG="us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_${today}_${git_hash}" -DOCKER_IMAGE_ID=$(docker images | head -n 2 | tail -n 1 | awk '{print $3}') -docker tag $DOCKER_IMAGE_ID $DOCKER_TAG -docker push $DOCKER_TAG +GAR_TAG="us-central1-docker.pkg.dev/${REPO_WITH_TAG}" +docker tag "${REPO_WITH_TAG}" "${GAR_TAG}" + +docker push "${GAR_TAG}" + +echo "Docker image pushed to \"${GAR_TAG}\"" ``` Don't forget to shut down (and possibly delete) your VM once you're done! diff --git a/scripts/variantstore/docs/aou/AOU_DELIVERABLES.md b/scripts/variantstore/docs/aou/AOU_DELIVERABLES.md index 8ae4b8cf1cd..4636408944a 100644 --- a/scripts/variantstore/docs/aou/AOU_DELIVERABLES.md +++ b/scripts/variantstore/docs/aou/AOU_DELIVERABLES.md @@ -83,17 +83,19 @@ - This workflow needs to be run with the `extract_table_prefix` input from `GvsPrepareRangesCallset` step. - This workflow needs to be run with the `filter_set_name` input from `GvsCreateFilterSet` step. - This workflow does not use the Terra Data Entity Model to run, so be sure to select the `Run workflow with inputs defined by file paths` workflow submission option. -1. `GvsExtractCallset` / `GvsExtractCallsetPgenMerged` workflow +1. `GvsExtractCallset` / `GvsExtractCallsetPgenMerged` workflows ("small callset" Exome, Clinvar, and ACAF threshold extracts in VCF and PGEN formats respectively) - You will need to run the `GvsPrepareRangesCallset` workflow for each "[Region](https://support.researchallofus.org/hc/en-us/articles/14929793660948-Smaller-Callsets-for-Analyzing-Short-Read-WGS-SNP-Indel-Data-with-Hail-MT-VCF-and-PLINK)" (interval list) for which a PGEN or VCF deliverable is required for the callset. - This workflow transforms the data in the vet, ref_ranges, and samples tables into a schema optimized for extract. - The `enable_extract_table_ttl` input should be set to `true` (the default value is `false`), which will add a TTL of two weeks to the tables it creates. - `extract_table_prefix` should be set to a name that is unique to the given Region / interval list. See the [naming conventions doc](https://docs.google.com/document/d/1pNtuv7uDoiOFPbwe4zx5sAGH7MyxwKqXkyrpNmBxeow) for guidance on what to use. - Specify the `interval_list` appropriate for the PGEN / VCF extraction run you are performing. + - `GvsExtractCallsetPgen` currently defaults to 100 alt alleles maximum, which means that any sites having more than that number of alt alleles will be dropped. - This workflow does not use the Terra Data Entity Model to run, so be sure to select the `Run workflow with inputs defined by file paths` workflow submission option. - Specify the same `call_set_identifier`, `dataset_name`, `project_id`, `extract_table_prefix`, and `interval_list` that were used in the `GvsPrepareRangesCallset` run documented above. - Specify the `interval_weights_bed` appropriate for the PGEN / VCF extraction run you are performing. `gs://gvs_quickstart_storage/weights/gvs_full_vet_weights_1kb_padded_orig.bed` is the interval weights BED used for Quickstart. - - For `GvsExtractCallset` only, if you want to output VCFs that are compressed using bgzip, set the `bgzip_output_vcfs` input to `true` to generate VCFs that are compressed using bgzip. - - For `GvsExtractCallsetPgenMerged` only, select the workflow option "Retry with more memory" and choose a "Memory retry factor" of 1.5 + - For both `GvsExtractCallset` and `GvsExtractCallsetPgenMerged`, select the workflow option "Retry with more memory" and choose a "Memory retry factor" of 1.5 + - For `GvsExtractCallset`, make sure to specify the appropriate `maximum_alternate_alleles` value (currently 100). + - For `GvsExtractCallset`, if you want to output VCFs that are compressed using bgzip, set the `bgzip_output_vcfs` input to `true` to generate VCFs that are compressed using bgzip. - These workflows do not use the Terra Data Entity Model to run, so be sure to select the `Run workflow with inputs defined by file paths` workflow submission option. 1. `GvsCalculatePrecisionAndSensitivity` workflow - Please see the detailed instructions for running the Precision and Sensitivity workflow [here](../../tieout/AoU_PRECISION_SENSITIVITY.md). diff --git a/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl b/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl index b44e49f0312..38ca31a05a9 100644 --- a/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl +++ b/scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl @@ -10,6 +10,8 @@ workflow GvsCalculatePrecisionAndSensitivity { String dataset_name String filter_set_name File interval_list + File? vcf_eval_bed_file + Array[String] chromosomes = ["chr20"] String project_id Array[String] sample_names @@ -32,6 +34,7 @@ workflow GvsCalculatePrecisionAndSensitivity { parameter_meta { call_set_identifier: "The name of the callset for which we are calculating precision and sensitivity." + chromosomes: "The chromosome(s) on which to analyze precision and sensitivity. The default value for this is `['chr20']`." dataset_name: "The GVS BigQuery dataset name." filter_set_name: "The filter_set_name used to generate the callset." interval_list: "The intervals over which to calculate precision and sensitivity." @@ -41,6 +44,7 @@ workflow GvsCalculatePrecisionAndSensitivity { truth_vcf_indices: "A list of the VCF indices for the truth data VCFs supplied above." truth_beds: "A list of the bed files for the truth data used for analyzing the samples in `sample_names`." ref_fasta: "The cloud path for the reference fasta sequence." + vcf_eval_bed_file: "Optional bed file for EvaluateVcf; if passed, will be used instead of chromosomes." } String output_basename = call_set_identifier + "_PS" @@ -132,7 +136,8 @@ workflow GvsCalculatePrecisionAndSensitivity { truth_vcf = truth_vcfs[i], truth_vcf_index = truth_vcf_indices[i], truth_bed = truth_beds[i], - interval_list = interval_list, + vcf_eval_bed_file = vcf_eval_bed_file, + chromosomes = chromosomes, output_basename = sample_name + "-bq_roc_filtered", is_vqsr_lite = IsVQSRLite.is_vqsr_lite, ref_fasta = ref_fasta, @@ -146,7 +151,8 @@ workflow GvsCalculatePrecisionAndSensitivity { truth_vcf = truth_vcfs[i], truth_vcf_index = truth_vcf_indices[i], truth_bed = truth_beds[i], - interval_list = interval_list, + vcf_eval_bed_file = vcf_eval_bed_file, + chromosomes = chromosomes, all_records = true, output_basename = sample_name + "-bq_all", is_vqsr_lite = IsVQSRLite.is_vqsr_lite, @@ -377,9 +383,10 @@ task EvaluateVcf { File truth_vcf File truth_vcf_index File truth_bed + File? vcf_eval_bed_file + Array[String] chromosomes Boolean all_records = false - File interval_list File ref_fasta @@ -396,6 +403,14 @@ task EvaluateVcf { String max_score_field_tag = if (is_vqsr_lite == true) then 'MAX_CALIBRATION_SENSITIVITY' else 'MAX_AS_VQSLOD' command <<< + chromosomes=( ~{sep=' ' chromosomes} ) + + echo "Creating .bed file to control which chromosomes should be evaluated." + for i in "${chromosomes[@]}" + do + echo "$i 0 300000000" >> chromosomes.to.eval.txt + done + # Prepend date, time and pwd to xtrace log entries. PS4='\D{+%F %T} \w $ ' set -o errexit -o nounset -o pipefail -o xtrace @@ -403,7 +418,7 @@ task EvaluateVcf { rtg format --output human_REF_SDF ~{ref_fasta} rtg vcfeval \ - --bed-regions ~{interval_list} \ + --bed-regions ~{if defined(vcf_eval_bed_file) then vcf_eval_bed_file else "chromosomes.to.eval.txt"} \ ~{if all_records then "--all-records" else ""} \ --roc-subset snp,indel \ --vcf-score-field=INFO.~{max_score_field_tag} \ diff --git a/scripts/variantstore/wdl/GvsExtractCallset.wdl b/scripts/variantstore/wdl/GvsExtractCallset.wdl index 75a2b0835d8..501f3807f23 100644 --- a/scripts/variantstore/wdl/GvsExtractCallset.wdl +++ b/scripts/variantstore/wdl/GvsExtractCallset.wdl @@ -47,6 +47,8 @@ workflow GvsExtractCallset { Float y_bed_weight_scaling = 4 Boolean is_wgs = true Boolean convert_filtered_genotypes_to_nocalls = false + Boolean write_cost_to_db = true + Int? maximum_alternate_alleles } File reference = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta" @@ -72,7 +74,6 @@ workflow GvsExtractCallset { Boolean emit_pls = false Boolean emit_ads = true - Boolean write_cost_to_db = true String intervals_file_extension = if (zero_pad_output_vcf_filenames) then '-~{output_file_base_name}.interval_list' else '-scattered.interval_list' String vcf_extension = if (bgzip_output_vcfs) then '.vcf.bgz' else '.vcf.gz' @@ -240,6 +241,7 @@ workflow GvsExtractCallset { emit_ads = emit_ads, convert_filtered_genotypes_to_nocalls = convert_filtered_genotypes_to_nocalls, write_cost_to_db = write_cost_to_db, + maximum_alternate_alleles = maximum_alternate_alleles, } } @@ -333,6 +335,7 @@ task ExtractTask { Int memory_gib Int? local_sort_max_records_in_ram = 10000000 + Int? maximum_alternate_alleles # for call-caching -- check if DB tables haven't been updated since the last run String max_last_modified_timestamp @@ -370,7 +373,18 @@ task ExtractTask { --filter-set-name ~{filter_set_name}' fi - gatk --java-options "-Xmx~{memory_gib - 3}g" \ + # This tool may get invoked with "Retry with more memory" with a different amount of memory than specified in + # the input `memory_gib`, so use the memory-related environment variables rather than the `memory_gib` input. + # https://support.terra.bio/hc/en-us/articles/4403215299355-Out-of-Memory-Retry + if [[ ${MEM_UNIT} == "GB" ]] + then + memory_mb=$(python3 -c "from math import floor; print(floor((${MEM_SIZE} - 3) * 1000))") + else + echo "Unexpected memory unit: ${MEM_UNIT}" 1>&2 + exit 1 + fi + + gatk --java-options "-Xmx${memory_mb}m" \ ExtractCohortToVcf \ --vet-ranges-extract-fq-table ~{fq_ranges_cohort_vet_extract_table} \ ~{"--vet-ranges-extract-table-version " + vet_extract_table_version} \ @@ -387,6 +401,7 @@ task ExtractTask { ~{true='--emit-ads' false='' emit_ads} \ ~{true='' false='--use-vqsr-scoring' use_VQSR_lite} \ ~{true='--convert-filtered-genotypes-to-no-calls' false='' convert_filtered_genotypes_to_nocalls} \ + ~{'--maximum-alternate-alleles ' + maximum_alternate_alleles} \ ${FILTERING_ARGS} \ --dataset-id ~{dataset_name} \ --call-set-identifier ~{call_set_identifier} \ diff --git a/scripts/variantstore/wdl/GvsExtractCallsetPgen.wdl b/scripts/variantstore/wdl/GvsExtractCallsetPgen.wdl index 6b6063a1795..7a7fe39ec5e 100644 --- a/scripts/variantstore/wdl/GvsExtractCallsetPgen.wdl +++ b/scripts/variantstore/wdl/GvsExtractCallsetPgen.wdl @@ -16,8 +16,10 @@ workflow GvsExtractCallsetPgen { # reference (see plink chromosome codes here: https://www.cog-genomics.org/plink/2.0/data#irreg_output) # ExtractCohortToPgen currently only supports codes "chrM" and "MT" String pgen_chromosome_code = "chrM" - # Max number of alt alleles a site can have. If a site exceeds this number, it will not be written (max 254) - Int max_alt_alleles = 254 + # Max number of alt alleles a site can have. If a site exceeds this number, it will not be written (max 254). + # PGEN extract is currently only used for AoU "small callsets" which always want 100 max alt alleles, so default + # to that value. + Int max_alt_alleles = 100 # If true, does not throw an exception for samples@sites with unsupported ploidy (codes it as missing instead) Boolean lenient_ploidy_validation = false # If true, preserves phasing in the output PGEN files if phasing is present in the source genotypes @@ -318,7 +320,7 @@ task PgenExtractTask { # ExtractCohortToPgen currently only supports codes "chrM" and "MT" String pgen_chromosome_code # Max number of alt alleles a site can have. If a site exceeds this number, it will not be written (max 254) - Int? max_alt_alleles + Int max_alt_alleles # If true, does not throw an exception for samples@sites with unsupported ploidy (codes it as missing instead) Boolean? lenient_ploidy_validation # If true, preserves phasing in the output PGEN files if phasing is present in the source genotypes diff --git a/scripts/variantstore/wdl/GvsJointVariantCalling.wdl b/scripts/variantstore/wdl/GvsJointVariantCalling.wdl index f8dc056e30c..dfab2a8f76c 100644 --- a/scripts/variantstore/wdl/GvsJointVariantCalling.wdl +++ b/scripts/variantstore/wdl/GvsJointVariantCalling.wdl @@ -63,6 +63,8 @@ workflow GvsJointVariantCalling { File? training_python_script File? scoring_python_script + + Int? maximum_alternate_alleles } # If is_wgs is true, we'll use the WGS interval list else, we'll use the Exome interval list. We'll currently use @@ -225,6 +227,7 @@ workflow GvsJointVariantCalling { drop_state = drop_state, bgzip_output_vcfs = bgzip_output_vcfs, is_wgs = is_wgs, + maximum_alternate_alleles = maximum_alternate_alleles, } output { diff --git a/scripts/variantstore/wdl/GvsQuickstartHailIntegration.wdl b/scripts/variantstore/wdl/GvsQuickstartHailIntegration.wdl index d544a6a10f5..91710eae55d 100644 --- a/scripts/variantstore/wdl/GvsQuickstartHailIntegration.wdl +++ b/scripts/variantstore/wdl/GvsQuickstartHailIntegration.wdl @@ -35,6 +35,8 @@ workflow GvsQuickstartHailIntegration { String? workspace_bucket String? workspace_id String? submission_id + + Int? maximum_alternate_alleles } String project_id = "gvs-internal" @@ -89,6 +91,7 @@ workflow GvsQuickstartHailIntegration { workspace_bucket = effective_workspace_bucket, workspace_id = effective_workspace_id, submission_id = effective_submission_id, + maximum_alternate_alleles = maximum_alternate_alleles, } call ExtractAvroFilesForHail.GvsExtractAvroFilesForHail { diff --git a/scripts/variantstore/wdl/GvsQuickstartIntegration.wdl b/scripts/variantstore/wdl/GvsQuickstartIntegration.wdl index 50e7a836ea2..f038ae69b5b 100644 --- a/scripts/variantstore/wdl/GvsQuickstartIntegration.wdl +++ b/scripts/variantstore/wdl/GvsQuickstartIntegration.wdl @@ -28,6 +28,7 @@ workflow GvsQuickstartIntegration { String? gatk_docker String? hail_version Boolean chr20_X_Y_only = true + Int? maximum_alternate_alleles } File full_wgs_interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list" @@ -102,6 +103,7 @@ workflow GvsQuickstartIntegration { workspace_id = GetToolVersions.workspace_id, submission_id = GetToolVersions.submission_id, hail_version = effective_hail_version, + maximum_alternate_alleles = maximum_alternate_alleles, } call QuickstartHailIntegration.GvsQuickstartHailIntegration as GvsQuickstartHailVQSRClassicIntegration { input: @@ -128,6 +130,7 @@ workflow GvsQuickstartIntegration { workspace_id = GetToolVersions.workspace_id, submission_id = GetToolVersions.submission_id, hail_version = effective_hail_version, + maximum_alternate_alleles = maximum_alternate_alleles, } if (GvsQuickstartHailVQSRLiteIntegration.used_tighter_gcp_quotas) { @@ -173,6 +176,7 @@ workflow GvsQuickstartIntegration { workspace_bucket = GetToolVersions.workspace_bucket, workspace_id = GetToolVersions.workspace_id, submission_id = GetToolVersions.submission_id, + maximum_alternate_alleles = maximum_alternate_alleles, } call QuickstartVcfIntegration.GvsQuickstartVcfIntegration as QuickstartVcfVQSRClassicIntegration { input: @@ -199,6 +203,7 @@ workflow GvsQuickstartIntegration { workspace_bucket = GetToolVersions.workspace_bucket, workspace_id = GetToolVersions.workspace_id, submission_id = GetToolVersions.submission_id, + maximum_alternate_alleles = maximum_alternate_alleles, } if (QuickstartVcfVQSRClassicIntegration.used_tighter_gcp_quotas) { @@ -244,6 +249,7 @@ workflow GvsQuickstartIntegration { workspace_bucket = GetToolVersions.workspace_bucket, workspace_id = GetToolVersions.workspace_id, submission_id = GetToolVersions.submission_id, + maximum_alternate_alleles = maximum_alternate_alleles, } if (QuickstartVcfExomeIntegration.used_tighter_gcp_quotas) { @@ -282,6 +288,7 @@ workflow GvsQuickstartIntegration { workspace_bucket = GetToolVersions.workspace_bucket, workspace_id = GetToolVersions.workspace_id, submission_id = GetToolVersions.submission_id, + maximum_alternate_alleles = maximum_alternate_alleles, git_branch_or_tag = git_branch_or_tag, } diff --git a/scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl b/scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl index e56bdfe3027..570242cfc49 100644 --- a/scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl +++ b/scripts/variantstore/wdl/GvsQuickstartVcfIntegration.wdl @@ -33,6 +33,8 @@ workflow GvsQuickstartVcfIntegration { String? workspace_bucket String? workspace_id String? submission_id + + Int? maximum_alternate_alleles } String project_id = "gvs-internal" @@ -110,6 +112,7 @@ workflow GvsQuickstartVcfIntegration { git_branch_or_tag = git_branch_or_tag, git_hash = effective_git_hash, tighter_gcp_quotas = false, + maximum_alternate_alleles = maximum_alternate_alleles, } # Only assert identical outputs if we did not filter (filtering is not deterministic) OR if we are using VQSR Lite (which is deterministic) diff --git a/scripts/variantstore/wdl/GvsTieoutPgenToVcf.wdl b/scripts/variantstore/wdl/GvsTieoutPgenToVcf.wdl index 9150313d25d..7a84adcf27a 100644 --- a/scripts/variantstore/wdl/GvsTieoutPgenToVcf.wdl +++ b/scripts/variantstore/wdl/GvsTieoutPgenToVcf.wdl @@ -55,7 +55,7 @@ task Tieout { # Now just the contigs we care about zgrep -E '^##' 0000000000-quickit.vcf.gz | grep -E '^##contig=> vcf_header.txt - # Tab-separated comparision-friendly columns + # Tab-separated comparison-friendly columns acc="" for item in \\#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT $(cat ../samples.txt) do diff --git a/scripts/variantstore/wdl/GvsTieoutVcfMaxAltAlleles.wdl b/scripts/variantstore/wdl/GvsTieoutVcfMaxAltAlleles.wdl new file mode 100644 index 00000000000..532ec3fd6ab --- /dev/null +++ b/scripts/variantstore/wdl/GvsTieoutVcfMaxAltAlleles.wdl @@ -0,0 +1,98 @@ +version 1.0 + +import "GvsUtils.wdl" as Utils + +workflow GvsTieoutVcfMaxAltAlleles { + input { + String unfiltered_output_gcs_path + String filtered_output_gcs_path + Int max_alt_alleles + String? variants_docker + } + + if (!defined(variants_docker)) { + call Utils.GetToolVersions + } + + String effective_variants_docker = select_first([variants_docker, GetToolVersions.variants_docker]) + + call Tieout { + input: + unfiltered_output_gcs_path = unfiltered_output_gcs_path, + filtered_output_gcs_path = filtered_output_gcs_path, + max_alt_alleles = max_alt_alleles, + variants_docker = effective_variants_docker, + } +} + +task Tieout { + input { + String unfiltered_output_gcs_path + String filtered_output_gcs_path + Int max_alt_alleles + String variants_docker + } + command <<< + # Prepend date, time and pwd to xtrace log entries. + PS4='\D{+%F %T} \w $ ' + set -o errexit -o nounset -o pipefail -o xtrace + + mkdir unfiltered filtered compare + + cd unfiltered + gcloud storage cp '~{unfiltered_output_gcs_path}/*' . + + cd .. + cd filtered + gcloud storage cp '~{filtered_output_gcs_path}/*' . + + cd .. + + for unfiltered in unfiltered/*.vcf.gz + do + base="$(basename $unfiltered)" + filtered="filtered/${base}" + out=${base%-*} + bcftools isec $filtered $unfiltered -p compare/${out} + done + + # Identify the 'isec' output files that represent entries which are "private" to either VCF file being compared. + find . -name README.txt | xargs grep -h private | awk '{print $1}' > private_files.txt + + # Turn off errexit, the grep below is expected to return non-zero + set +o errexit + + # Find all the private files, print out all the ALTs, look for any lines that do *not* have at least two commas. + # All ALT entries are expected to have two or more commas because the maximum number of ALT alleles was specified + # as two. So the unfiltered file would have entries like ALT1,ALT2,...,ALTN while the filtered file would not, + # and these should be the only differences between the two files. + cat private_files.txt | xargs -n 1 bcftools query -f '%ALT\n' | grep -E -v ',.*,' > unexpected.txt + + rc=$? + if [[ $rc -eq 0 ]] + then + echo "Unexpected ALTs found, see 'unexpected' output for details." 1>&2 + exit 1 + fi + + set -o errexit + + # Similar to above, except making sure that we did observe some filtered sites. At the time of this + # writing there are 4001 sites filtered for > 2 entries; conservatively setting the threshold at 1000. + filtered_count=$(cat private_files.txt | xargs -n 1 bcftools query -f '%ALT\n' | grep -E ',.*,' | wc -l) + if [[ $filtered_count -lt 1000 ]] + then + echo "Unexpectedly found fewer than 1000 filtered ALTs, see 'tarball' output for details." 1>&2 + exit 1 + fi + + tar cfvz tieout.tgz compare + >>> + output { + File tarball = "tieout.tgz" + File unexpected = "unexpected.txt" + } + runtime { + docker: variants_docker + } +} diff --git a/scripts/variantstore/wdl/GvsUtils.wdl b/scripts/variantstore/wdl/GvsUtils.wdl index 40f0fee9651..bfb1b47aff7 100644 --- a/scripts/variantstore/wdl/GvsUtils.wdl +++ b/scripts/variantstore/wdl/GvsUtils.wdl @@ -73,8 +73,8 @@ task GetToolVersions { # there are a handlful of tasks that require the larger GNU libc-based `slim`. String cloud_sdk_slim_docker = "gcr.io/google.com/cloudsdktool/cloud-sdk:435.0.0-slim" String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/variants:2024-04-22-alpine-32804b134a75" - String gatk_docker = "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_2024_04_24_61922a303" String variants_nirvana_docker = "us.gcr.io/broad-dsde-methods/variantstore:nirvana_2022_10_19" + String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2024-05-01-gatkbase-617d4d1c7f64" String real_time_genomics_docker = "docker.io/realtimegenomics/rtg-tools:latest" String gotc_imputation_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.5-1.10.2-0.1.16-1649948623" String plink_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/plink2:2024-04-23-slim-a0a65f52cc0e" diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohort.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohort.java index 2e230ba44d5..3eae6e1d9aa 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohort.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohort.java @@ -145,6 +145,13 @@ public enum VQScoreFilteringType {GENOTYPE, SITES, NONE} ) private boolean convertFilteredGenotypesToNoCalls = false; + @Argument( + fullName = "maximum-alternate-alleles", + doc = "The maximum number of alternate alleles a site can have before it is hard-filtered from the output", + optional = true + ) + private Long maximumAlternateAlleles = 0L; + @Argument( fullName = "snps-truth-sensitivity-filter-level", mutex = {"snps-lod-score-cutoff"}, @@ -414,6 +421,10 @@ protected void onStartup() { "if no sample file (--sample-file) is provided."); } + OptionalLong optionalMaximumAlternateAlleles = + maximumAlternateAlleles != null && maximumAlternateAlleles > 0L ? + OptionalLong.of(maximumAlternateAlleles) : OptionalLong.empty(); + engine = !isVQSR ? new ExtractCohortVETSEngine( projectID, @@ -441,6 +452,7 @@ protected void onStartup() { emitADs, vqScoreFilteringType, convertFilteredGenotypesToNoCalls, + optionalMaximumAlternateAlleles, inferredReferenceState, presortedAvroFiles, this::apply) @@ -471,6 +483,7 @@ protected void onStartup() { emitADs, vqScoreFilteringType, convertFilteredGenotypesToNoCalls, + optionalMaximumAlternateAlleles, inferredReferenceState, presortedAvroFiles, this::apply); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java index a1f62236e8d..0c201ff1984 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java @@ -46,6 +46,7 @@ public class ExtractCohortEngine { private final Double vqScoreINDELThreshold; private final ExtractCohort.VQScoreFilteringType vqScoreFilteringType; private final boolean convertFilteredGenotypesToNoCalls; + private final OptionalLong maxAlternateAlleleCount; private final String projectID; private final boolean emitPLs; @@ -136,6 +137,7 @@ public ExtractCohortEngine(final String projectID, final boolean emitADs, final ExtractCohort.VQScoreFilteringType vqScoreFilteringType, final boolean convertFilteredGenotypesToNoCalls, + final OptionalLong maxAlternateAlleleCount, final GQStateEnum inferredReferenceState, final boolean presortedAvroFiles, final Consumer variantContextConsumer @@ -190,6 +192,7 @@ else if (vetRangesExtractTableVersion == VetRangesExtractVersionEnum.V1) { this.vqScoreINDELThreshold = vqScoreINDELThreshold; this.vqScoreFilteringType = vqScoreFilteringType; this.convertFilteredGenotypesToNoCalls = convertFilteredGenotypesToNoCalls; + this.maxAlternateAlleleCount = maxAlternateAlleleCount; this.filterSetSiteTableRef = vqScoreFilteringType.equals(ExtractCohort.VQScoreFilteringType.NONE) ? null : new TableReference(filterSetSiteTableName, SchemaUtils.FILTER_SET_SITE_FIELDS); this.filterSetInfoTableRef = vqScoreFilteringType.equals(ExtractCohort.VQScoreFilteringType.NONE) ? null : new TableReference(filterSetInfoTableName, getFilterSetInfoTableFields()); @@ -583,6 +586,15 @@ VariantContext finalizeCurrentVariant(final List unmergedVariant vcWithRef.genotypes(genotypes); mergedVC = vcWithRef.make(); + if (maxAlternateAlleleCount.isPresent()) { + if (mergedVC.getAlternateAlleles().size() > maxAlternateAlleleCount.getAsLong()) { + logger.warn("Dropping site on contig {} pos [{}, {}] with {} alt alleles versus maximum {}", + mergedVC.getContig(), mergedVC.getStart(), mergedVC.getEnd(), + mergedVC.getAlternateAlleles().size(), maxAlternateAlleleCount.getAsLong()); + return null; + } + } + ReferenceContext referenceContext = new ReferenceContext(refSource, new SimpleInterval(mergedVC)); VariantContext genotypedVC = annotationEngine.annotateContext(mergedVC, new FeatureContext(), referenceContext, null, a -> true); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortVETSEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortVETSEngine.java index 817d9989ffb..87f1ba0c8cd 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortVETSEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortVETSEngine.java @@ -18,6 +18,7 @@ import java.util.Map; import java.util.Objects; import java.util.Optional; +import java.util.OptionalLong; import java.util.function.Consumer; import java.util.stream.Stream; @@ -75,6 +76,7 @@ public ExtractCohortVETSEngine(final String projectID, final boolean emitADs, final ExtractCohort.VQScoreFilteringType vqScoreFilteringType, final boolean convertFilteredGenotypesToNoCalls, + final OptionalLong maximumAlternateAlleles, final GQStateEnum inferredReferenceState, final boolean presortedAvroFiles, final Consumer variantContextConsumer @@ -105,6 +107,7 @@ public ExtractCohortVETSEngine(final String projectID, emitADs, vqScoreFilteringType, convertFilteredGenotypesToNoCalls, + maximumAlternateAlleles, inferredReferenceState, presortedAvroFiles, variantContextConsumer);