From e1471f80419cbf6a61108d77cda6ffd1cce90073 Mon Sep 17 00:00:00 2001 From: Miguel Covarrubias Date: Mon, 6 May 2024 14:01:21 -0400 Subject: [PATCH] Choose sensible max alts default for PGEN extract [VS-1279] (#8811) --- scripts/variantstore/docs/aou/AOU_DELIVERABLES.md | 1 + scripts/variantstore/wdl/GvsExtractCallsetPgen.wdl | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/scripts/variantstore/docs/aou/AOU_DELIVERABLES.md b/scripts/variantstore/docs/aou/AOU_DELIVERABLES.md index 68ea9c39044..115c651e9a0 100644 --- a/scripts/variantstore/docs/aou/AOU_DELIVERABLES.md +++ b/scripts/variantstore/docs/aou/AOU_DELIVERABLES.md @@ -89,6 +89,7 @@ - The `enable_extract_table_ttl` input should be set to `true` (the default value is `false`), which will add a TTL of two weeks to the tables it creates. - `extract_table_prefix` should be set to a name that is unique to the given Region / interval list. See the [naming conventions doc](https://docs.google.com/document/d/1pNtuv7uDoiOFPbwe4zx5sAGH7MyxwKqXkyrpNmBxeow) for guidance on what to use. - Specify the `interval_list` appropriate for the PGEN / VCF extraction run you are performing. + - `GvsExtractCallsetPgen` currently defaults to 100 alt alleles maximum, which means that any sites having more than that number of alt alleles will be dropped. - This workflow does not use the Terra Data Entity Model to run, so be sure to select the `Run workflow with inputs defined by file paths` workflow submission option. - Specify the same `call_set_identifier`, `dataset_name`, `project_id`, `extract_table_prefix`, and `interval_list` that were used in the `GvsPrepareRangesCallset` run documented above. - Specify the `interval_weights_bed` appropriate for the PGEN / VCF extraction run you are performing. `gs://gvs_quickstart_storage/weights/gvs_full_vet_weights_1kb_padded_orig.bed` is the interval weights BED used for Quickstart. diff --git a/scripts/variantstore/wdl/GvsExtractCallsetPgen.wdl b/scripts/variantstore/wdl/GvsExtractCallsetPgen.wdl index 6b6063a1795..7a7fe39ec5e 100644 --- a/scripts/variantstore/wdl/GvsExtractCallsetPgen.wdl +++ b/scripts/variantstore/wdl/GvsExtractCallsetPgen.wdl @@ -16,8 +16,10 @@ workflow GvsExtractCallsetPgen { # reference (see plink chromosome codes here: https://www.cog-genomics.org/plink/2.0/data#irreg_output) # ExtractCohortToPgen currently only supports codes "chrM" and "MT" String pgen_chromosome_code = "chrM" - # Max number of alt alleles a site can have. If a site exceeds this number, it will not be written (max 254) - Int max_alt_alleles = 254 + # Max number of alt alleles a site can have. If a site exceeds this number, it will not be written (max 254). + # PGEN extract is currently only used for AoU "small callsets" which always want 100 max alt alleles, so default + # to that value. + Int max_alt_alleles = 100 # If true, does not throw an exception for samples@sites with unsupported ploidy (codes it as missing instead) Boolean lenient_ploidy_validation = false # If true, preserves phasing in the output PGEN files if phasing is present in the source genotypes @@ -318,7 +320,7 @@ task PgenExtractTask { # ExtractCohortToPgen currently only supports codes "chrM" and "MT" String pgen_chromosome_code # Max number of alt alleles a site can have. If a site exceeds this number, it will not be written (max 254) - Int? max_alt_alleles + Int max_alt_alleles # If true, does not throw an exception for samples@sites with unsupported ploidy (codes it as missing instead) Boolean? lenient_ploidy_validation # If true, preserves phasing in the output PGEN files if phasing is present in the source genotypes