Skip to content

Commit

Permalink
VS-1310 Optionally convert filtered genotypes to no-calls (#8793)
Browse files Browse the repository at this point in the history
* Add new parameter to set filtered genotypes to no-calls to ExtractCohort
* Modified ExtractCohortEngine to optionally set genotypes that are filtered (FT flag set - at the genotype leve) to no-calls.
* Renamed VQSR Classic to 'VQSR'
* Renamed VQSR Lite to 'VETS'
* Updated VCF and pgen tests for code changes
  • Loading branch information
gbggrant committed Apr 29, 2024
1 parent cf99923 commit 794cd90
Show file tree
Hide file tree
Showing 17 changed files with 213 additions and 128 deletions.
74 changes: 39 additions & 35 deletions scripts/variantstore/wdl/GvsExtractCallset.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ workflow GvsExtractCallset {
Int? split_intervals_mem_override
Float x_bed_weight_scaling = 4
Float y_bed_weight_scaling = 4
Boolean write_cost_to_db = true
Boolean is_wgs = true
Boolean convert_filtered_genotypes_to_nocalls = false
}

File reference = "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta"
Expand All @@ -71,6 +71,7 @@ workflow GvsExtractCallset {

Boolean emit_pls = false
Boolean emit_ads = true
Boolean write_cost_to_db = true

String intervals_file_extension = if (zero_pad_output_vcf_filenames) then '-~{output_file_base_name}.vcf.gz.interval_list' else '-scattered.interval_list'

Expand Down Expand Up @@ -203,39 +204,40 @@ workflow GvsExtractCallset {

call ExtractTask {
input:
go = select_first([ValidateFilterSetName.done, true]),
dataset_name = dataset_name,
call_set_identifier = call_set_identifier,
use_VQSR_lite = use_VQSR_lite,
gatk_docker = effective_gatk_docker,
gatk_override = gatk_override,
reference = reference,
reference_index = reference_index,
reference_dict = reference_dict,
fq_samples_to_extract_table = fq_samples_to_extract_table,
interval_index = i,
intervals = SplitIntervals.interval_files[i],
fq_cohort_extract_table = fq_cohort_extract_table,
fq_ranges_cohort_ref_extract_table = fq_ranges_cohort_ref_extract_table,
fq_ranges_cohort_vet_extract_table = fq_ranges_cohort_vet_extract_table,
vet_extract_table_version = GetExtractVetTableVersion.version,
read_project_id = query_project,
do_not_filter_override = do_not_filter_override,
fq_filter_set_info_table = fq_filter_set_info_table,
fq_filter_set_site_table = fq_filter_set_site_table,
fq_filter_set_tranches_table = if (use_VQSR_lite) then none else fq_filter_set_tranches_table,
filter_set_name = filter_set_name,
drop_state = drop_state,
output_file = vcf_filename,
output_gcs_dir = output_gcs_dir,
max_last_modified_timestamp = GetBQTablesMaxLastModifiedTimestamp.max_last_modified_timestamp,
extract_preemptible_override = extract_preemptible_override,
extract_maxretries_override = extract_maxretries_override,
disk_override = disk_override,
memory_gib = effective_extract_memory_gib,
emit_pls = emit_pls,
emit_ads = emit_ads,
write_cost_to_db = write_cost_to_db,
go = select_first([ValidateFilterSetName.done, true]),
dataset_name = dataset_name,
call_set_identifier = call_set_identifier,
use_VQSR_lite = use_VQSR_lite,
gatk_docker = effective_gatk_docker,
gatk_override = gatk_override,
reference = reference,
reference_index = reference_index,
reference_dict = reference_dict,
fq_samples_to_extract_table = fq_samples_to_extract_table,
interval_index = i,
intervals = SplitIntervals.interval_files[i],
fq_cohort_extract_table = fq_cohort_extract_table,
fq_ranges_cohort_ref_extract_table = fq_ranges_cohort_ref_extract_table,
fq_ranges_cohort_vet_extract_table = fq_ranges_cohort_vet_extract_table,
vet_extract_table_version = GetExtractVetTableVersion.version,
read_project_id = query_project,
do_not_filter_override = do_not_filter_override,
fq_filter_set_info_table = fq_filter_set_info_table,
fq_filter_set_site_table = fq_filter_set_site_table,
fq_filter_set_tranches_table = if (use_VQSR_lite) then none else fq_filter_set_tranches_table,
filter_set_name = filter_set_name,
drop_state = drop_state,
output_file = vcf_filename,
output_gcs_dir = output_gcs_dir,
max_last_modified_timestamp = GetBQTablesMaxLastModifiedTimestamp.max_last_modified_timestamp,
extract_preemptible_override = extract_preemptible_override,
extract_maxretries_override = extract_maxretries_override,
disk_override = disk_override,
memory_gib = effective_extract_memory_gib,
emit_pls = emit_pls,
emit_ads = emit_ads,
convert_filtered_genotypes_to_nocalls = convert_filtered_genotypes_to_nocalls,
write_cost_to_db = write_cost_to_db,
}
}

Expand Down Expand Up @@ -314,6 +316,7 @@ task ExtractTask {

Boolean emit_pls
Boolean emit_ads
Boolean convert_filtered_genotypes_to_nocalls = false

Boolean do_not_filter_override
String fq_filter_set_info_table
Expand Down Expand Up @@ -383,7 +386,8 @@ task ExtractTask {
--project-id ~{read_project_id} \
~{true='--emit-pls' false='' emit_pls} \
~{true='--emit-ads' false='' emit_ads} \
~{true='' false='--use-vqsr-classic-scoring' use_VQSR_lite} \
~{true='' false='--use-vqsr-scoring' use_VQSR_lite} \
~{true='--convert-filtered-genotypes-to-no-calls' false='' convert_filtered_genotypes_to_nocalls} \
${FILTERING_ARGS} \
--dataset-id ~{dataset_name} \
--call-set-identifier ~{call_set_identifier} \
Expand Down
57 changes: 29 additions & 28 deletions scripts/variantstore/wdl/GvsExtractCallsetPgen.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -419,34 +419,35 @@ task PgenExtractTask {
fi

gatk --java-options "-Xmx${memory_mb}m" \
ExtractCohortToPgen \
--vet-ranges-extract-fq-table ~{fq_ranges_cohort_vet_extract_table} \
~{"--vet-ranges-extract-table-version " + vet_extract_table_version} \
--ref-ranges-extract-fq-table ~{fq_ranges_cohort_ref_extract_table} \
--ref-version 38 \
-R ~{reference} \
-O ~{output_pgen_basename}.pgen \
--local-sort-max-records-in-ram ~{local_sort_max_records_in_ram} \
--sample-table ~{fq_samples_to_extract_table} \
~{"--inferred-reference-state " + inferred_reference_state} \
-L ~{interval_filename} \
--project-id ~{read_project_id} \
~{true='--emit-pls' false='' emit_pls} \
~{true='--emit-ads' false='' emit_ads} \
~{true='' false='--use-vqsr-classic-scoring' use_VQSR_lite} \
${FILTERING_ARGS} \
--dataset-id ~{dataset_name} \
--call-set-identifier ~{call_set_identifier} \
--wdl-step GvsExtractCallsetPgenPgen \
--wdl-call PgenExtractTask \
--shard-identifier ~{interval_filename} \
~{cost_observability_line} \
--writer-log-file writer.log \
--pgen-chromosome-code ~{pgen_chromosome_code} \
--max-alt-alleles ~{max_alt_alleles} \
~{true='--lenient-ploidy-validation' false='' lenient_ploidy_validation} \
~{true='--preserve-phasing' false='' preserve_phasing} \
--allow-empty-pgen
ExtractCohortToPgen \
--vet-ranges-extract-fq-table ~{fq_ranges_cohort_vet_extract_table} \
~{"--vet-ranges-extract-table-version " + vet_extract_table_version} \
--ref-ranges-extract-fq-table ~{fq_ranges_cohort_ref_extract_table} \
--ref-version 38 \
-R ~{reference} \
-O ~{output_pgen_basename}.pgen \
--local-sort-max-records-in-ram ~{local_sort_max_records_in_ram} \
--sample-table ~{fq_samples_to_extract_table} \
~{"--inferred-reference-state " + inferred_reference_state} \
-L ~{interval_filename} \
--project-id ~{read_project_id} \
~{true='--emit-pls' false='' emit_pls} \
~{true='--emit-ads' false='' emit_ads} \
~{true='' false='--use-vqsr-scoring' use_VQSR_lite} \
--convert-filtered-genotypes-to-no-calls \
${FILTERING_ARGS} \
--dataset-id ~{dataset_name} \
--call-set-identifier ~{call_set_identifier} \
--wdl-step GvsExtractCallsetPgenPgen \
--wdl-call PgenExtractTask \
--shard-identifier ~{interval_filename} \
~{cost_observability_line} \
--writer-log-file writer.log \
--pgen-chromosome-code ~{pgen_chromosome_code} \
--max-alt-alleles ~{max_alt_alleles} \
~{true='--lenient-ploidy-validation' false='' lenient_ploidy_validation} \
~{true='--preserve-phasing' false='' preserve_phasing} \
--allow-empty-pgen


# Drop trailing slash if one exists
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsUtils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ task GetToolVersions {
# there are a handlful of tasks that require the larger GNU libc-based `slim`.
String cloud_sdk_slim_docker = "gcr.io/google.com/cloudsdktool/cloud-sdk:435.0.0-slim"
String variants_docker = "us.gcr.io/broad-dsde-methods/variantstore:2024-04-23-alpine-92a8b296e"
String gatk_docker = "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_2024_03_22_91a6c15"
String gatk_docker = "us.gcr.io/broad-dsde-methods/broad-gatk-snapshots:varstore_2024_04_24_61922a303"
String variants_nirvana_docker = "us.gcr.io/broad-dsde-methods/variantstore:nirvana_2022_10_19"
String real_time_genomics_docker = "docker.io/realtimegenomics/rtg-tools:latest"
String gotc_imputation_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.5-1.10.2-0.1.16-1649948623"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ public class SchemaUtils {

public static final List<String> SAMPLE_FIELDS = Arrays.asList(SchemaUtils.SAMPLE_NAME_FIELD_NAME, SchemaUtils.SAMPLE_ID_FIELD_NAME);
public static final List<String> YNG_FIELDS = Arrays.asList(FILTER_SET_NAME, LOCATION_FIELD_NAME, REF_ALLELE_FIELD_NAME, ALT_ALLELE_FIELD_NAME, VQSLOD, YNG_STATUS);
public static final List<String> VQSLITE_YNG_FIELDS = Arrays.asList(FILTER_SET_NAME, LOCATION_FIELD_NAME, REF_ALLELE_FIELD_NAME, ALT_ALLELE_FIELD_NAME, CALIBRATION_SENSITIVITY, SCORE, YNG_STATUS);
public static final List<String> VETS_YNG_FIELDS = Arrays.asList(FILTER_SET_NAME, LOCATION_FIELD_NAME, REF_ALLELE_FIELD_NAME, ALT_ALLELE_FIELD_NAME, CALIBRATION_SENSITIVITY, SCORE, YNG_STATUS);
public static final List<String> TRANCHE_FIELDS = Arrays.asList(TARGET_TRUTH_SENSITIVITY, MIN_VQSLOD, TRANCHE_FILTER_NAME, TRANCHE_MODEL);

public static final List<String> ALT_ALLELE_FIELDS = Arrays.asList(LOCATION_FIELD_NAME, SAMPLE_ID_FIELD_NAME, REF_ALLELE_FIELD_NAME, "allele", ALT_ALLELE_FIELD_NAME, "allele_pos", CALL_GT, AS_RAW_MQ, RAW_MQ, AS_RAW_MQRankSum, "raw_mqranksum_x_10", AS_QUALapprox, "qual", AS_RAW_ReadPosRankSum, "raw_readposranksum_x_10", AS_SB_TABLE, "SB_REF_PLUS","SB_REF_MINUS","SB_ALT_PLUS","SB_ALT_MINUS", CALL_AD, "ref_ad", "ad");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -122,20 +122,29 @@ public enum VQScoreFilteringType {GENOTYPE, SITES, NONE}
private boolean emitADs = false;

@Argument(
fullName = "use-vqsr-classic-scoring",
doc = "If true, use VQSR 'Classic' scoring (vqs Lod score). Otherwise use VQSR 'Lite' scoring (calibration_sensitivity)",
fullName = "use-vqsr-scoring",
doc = "If true, use VQSR scoring (vqs Lod score). Otherwise use VETS scoring (calibration_sensitivity)",
optional = true
)
private boolean isVQSRClassic = false;
private boolean isVQSR = false;

@Argument(
fullName = "vqsr-score-filter-by-site",
doc = "If VQSR Score filtering is applied, it should be at a site level. Default is false",
fullName = "vqs-score-filter-by-site",
doc = "If Variant Quality Score filtering is applied, it should be at a site level. Default is false",
optional = true
)
// historical note that this parameter was previously named 'vqsr-score-filter-by-site', changed as it's not VQSR-specific
private boolean performSiteSpecificVQScoreFiltering = false;
private VQScoreFilteringType vqScoreFilteringType = VQScoreFilteringType.NONE;

@Argument(
fullName = "convert-filtered-genotypes-to-no-calls",
doc = "Set filtered genotypes to no-calls. This option can only be used if Variant Quality Score filtering " +
"is applied at the genotype level",
optional = true
)
private boolean convertFilteredGenotypesToNoCalls = false;

@Argument(
fullName = "snps-truth-sensitivity-filter-level",
mutex = {"snps-lod-score-cutoff"},
Expand Down Expand Up @@ -269,12 +278,12 @@ protected String[] customCommandLineValidation() {
errors.add("Parameter 'indels-truth-sensitivity-filter-level' must be between > 0.0 and < 100.0");
}
}
if (!isVQSRClassic) {
if (!isVQSR) {
if (tranchesTableName != null) {
errors.add("Parameter 'tranches-table' is not allowed for VQSR Lite");
errors.add("Parameter 'tranches-table' is not allowed for VETS");

Check warning on line 283 in src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohort.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohort.java#L283

Added line #L283 was not covered by tests
}
if ((vqsLodSNPThreshold != null) || (vqsLodINDELThreshold != null)) {
errors.add("Parameters 'snps-lod-score-cutoff' and 'indels-lod-score-cutoff' cannot be used in VQSR Lite mode");
errors.add("Parameters 'snps-lod-score-cutoff' and 'indels-lod-score-cutoff' cannot be used in VETS mode");

Check warning on line 286 in src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohort.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohort.java#L286

Added line #L286 was not covered by tests
}
}
if (!errors.isEmpty()) {
Expand All @@ -293,20 +302,26 @@ protected void onStartup() {
vqScoreFilteringType = performSiteSpecificVQScoreFiltering ? VQScoreFilteringType.SITES : VQScoreFilteringType.GENOTYPE;
}

if (convertFilteredGenotypesToNoCalls && vqScoreFilteringType != VQScoreFilteringType.GENOTYPE) {
throw new UserException("The option '--convert-filtered-genotypes-to-no-calls' can ONLY be used if you are filtering at the " +
"Genotype level (you have set '--filter-set-info-table' and NOT set '--vqs-score-filter-by-site')");
}

// filter at a site level (but not necessarily use vqslod)
if ((filterSetSiteTableName != null && filterSetName == null) || (filterSetSiteTableName == null && filterSetName != null)) {
throw new UserException("--filter-set-name and --filter-set-site-table are both necessary for any filtering related operations");
}
if (!vqScoreFilteringType.equals(VQScoreFilteringType.NONE)) {
//noinspection ConstantValue
if (filterSetInfoTableName == null || filterSetSiteTableName == null || filterSetName == null) {
throw new UserException(" --filter-set-info-table, --filter-set-name and --filter-set-site-table are all necessary for any VQSR filtering operations");
throw new UserException(" --filter-set-info-table, --filter-set-name and --filter-set-site-table are all necessary for any Variant Quality" +
" filtering operations");
}
}

Set<VCFHeaderLine> extraHeaderLines = new HashSet<>();
if (!vqScoreFilteringType.equals(VQScoreFilteringType.NONE)) {
if (isVQSRClassic) {
if (isVQSR) {
extraHeaderLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.AS_VQS_LOD_KEY));
FilterSensitivityTools.validateFilteringCutoffs(truthSensitivitySNPThreshold, truthSensitivityINDELThreshold, vqsLodSNPThreshold, vqsLodINDELThreshold, tranchesTableName);
Map<String, Map<Double, Double>> trancheMaps = FilterSensitivityTools.getTrancheMaps(filterSetName, tranchesTableName, projectID);
Expand Down Expand Up @@ -399,8 +414,8 @@ protected void onStartup() {
"if no sample file (--sample-file) is provided.");
}

engine = !isVQSRClassic ?
new ExtractCohortLiteEngine(
engine = !isVQSR ?
new ExtractCohortVETSEngine(
projectID,
header,
annotationEngine,
Expand All @@ -425,6 +440,7 @@ protected void onStartup() {
emitPLs,
emitADs,
vqScoreFilteringType,
convertFilteredGenotypesToNoCalls,
inferredReferenceState,
presortedAvroFiles,
this::apply)
Expand Down Expand Up @@ -454,6 +470,7 @@ protected void onStartup() {
emitPLs,
emitADs,
vqScoreFilteringType,
convertFilteredGenotypesToNoCalls,
inferredReferenceState,
presortedAvroFiles,
this::apply);
Expand Down
Loading

0 comments on commit 794cd90

Please sign in to comment.