Skip to content

Commit

Permalink
Add malaria spanning deletion exception regression test with fix (#8802)
Browse files Browse the repository at this point in the history
* Add malaria spanning deletion exception regression test with fix

* Disabling codecov.

---------

Co-authored-by: Jonn Smith <[email protected]>
  • Loading branch information
ldgauthier and jonn-smith authored May 1, 2024
1 parent aed8b1b commit ec39c37
Show file tree
Hide file tree
Showing 5 changed files with 113 additions and 11 deletions.
9 changes: 5 additions & 4 deletions .github/actions/upload-gatk-test-results/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,10 @@ runs:
name: test-results-${{ inputs.is-docker == 'true' && 'docker-' || '' }}${{ matrix.Java }}-${{ matrix.testType }}
path: build/reports/tests

- name: Upload to codecov
run: bash <(curl -s https://raw.githubusercontent.com/broadinstitute/codecov-bash-uploader/main/codecov-verified.bash)
shell: bash
# Disabling codecov because it is timing out and failing builds that otherwise succeed.
## - name: Upload to codecov
## run: bash <(curl -s https://raw.githubusercontent.com/broadinstitute/codecov-bash-uploader/main/codecov-verified.bash)
## shell: bash

- name: Upload Reports
if: ${{ inputs.only-artifact != 'true' }}
Expand Down Expand Up @@ -91,4 +92,4 @@ runs:
run: |
pip install --user PyGithub;
python scripts/github_actions/Reporter.py ${{ steps.uploadreports.outputs.view_url }};
shell: bash
shell: bash
Original file line number Diff line number Diff line change
Expand Up @@ -327,13 +327,11 @@ public static void makeGenotypeCall(final int ploidy,
gb.alleles(noCallAlleles(ploidy));
} else if (assignmentMethod == GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN ||
assignmentMethod == GenotypeAssignmentMethod.PREFER_PLS) {
if (genotypeLikelihoods == null || !isInformative(genotypeLikelihoods)) {
if (assignmentMethod == GenotypeAssignmentMethod.PREFER_PLS) {
if (originalGT == null) {
throw new IllegalArgumentException("original GT cannot be null if assignmentMethod is PREFER_PLS");
} else {
gb.alleles(bestMatchToOriginalGT(allelesToUse, originalGT.getAlleles()));
}
if ((genotypeLikelihoods == null || !isInformative(genotypeLikelihoods)) && assignmentMethod == GenotypeAssignmentMethod.PREFER_PLS) {
if (originalGT == null) {
throw new IllegalArgumentException("original GT cannot be null if assignmentMethod is PREFER_PLS");
} else {
gb.alleles(bestMatchToOriginalGT(allelesToUse, originalGT.getAlleles()));
}
} else {
final int maxLikelihoodIndex = MathUtils.maxElementIndex(genotypeLikelihoods);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -668,4 +668,37 @@ public void testAddedHcAsAnnotations() throws IOException {
true
);
}
@Test
public void testSpanDelRegression() throws IOException {

final File input = getTestFile("reblock_cleanup_bug_variant.g.vcf");
final File output = createTempFile("reblockedgvcf", ".vcf");

final ArgumentsBuilder args = new ArgumentsBuilder();

args.addReference(new File(pf_reference))
.add("V", input)
.addFlag("do-qual-approx")
.add(StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false")
.add("A", "AssemblyComplexity")
.addFlag("floor-blocks")
.add("GQB", 20)
.add("GQB", 30)
.add("GQB", 40)
.addOutput(output);

runCommandLine(args);

Pair<VCFHeader, List<VariantContext>> actual = VariantContextTestUtils.readEntireVCFIntoMemory(output.getAbsolutePath());
List<VariantContext> variants = actual.getRight();
Assert.assertEquals(variants.size(), 4);
VariantContext v0 = variants.get(0);
//crappy deletions are all collapsed into GQ0 ref block
Assert.assertEquals(v0.getStart(), 646914);
Assert.assertEquals(v0.getAttributeAsInt(VCFConstants.END_KEY, 0), 646953);
Assert.assertTrue(v0.getGenotype(0).isHomRef());
Assert.assertEquals(v0.getGenotype(0).getGQ(), 0);
//no more star alleles because deletions are all gone
Assert.assertFalse(variants.stream().anyMatch(v -> v.getAlternateAlleles().contains(Allele.SPAN_DEL)));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
##fileformat=VCFv4.2
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele not already represented at this location by REF and ALT">
##FILTER=<ID=LowQual,Description="Low quality">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block">
##FORMAT=<ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another; will always be heterozygous and is not intended to describe called alleles">
##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phasing set (typically the position of the first variant in the set)">
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
##GATKCommandLine=<ID=HaplotypeCaller,CommandLine="HaplotypeCaller --annotate-with-num-discovered-alleles true --heterozygosity 0.001 --indel-heterozygosity 1.25E-4 --heterozygosity-stdev 0.01 --sample-ploidy 2 --contamination-fraction-to-filter 0.0 --gvcf-gq-bands 10 --gvcf-gq-bands 20 --gvcf-gq-bands 30 --gvcf-gq-bands 40 --gvcf-gq-bands 50 --gvcf-gq-bands 60 --gvcf-gq-bands 70 --gvcf-gq-bands 80 --gvcf-gq-bands 90 --linked-de-bruijn-graph true --pileup-detection true --pileup-detection-enable-indel-pileup-calling true --bam-output SEN_2012_SLP.CXX_0108.haplotype_caller.Pf3D7_11_v3.bamout.bam --smith-waterman FASTEST_AVAILABLE --emit-ref-confidence GVCF --output SEN_2012_SLP.CXX_0108.haplotype_caller.Pf3D7_11_v3.g.vcf.gz --intervals Pf3D7_11_v3 --input gs://fc-b06e896e-cc1d-4deb-b638-f7b87c3e5dbd/results/SRFlowcell/SEN_2012_SLP.CXX_0108/reads/aligned/SEN_2012_SLP.CXX_0108.aligned.merged.markDuplicates.sorted.BQSR.bam --reference /cromwell_root/broad-dsde-methods-long-reads/resources/references/plasmodb_release-61/Pfalciparum3D7/fasta/data/PlasmoDB-61_Pfalciparum3D7_Genome.fasta --annotation AssemblyComplexity --annotation-group StandardAnnotation --annotation-group StandardHCAnnotation --use-posteriors-to-calculate-qual false --dont-use-dragstr-priors false --use-new-qual-calculator true --standard-min-confidence-threshold-for-calling 30.0 --max-alternate-alleles 6 --max-genotype-count 1024 --num-reference-samples-if-no-call 0 --genotype-assignment-method USE_PLS_TO_ASSIGN --output-mode EMIT_VARIANTS_ONLY --all-site-pls false --flow-likelihood-parallel-threads 0 --flow-likelihood-optimized-comp false --trim-to-haplotype true --exact-matching false --flow-use-t0-tag false --flow-probability-threshold 0.003 --flow-remove-non-single-base-pair-indels false --flow-remove-one-zero-probs false --flow-quantization-bins 121 --flow-fill-empty-bins-value 0.001 --flow-symmetric-indel-probs false --flow-report-insertion-or-deletion false --flow-disallow-probs-larger-than-call false --flow-lump-probs false --flow-retain-max-n-probs-base-format false --flow-probability-scaling-factor 10 --flow-order-cycle-length 4 --keep-boundary-flows false --floor-blocks false --indel-size-to-eliminate-in-ref-model 10 --disable-optimizations false --dragen-mode false --dragen-378-concordance-mode false --flow-mode NONE --apply-bqd false --apply-frd false --disable-spanning-event-genotyping false --transform-dragen-mapping-quality false --mapping-quality-threshold-for-genotyping 20 --max-effective-depth-adjustment-for-frd 0 --just-determine-active-regions false --dont-genotype false --do-not-run-physical-phasing false --do-not-correct-overlapping-quality false --use-filtered-reads-for-annotations false --use-flow-aligner-for-stepwise-hc-filtering false --adaptive-pruning false --do-not-recover-dangling-branches false --recover-dangling-heads false --kmer-size 10 --kmer-size 25 --dont-increase-kmer-sizes-for-cycles false --allow-non-unique-kmers-in-ref false --num-pruning-samples 1 --min-dangling-branch-length 4 --recover-all-dangling-branches false --max-num-haplotypes-in-population 128 --min-pruning 2 --adaptive-pruning-initial-error-rate 0.001 --pruning-lod-threshold 2.302585092994046 --pruning-seeding-lod-threshold 9.210340371976184 --max-unpruned-variants 100 --disable-artificial-haplotype-recovery false --enable-legacy-graph-cycle-detection false --debug-assembly false --debug-graph-transformations false --capture-assembly-failure-bam false --num-matching-bases-in-dangling-end-to-recover -1 --error-correction-log-odds -Infinity --error-correct-reads false --kmer-length-for-read-error-correction 25 --min-observations-for-kmer-to-be-solid 20 --likelihood-calculation-engine PairHMM --base-quality-score-threshold 18 --dragstr-het-hom-ratio 2 --dont-use-dragstr-pair-hmm-scores false --pair-hmm-gap-continuation-penalty 10 --expected-mismatch-rate-for-read-disqualification 0.02 --pair-hmm-implementation FASTEST_AVAILABLE --pcr-indel-model CONSERVATIVE --phred-scaled-global-read-mismapping-rate 45 --disable-symmetric-hmm-normalizing false --disable-cap-base-qualities-to-map-quality false --enable-dynamic-read-disqualification-for-genotyping false --dynamic-read-disqualification-threshold 1.0 --native-pair-hmm-threads 4 --native-pair-hmm-use-double-precision false --flow-hmm-engine-min-indel-adjust 6 --flow-hmm-engine-flat-insertion-penatly 45 --flow-hmm-engine-flat-deletion-penatly 45 --use-pdhmm false --use-pdhmm-overlap-optimization false --make-determined-haps-from-pd-code false --print-pileupcalling-status false --fallback-gga-if-pdhmm-fails true --pileup-detection-active-region-phred-threshold 0.0 --num-artificial-haplotypes-to-add-per-allele 5 --artifical-haplotype-filtering-kmer-size 10 --pileup-detection-snp-alt-threshold 0.1 --pileup-detection-indel-alt-threshold 0.1 --pileup-detection-absolute-alt-depth 0.0 --pileup-detection-snp-adjacent-to-assembled-indel-range 5 --pileup-detection-snp-basequality-filter 12 --pileup-detection-bad-read-tolerance 0.0 --pileup-detection-proper-pair-read-badness true --pileup-detection-edit-distance-read-badness-threshold 0.08 --pileup-detection-chimeric-read-badness true --pileup-detection-template-mean-badness-threshold 0.0 --pileup-detection-template-std-badness-threshold 0.0 --pileup-detection-filter-assembly-alt-bad-read-tolerance 0.0 --pileup-detection-edit-distance-read-badness-for-assembly-filtering-threshold 0.12 --bam-writer-type CALLED_HAPLOTYPES --dont-use-soft-clipped-bases false --override-fragment-softclip-check false --min-base-quality-score 10 --max-mnp-distance 0 --force-call-filtered-alleles false --reference-model-deletion-quality 30 --soft-clip-low-quality-ends false --allele-informative-reads-overlap-margin 2 --smith-waterman-dangling-end-match-value 25 --smith-waterman-dangling-end-mismatch-penalty -50 --smith-waterman-dangling-end-gap-open-penalty -110 --smith-waterman-dangling-end-gap-extend-penalty -6 --smith-waterman-haplotype-to-reference-match-value 200 --smith-waterman-haplotype-to-reference-mismatch-penalty -150 --smith-waterman-haplotype-to-reference-gap-open-penalty -260 --smith-waterman-haplotype-to-reference-gap-extend-penalty -11 --smith-waterman-read-to-haplotype-match-value 10 --smith-waterman-read-to-haplotype-mismatch-penalty -15 --smith-waterman-read-to-haplotype-gap-open-penalty -30 --smith-waterman-read-to-haplotype-gap-extend-penalty -5 --flow-assembly-collapse-hmer-size 0 --flow-assembly-collapse-partial-mode false --flow-filter-alleles false --flow-filter-alleles-qual-threshold 30.0 --flow-filter-alleles-sor-threshold 3.0 --flow-filter-lone-alleles false --flow-filter-alleles-debug-graphs false --min-assembly-region-size 50 --max-assembly-region-size 300 --active-probability-threshold 0.002 --max-prob-propagation-distance 50 --force-active false --assembly-region-padding 100 --padding-around-indels 75 --padding-around-snps 20 --padding-around-strs 75 --max-extension-into-assembly-region-padding-legacy 25 --max-reads-per-alignment-start 50 --enable-legacy-assembly-region-trimming false --interval-set-rule UNION --interval-padding 0 --interval-exclusion-padding 0 --interval-merging-rule ALL --read-validation-stringency SILENT --seconds-between-progress-updates 10.0 --disable-sequence-dictionary-validation false --create-output-bam-index true --create-output-bam-md5 false --create-output-variant-index true --create-output-variant-md5 false --max-variants-per-shard 0 --lenient false --add-output-sam-program-record true --add-output-vcf-command-line true --cloud-prefetch-buffer 40 --cloud-index-prefetch-buffer -1 --disable-bam-index-caching false --sites-only-vcf-output false --help false --version false --showHidden false --verbosity INFO --QUIET false --use-jdk-deflater false --use-jdk-inflater false --gcs-max-retries 20 --gcs-project-for-requester-pays --disable-tool-default-read-filters false --minimum-mapping-quality 20 --disable-tool-default-annotations false --enable-all-annotations false --allow-old-rms-mapping-quality-annotation-data false --assembly-complexity-reference-mode false",Version="4.5.0.0",Date="April 23, 2024 at 6:39:27 PM GMT">
##GVCFBlock0-10=minGQ=0(inclusive),maxGQ=10(exclusive)
##GVCFBlock10-20=minGQ=10(inclusive),maxGQ=20(exclusive)
##GVCFBlock20-30=minGQ=20(inclusive),maxGQ=30(exclusive)
##GVCFBlock30-40=minGQ=30(inclusive),maxGQ=40(exclusive)
##GVCFBlock40-50=minGQ=40(inclusive),maxGQ=50(exclusive)
##GVCFBlock50-60=minGQ=50(inclusive),maxGQ=60(exclusive)
##GVCFBlock60-70=minGQ=60(inclusive),maxGQ=70(exclusive)
##GVCFBlock70-80=minGQ=70(inclusive),maxGQ=80(exclusive)
##GVCFBlock80-90=minGQ=80(inclusive),maxGQ=90(exclusive)
##GVCFBlock90-100=minGQ=90(inclusive),maxGQ=100(exclusive)
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##INFO=<ID=ExcessHet,Number=1,Type=Float,Description="Phred-scaled p-value for exact test of excess heterozygosity">
##INFO=<ID=HAPCOMP,Number=A,Type=Integer,Description="Edit distances of each alt allele's most common supporting haplotype from closest germline haplotype, excluding differences at the site in question.">
##INFO=<ID=HAPDOM,Number=A,Type=Float,Description="For each alt allele, fraction of read support that best fits the most-supported haplotype containing the allele">
##INFO=<ID=HEC,Number=.,Type=Integer,Description="Counts of support for haplotype groups excluding difference at the site in question.">
##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
##INFO=<ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
##INFO=<ID=MLEAF,Number=A,Type=Float,Description="Maximum likelihood expectation (MLE) for the allele frequency (not necessarily the same as the AF), for each ALT allele, in the same order as listed">
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
##INFO=<ID=NDA,Number=1,Type=Integer,Description="Number of alternate alleles discovered (but not necessarily genotyped) at this site">
##INFO=<ID=RAW_MQandDP,Number=2,Type=Integer,Description="Raw data (sum of squared MQ and total depth) for improved RMS Mapping Quality calculation. Incompatible with deprecated RAW_MQ formulation.">
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
##contig=<ID=Pf3D7_01_v3,length=640851>
##contig=<ID=Pf3D7_02_v3,length=947102>
##contig=<ID=Pf3D7_03_v3,length=1067971>
##contig=<ID=Pf3D7_04_v3,length=1200490>
##contig=<ID=Pf3D7_05_v3,length=1343557>
##contig=<ID=Pf3D7_06_v3,length=1418242>
##contig=<ID=Pf3D7_07_v3,length=1445207>
##contig=<ID=Pf3D7_08_v3,length=1472805>
##contig=<ID=Pf3D7_09_v3,length=1541735>
##contig=<ID=Pf3D7_10_v3,length=1687656>
##contig=<ID=Pf3D7_11_v3,length=2038340>
##contig=<ID=Pf3D7_12_v3,length=2271494>
##contig=<ID=Pf3D7_13_v3,length=2925236>
##contig=<ID=Pf3D7_14_v3,length=3291936>
##contig=<ID=Pf3D7_API_v3,length=34250>
##contig=<ID=Pf3D7_MIT_v3,length=5967>
##source=HaplotypeCaller
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SENTh086.12-073.18_SEN.Di
Pf3D7_12_v3 646914 . TATA *,T,<NON_REF> 0.01 . HAPCOMP=0,0,0;HAPDOM=0.00,0.800,0.00;HEC=7,4,1,0,0,0,0,0,0;MLEAC=0,0,0;MLEAF=NaN,NaN,NaN;NDA=3 GT:PL ./.:0,0,0,0,0,0,0,0,0,0
Pf3D7_12_v3 646918 . T <NON_REF> . . END=646919 GT:DP:GQ:MIN_DP:PL 0/0:0:0:0:0,0,0
Pf3D7_12_v3 646920 . TTTTTTTTTTTTTC *,T,<NON_REF> 0 . DP=5;ExcessHet=0.0000;HAPCOMP=0,1,0;HAPDOM=0.00,1.00,0.00;HEC=7,4,1,0,0,0,0,0,0;MLEAC=0,0,0;MLEAF=0.00,0.00,0.00;NDA=3;RAW_MQandDP=18000,5 GT:AD:DP:GQ:PL:SB 0/0:4,0,0,0:4:12:0,15,123,12,114,180,15,124,152,151:2,2,0,0
Pf3D7_12_v3 646923 . T A,*,<NON_REF> 0.01 . DP=4;ExcessHet=0.0000;HAPCOMP=0,0,0;HAPDOM=0.111,0.00,0.00;HEC=7,4,1,0,0,0;MLEAC=0,0,0;MLEAF=0.00,0.00,0.00;NDA=3;RAW_MQandDP=14400,4 GT:AD:DP:GQ:PL:SB 0/0:0,0,0,0:0:0:0,0,1,12,13,99,11,12,61,50:0,0,0,0
Pf3D7_12_v3 646924 . T <NON_REF> . . END=646925 GT:DP:GQ:MIN_DP:PL 0/0:4:12:4:0,12,99
Pf3D7_12_v3 646926 . TTTTTTTC *,T,<NON_REF> 0 . DP=5;ExcessHet=0.0000;HAPCOMP=0,0,0;HAPDOM=0.00,1.00,0.00;HEC=7,4,1,0,0,0,0,0,0;MLEAC=0,0,0;MLEAF=0.00,0.00,0.00;NDA=3;RAW_MQandDP=18000,5 GT:AD:DP:GQ:PL:SB 0/0:0,0,0,0:0:0:0,12,180,0,12,0,12,102,12,90:0,0,0,0
Pf3D7_12_v3 646927 . TTTTTTC T,*,<NON_REF> 0.01 . DP=5;ExcessHet=0.0000;HAPCOMP=1,0,0;HAPDOM=0.111,0.00,0.00;HEC=7,4,1,0,0,0,0;MLEAC=0,0,0;MLEAF=0.00,0.00,0.00;NDA=3;RAW_MQandDP=18000,5 GT:AD:DP:GQ:PL:SB 1/2:0,0,0,0:0:0:1,1,1,1,0,0,1,1,1,1:0,0,0,0
Pf3D7_12_v3 646933 . C *,T,<NON_REF> 3.15 . DP=5;ExcessHet=0.0000;HAPCOMP=0,1,0;HAPDOM=0.00,0.111,0.00;HEC=7,4,1,0,0,0,0;MLEAC=1,1,0;MLEAF=0.500,0.500,0.00;NDA=3;RAW_MQandDP=18000,5 GT:AD:DP:GQ:PL:SB 1/1:0,0,0,0:0:1:123,15,0,16,1,1,76,14,15,62:0,0,0,0
Pf3D7_12_v3 646934 . T <NON_REF> . . END=646947 GT:DP:GQ:MIN_DP:PL 0/0:7:15:5:0,15,124
Pf3D7_12_v3 646948 . C <NON_REF> . . END=646948 GT:DP:GQ:MIN_DP:PL 0/0:3:9:3:0,9,74
Pf3D7_12_v3 646949 . A <NON_REF> . . END=646953 GT:DP:GQ:MIN_DP:PL 0/0:5:15:5:0,15,124
Pf3D7_12_v3 646954 . G <NON_REF> . . END=646955 GT:DP:GQ:MIN_DP:PL 0/0:7:21:7:0,21,163
Pf3D7_12_v3 646956 . C <NON_REF> . . END=646956 GT:DP:GQ:MIN_DP:PL 0/0:8:10:8:0,10,159
Pf3D7_12_v3 646957 . A <NON_REF> . . END=646991 GT:DP:GQ:MIN_DP:PL 0/0:8:21:7:0,21,163
Binary file not shown.

0 comments on commit ec39c37

Please sign in to comment.