Skip to content

Commit

Permalink
Several GQ0 cleanup changes: (#8741)
Browse files Browse the repository at this point in the history
Set GGVCFs --all-sites GQ0 hom-refs to no-calls
Set regular GGVCFs GQ0 hom-refs to no-calls (any DP, PL) for better AF/AN annotations
Remove PLs in "no data" case where DP=0 for more accurate QUAL score
  • Loading branch information
ldgauthier authored Apr 10, 2024
1 parent 6739e6d commit 7cdc985
Show file tree
Hide file tree
Showing 39 changed files with 28,587 additions and 7,405 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -132,11 +132,17 @@ public void traverse() {
.forEachOrdered(variant -> {

final SimpleInterval variantInterval = new SimpleInterval(variant);
apply(variant,
Collections.singletonList(variant),
new ReadsContext(reads, variantInterval, readFilter),
new ReferenceContext(reference, variantInterval),
new FeatureContext(features, variantInterval));

try{
apply(variant,
Collections.singletonList(variant),
new ReadsContext(reads, variantInterval, readFilter),
new ReferenceContext(reference, variantInterval),
new FeatureContext(features, variantInterval));
} catch (final IllegalStateException e) {
throw new GATKException("Exception thrown at " + variant.getContig() + ":" + variant.getStart()
+ " " + variant.toString(), e);
}

progressMeter.update(variantInterval);
});
Expand All @@ -158,11 +164,16 @@ public void traverse() {
postTransformer)
.collect(Collectors.toList());
if (!filteredVariants.isEmpty()) {
apply(locus,
filteredVariants,
new ReadsContext(reads, locus, readFilter),
new ReferenceContext(reference, locus),
new FeatureContext(features, locus));
try {
apply(locus,
filteredVariants,
new ReadsContext(reads, locus, readFilter),
new ReferenceContext(reference, locus),
new FeatureContext(features, locus));
} catch (final IllegalStateException e) {
throw new GATKException("Exception thrown at first variant start " + filteredVariants.get(0).getContig() + ":" + filteredVariants.get(0).getStart()
+ " " + filteredVariants.get(0).toString(), e);
}

progressMeter.update(locus);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,9 @@ private VariantContext regenotypeVC(final VariantContext originalVC, final Refer

final VariantContext result;

// only re-genotype polymorphic sites
if ( originalVC.isVariant() && originalVC.getAttributeAsInt(VCFConstants.DEPTH_KEY,0) > 0 ) {
// only re-genotype polymorphic sites
// note that the calculateGenotypes method also calculates the QUAL score
final VariantContext regenotypedVC = calculateGenotypes(originalVC, includeNonVariants);
if (regenotypedVC == null) {
return null;
Expand Down Expand Up @@ -186,7 +187,7 @@ private VariantContext regenotypeVC(final VariantContext originalVC, final Refer
//don't count sites with no depth and no confidence towards things like AN and InbreedingCoeff
vcBuilder.genotypes(assignNoCallsAnnotationExcludedGenotypes(result.getGenotypes()));
VariantContext annotated = annotationEngine.annotateContext(vcBuilder.make(), features, ref, null, a -> true);
return new VariantContextBuilder(annotated).genotypes(cleanupGenotypeAnnotations(result, false, keepSB)).make();
return new VariantContextBuilder(annotated).genotypes(cleanupGenotypeAnnotations(annotated, false, keepSB)).make();
} else if (includeNonVariants) {
// For monomorphic sites we need to make sure e.g. the hom ref genotypes are created and only then are passed to the annotation engine.
VariantContext preannotated = new VariantContextBuilder(result).genotypes(cleanupGenotypeAnnotations(result, true, false)).make();
Expand Down Expand Up @@ -461,24 +462,29 @@ static List<Genotype> cleanupGenotypeAnnotations(final VariantContext vc, final
attrs.put(GATKVCFConstants.HAPLOTYPE_CALLER_PHASING_GT_KEY, GenotypeGVCFs.PHASED_HOM_VAR_STRING);
}

// create AD if it's not there
if ( !oldGT.hasAD() && vc.isVariant() ) {
// create AD if it's not there, but only if there's data
if ( !oldGT.hasAD() && vc.isVariant() && depth > 0) {
final int[] AD = new int[vc.getNAlleles()];
AD[0] = depth;
builder.AD(AD);
}

if ( createRefGTs ) {
// move the GQ to RGQ
if (oldGT.hasGQ()) {
//keep 0 depth samples and 0 GQ samples as no-call
if (depth > 0 && oldGT.hasGQ()) {
if (oldGT.getGQ() > 0) {
final List<Allele> refAlleles = Collections.nCopies(oldGT.getPloidy(), vc.getReference());
builder.alleles(refAlleles);
} else {
builder.alleles(Collections.nCopies(oldGT.getPloidy(),Allele.NO_CALL));
}

// move the GQ to RGQ
builder.noGQ();
attrs.put(GATKVCFConstants.REFERENCE_GENOTYPE_QUALITY, oldGT.getGQ());
}

//keep 0 depth samples and 0 GQ samples as no-call
if (depth > 0 && oldGT.hasGQ() && oldGT.getGQ() > 0) {
final List<Allele> refAlleles = Collections.nCopies(oldGT.getPloidy(), vc.getReference());
builder.alleles(refAlleles);
} else {
builder.alleles(Collections.nCopies(oldGT.getPloidy(),Allele.NO_CALL));
builder.noGQ().noDP();
}

// also, the PLs are technically no longer usable
Expand All @@ -494,8 +500,8 @@ static List<Genotype> cleanupGenotypeAnnotations(final VariantContext vc, final
* Does this genotype look like it has no reads and should be excluded from annotations?
*/
static boolean excludeFromAnnotations(Genotype oldGT) {
return oldGT.isHomRef() && !oldGT.hasPL()
&& ((oldGT.hasDP() && oldGT.getDP() == 0) || !oldGT.hasDP())
return (oldGT.isHomRef() || oldGT.isNoCall())
&& (!oldGT.hasDP() || oldGT.getDP() == 0)
&& oldGT.hasGQ() && oldGT.getGQ() == 0;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -580,6 +580,7 @@ private GenotypesContext mergeRefConfidenceGenotypes(final VariantContext vc,
final int ploidy = g.getPloidy();
final GenotypeBuilder genotypeBuilder = new GenotypeBuilder(g);
if (!doSomaticMerge) {
//do attribute subsetting
if (g.hasPL() || g.hasAD()) {
int[] perSampleIndexesOfRelevantAlleles = AlleleSubsettingUtils.getIndexesOfRelevantAllelesForGVCF(remappedAlleles, targetAlleles, vc.getStart(), g, false);
if (g.hasPL()) {
Expand All @@ -591,8 +592,10 @@ private GenotypesContext mergeRefConfidenceGenotypes(final VariantContext vc,
if (g.hasAD()) {
genotypeBuilder.AD(AlleleSubsettingUtils.generateAD(g.getAD(), perSampleIndexesOfRelevantAlleles));
}

}
//clean up low confidence hom refs for better annotations later
} else if (GenotypeGVCFsEngine.excludeFromAnnotations(g)) {
if (GenotypeGVCFsEngine.excludeFromAnnotations(g)) {
genotypeBuilder.alleles(Collections.nCopies(ploidy, Allele.NO_CALL));
}
}
Expand Down Expand Up @@ -659,7 +662,7 @@ private GenotypesContext mergeRefConfidenceGenotypes(final VariantContext vc,
GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(),
genotypeBuilder, assignmentMethod,
g.hasLikelihoods() ? g.getLikelihoods().getAsVector() : null,
targetAlleles, new GenotypeBuilder(g.getSampleName(), originalGTAlleles).make(), null);
targetAlleles, new GenotypeBuilder(g).alleles(originalGTAlleles).make(), null);
mergedGenotypes.add(genotypeBuilder.make());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ public static GenotypesContext subsetAlleles(final GenotypesContext originalGs,
}

// restrict AD to the new allele subset
if(g.hasAD()) {
if(g.hasAD() && gb.makeWithShallowCopy().hasAD()) {
final int[] newAD = getNewAlleleBasedReadCountAnnotation(allelesToKeep, allelePermutation, g.getAD());
gb.AD(newAD);
// if we have recalculated AD and the original genotype had AF but was then removed, then recalculate AF based on AD counts
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -393,14 +393,8 @@ protected GenotypesContext iterateOnGenotypes(final VariantContext vc, final Lis
protected void makeGenotypeCall(final Genotype g, final GenotypeBuilder gb,
final double[] genotypeLikelihoods,
final List<Allele> allelesToUse) {
if ( genotypeLikelihoods == null || !GATKVariantContextUtils.isInformative(genotypeLikelihoods) ) {
//gb.alleles(GATKVariantContextUtils.noCallAlleles(g.getAlleles().size())).noGQ();
GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(), gb, GenotypeAssignmentMethod.SET_TO_NO_CALL,
genotypeLikelihoods, allelesToUse, null);
} else {
GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(), gb, GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN,
genotypeLikelihoods, allelesToUse, null);
}
GATKVariantContextUtils.makeGenotypeCall(g.getPloidy(), gb, GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN,
genotypeLikelihoods, allelesToUse, g, null);
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ private Genotype getUpdatedGenotype(final VariantContext vc, final Genotype geno
final GenotypeBuilder builder = new GenotypeBuilder(genotype);

//update genotype types based on posteriors
GATKVariantContextUtils.makeGenotypeCall(vc.getMaxPloidy(2), builder, GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, log10Posteriors, vc.getAlleles(), null);
GATKVariantContextUtils.makeGenotypeCall(vc.getMaxPloidy(2), builder, GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, log10Posteriors, vc.getAlleles(), genotype, null);

builder.attribute(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY,
Utils.listFromPrimitives(GenotypeLikelihoods.fromLog10Likelihoods(log10Posteriors).getAsPLs()));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.walkers.ReferenceConfidenceVariantContextMerger;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypeAssignmentMethod;
import org.broadinstitute.hellbender.tools.walkers.genotyper.GenotypesCache;
import org.broadinstitute.hellbender.utils.MathUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
Expand Down Expand Up @@ -134,9 +135,16 @@ else if (a.isSymbolic()) { //use the greater of the priors for non-ref
builder.phased(vc1.getGenotype(genoIdx).isPhased());
if ( posteriors.get(genoIdx) != null ) {
GATKVariantContextUtils.makeGenotypeCall(vc1.getMaxPloidy(HomoSapiensConstants.DEFAULT_PLOIDY), builder,
GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, posteriors.get(genoIdx), vc1.getAlleles(), null);
GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN, posteriors.get(genoIdx), vc1.getAlleles(), vc1.getGenotype(genoIdx), null);

builder.attribute(GATKVCFConstants.PHRED_SCALED_POSTERIORS_KEY,
Utils.listFromPrimitives(GenotypeLikelihoods.fromLog10Likelihoods(posteriors.get(genoIdx)).getAsPLs()));
Utils.listFromPrimitives(GenotypeLikelihoods.fromLog10Likelihoods(posteriors.get(genoIdx)).getAsPLs()));
//TODO: refactor me -- this is to make up for the GGVCFs changes fixing GQ0 hom-refs
if (!builder.makeWithShallowCopy().hasPL()) {
final int maxPosteriorIndex = MathUtils.maxElementIndex(posteriors.get(genoIdx));
builder.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(maxPosteriorIndex, posteriors.get(genoIdx)));
builder.alleles(GenotypesCache.get(vc1.getGenotype(genoIdx).getPloidy(), maxPosteriorIndex).asAlleleList(vc1.getAlleles()));
}
}
newContext.add(builder.make());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -819,7 +819,7 @@ private Genotype getCalledGenotype(final VariantContext variant) {
final GenotypeBuilder builderToCallAlleles = new GenotypeBuilder(noCallGT);
//TODO: update to support DRAGEN posteriors
GATKVariantContextUtils.makeGenotypeCall(noCallGT.getPloidy(), builderToCallAlleles, GenotypeAssignmentMethod.USE_PLS_TO_ASSIGN,
noCallGT.getLikelihoods().getAsVector(), variant.getAlleles(), null);
noCallGT.getLikelihoods().getAsVector(), variant.getAlleles(), noCallGT, null);
return builderToCallAlleles.make();
} else {
return variant.getGenotype(0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ public static boolean isDiploidWithLikelihoods(final Genotype g) {
}

/**
* Returns true if the genotype is a diploid genotype with likelihoods.
* Returns true if the genotype is a called diploid genotype with likelihoods.
*/
public static boolean isCalledAndDiploidWithLikelihoodsOrWithGQ(final Genotype g) {
return Utils.nonNull(g).isCalled() && g.getPloidy() == 2 && (Utils.nonNull(g).hasLikelihoods() || g.hasGQ()) ;
Expand Down Expand Up @@ -142,6 +142,7 @@ public static GenotypeCounts computeDiploidGenotypeCounts(final VariantContext v
/**
* Do we have (or can we infer) likelihoods necessary for allele frequency calculation?
* Some reblocked and/or DRAGEN GVCFs omit likelihoods for ref blocks, but we can estimate them
* Ref blocks without likelihoods need to be retrieved from GenomicDB or CombineGVCFs with the --call-genotypes arg
* If GenomicsDB max alt threshold is too low, non-reference genotypes may also be missing PLs -- we can't estimate, so reject those
* @param g a genotype of unknown call and ploidy
* @return true if we have enough info for AF calculation
Expand Down Expand Up @@ -192,6 +193,6 @@ public static double[] makeApproximateDiploidLog10LikelihoodsFromGQ(Genotype g,
}

public static boolean shouldBeCalled(final Genotype g) {
return !g.isNonInformative() || g.hasGQ();
return !g.isNonInformative() || (g.hasGQ() && g.getGQ() > 0);
}
}
Loading

0 comments on commit 7cdc985

Please sign in to comment.