Skip to content

Commit

Permalink
SNVQ recalibration tool added for flow based reads (#8697)
Browse files Browse the repository at this point in the history
Co-authored-by: Dror Kessler <[email protected]>
  • Loading branch information
ilyasoifer and Dror Kessler authored Apr 4, 2024
1 parent 724b5bc commit 6739e6d
Show file tree
Hide file tree
Showing 73 changed files with 273,250 additions and 272,313 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ public class FlowBasedArgumentCollection implements Serializable {
private static final long serialVersionUID = 0;

public static final String FLOW_USE_T0_TAG = "flow-use-t0-tag";
public static final String PROBABILITY_RATIO_THRESHOLD_LONG_NAME = "flow-probability-threshold";
public static final String REMOVE_LONGER_THAN_ONE_INDELS_LONG_NAME = "flow-remove-non-single-base-pair-indels";
public static final String REMOVE_ONE_TO_ZERO_PROBS_LONG_NAME = "flow-remove-one-zero-probs";
public static final String NUMBER_OF_POSSIBLE_PROBS_LONG_NAME = "flow-quantization-bins";
Expand All @@ -27,8 +26,7 @@ public class FlowBasedArgumentCollection implements Serializable {



private static final double DEFAULT_RATIO_THRESHOLD = 0.003;
private static final double DEFAULT_FILLING_VALUE = 0.001;
public static final double DEFAULT_FILLING_VALUE = 0.001;
private static final boolean DEFAULT_REMOVE_LONGER_INDELS = false;
private static final boolean DEFAULT_REMOVE_ONE_TO_ZERO = false;
private static final boolean DEFAULT_SYMMETRIC_INDELS = false;
Expand All @@ -45,10 +43,6 @@ public class FlowBasedArgumentCollection implements Serializable {
@Argument(fullName = FLOW_USE_T0_TAG, doc = "Use t0 tag if exists in the read to create flow matrix", optional = true)
public boolean useT0Tag = DEFAULT_FLOW_USE_T0_TAG;

@Advanced
@Argument(fullName = PROBABILITY_RATIO_THRESHOLD_LONG_NAME, doc = "Lowest probability ratio to be used as an option", optional = true)
public double probabilityRatioThreshold = DEFAULT_RATIO_THRESHOLD;

@Advanced
@Argument(fullName = REMOVE_LONGER_THAN_ONE_INDELS_LONG_NAME, doc = "Should the probabilities of more then 1 indel be used", optional = true)
public boolean removeLongerThanOneIndels = DEFAULT_REMOVE_LONGER_INDELS;
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
package org.broadinstitute.hellbender.tools.walkers.featuremapping;

import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.Hidden;

import java.io.Serializable;
import java.util.List;

/**
* Set of arguments for the {@link AddFlowSNVQuality}
*/
public class AddFlowSNVQualityArgumentCollection implements Serializable{
private static final long serialVersionUID = 1L;
public static final String MAX_PHRED_SCORE_FULL_NAME = "max-phred-score";
public static final String KEEP_SUPPLEMENTARY_ALIGNMENTS_FULL_NAME = "keep-supplementary-alignments";
public static final String INCLUDE_QC_FAILED_READ_FULL_NAME = "include-qc-failed-read";
public static final String SNVQ_MODE_FULL_NAME = "snvq-mode";
public static final String OUTPUT_QUALITY_ATTRIBUTE_FULL_NAME = "output-quality-attribute";
public static final String DEBUG_READ_NAME_FULL_NAME = "debug-read-name";
public static final String DEBUG_COLLECT_STATS_INTO_FULL_NAME = "debug-collect-stats-into";

public enum SnvqModeEnum {
Legacy,
Optimistic,
Pessimistic,
Geometric
};

/**
* maximum value for
* delta in score
**/
@Argument(fullName = MAX_PHRED_SCORE_FULL_NAME, doc = "Limit value for phred scores", optional = true)
public double maxPhredScore = Double.NaN;

/**
* keep supplementary alignments?
**/
@Argument(fullName = KEEP_SUPPLEMENTARY_ALIGNMENTS_FULL_NAME, doc = "keep supplementary alignments ?", optional = true)
public boolean keepSupplementaryAlignments = true;

@Advanced
@Argument(fullName= INCLUDE_QC_FAILED_READ_FULL_NAME, doc = "include reads with QC failed flag", optional = true)
public boolean includeQcFailedReads = true;

/**
* snvq computation mode
*/
@Argument(fullName = SNVQ_MODE_FULL_NAME, doc = "snvq calculation mode.", optional = true)
public SnvqModeEnum snvMode = SnvqModeEnum.Geometric;

/**
* By default this tool overwrites the QUAL field with the new qualities. Setting this argument saves the original qualities in the specified SAM tag.
*/
@Argument(fullName = OUTPUT_QUALITY_ATTRIBUTE_FULL_NAME, doc = "alternate SAM tag to put original quality scores instead of overwriting the QUAL field. If not used, QUAL will be overwritten.", optional = true)
public String outputQualityAttribute = null;

/**
* debug read names?
**/
@Hidden
@Argument(fullName = DEBUG_READ_NAME_FULL_NAME, doc = "Read names of reads to output details of as part of debugging. ", optional = true)
public List<String> debugReadName = null;

@Advanced
@Hidden
@Argument(fullName= DEBUG_COLLECT_STATS_INTO_FULL_NAME, doc = "Statistics about the reads will be output to given filename.", optional = true)
public String debugCollectStatsInto = null;
}
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,25 @@ private static double[][] extractErrorProbBands(final FlowBasedRead flowRead, fi
return result;
}

/**
* The following functions estimate the error probability for an hmer. specifically two error
* probability values are generated: one for the first base of the hmer and another for the
* rest of its bases.
*
* The computation itself is performed in a subsequent function: generateSidedHmerBaseErrorProbability
* It iterates over the possible valid combinations of errors and sums them up.
*
* @param key - key (hmer length) in flow space
* @param errorProbBands - for each flow (position in the key) three error probabilities are provided:
* [0] - for the hmer being one base shorter
* [1] - for the hmer to be at its length
* [2] - for the hmer to be one base longer
* @param flow - the flow (index) for which to generate the probabilities (0 <= flow < key.length)
* @param flowOrderLength - the cycle length of of the flow order (usually 4)
* @return an array of two probabilities:
* [0] - probability for the first base of the hmer
* [1] - probability for the rest of the bases of the hmer
*/
@VisibleForTesting
protected static double[] generateHmerBaseErrorProbabilities(final int[] key, final double[][] errorProbBands, final int flow, final int flowOrderLength) {

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -858,9 +858,9 @@ else if ( hasQ || hasZ ) {
cols.put("ReadName", read.getName());

// haplotypes and reference scores
cols.put("PaternalHaplotypeScore", paternal.score);
cols.put("MaternalHaplotypeScore", maternal.score);
cols.put("RefHaplotypeScore", refScore);
cols.put("PaternalHaplotypeScore", String.format("%.6f", paternal.score));
cols.put("MaternalHaplotypeScore", String.format("%.6f", maternal.score));
cols.put("RefHaplotypeScore", String.format("%.6f", refScore));

// build haplotype keys
final FlowBasedReadUtils.ReadGroupInfo rgInfo = FlowBasedReadUtils.getReadGroupInfo(getHeaderForReads(), read);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ public class GroundTruthScorer extends ReadWalker {
private static final int BASE_VALUE_MAX = FlowBasedRead.DEFAULT_FLOW_ORDER.length() - 1;

private static final double NORMALIZED_SCORE_THRESHOLD_DEFAULT = -0.1;
private static final double DEFAULT_RATIO_THRESHOLD = 0.003;

/*
Private accumulator class for counting false/true observations (hence Boolean).
Expand Down Expand Up @@ -502,7 +503,7 @@ public void closeTool() {
// write reports
if ( reportFilePath != null ) {
final GATKReport report = new GATKReport(
BooleanAccumulator.newReportTable(qualReport, "qual", fbargs.probabilityRatioThreshold, omitZerosFromReport),
BooleanAccumulator.newReportTable(qualReport, "qual", DEFAULT_RATIO_THRESHOLD, omitZerosFromReport),
BooleanAccumulator.newReportTable(qualReport, "qual", "hmer", omitZerosFromReport),
BooleanAccumulator.newReportTable(qualReport, "qual", "hmer", "deviation", "base", omitZerosFromReport),
PercentileReport.newReportTable(percentileReports, qualityPercentiles)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,23 +1,51 @@
package org.broadinstitute.hellbender.tools.walkers.groundtruth;

import org.apache.commons.collections.map.LazySortedMap;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.io.IOException;
import java.io.PrintWriter;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicInteger;

public class SeriesStats {

private static final Logger logger = LogManager.getLogger(SeriesStats.class);

// local state
private double last = Double.NaN;
private int count = 0;
private double sum = 0;
private double min = Double.NaN;
private double max = Double.NaN;
private SortedMap<Double, AtomicInteger> bins = new TreeMap<>();
private int intCount = 0;
private Map<Double, SeriesStats> auxBins = new LinkedHashMap<>();

public void csvWrite(final String path) throws IOException {
logger.info("Writing SeriesStats " + toDigest() + " into " + path);
PrintWriter pw = new PrintWriter(path);
pw.println("value,count");
boolean intKeys = isIntKeys();
for (Map.Entry<Double, AtomicInteger> entry : bins.entrySet() ) {
if ( intKeys ) {
pw.println(String.format("%d,%d", entry.getKey().intValue(), entry.getValue().get()));
} else {
pw.println(String.format("%f,%d", entry.getKey(), entry.getValue().get()));
}
}
pw.close();
}

void add(double v) {
public void add(int v) {
add((double)v);
intCount++;
}

public void add(double v) {

// save in simple values
last = v;
Expand All @@ -31,10 +59,11 @@ void add(double v) {
count++;

// save in bins
if ( bins.containsKey(v) ) {
bins.get(v).incrementAndGet();
final Double key = v;
if ( bins.containsKey(key) ) {
bins.get(key).incrementAndGet();
} else {
bins.put(v, new AtomicInteger(1));
bins.put(key, new AtomicInteger(1));
}
}

Expand Down Expand Up @@ -109,4 +138,23 @@ public double getStd() {
return Math.sqrt(variance);
}

public Map<Double, AtomicInteger> getBins() {
return this.bins;
}

public Map<Double, SeriesStats> getAuxBins() {
return this.auxBins;
}

public String toDigest() {
if ( isIntKeys() ) {
return String.format("count=%d, min=%d, max=%d, median=%d, bin.count=%d", getCount(), (int)getMin(), (int)getMax(), (int)getMedian(), getBins().size());
} else {
return String.format("count=%d, min=%f, max=%f, median=%f, bin.count=%d", getCount(), getMin(), getMax(), getMedian(), getBins().size());
}
}

private boolean isIntKeys() {
return (count == intCount);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -133,12 +133,12 @@ public AlleleLikelihoods<GATKRead, Haplotype> computeReadLikelihoods(final List<
@Override
public ToDoubleFunction<GATKRead> log10MinTrueLikelihood(final double expectedErrorRate, final boolean capLikelihoods) {
final double log10ErrorRate = Math.log10(expectedErrorRate);
final double catastrophicErrorRate = fbargs.fillingValue;
final double log10catastrophicErrorRate = Math.log10(fbargs.fillingValue);
final double largeEventErrorRate = Math.max(fbargs.fillingValue, 0.000001); // error rate for non-hmer/snv errors that are not seq. errors.
final double log10catastrophicErrorRate = Math.log10(largeEventErrorRate);
return read -> {
final double maxErrorsForRead = capLikelihoods ? Math.max(MAX_ERRORS_FOR_READ_CAP, Math.ceil(read.getLength() * expectedErrorRate)) : Math.ceil(read.getLength() * expectedErrorRate);
final double maxCatastrophicErrorsForRead = capLikelihoods ? Math.max(MAX_CATASTROPHIC_ERRORS_FOR_READ_CAP, Math.ceil(read.getLength() * fbargs.fillingValue)) :
Math.ceil(read.getLength() * fbargs.fillingValue);
final double maxCatastrophicErrorsForRead = capLikelihoods ? Math.max(MAX_CATASTROPHIC_ERRORS_FOR_READ_CAP, Math.ceil(read.getLength() * largeEventErrorRate)) :
Math.ceil(read.getLength() * largeEventErrorRate);
return maxErrorsForRead * log10ErrorRate + maxCatastrophicErrorsForRead * log10catastrophicErrorRate;
};
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,12 +132,13 @@ public AlleleLikelihoods<GATKRead, Haplotype> computeReadLikelihoods(final List<
@Override
public ToDoubleFunction<GATKRead> log10MinTrueLikelihood(final double expectedErrorRate, final boolean capLikelihoods) {
final double log10ErrorRate = Math.log10(expectedErrorRate);
final double catastrophicErrorRate = Math.log10(fbargs.fillingValue);
final double largeEventErrorRate = 0.001; // error rate for non-hmer/snv errors that are not seq. errors.
final double log10catastrophicErrorRate = Math.log10(largeEventErrorRate);

return read -> {
final double maxErrorsForRead = Math.max(3.0, Math.ceil(read.getLength() * expectedErrorRate));
final double maxCatastrophicErrorsForRead = Math.max(2.0, Math.ceil(read.getLength() * fbargs.fillingValue));
return maxErrorsForRead * log10ErrorRate + maxCatastrophicErrorsForRead*catastrophicErrorRate;
final double maxCatastrophicErrorsForRead = Math.max(2.0, Math.ceil(read.getLength() * largeEventErrorRate));
return maxErrorsForRead * log10ErrorRate + maxCatastrophicErrorsForRead*log10catastrophicErrorRate;
};
}

Expand Down
Loading

0 comments on commit 6739e6d

Please sign in to comment.