From e796d20a9e0c0d311db56beeb58cae9ff03bb6d7 Mon Sep 17 00:00:00 2001
From: epiercehoffman
Date: Tue, 23 Jan 2024 14:26:41 -0500
Subject: [PATCH] Rewrite complex SV functional annotation in SVAnnotate
(#8516)
---
.../spark/sv/utils/GATKSVVCFConstants.java | 5 +-
.../tools/walkers/sv/SVAnnotate.java | 6 +
.../tools/walkers/sv/SVAnnotateEngine.java | 293 +++++++++++++--
.../hellbender/utils/SimpleInterval.java | 20 +
.../walkers/sv/SVAnnotateEngineUnitTest.java | 345 +++++++++++++++---
.../walkers/sv/SVAnnotateIntegrationTest.java | 3 +-
.../utils/SimpleIntervalUnitTest.java | 110 ++++++
7 files changed, 698 insertions(+), 84 deletions(-)
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java
index 95bb083f03b..c7586265990 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java
@@ -89,7 +89,9 @@ public enum ComplexVariantSubtype {
piDUP_RF,
dDUP,
dDUP_iDEL,
- INS_iDEL
+ INS_iDEL,
+ CTX_PP_QQ,
+ CTX_PQ_QP
}
// not defined in output vcf header but used in internal id that is currently output in the ID column
@@ -163,6 +165,7 @@ public enum ComplexVariantSubtype {
public static final String NONCODING_BREAKPOINT = "PREDICTED_NONCODING_BREAKPOINT";
public static final String NEAREST_TSS = "PREDICTED_NEAREST_TSS";
public static final String TSS_DUP = "PREDICTED_TSS_DUP";
+ public static final String PARTIAL_DISPERSED_DUP = "PREDICTED_PARTIAL_DISPERSED_DUP";
// SVTYPE classes
public enum StructuralVariantAnnotationType {
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotate.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotate.java
index bd808781c74..9c9e2f11efc 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotate.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotate.java
@@ -123,6 +123,11 @@
* duplicated. The partial duplication occurs when a duplication has one breakpoint within the transcript and one
* breakpoint after the end of the transcript. When the duplication is in tandem, the result is that there is one
* intact copy of the full endogenous gene.
+ * PREDICTED_PARTIAL_DISPERSED_DUP
+ * Gene(s) which are partially overlapped by the duplicated segment involved in an SV's dispersed duplication.
+ * This annotation is applied to a dispersed (non-tandem) duplication segment that is part of a complex SV if the
+ * duplicated segment overlaps part of a transcript but not the entire transcript (which would be a
+ * PREDICTED_COPY_GAIN event).
* PREDICTED_INV_SPAN
* Gene(s) which are entirely spanned by an SV's inversion. A whole-gene inversion occurs when an inversion spans
* the entire transcript, from the first base of the 5' UTR to the last base of the 3' UTR.
@@ -354,6 +359,7 @@ private void addAnnotationInfoKeysToHeader(final VCFHeader header) {
header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.NONCODING_SPAN, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Class(es) of noncoding elements spanned by SV."));
header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.NONCODING_BREAKPOINT, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Class(es) of noncoding elements disrupted by SV breakpoint."));
header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.NEAREST_TSS, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Nearest transcription start site to an intergenic variant."));
+ header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.PARTIAL_DISPERSED_DUP, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Gene(s) overlapped partially by the duplicated interval involved in a dispersed duplication event in a complex SV."));
}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotateEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotateEngine.java
index b82da470e65..7c977602303 100644
--- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotateEngine.java
+++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotateEngine.java
@@ -12,11 +12,13 @@
import org.broadinstitute.hellbender.utils.SVInterval;
import org.broadinstitute.hellbender.utils.SVIntervalTree;
import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.codecs.gtf.GencodeGtfFeature;
import org.broadinstitute.hellbender.utils.codecs.gtf.GencodeGtfTranscriptFeature;
import org.broadinstitute.hellbender.utils.variant.GATKSVVariantContextUtils;
import java.util.*;
+import java.util.stream.Collectors;
public class SVAnnotateEngine {
private final int maxBreakendLen;
@@ -24,13 +26,37 @@ public class SVAnnotateEngine {
private final SVIntervalTree nonCodingIntervalTree;
private final SAMSequenceDictionary sequenceDictionary;
- private final Set MSV_EXON_OVERLAP_CLASSIFICATIONS = Sets.newHashSet(GATKSVVCFConstants.LOF,
+ @VisibleForTesting
+ protected static final Set MSV_EXON_OVERLAP_CLASSIFICATIONS = Sets.newHashSet(GATKSVVCFConstants.LOF,
GATKSVVCFConstants.INT_EXON_DUP,
GATKSVVCFConstants.DUP_PARTIAL,
GATKSVVCFConstants.PARTIAL_EXON_DUP,
GATKSVVCFConstants.COPY_GAIN,
GATKSVVCFConstants.TSS_DUP);
+ @VisibleForTesting
+ protected static final Set PROTEIN_CODING_CONSEQUENCES = Sets.newHashSet(GATKSVVCFConstants.LOF,
+ GATKSVVCFConstants.INT_EXON_DUP,
+ GATKSVVCFConstants.DUP_PARTIAL,
+ GATKSVVCFConstants.PARTIAL_EXON_DUP,
+ GATKSVVCFConstants.COPY_GAIN,
+ GATKSVVCFConstants.TSS_DUP,
+ GATKSVVCFConstants.INV_SPAN,
+ GATKSVVCFConstants.MSV_EXON_OVERLAP,
+ GATKSVVCFConstants.UTR,
+ GATKSVVCFConstants.INTRONIC,
+ GATKSVVCFConstants.BREAKEND_EXON);
+
+ @VisibleForTesting
+ protected static final Set COMPLEX_SUBTYPES_WITH_DISPERSED_DUP =
+ Sets.newHashSet(GATKSVVCFConstants.ComplexVariantSubtype.dDUP,
+ GATKSVVCFConstants.ComplexVariantSubtype.dupINV,
+ GATKSVVCFConstants.ComplexVariantSubtype.INVdup,
+ GATKSVVCFConstants.ComplexVariantSubtype.dupINVdup,
+ GATKSVVCFConstants.ComplexVariantSubtype.dupINVdel,
+ GATKSVVCFConstants.ComplexVariantSubtype.delINVdup,
+ GATKSVVCFConstants.ComplexVariantSubtype.dDUP_iDEL);
+
// Mini class to package SV type and interval into one object
@VisibleForTesting
protected static final class SVSegment {
@@ -211,14 +237,21 @@ protected static String annotateDeletion(final SimpleInterval variantInterval,
* Get consequence of duplication variant on transcript
* @param variantInterval - SimpleInterval representing structural variant
* @param gtfTranscript - protein-coding GTF transcript
+ * @param isDispersedDuplication - boolean: true if duplication segment is dispersed, false otherwise
* @return - consequence of duplication variant on transcript
*/
@VisibleForTesting
protected static String annotateDuplication(final SimpleInterval variantInterval,
- final GencodeGtfTranscriptFeature gtfTranscript) {
+ final GencodeGtfTranscriptFeature gtfTranscript,
+ boolean isDispersedDuplication) {
final SimpleInterval transcriptInterval = new SimpleInterval(gtfTranscript);
if (variantSpansFeature(variantInterval, transcriptInterval)) {
+ // return COPY_GAIN immediately because same regardless of tandem or dispersed (isDispersedDuplication)
return GATKSVVCFConstants.COPY_GAIN;
+ } else if (isDispersedDuplication) {
+ // all DUP segments in CPX events are currently dispersed duplications, not tandem. So
+ // if not COPY_GAIN, then partial gene overlap --> if complex, immediate PARTIAL_DISPERSED_DUP
+ return GATKSVVCFConstants.PARTIAL_DISPERSED_DUP;
} else if (variantOverlapsTranscriptionStartSite(variantInterval, gtfTranscript)) {
return GATKSVVCFConstants.TSS_DUP;
} else if (!transcriptInterval.contains(variantInterval)) {
@@ -276,7 +309,7 @@ protected static String annotateDuplication(final SimpleInterval variantInterval
protected static String annotateCopyNumberVariant(final SimpleInterval variantInterval,
final GencodeGtfTranscriptFeature gtfTranscript,
final Set MSVExonOverlapClassifications) {
- final String consequence = annotateDuplication(variantInterval, gtfTranscript);
+ final String consequence = annotateDuplication(variantInterval, gtfTranscript, false);
if (MSVExonOverlapClassifications.contains(consequence)) {
return GATKSVVCFConstants.MSV_EXON_OVERLAP;
} else {
@@ -338,12 +371,14 @@ protected static String annotateBreakend(final SimpleInterval variantInterval,
* Add consequence of structural variant on an overlapping transcript to consequence dictionary for variant
* @param variantInterval - SimpleInterval representing structural variant
* @param svType - SV type
+ * @param includesDispersedDuplication - boolean: true if SV type contains dispersed duplication(s), false if not
* @param transcript - protein-coding GTF transcript
* @param variantConsequenceDict - running map of consequence -> feature name for variant to update
*/
@VisibleForTesting
protected void annotateTranscript(final SimpleInterval variantInterval,
final GATKSVVCFConstants.StructuralVariantAnnotationType svType,
+ final boolean includesDispersedDuplication,
final GencodeGtfTranscriptFeature transcript,
final Map> variantConsequenceDict) {
final String consequence;
@@ -355,7 +390,9 @@ protected void annotateTranscript(final SimpleInterval variantInterval,
consequence = annotateInsertion(variantInterval, transcript);
break;
case DUP:
- consequence = annotateDuplication(variantInterval, transcript);
+ // if SV includesDispersedDuplication, every DUP segment in the SV is treated as dispersed
+ // this assumption holds for the complex subtypes currently resolved by GATK-SV
+ consequence = annotateDuplication(variantInterval, transcript, includesDispersedDuplication);
break;
case CNV:
consequence = annotateCopyNumberVariant(variantInterval,transcript, MSV_EXON_OVERLAP_CLASSIFICATIONS);
@@ -395,6 +432,7 @@ private void annotatePromoterOverlaps(final SimpleInterval variantInterval,
for (final Iterator> it = promotersForVariant; it.hasNext(); ) {
final SVIntervalTree.Entry promoterEntry = it.next();
final String promoterName = promoterEntry.getValue();
+ // only annotate promoter overlap if there is no coding annotation for the gene
if (!codingAnnotationGenes.contains(promoterName)) {
updateVariantConsequenceDict(variantConsequenceDict, GATKSVVCFConstants.PROMOTER, promoterName);
}
@@ -477,11 +515,13 @@ protected static GATKSVVCFConstants.StructuralVariantAnnotationType getSVType(fi
* Add protein-coding annotations for any transcripts overlapping the variant to the variant consequence dictionary
* @param variantInterval - SimpleInterval representing structural variant
* @param svType - SV type
+ * @param includesDispersedDuplication - boolean: true if SV type contains dispersed duplication(s), false otherwise
* @param variantConsequenceDict - running map of consequence -> feature name for variant to update
*/
@VisibleForTesting
protected void annotateGeneOverlaps(final SimpleInterval variantInterval,
final GATKSVVCFConstants.StructuralVariantAnnotationType svType,
+ final boolean includesDispersedDuplication,
final Map> variantConsequenceDict) {
final Iterator> gtfTranscriptsForVariant =
gtfIntervalTrees.getTranscriptIntervalTree().overlappers(
@@ -489,24 +529,101 @@ protected void annotateGeneOverlaps(final SimpleInterval variantInterval,
);
for (Iterator> it = gtfTranscriptsForVariant; it.hasNext(); ) {
SVIntervalTree.Entry transcriptEntry = it.next();
- annotateTranscript(variantInterval, svType, transcriptEntry.getValue(), variantConsequenceDict);
+ annotateTranscript(variantInterval, svType, includesDispersedDuplication, transcriptEntry.getValue(),
+ variantConsequenceDict);
}
}
+
+ /**
+ * Parse CPX_INTERVALS field into a list of SV segments.
+ * Format of each item in CPX_INTERVALS is "SVTYPE_CHROM:POS-END"
+ * @param cpxIntervals - list of String elements from CPX_INTERVALS field, each describing one segment of a complex SV
+ * @return - List of SVSegments representing components of the complex SV represented in CPX_INTERVALS
+ */
+ @VisibleForTesting
+ protected static List parseComplexIntervals(final List cpxIntervals) {
+ final List segments = new ArrayList<>(cpxIntervals.size() + 1);
+ for (final String cpxInterval : cpxIntervals) {
+ final String[] parsed = cpxInterval.split("_");
+ final GATKSVVCFConstants.StructuralVariantAnnotationType svTypeForInterval = GATKSVVCFConstants.StructuralVariantAnnotationType.valueOf(parsed[0]);
+ final SimpleInterval interval = new SimpleInterval(parsed[1]);
+ segments.add(new SVSegment(svTypeForInterval, interval));
+ }
+ return segments;
+ }
+
+
/**
- * Parse one interval string from CPX_INTERVALS INFO field into an SVSegment representing the SV type and
- * interval of one of the components of the complex event
- * @param cpxInterval - one element from CPX_INTERVALS list, a string representing one component of complex SV
- * @return - SVSegment representing one component of the complex SV (type and interval)
+ * Return modified complex SV intervals list ready for protein-coding annotation.
+ * Start from SV segments from CPX_INTERVALS and ignore or adjust intervals as needed:
+ * (1) Ignore INV segments in dDUP, dDUP_iDEL, and INS_iDEL events because they describe an inversion in the inserted
+ * sequence that has no impact on the source sequence.
+ * (2) Adjust INV segments in dupINV, INVdup, dupINVdup, dupINVdel, and delINVdup events by subtracting the portion
+ * of the interval overlapped by a DUP segment, because that describes an inversion in the duplicated copy that has no
+ * impact on the source sequence.
+ * (3) Ignore INS segment in INS_iDEL if present because it represents the source sequence and the impact on the
+ * source (DUP or DEL) is unknown and cannot be annotated.
+ * @param cpxIntervals - list of SVSegments representing complex SV intervals from CPX_INTERVALS field
+ * @param complexType - Complex SV event type category, from CPX_TYPE field
+ * @return - List of SVSegments representing component of the complex SV (type and interval) to annotate for
+ * protein-coding consequences
*/
@VisibleForTesting
- protected static SVSegment parseCPXIntervalString(final String cpxInterval) {
- final String[] parsed = cpxInterval.split("_");
- final GATKSVVCFConstants.StructuralVariantAnnotationType svTypeForInterval = GATKSVVCFConstants.StructuralVariantAnnotationType.valueOf(parsed[0]);
- final SimpleInterval interval = new SimpleInterval(parsed[1]);
- return new SVSegment(svTypeForInterval, interval);
+ protected static List getComplexAnnotationIntervals(final List cpxIntervals,
+ final GATKSVVCFConstants.ComplexVariantSubtype complexType) {
+ final List segments = new ArrayList<>(cpxIntervals.size());
+ final List dupIntervals = new ArrayList<>(cpxIntervals.size());
+ SimpleInterval inversionIntervalToAdjust = null;
+ boolean keepSegment;
+ for (final SVSegment originalSegment : cpxIntervals) {
+ keepSegment = true;
+ if (originalSegment.getIntervalSVType() == GATKSVVCFConstants.StructuralVariantAnnotationType.INV) {
+ // ignore INV segment for dDUP or dDUP_iDEL or INS_iDEL
+ // because it is an inversion of the inserted sequence relative to the origin
+ // but has no impact on the origin site
+ if (complexType == GATKSVVCFConstants.ComplexVariantSubtype.dDUP ||
+ complexType == GATKSVVCFConstants.ComplexVariantSubtype.dDUP_iDEL ||
+ complexType == GATKSVVCFConstants.ComplexVariantSubtype.INS_iDEL) {
+ keepSegment = false;
+ }
+ // save INV interval to adjust later for dupINV / INVdup / dupINVdup / dupINVdel / delINVdup
+ // because INV interval includes duplicated sequence that is inverted relative to origin when re-inserted
+ // but there is no inversion at the origin site for this sequence
+ else if (complexType == GATKSVVCFConstants.ComplexVariantSubtype.dupINV ||
+ complexType == GATKSVVCFConstants.ComplexVariantSubtype.INVdup ||
+ complexType == GATKSVVCFConstants.ComplexVariantSubtype.dupINVdup ||
+ complexType == GATKSVVCFConstants.ComplexVariantSubtype.delINVdup ||
+ complexType == GATKSVVCFConstants.ComplexVariantSubtype.dupINVdel) {
+ inversionIntervalToAdjust = originalSegment.getInterval();
+ keepSegment = false;
+ }
+ } else if (originalSegment.getIntervalSVType() == GATKSVVCFConstants.StructuralVariantAnnotationType.DUP) {
+ dupIntervals.add(originalSegment.getInterval());
+ } else if (originalSegment.getIntervalSVType() == GATKSVVCFConstants.StructuralVariantAnnotationType.INS &&
+ complexType == GATKSVVCFConstants.ComplexVariantSubtype.INS_iDEL) {
+ // if there is an INS interval in CPX_INTERVALS for INS_iDEL, ignore it
+ // because it represents the origin of the inserted sequence and the status (DUP or DEL) is unknown
+ keepSegment = false;
+ }
+ if (keepSegment) {
+ segments.add(originalSegment);
+ }
+ }
+ // adjust INV interval for dupINV / INVdup / dupINVdup / dupINVdel / delINVdup
+ // to remove portion that overlaps duplicated sequence (not inverted at sequence origin so no impact)
+ // and keep portion that represents sink site breakpoints
+ if (inversionIntervalToAdjust != null) {
+ for (final SimpleInterval dupInterval : dupIntervals) {
+ inversionIntervalToAdjust = inversionIntervalToAdjust.subtract(dupInterval);
+ }
+ segments.add(new SVSegment(GATKSVVCFConstants.StructuralVariantAnnotationType.INV, inversionIntervalToAdjust));
+ }
+
+ return segments;
}
+
/**
* Get SV type to use for annotation for a breakend VCF record
* Breakend may represent BND, CTX, or DEL / DUP if the user specifies {@code SVAnnotate.MAX_BND_LEN_NAME}
@@ -519,11 +636,12 @@ protected static SVSegment parseCPXIntervalString(final String cpxInterval) {
* @return - SV type to use for annotation of breakend record
*/
private static GATKSVVCFConstants.StructuralVariantAnnotationType getAnnotationTypeForBreakend(final VariantContext variant,
- final String complexType,
+ final GATKSVVCFConstants.ComplexVariantSubtype complexType,
final int maxBreakendLen,
final int svLen,
final String chrom, final String chr2) {
- if (complexType != null && complexType.contains("CTX")) {
+ if (complexType != null && (complexType == GATKSVVCFConstants.ComplexVariantSubtype.CTX_PP_QQ ||
+ complexType == GATKSVVCFConstants.ComplexVariantSubtype.CTX_PQ_QP)) {
return GATKSVVCFConstants.StructuralVariantAnnotationType.CTX;
} else if (maxBreakendLen > 0 && chr2 != null && chrom.equals(chr2) && svLen <= maxBreakendLen) {
// if maxBreakendLenForOverlapAnnotation argument provided, annotate as DUP or DEL if applicable
@@ -554,26 +672,25 @@ private static GATKSVVCFConstants.StructuralVariantAnnotationType getAnnotationT
@VisibleForTesting
protected static List getSVSegments(final VariantContext variant,
final GATKSVVCFConstants.StructuralVariantAnnotationType overallSVType,
- final int maxBreakendLen) {
+ final int maxBreakendLen,
+ final GATKSVVCFConstants.ComplexVariantSubtype complexType) {
final List intervals;
- final String complexType = variant.getAttributeAsString(GATKSVVCFConstants.CPX_TYPE, null);
final String chrom = variant.getContig();
final int pos = variant.getStart();
final String chr2 = variant.getAttributeAsString(GATKSVVCFConstants.CONTIG2_ATTRIBUTE, null);
final int end2 = variant.getAttributeAsInt(GATKSVVCFConstants.END2_ATTRIBUTE, pos);
if (overallSVType.equals(GATKSVVCFConstants.StructuralVariantAnnotationType.CPX)) {
- final List cpxIntervalsString = variant.getAttributeAsStringList(GATKSVVCFConstants.CPX_INTERVALS, null);
- if (cpxIntervalsString == null) {
+ final List cpxIntervals = variant.getAttributeAsStringList(GATKSVVCFConstants.CPX_INTERVALS, null);
+ if (cpxIntervals.isEmpty()) {
throw new UserException("Complex (CPX) variant must contain CPX_INTERVALS INFO field");
}
if (complexType == null) {
throw new UserException("Complex (CPX) variant must contain CPX_TYPE INFO field");
}
- intervals = new ArrayList<>(cpxIntervalsString.size() + 1);
- for (final String cpxInterval : cpxIntervalsString) {
- intervals.add(parseCPXIntervalString(cpxInterval));
- }
- if (complexType.contains("dDUP")) {
+ intervals = getComplexAnnotationIntervals(parseComplexIntervals(cpxIntervals), complexType);
+ // add sink site as INS for dDUP (encoded in CHROM and POS instead of INFO/CPX_INTERVALS)
+ // no need to add sink site INS for INS_iDEL or dDUP_iDEL because DEL coordinates contain sink site
+ if (complexType == GATKSVVCFConstants.ComplexVariantSubtype.dDUP) {
intervals.add(new SVSegment(GATKSVVCFConstants.StructuralVariantAnnotationType.INS,
new SimpleInterval(chrom, pos, pos + 1)));
}
@@ -620,7 +737,62 @@ protected static List getSVSegments(final VariantContext variant,
return intervals;
}
+ /**
+ * Returns a list of SVSegments to use for promoter & noncoding annotations
+ * For simple (non-complex) SVs, returns original list of segments
+ * For complex SVs, returns a new subsetted list of SVSegments without DUP segments, which are always dispersed
+ * (never tandem) in current set of defined CPX subtypes, and so are not considered for noncoding annotations.
+ * @param svSegments - List of SVSegments used for gene overlap annotations
+ * @return - Subsetted list of SVSegments to use for promoter & noncoding annotations for CPX SVs
+ */
+ @VisibleForTesting
+ protected static List getSegmentsForNonCodingAnnotations(final List svSegments,
+ final boolean isComplex) {
+ if (isComplex) {
+ return svSegments.stream()
+ .filter(seg -> seg.getIntervalSVType() != GATKSVVCFConstants.StructuralVariantAnnotationType.DUP)
+ .collect(Collectors.toList());
+ }
+ else {
+ return svSegments;
+ }
+ }
+ /**
+ * Returns a list of SVSegments to use for nearest TSS annotations.
+ * Must apply on the output of getSegmentsForNonCodingAnnotations.
+ * For simple (non-complex) SVs, returns original list of segments.
+ * For complex SVs, merges remaining intervals (DEL, INV) into a single interval for deletion-containing CPX events.
+ * DUP segments are already removed from complex SVs.
+ * @param svSegments - List of SVSegments used for gene overlap annotations
+ * @return - List of SVSegments to use for nearest TSS annotations for CPX SVs
+ */
+ @VisibleForTesting
+ protected static List getSegmentForNearestTSS(final List svSegments,
+ final GATKSVVCFConstants.ComplexVariantSubtype complexType) {
+ // for dDUP_iDEL, INS_iDEL, delINV, INVdel, dupINVdel, delINVdup, delINVdel --> merge all remaining SV segments
+ // which will be INS, DEL, INV types (DUPs already removed)
+ // so that there is only one nearest TSS based on outer breakpoints of CPX event
+ if (complexType != null &&
+ (complexType == GATKSVVCFConstants.ComplexVariantSubtype.INS_iDEL ||
+ complexType == GATKSVVCFConstants.ComplexVariantSubtype.dDUP_iDEL ||
+ complexType == GATKSVVCFConstants.ComplexVariantSubtype.delINV ||
+ complexType == GATKSVVCFConstants.ComplexVariantSubtype.INVdel ||
+ complexType == GATKSVVCFConstants.ComplexVariantSubtype.dupINVdel ||
+ complexType == GATKSVVCFConstants.ComplexVariantSubtype.delINVdup ||
+ complexType == GATKSVVCFConstants.ComplexVariantSubtype.delINVdel)) {
+ SimpleInterval spanningSegment = svSegments.get(0).getInterval();
+ for (int i = 1; i < svSegments.size(); i++) {
+ spanningSegment = spanningSegment.mergeWithContiguous(svSegments.get(i).getInterval());
+ }
+ return Collections.singletonList(new SVSegment(GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
+ spanningSegment));
+ } else {
+ // if not complex return original list of segments
+ // if complex & dDUP, dupINV, INVdup, dupINVdup --> no further modifications (already adjusted INV, removed DUPs)
+ return svSegments;
+ }
+ }
/**
* Create a copy of the variant consequence dictionary in which the feature names for each consequence are sorted
@@ -639,6 +811,39 @@ protected static Map sortVariantConsequenceDict(final Map> variantConsequenceDict,
+ final Set proteinCodingConsequences) {
+ for (final String consequence : variantConsequenceDict.keySet()) {
+ if (proteinCodingConsequences.contains(consequence)) {
+ // if the SV has protein-coding consequences other than PARTIAL_DISPERSED_DUP then it is not INTERGENIC
+ return false;
+ }
+ }
+ // if the SV has no protein-coding consequences then it is intergenic
+ return true;
+ }
+
+ /**
+ * Checks if a variant includes one or more dispersed duplications
+ * @param complexType - the complex subtype of the SV
+ * @param complexSubtypesWithDispersedDup - the set of complex subtypes containing one or more dispersed duplications
+ * @return - boolean: true if the SV's complexType is in the set of complexSubtypesWithDispersedDup, false otherwise
+ */
+ @VisibleForTesting
+ protected static boolean includesDispersedDuplication(final GATKSVVCFConstants.ComplexVariantSubtype complexType,
+ final Set complexSubtypesWithDispersedDup) {
+ return complexType != null && complexSubtypesWithDispersedDup.contains(complexType);
+ }
+
/**
* Create a consequence -> feature name map and add all annotations for protein-coding, promoter, nearest TSS,
* and noncoding consequences for a variant
@@ -649,40 +854,58 @@ protected static Map sortVariantConsequenceDict(final Map annotateStructuralVariant(final VariantContext variant) {
final Map> variantConsequenceDict = new HashMap<>();
final GATKSVVCFConstants.StructuralVariantAnnotationType overallSVType = getSVType(variant);
- final List svSegments = getSVSegments(variant, overallSVType, maxBreakendLen);
+ final String complexTypeString = variant.getAttributeAsString(GATKSVVCFConstants.CPX_TYPE, null);
+ GATKSVVCFConstants.ComplexVariantSubtype complexType = null;
+ if (complexTypeString != null) {
+ // replace / in CTX_PP/QQ and CTX_PQ/QP with _ to match ComplexVariantSubtype constants which cannot contain slashes
+ complexType = GATKSVVCFConstants.ComplexVariantSubtype.valueOf(complexTypeString.replace("/", "_"));
+ }
+ final boolean includesDispersedDuplication = includesDispersedDuplication(complexType, COMPLEX_SUBTYPES_WITH_DISPERSED_DUP);
+ final List svSegmentsForGeneOverlaps = getSVSegments(variant, overallSVType, maxBreakendLen, complexType);
+
+ // annotate gene overlaps
if (gtfIntervalTrees != null && gtfIntervalTrees.getTranscriptIntervalTree() != null) {
- for (SVSegment svSegment : svSegments) {
- annotateGeneOverlaps(svSegment.getInterval(), svSegment.getIntervalSVType(), variantConsequenceDict);
+ for (SVSegment svSegment : svSegmentsForGeneOverlaps) {
+ annotateGeneOverlaps(svSegment.getInterval(), svSegment.getIntervalSVType(),
+ includesDispersedDuplication, variantConsequenceDict);
}
}
- // if variant consequence dictionary is empty (no protein-coding annotations), apply INTERGENIC flag
- final boolean noCodingAnnotations = variantConsequenceDict.isEmpty();
+ // if variant consequence dictionary contains no protein-coding consequences, apply INTERGENIC flag
+ final boolean isIntergenic = isIntergenic(variantConsequenceDict, PROTEIN_CODING_CONSEQUENCES);
+
+ // get SV segments to annotate promoter & noncoding consequences
+ final List svSegmentsForNonCodingAnnotations =
+ getSegmentsForNonCodingAnnotations(svSegmentsForGeneOverlaps, includesDispersedDuplication);
// then annotate promoter overlaps and non-coding feature overlaps
if (gtfIntervalTrees != null && gtfIntervalTrees.getPromoterIntervalTree() != null) {
- for (final SVSegment svSegment : svSegments) {
+ for (final SVSegment svSegment : svSegmentsForNonCodingAnnotations) {
annotatePromoterOverlaps(svSegment.getInterval(), variantConsequenceDict);
}
}
if (nonCodingIntervalTree != null) {
- for (SVSegment svSegment : svSegments) {
+ for (SVSegment svSegment : svSegmentsForNonCodingAnnotations) {
annotateNonCodingOverlaps(svSegment.getInterval(), variantConsequenceDict);
}
}
+ // get list of SV segments to annotate nearest TSS
+ List svSegmentsForNearestTSS =
+ getSegmentForNearestTSS(svSegmentsForNonCodingAnnotations, complexType);
+
// annotate nearest TSS for intergenic variants with no promoter overlaps
if (gtfIntervalTrees != null && gtfIntervalTrees.getTranscriptionStartSiteTree() != null &&
- !variantConsequenceDict.containsKey(GATKSVVCFConstants.PROMOTER) && noCodingAnnotations) {
- for (SVSegment svSegment : svSegments) {
+ !variantConsequenceDict.containsKey(GATKSVVCFConstants.PROMOTER) && isIntergenic) {
+ for (SVSegment svSegment : svSegmentsForNearestTSS) {
annotateNearestTranscriptionStartSite(svSegment.getInterval(), variantConsequenceDict);
}
}
final Map attributes = sortVariantConsequenceDict(variantConsequenceDict);
if (gtfIntervalTrees != null && gtfIntervalTrees.getTranscriptIntervalTree() != null) {
- attributes.put(GATKSVVCFConstants.INTERGENIC, noCodingAnnotations);
+ attributes.put(GATKSVVCFConstants.INTERGENIC, isIntergenic);
}
return attributes;
}
diff --git a/src/main/java/org/broadinstitute/hellbender/utils/SimpleInterval.java b/src/main/java/org/broadinstitute/hellbender/utils/SimpleInterval.java
index bade66094bf..ebbce974105 100644
--- a/src/main/java/org/broadinstitute/hellbender/utils/SimpleInterval.java
+++ b/src/main/java/org/broadinstitute/hellbender/utils/SimpleInterval.java
@@ -1,6 +1,7 @@
package org.broadinstitute.hellbender.utils;
+import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.util.Locatable;
@@ -275,6 +276,25 @@ public SimpleInterval intersect( final Locatable that ) {
Math.min( getEnd(), that.getEnd()) );
}
+ /**
+ * Get section of starting interval (this) that is not overlapped by the other interval (that)
+ * @param that - interval to subtract from starting interval. Must overlap (but not fully contain) starting interval
+ * @return - SimpleInterval representing the portion of starting interval (this) not overlapped by other interval (that)
+ */
+ @VisibleForTesting
+ public SimpleInterval subtract(final Locatable that) {
+ Utils.validateArg(this.overlaps(that), () ->
+ "SimpleIntervaL::subtract(): The two intervals need to overlap: " + this + ", " + that);
+ Utils.validateArg(!that.contains(this), () ->
+ "SimpleIntervaL::subtract(): Interval to subtract " + that + " cannot contain starting interval " + this);
+ if (this.getStart() < that.getStart()) {
+ return new SimpleInterval(this.getContig(), this.getStart(), that.getStart());
+ }
+ else {
+ return new SimpleInterval(this.getContig(), that.getEnd(), this.getEnd());
+ }
+ }
+
/**
* Returns a new SimpleInterval that represents the entire span of this and that. Requires that
* this and that SimpleInterval are contiguous.
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotateEngineUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotateEngineUnitTest.java
index 867e0222221..80f8312ff0c 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotateEngineUnitTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotateEngineUnitTest.java
@@ -25,12 +25,6 @@
public class SVAnnotateEngineUnitTest extends GATKBaseTest {
private final File TOY_GTF_FILE = new File(getToolTestDataDir().replaceFirst("Engine", "") + "unittest.gtf");
private final File TINY_NONCODING_BED_FILE = new File(getToolTestDataDir().replaceFirst("Engine", "") + "noncoding.unittest.bed");
- private final Set MSV_EXON_OVERLAP_CLASSIFICATIONS = Sets.newHashSet(GATKSVVCFConstants.LOF,
- GATKSVVCFConstants.INT_EXON_DUP,
- GATKSVVCFConstants.DUP_PARTIAL,
- GATKSVVCFConstants.PARTIAL_EXON_DUP,
- GATKSVVCFConstants.COPY_GAIN,
- GATKSVVCFConstants.TSS_DUP);
// Pairs of intervals with different relationships to check if first (variant) interval spans second (feature)
@@ -175,14 +169,15 @@ public void testAnnotateIntervalSVTypes(
) {
final GencodeGtfTranscriptFeature toyTranscript = loadToyGtfTranscript();
- final String actualDuplicationConsequence = SVAnnotateEngine.annotateDuplication(toyVariant, toyTranscript);
+ final String actualDuplicationConsequence = SVAnnotateEngine.annotateDuplication(toyVariant, toyTranscript, false);
Assert.assertEquals(actualDuplicationConsequence, expectedDuplicationConsequence);
final String actualDeletionConsequence = SVAnnotateEngine.annotateDeletion(toyVariant, toyTranscript);
Assert.assertEquals(actualDeletionConsequence, expectedDeletionConsequence);
final String actualCopyNumberVariantConsequence =
- SVAnnotateEngine.annotateCopyNumberVariant(toyVariant, toyTranscript, MSV_EXON_OVERLAP_CLASSIFICATIONS);
+ SVAnnotateEngine.annotateCopyNumberVariant(toyVariant, toyTranscript,
+ SVAnnotateEngine.MSV_EXON_OVERLAP_CLASSIFICATIONS);
Assert.assertEquals(actualCopyNumberVariantConsequence, expectedCopyNumberVariantConsequence);
final String actualInversionConsequence = SVAnnotateEngine.annotateInversion(toyVariant, toyTranscript);
@@ -244,21 +239,143 @@ public void testAnnotatePointSVTypes(
Assert.assertEquals(actualTwoBaseTranslocationConsequence, expectedTranslocationVariantConsequence);
}
+ /**
+ * Create list of SV segments with SAME SVTYPE - convenience function for testing getSVSegments
+ * @param svType - SV type for all segments
+ * @param intervals - list of intervals
+ * @return - list of SV segments with provided SV type, one for each interval
+ */
+ private List createListOfSVSegments(final GATKSVVCFConstants.StructuralVariantAnnotationType svType,
+ final SimpleInterval[] intervals) {
+ final List segments = new ArrayList<>(intervals.length);
+ for (final SimpleInterval interval : intervals) {
+ segments.add(new SVAnnotateEngine.SVSegment(svType, interval));
+ }
+ return segments;
+ }
+
+ /**
+ * Create list of SV segments with different SVTYPEs - convenience function
+ * @param svTypes - list of SV types
+ * @param intervals - list of intervals
+ * @return - list of SV segments
+ */
+ private List createListOfSVSegmentsDifferentTypes(final GATKSVVCFConstants.StructuralVariantAnnotationType[] svTypes,
+ final SimpleInterval[] intervals) {
+ Assert.assertEquals(svTypes.length, intervals.length);
+ final List segments = new ArrayList<>(intervals.length);
+ for (int i = 0; i < svTypes.length; i++) {
+ segments.add(new SVAnnotateEngine.SVSegment(svTypes[i], intervals[i]));
+ }
+ return segments;
+ }
+
+ // CPX_TYPE & CPX_INTERVALS INFO fields specifying complex variant intervals, and expected SV segments
+ @DataProvider(name = "complexVariantIntervals")
+ public Object[][] getComplexVariantIntervalsTestData() {
+ return new Object[][] {
+ { GATKSVVCFConstants.ComplexVariantSubtype.dDUP, "DUP_chr1:280-420",
+ createListOfSVSegmentsDifferentTypes(new GATKSVVCFConstants.StructuralVariantAnnotationType[]{
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DUP },
+ new SimpleInterval[]{new SimpleInterval("chr1", 280, 420)}) },
+ { GATKSVVCFConstants.ComplexVariantSubtype.dupINV, "DUP_chr1:44355904-44356327,INV_chr1:44355904-44357498",
+ createListOfSVSegmentsDifferentTypes(new GATKSVVCFConstants.StructuralVariantAnnotationType[]{
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DUP,
+ GATKSVVCFConstants.StructuralVariantAnnotationType.INV},
+ new SimpleInterval[]{new SimpleInterval("chr1", 44355904, 44356327),
+ new SimpleInterval("chr1", 44356327, 44357498)}) },
+ { GATKSVVCFConstants.ComplexVariantSubtype.dupINVdup, "DUP_chr1:247660974-248129213,INV_chr1:247660974-248587216,DUP_chr1:248520217-248587216",
+ createListOfSVSegmentsDifferentTypes(new GATKSVVCFConstants.StructuralVariantAnnotationType[]{
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DUP,
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DUP,
+ GATKSVVCFConstants.StructuralVariantAnnotationType.INV},
+ new SimpleInterval[]{new SimpleInterval("chr1", 247660974, 248129213),
+ new SimpleInterval("chr1", 248520217, 248587216),
+ new SimpleInterval("chr1", 248129213, 248520217)}) },
+ { GATKSVVCFConstants.ComplexVariantSubtype.dDUP_iDEL, "DUP_chr2:131488885-131489335,INV_chr2:131488885-131489335,DEL_chr2:130185450-130185720",
+ createListOfSVSegmentsDifferentTypes(new GATKSVVCFConstants.StructuralVariantAnnotationType[]{
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DUP,
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DEL},
+ new SimpleInterval[]{new SimpleInterval("chr2", 131488885,131489335),
+ new SimpleInterval("chr2", 130185450,130185720)}) },
+ { GATKSVVCFConstants.ComplexVariantSubtype.dDUP_iDEL, "DUP_chr3:95751919-95752156,DEL_chr3:95746923-95749272",
+ createListOfSVSegmentsDifferentTypes(new GATKSVVCFConstants.StructuralVariantAnnotationType[]{
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DUP,
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DEL},
+ new SimpleInterval[]{new SimpleInterval("chr3", 95751919,95752156),
+ new SimpleInterval("chr3", 95746923,95749272)}) },
+ { GATKSVVCFConstants.ComplexVariantSubtype.INS_iDEL, "DEL_chr3:60521333-60521483",
+ createListOfSVSegmentsDifferentTypes(new GATKSVVCFConstants.StructuralVariantAnnotationType[]{
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DEL},
+ new SimpleInterval[]{new SimpleInterval("chr3", 60521333,60521483)}) },
+ { GATKSVVCFConstants.ComplexVariantSubtype.delINV, "DEL_chr2:120379742-120383130,INV_chr2:120383130-120384123",
+ createListOfSVSegmentsDifferentTypes(new GATKSVVCFConstants.StructuralVariantAnnotationType[]{
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
+ GATKSVVCFConstants.StructuralVariantAnnotationType.INV},
+ new SimpleInterval[]{new SimpleInterval("chr2", 120379742,120383130),
+ new SimpleInterval("chr2", 120383130,120384123)}) },
+ { GATKSVVCFConstants.ComplexVariantSubtype.INVdel, "INV_chr2:122719025-122719803,DEL_chr2:122719803-122724929",
+ createListOfSVSegmentsDifferentTypes(new GATKSVVCFConstants.StructuralVariantAnnotationType[]{
+ GATKSVVCFConstants.StructuralVariantAnnotationType.INV,
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DEL},
+ new SimpleInterval[]{new SimpleInterval("chr2", 122719025,122719803),
+ new SimpleInterval("chr2", 122719803,122724929)}) },
+ { GATKSVVCFConstants.ComplexVariantSubtype.delINVdup, "DEL_chr2:54002577-54003019,INV_chr2:54003019-54006204,DUP_chr2:54006057-54006204",
+ createListOfSVSegmentsDifferentTypes(new GATKSVVCFConstants.StructuralVariantAnnotationType[]{
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DUP,
+ GATKSVVCFConstants.StructuralVariantAnnotationType.INV},
+ new SimpleInterval[]{new SimpleInterval("chr2", 54002577,54003019),
+ new SimpleInterval("chr2", 54006057,54006204),
+ new SimpleInterval("chr2", 54003019,54006057)}) },
+ { GATKSVVCFConstants.ComplexVariantSubtype.dupINVdel, "DUP_chr2:4157678-4157846,INV_chr2:4157678-4165085,DEL_chr2:4165085-4175888",
+ createListOfSVSegmentsDifferentTypes(new GATKSVVCFConstants.StructuralVariantAnnotationType[]{
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DUP,
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
+ GATKSVVCFConstants.StructuralVariantAnnotationType.INV},
+ new SimpleInterval[]{new SimpleInterval("chr2", 4157678,4157846),
+ new SimpleInterval("chr2", 4165085,4175888),
+ new SimpleInterval("chr2", 4157846,4165085)}) },
+ { GATKSVVCFConstants.ComplexVariantSubtype.delINVdel, "DEL_chr2:62384663-62387814,INV_chr2:62387814-62388322,DEL_chr2:62388322-62390272",
+ createListOfSVSegmentsDifferentTypes(new GATKSVVCFConstants.StructuralVariantAnnotationType[]{
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
+ GATKSVVCFConstants.StructuralVariantAnnotationType.INV,
+ GATKSVVCFConstants.StructuralVariantAnnotationType.DEL},
+ new SimpleInterval[]{new SimpleInterval("chr2", 62384663,62387814),
+ new SimpleInterval("chr2", 62387814,62388322),
+ new SimpleInterval("chr2", 62388322,62390272)}) }
+ };
+ }
+
+ // Test getComplexAnnotationIntervals()
+ @Test(dataProvider = "complexVariantIntervals")
+ public void testGetSegmentsFromComplexIntervals(
+ final GATKSVVCFConstants.ComplexVariantSubtype complexType,
+ final String cpxIntervalsString,
+ final List expectedSVSegments
+ ) {
+ final List cpxIntervals = SVAnnotateEngine.parseComplexIntervals(Arrays.asList(cpxIntervalsString.split(",")));
+ final List actualSegments = SVAnnotateEngine.getComplexAnnotationIntervals(cpxIntervals,
+ complexType);
+ assertSegmentListEqual(actualSegments, expectedSVSegments);
+ }
+
// CPX_INTERVALS INFO field string specifying complex variant intervals, and expected annotation(s)
@DataProvider(name = "toyComplexVariants")
public Object[][] getToyComplexVariantTestData() {
return new Object[][] {
- { "DUP_chr1:280-420", Sets.newHashSet(GATKSVVCFConstants.INT_EXON_DUP) },
- { "INV_chr1:90-1010,DUP_chr1:890-1010",
- Sets.newHashSet(GATKSVVCFConstants.INV_SPAN, GATKSVVCFConstants.DUP_PARTIAL) },
- { "DEL_chr1:250-450,INV_chr1:450-650,DUP_chr1:610-650",
- Sets.newHashSet(GATKSVVCFConstants.LOF, GATKSVVCFConstants.INTRONIC) }
+ { GATKSVVCFConstants.ComplexVariantSubtype.dDUP, "DUP_chr1:280-420", Sets.newHashSet(GATKSVVCFConstants.PARTIAL_DISPERSED_DUP) },
+ { GATKSVVCFConstants.ComplexVariantSubtype.INVdup, "INV_chr1:90-1010,DUP_chr1:890-1010",
+ Sets.newHashSet(GATKSVVCFConstants.LOF, GATKSVVCFConstants.PARTIAL_DISPERSED_DUP) },
+ { GATKSVVCFConstants.ComplexVariantSubtype.delINVdup, "DEL_chr1:250-450,INV_chr1:450-650,DUP_chr1:610-650",
+ Sets.newHashSet(GATKSVVCFConstants.LOF, GATKSVVCFConstants.PARTIAL_DISPERSED_DUP) }
};
}
- // Test annotation of CPX events from CPX_INTERVALS string
+ // Test annotation of CPX events for gene overlaps from CPX_INTERVALS string
@Test(dataProvider = "toyComplexVariants")
public void testAnnotateComplexEvents(
+ final GATKSVVCFConstants.ComplexVariantSubtype complexType,
final String cpxIntervalsString,
final Set expectedConsequences
) {
@@ -272,28 +389,36 @@ public void testAnnotateComplexEvents(
SVAnnotateEngine svAnnotateEngine = new SVAnnotateEngine(gtfTrees, null, sequenceDictionary,
-1);
- final String[] cpxIntervalStrings = cpxIntervalsString.split(",");
- for (String cpxIntervalString : cpxIntervalStrings) {
- SVAnnotateEngine.SVSegment cpxSegment = SVAnnotateEngine.parseCPXIntervalString(cpxIntervalString);
- svAnnotateEngine.annotateGeneOverlaps(cpxSegment.getInterval(), cpxSegment.getIntervalSVType(),
+ final List cpxIntervals = SVAnnotateEngine.parseComplexIntervals(Arrays.asList(cpxIntervalsString.split(",")));
+ final List cpxSegments = SVAnnotateEngine.getComplexAnnotationIntervals(cpxIntervals, complexType);
+ for (final SVAnnotateEngine.SVSegment cpxSegment: cpxSegments) {
+ svAnnotateEngine.annotateGeneOverlaps(cpxSegment.getInterval(), cpxSegment.getIntervalSVType(), true,
variantConsequenceDict);
}
Assert.assertEquals(variantConsequenceDict.keySet(), expectedConsequences);
}
+ private Map> createVariantConsequenceDict(final String[] consequences,
+ final String[] features) {
+ Assert.assertEquals(consequences.length, features.length);
+ final Map> map = new HashMap<>();
+ for (int i = 0; i < consequences.length; i++) {
+ SVAnnotateEngine.updateVariantConsequenceDict(map, consequences[i], features[i]);
+ }
+ return map;
+ }
+
// Test sortVariantConsequenceDict() sorts lists of genes in variant consequence map
@Test
public void testSortVariantConsequenceDict() {
- final Map> before = new HashMap<>();
- SVAnnotateEngine.updateVariantConsequenceDict(before, GATKSVVCFConstants.LOF, "NOC2L");
- SVAnnotateEngine.updateVariantConsequenceDict(before, GATKSVVCFConstants.LOF, "KLHL17");
- SVAnnotateEngine.updateVariantConsequenceDict(before, GATKSVVCFConstants.LOF, "PLEKHN1");
- SVAnnotateEngine.updateVariantConsequenceDict(before, GATKSVVCFConstants.LOF, "PERM1");
- SVAnnotateEngine.updateVariantConsequenceDict(before, GATKSVVCFConstants.DUP_PARTIAL, "SAMD11");
- SVAnnotateEngine.updateVariantConsequenceDict(before, GATKSVVCFConstants.LOF, "HES4");
- SVAnnotateEngine.updateVariantConsequenceDict(before, GATKSVVCFConstants.TSS_DUP, "ISG15");
+ final Map> before = createVariantConsequenceDict(
+ new String[]{GATKSVVCFConstants.LOF, GATKSVVCFConstants.LOF, GATKSVVCFConstants.LOF,
+ GATKSVVCFConstants.LOF, GATKSVVCFConstants.DUP_PARTIAL, GATKSVVCFConstants.LOF,
+ GATKSVVCFConstants.TSS_DUP},
+ new String[]{"NOC2L", "KLHL17", "PLEKHN1", "PERM1", "SAMD11", "HES4", "ISG15"}
+ );
final Map expectedAfter = new HashMap<>();
expectedAfter.put(GATKSVVCFConstants.DUP_PARTIAL, Arrays.asList("SAMD11"));
@@ -303,22 +428,74 @@ public void testSortVariantConsequenceDict() {
Assert.assertEquals(SVAnnotateEngine.sortVariantConsequenceDict(before), expectedAfter);
}
+ @DataProvider(name = "proteinCodingConsequences")
+ public Object[][] getProteinCodingConsequences() {
+ return new Object[][] {
+ // protein-coding consequence -> not intergenic
+ { createVariantConsequenceDict(
+ new String[]{GATKSVVCFConstants.TSS_DUP, GATKSVVCFConstants.PARTIAL_DISPERSED_DUP},
+ new String[]{"HES4", "SAMD11"}),
+ false
+ },
+ // partial dispersed dup does not count towards protein-coding consequences for this
+ { createVariantConsequenceDict(
+ new String[]{GATKSVVCFConstants.PARTIAL_DISPERSED_DUP},
+ new String[]{"SAMD11"}),
+ true
+ },
+ // ignore noncoding consequences
+ { createVariantConsequenceDict(
+ new String[]{GATKSVVCFConstants.NONCODING_BREAKPOINT, GATKSVVCFConstants.PROMOTER},
+ new String[]{"Enhancer", "RP1"}),
+ true
+ },
+ { new HashMap<>(),
+ true
+ }
+ };
+ }
- /**
- * Create list of SV segments with SAME SVTYPE - convenience function for testing getSVSegments
- * @param svType - SV type for all segments
- * @param intervals - list of intervals
- * @return - list of SV segments with provided SV type, one for each interval
- */
- private List createListOfSVSegments(final GATKSVVCFConstants.StructuralVariantAnnotationType svType,
- final SimpleInterval[] intervals) {
- final List segments = new ArrayList<>(intervals.length);
- for (final SimpleInterval interval : intervals) {
- segments.add(new SVAnnotateEngine.SVSegment(svType, interval));
- }
- return segments;
+ @Test(dataProvider = "proteinCodingConsequences")
+ public void testIsIntergenic(
+ final Map> variantConsequenceDict,
+ final boolean expectedIsIntergenic
+ ){
+ final boolean actualIsIntergenic = SVAnnotateEngine.isIntergenic(variantConsequenceDict,
+ SVAnnotateEngine.PROTEIN_CODING_CONSEQUENCES);
+ Assert.assertEquals(actualIsIntergenic, expectedIsIntergenic);
+ }
+
+ @DataProvider(name = "complexSubtypes")
+ public Object[][] getComplexSubtypes() {
+ return new Object[][] {
+ { null, false },
+ { GATKSVVCFConstants.ComplexVariantSubtype.delINV, false },
+ { GATKSVVCFConstants.ComplexVariantSubtype.INVdel, false },
+ { GATKSVVCFConstants.ComplexVariantSubtype.dupINV, true },
+ { GATKSVVCFConstants.ComplexVariantSubtype.INVdup, true },
+ { GATKSVVCFConstants.ComplexVariantSubtype.delINVdel, false },
+ { GATKSVVCFConstants.ComplexVariantSubtype.dupINVdup, true },
+ { GATKSVVCFConstants.ComplexVariantSubtype.delINVdup, true },
+ { GATKSVVCFConstants.ComplexVariantSubtype.dupINVdel, true },
+ { GATKSVVCFConstants.ComplexVariantSubtype.dDUP, true },
+ { GATKSVVCFConstants.ComplexVariantSubtype.dDUP_iDEL, true },
+ { GATKSVVCFConstants.ComplexVariantSubtype.INS_iDEL, false },
+ { GATKSVVCFConstants.ComplexVariantSubtype.CTX_PP_QQ, false },
+ { GATKSVVCFConstants.ComplexVariantSubtype.CTX_PQ_QP, false }
+ };
}
+ @Test(dataProvider = "complexSubtypes")
+ public void testIncludesDispersedDuplication(
+ final GATKSVVCFConstants.ComplexVariantSubtype complexType,
+ final boolean expectedIncludesDispersedDuplication
+ ){
+ final boolean actualIncludesDispersedDuplication = SVAnnotateEngine.includesDispersedDuplication(complexType,
+ SVAnnotateEngine.COMPLEX_SUBTYPES_WITH_DISPERSED_DUP);
+ Assert.assertEquals(actualIncludesDispersedDuplication, expectedIncludesDispersedDuplication);
+ }
+
+
/**
* Assert two lists of SVAnnotate.SVSegment objects are equal in contents
* Lists must be same size and contain equal SVSegments in the same order
@@ -384,6 +561,7 @@ public Object[][] getSVTypesAndSegmentsTestData() {
return new Object[][] {
{ createVariantContext("chr2", 86263976, 86263977, "chr19", 424309, "N",
"", null, null, "CTX_PP/QQ", null),
+ GATKSVVCFConstants.ComplexVariantSubtype.CTX_PP_QQ,
GATKSVVCFConstants.StructuralVariantAnnotationType.CTX,
createListOfSVSegments(GATKSVVCFConstants.StructuralVariantAnnotationType.CTX,
new SimpleInterval[]{ new SimpleInterval("chr2", 86263976, 86263977),
@@ -391,66 +569,77 @@ public Object[][] getSVTypesAndSegmentsTestData() {
null },
{ createVariantContext("chr2", 86263976, 86263976, null, 424309, "G",
"G]chr19:424309]", null, null,"CTX_PP/QQ", null),
+ GATKSVVCFConstants.ComplexVariantSubtype.CTX_PP_QQ,
GATKSVVCFConstants.StructuralVariantAnnotationType.BND,
Arrays.asList(new SVAnnotateEngine.SVSegment(GATKSVVCFConstants.StructuralVariantAnnotationType.CTX,
new SimpleInterval("chr2", 86263976, 86263976))),
null},
{ createVariantContext("chr2", 86263977, 86263977, null, 424309, "A",
"[chr19:424310[A", null, null, "CTX_PP/QQ", null),
+ GATKSVVCFConstants.ComplexVariantSubtype.CTX_PP_QQ,
GATKSVVCFConstants.StructuralVariantAnnotationType.BND,
Arrays.asList(new SVAnnotateEngine.SVSegment(GATKSVVCFConstants.StructuralVariantAnnotationType.CTX,
new SimpleInterval("chr2", 86263977, 86263977))),
null },
{ createVariantContext("chr2", 205522308, 205522384, "chr2", null, "N",
"", 76, null, null, null),
+ null,
GATKSVVCFConstants.StructuralVariantAnnotationType.INV,
Arrays.asList(new SVAnnotateEngine.SVSegment(GATKSVVCFConstants.StructuralVariantAnnotationType.INV,
new SimpleInterval("chr2", 205522308, 205522384))),
null },
{ createVariantContext("chr19", 424309, 424309, null, 424309, "T",
"T]chr2:86263976]", null, null, "CTX_PP/QQ", null),
+ GATKSVVCFConstants.ComplexVariantSubtype.CTX_PP_QQ,
GATKSVVCFConstants.StructuralVariantAnnotationType.BND,
Arrays.asList(new SVAnnotateEngine.SVSegment(GATKSVVCFConstants.StructuralVariantAnnotationType.CTX,
new SimpleInterval("chr19", 424309, 424309))),
null },
{ createVariantContext("chr19", 424310, 424310, null, 424309, "C",
"[chr2:86263977[C", null, null, "CTX_PP/QQ", null),
+ GATKSVVCFConstants.ComplexVariantSubtype.CTX_PP_QQ,
GATKSVVCFConstants.StructuralVariantAnnotationType.BND,
Arrays.asList(new SVAnnotateEngine.SVSegment(GATKSVVCFConstants.StructuralVariantAnnotationType.CTX,
new SimpleInterval("chr19", 424310, 424310))),
null },
{ createVariantContext("chr22", 10510000, 10694100, "chr22", null, "N",
"", 184100, null, null, null),
+ null,
GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
createListOfSVSegments(GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
new SimpleInterval[]{ new SimpleInterval("chr22", 10510000, 10694100)}),
null},
{ createVariantContext("chr22", 10510000, 10694100, "chr22", null, "N",
"", 184100, null, null, null),
+ null,
GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
createListOfSVSegments(GATKSVVCFConstants.StructuralVariantAnnotationType.DEL,
new SimpleInterval[]{ new SimpleInterval("chr22", 10510000, 10694100)}),
null},
{ createVariantContext("chr22", 10524000, 10710000, "chr22", null, "N",
"", 186000, null, null, null),
+ null,
GATKSVVCFConstants.StructuralVariantAnnotationType.DUP,
createListOfSVSegments(GATKSVVCFConstants.StructuralVariantAnnotationType.DUP,
new SimpleInterval[]{ new SimpleInterval("chr22", 10524000, 10710000)}),
null },
{ createVariantContext("chr22", 10532563, 10532611, "chr22", null, "N",
"", 245, null, null, null),
+ null,
GATKSVVCFConstants.StructuralVariantAnnotationType.INS,
createListOfSVSegments(GATKSVVCFConstants.StructuralVariantAnnotationType.INS,
new SimpleInterval[]{ new SimpleInterval("chr22", 10532563, 10532564)}),
null },
{ createVariantContext("chr22", 10572758, 10572788, "chr22", null, "N",
"", 57, null, null, null),
+ null,
GATKSVVCFConstants.StructuralVariantAnnotationType.INS,
createListOfSVSegments(GATKSVVCFConstants.StructuralVariantAnnotationType.INS,
new SimpleInterval[]{ new SimpleInterval("chr22", 10572758, 10572759)}),
null },
{ createVariantContext("chr22", 10717890, 10717890, "chr22", null, "N",
"", 5170, "-+", null, null),
+ null,
GATKSVVCFConstants.StructuralVariantAnnotationType.BND,
createListOfSVSegments(GATKSVVCFConstants.StructuralVariantAnnotationType.BND,
new SimpleInterval[]{ new SimpleInterval("chr22", 10717890, 10717890),
@@ -459,12 +648,14 @@ public Object[][] getSVTypesAndSegmentsTestData() {
new SimpleInterval[]{ new SimpleInterval("chr22", 10717890, 10723060)}) },
{ createVariantContext("chr22", 10774600, 10784500, "chr22", null, "N",
"", 9900, null, null, null),
+ null,
GATKSVVCFConstants.StructuralVariantAnnotationType.CNV,
createListOfSVSegments(GATKSVVCFConstants.StructuralVariantAnnotationType.CNV,
new SimpleInterval[]{ new SimpleInterval("chr22", 10774600, 10784500)}),
null },
{ createVariantContext("chr22", 10930458, 10930458, "chr22", 11564561, "N",
"", 634103, "--", null, null),
+ null,
GATKSVVCFConstants.StructuralVariantAnnotationType.BND,
createListOfSVSegments(GATKSVVCFConstants.StructuralVariantAnnotationType.BND,
new SimpleInterval[]{ new SimpleInterval("chr22", 10930458, 10930458),
@@ -472,6 +663,7 @@ public Object[][] getSVTypesAndSegmentsTestData() {
null },
{ createVariantContext("chr22", 17636024, 17636024, "chr22", null, "N",
"", 10709, "+-", null, null),
+ null,
GATKSVVCFConstants.StructuralVariantAnnotationType.BND,
createListOfSVSegments(GATKSVVCFConstants.StructuralVariantAnnotationType.BND,
new SimpleInterval[]{ new SimpleInterval("chr22", 17636024, 17636024),
@@ -481,10 +673,9 @@ public Object[][] getSVTypesAndSegmentsTestData() {
{ createVariantContext("chr22", 18971159, 18971435, "chr22", null, "N",
"", 386, null, "dDUP",
Arrays.asList("INV_chr22:20267228-20267614","DUP_chr22:20267228-20267614")),
+ GATKSVVCFConstants.ComplexVariantSubtype.dDUP,
GATKSVVCFConstants.StructuralVariantAnnotationType.CPX,
Arrays.asList(
- new SVAnnotateEngine.SVSegment(GATKSVVCFConstants.StructuralVariantAnnotationType.INV,
- new SimpleInterval("chr22", 20267228, 20267614)),
new SVAnnotateEngine.SVSegment(GATKSVVCFConstants.StructuralVariantAnnotationType.DUP,
new SimpleInterval("chr22", 20267228, 20267614)),
new SVAnnotateEngine.SVSegment(GATKSVVCFConstants.StructuralVariantAnnotationType.INS,
@@ -492,6 +683,7 @@ public Object[][] getSVTypesAndSegmentsTestData() {
null },
{ createVariantContext("chr22", 22120897, 22120897, "chrX", 126356858, "N",
"", -1, "++", null, null),
+ null,
GATKSVVCFConstants.StructuralVariantAnnotationType.BND,
createListOfSVSegments(GATKSVVCFConstants.StructuralVariantAnnotationType.BND,
new SimpleInterval[]{ new SimpleInterval("chr22", 22120897, 22120897),
@@ -499,6 +691,7 @@ public Object[][] getSVTypesAndSegmentsTestData() {
null },
{ createVariantContext("chr22", 22196261, 22196261, "chr22", null, "N",
"", 708725, "+-", null, null),
+ null,
GATKSVVCFConstants.StructuralVariantAnnotationType.BND,
createListOfSVSegments(GATKSVVCFConstants.StructuralVariantAnnotationType.BND,
new SimpleInterval[]{ new SimpleInterval("chr22", 22196261, 22196261),
@@ -506,12 +699,14 @@ public Object[][] getSVTypesAndSegmentsTestData() {
null },
{ createVariantContext("chr22", 22196261, 22196261, null, null, "A",
"A[chr22:22904986[", null, "+-", null, null),
+ null,
GATKSVVCFConstants.StructuralVariantAnnotationType.BND,
createListOfSVSegments(GATKSVVCFConstants.StructuralVariantAnnotationType.BND,
new SimpleInterval[]{ new SimpleInterval("chr22", 22196261, 22196261) }),
null },
{ createVariantContext("chr22", 22904986, 22904986, null, null, "T",
"]chr22:22196261]T", null, "+-", null, null),
+ null,
GATKSVVCFConstants.StructuralVariantAnnotationType.BND,
createListOfSVSegments(GATKSVVCFConstants.StructuralVariantAnnotationType.BND,
new SimpleInterval[]{ new SimpleInterval("chr22", 22904986, 22904986) }),
@@ -519,12 +714,13 @@ public Object[][] getSVTypesAndSegmentsTestData() {
{ createVariantContext("chr22", 36533058, 36538234, "chr22", null, "N",
"", 5176, null, "dupINV",
Arrays.asList("DUP_chr22:36533058-36533299","INV_chr22:36533058-36538234")),
+ GATKSVVCFConstants.ComplexVariantSubtype.dupINV,
GATKSVVCFConstants.StructuralVariantAnnotationType.CPX,
Arrays.asList(
new SVAnnotateEngine.SVSegment(GATKSVVCFConstants.StructuralVariantAnnotationType.DUP,
new SimpleInterval("chr22", 36533058, 36533299)),
new SVAnnotateEngine.SVSegment(GATKSVVCFConstants.StructuralVariantAnnotationType.INV,
- new SimpleInterval("chr22", 36533058, 36538234))),
+ new SimpleInterval("chr22", 36533299, 36538234))),
null }
};
}
@@ -533,6 +729,7 @@ public Object[][] getSVTypesAndSegmentsTestData() {
@Test(dataProvider = "typesAndSegments")
public void testGetSVTypeAndSegments(
final VariantContext variant,
+ final GATKSVVCFConstants.ComplexVariantSubtype complexType,
final GATKSVVCFConstants.StructuralVariantAnnotationType expectedSVType,
final List expectedSVSegments,
final List expectedSVSegmentsWithBNDOverlap
@@ -541,11 +738,11 @@ public void testGetSVTypeAndSegments(
Assert.assertEquals(actualSVType, expectedSVType);
final List actualSegments = SVAnnotateEngine.getSVSegments(variant,
- actualSVType, -1);
+ actualSVType, -1, complexType);
assertSegmentListEqual(actualSegments, expectedSVSegments);
final List actualSegmentsWithBNDOverlap = SVAnnotateEngine.getSVSegments(variant,
- actualSVType, 15000);
+ actualSVType, 15000, complexType);
assertSegmentListEqual(actualSegmentsWithBNDOverlap,
expectedSVSegmentsWithBNDOverlap != null ? expectedSVSegmentsWithBNDOverlap : expectedSVSegments);
}
@@ -581,6 +778,62 @@ public Object[][] getAnnotateStructuralVariantTestData() {
createAttributesMap(
Arrays.asList(GATKSVVCFConstants.PROMOTER, GATKSVVCFConstants.INTERGENIC),
Arrays.asList("EMMA1", true)) },
+ // CPX with dDUP CG and promoter annotations
+ { createVariantContext("chr1", 10, 11, null, null, null,
+ "", 1500, null, "dDUP",
+ Collections.singletonList("DUP_chr1:2000-3500")),
+ createAttributesMap(
+ Arrays.asList(GATKSVVCFConstants.PROMOTER, GATKSVVCFConstants.COPY_GAIN,
+ GATKSVVCFConstants.INTERGENIC),
+ Arrays.asList("EMMA1", "EMMA2", false)) },
+ // dupINVdup with CG
+ { createVariantContext("chr1", 20, 3500, null, null, null,
+ "", 3480, null, "dupINVdup",
+ Arrays.asList("DUP_chr1:20-70", "INV_chr1:20-3500", "DUP_chr1:2000-3500")),
+ createAttributesMap(
+ Arrays.asList(GATKSVVCFConstants.INV_SPAN, GATKSVVCFConstants.COPY_GAIN,
+ GATKSVVCFConstants.NONCODING_SPAN, GATKSVVCFConstants.INTERGENIC),
+ Arrays.asList("EMMA1", "EMMA2", Arrays.asList("DNase", "Enhancer"), false)) },
+ // ignore INV for dDUP; ignore dDUP for promoter; CPX noncoding span; CPX intergenic
+ { createVariantContext("chr1", 1101, 1102, null, null, null,
+ "", 700, null, "dDUP_iDEL",
+ Arrays.asList("DUP_chr1:10-60", "INV_chr1:10-60","DEL_chr1:1100-1700")),
+ createAttributesMap(
+ Arrays.asList(GATKSVVCFConstants.NONCODING_SPAN, GATKSVVCFConstants.NEAREST_TSS,
+ GATKSVVCFConstants.INTERGENIC),
+ Arrays.asList("Enhancer", "EMMA1", true)) },
+ // Ignore INV and source INS in CPX_INTERVALS for INS_iDEL
+ { createVariantContext("chr1", 1101, 1102, null, null, null,
+ "", 700, null, "INS_iDEL",
+ Arrays.asList("INV_chr1:10-60","DEL_chr1:1100-1700","INS_chr1:10-60")),
+ createAttributesMap(
+ Arrays.asList(GATKSVVCFConstants.NONCODING_SPAN, GATKSVVCFConstants.NEAREST_TSS,
+ GATKSVVCFConstants.INTERGENIC),
+ Arrays.asList("Enhancer", "EMMA1", true)) },
+ // no noncoding breakpoint for DUP segment of CPX; modify INV interval; multiple genes for a consequence
+ { createVariantContext("chr1", 450, 3100, null, null, null,
+ "", 2650, null, "dupINV",
+ Arrays.asList("DUP_chr1:450-2200", "INV_chr1:450-3100")),
+ createAttributesMap(
+ Arrays.asList(GATKSVVCFConstants.PARTIAL_DISPERSED_DUP, GATKSVVCFConstants.LOF,
+ GATKSVVCFConstants.INTERGENIC),
+ Arrays.asList(Arrays.asList("EMMA1", "EMMA2"), "EMMA2", false)) },
+ // intergenic with partial dispersed dup
+ { createVariantContext("chr1", 1100, 1100, null, null, null,
+ "", 550, null, "dDUP",
+ Arrays.asList("DUP_chr1:450-1000")),
+ createAttributesMap(
+ Arrays.asList(GATKSVVCFConstants.PARTIAL_DISPERSED_DUP, GATKSVVCFConstants.NEAREST_TSS,
+ GATKSVVCFConstants.INTERGENIC),
+ Arrays.asList("EMMA1", "EMMA1", true)) },
+ // merge INV + DEL for nearest TSS; noncoding breakpoint for CPX
+ { createVariantContext("chr1", 1400, 1900, null, null, null,
+ "", 500, null, "delINV",
+ Arrays.asList("DEL_chr1:1400-1500", "INV_chr1:1500-1900")),
+ createAttributesMap(
+ Arrays.asList(GATKSVVCFConstants.NONCODING_BREAKPOINT, GATKSVVCFConstants.NEAREST_TSS,
+ GATKSVVCFConstants.INTERGENIC),
+ Arrays.asList("Enhancer", "EMMA2", true)) },
{ createVariantContext("chr1", 50, 450, null, null, null,
"", 400, null, null, null),
createAttributesMap(
@@ -650,6 +903,4 @@ public void testAnnotateStructuralVariant(
Assert.assertEquals(actualAttributes, expectedAttributes);
}
-
-
-}
\ No newline at end of file
+}
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotateIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotateIntegrationTest.java
index b9c30729acc..f647a4afb37 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotateIntegrationTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotateIntegrationTest.java
@@ -28,7 +28,8 @@ public class SVAnnotateIntegrationTest extends CommandLineProgramTest {
GATKSVVCFConstants.INV_SPAN, GATKSVVCFConstants.PROMOTER, GATKSVVCFConstants.COPY_GAIN,
GATKSVVCFConstants.INTERGENIC, GATKSVVCFConstants.NEAREST_TSS, GATKSVVCFConstants.INT_EXON_DUP,
GATKSVVCFConstants.PARTIAL_EXON_DUP, GATKSVVCFConstants.MSV_EXON_OVERLAP, GATKSVVCFConstants.UTR,
- GATKSVVCFConstants.INTRONIC, GATKSVVCFConstants.TSS_DUP, GATKSVVCFConstants.BREAKEND_EXON);
+ GATKSVVCFConstants.INTRONIC, GATKSVVCFConstants.TSS_DUP, GATKSVVCFConstants.BREAKEND_EXON,
+ GATKSVVCFConstants.PARTIAL_DISPERSED_DUP);
private void assertVariantAnnotatedAsExpected(final List vcf, final String variantID,
Map expectedAnnotations) {
diff --git a/src/test/java/org/broadinstitute/hellbender/utils/SimpleIntervalUnitTest.java b/src/test/java/org/broadinstitute/hellbender/utils/SimpleIntervalUnitTest.java
index e230f6c5375..4cc892dd931 100644
--- a/src/test/java/org/broadinstitute/hellbender/utils/SimpleIntervalUnitTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/utils/SimpleIntervalUnitTest.java
@@ -260,6 +260,116 @@ public void testContains( final SimpleInterval firstInterval, final SimpleInterv
"contains() returned incorrect result for intervals " + firstInterval + " and " + secondInterval);
}
+ @DataProvider(name = "subtractIntervalData")
+ private Object[][] subtractIntervalData() {
+ return new Object[][] {
+ { new SimpleInterval("chr1", 10, 30),
+ new SimpleInterval("chr1", 20, 40),
+ new SimpleInterval("chr1", 10, 20) },
+ { new SimpleInterval("chr1", 10, 30),
+ new SimpleInterval("chr1", 5, 15),
+ new SimpleInterval("chr1", 15, 30) },
+ { new SimpleInterval("chr1", 10, 30),
+ new SimpleInterval("chr1", 10, 20),
+ new SimpleInterval("chr1", 20, 30) },
+ { new SimpleInterval("chr1", 10, 30),
+ new SimpleInterval("chr1", 20, 30),
+ new SimpleInterval("chr1", 10, 20) }
+ };
+ }
+
+ @Test(dataProvider = "subtractIntervalData")
+ public void testSubtractInterval( final SimpleInterval firstInterval,
+ final SimpleInterval secondInterval,
+ final SimpleInterval expectedResult ) {
+ Assert.assertEquals(firstInterval.subtract(secondInterval), expectedResult);
+ }
+
+ @DataProvider(name = "subtractIntervalDataExpectingException")
+ private Object[][] subtractIntervalDataExpectingException() {
+ return new Object[][] {
+ // different contigs
+ { new SimpleInterval("chr1", 10, 30),
+ new SimpleInterval("chr2", 20, 40) },
+ // non-overlapping intervals on same contig
+ { new SimpleInterval("chr1", 10, 30),
+ new SimpleInterval("chr1", 50, 150) },
+ // second interval contains first
+ { new SimpleInterval("chr1", 10, 30),
+ new SimpleInterval("chr1", 10, 40) }
+ };
+ }
+ @Test(dataProvider = "subtractIntervalDataExpectingException", expectedExceptions = IllegalArgumentException.class)
+ public void testSubtractIntervalExpectingException( final SimpleInterval firstInterval,
+ final SimpleInterval secondInterval) {
+ firstInterval.subtract(secondInterval);
+ }
+
+ @DataProvider(name = "mergeWithContiguousData")
+ private Object[][] mergeWithContiguousData() {
+ return new Object[][] {
+ // first is upstream, overlapping
+ { new SimpleInterval("chr1", 10, 30),
+ new SimpleInterval("chr1", 20, 40),
+ new SimpleInterval("chr1", 10, 40) },
+ // first is downstream, overlapping
+ { new SimpleInterval("chr1", 10, 30),
+ new SimpleInterval("chr1", 5, 15),
+ new SimpleInterval("chr1", 5, 30) },
+ // first contains second
+ { new SimpleInterval("chr1", 10, 30),
+ new SimpleInterval("chr1", 15, 20),
+ new SimpleInterval("chr1", 10, 30) },
+ // second contains first
+ { new SimpleInterval("chr1", 20, 30),
+ new SimpleInterval("chr1", 10, 30),
+ new SimpleInterval("chr1", 10, 30) },
+ // first is upstream, overlapping by 1
+ { new SimpleInterval("chr1", 10, 30),
+ new SimpleInterval("chr1", 30, 50),
+ new SimpleInterval("chr1", 10, 50) },
+ // first is upstream, adjacent
+ { new SimpleInterval("chr1", 10, 30),
+ new SimpleInterval("chr1", 31, 50),
+ new SimpleInterval("chr1", 10, 50) },
+ // first is downstream, overlapping by 1
+ { new SimpleInterval("chr1", 40, 60),
+ new SimpleInterval("chr1", 30, 40),
+ new SimpleInterval("chr1", 30, 60) },
+ // first is downstream, adjacent
+ { new SimpleInterval("chr1", 40, 60),
+ new SimpleInterval("chr1", 30, 39),
+ new SimpleInterval("chr1", 30, 60) }
+ };
+ }
+
+ @Test(dataProvider = "mergeWithContiguousData")
+ public void testMergeWithContiguous( final SimpleInterval firstInterval,
+ final SimpleInterval secondInterval,
+ final SimpleInterval expectedResult ) {
+ Assert.assertEquals(firstInterval.mergeWithContiguous(secondInterval), expectedResult);
+ }
+
+ @DataProvider(name = "mergeWithContiguousDataExpectingException")
+ private Object[][] mergeWithContiguousDataExpectingException() {
+ return new Object[][] {
+ // different contigs
+ { new SimpleInterval("chr1", 10, 30),
+ new SimpleInterval("chr2", 20, 40) },
+ // non-contiguous intervals on same contig, first is upstream
+ { new SimpleInterval("chr1", 10, 30),
+ new SimpleInterval("chr1", 50, 150) },
+ // non-contiguous intervals on same contig, first is downstream
+ { new SimpleInterval("chr1", 20, 30),
+ new SimpleInterval("chr1", 5, 15) }
+ };
+ }
+ @Test(dataProvider = "mergeWithContiguousDataExpectingException", expectedExceptions = GATKException.class)
+ public void testMergeWithContiguousExpectingException( final SimpleInterval firstInterval,
+ final SimpleInterval secondInterval) {
+ firstInterval.mergeWithContiguous(secondInterval);
+ }
+
@Test(expectedExceptions = IllegalArgumentException.class)
public void testNoNullInConstruction() throws Exception {
new SimpleInterval((String)null);