From e796d20a9e0c0d311db56beeb58cae9ff03bb6d7 Mon Sep 17 00:00:00 2001 From: epiercehoffman Date: Tue, 23 Jan 2024 14:26:41 -0500 Subject: [PATCH] Rewrite complex SV functional annotation in SVAnnotate (#8516) --- .../spark/sv/utils/GATKSVVCFConstants.java | 5 +- .../tools/walkers/sv/SVAnnotate.java | 6 + .../tools/walkers/sv/SVAnnotateEngine.java | 293 +++++++++++++-- .../hellbender/utils/SimpleInterval.java | 20 + .../walkers/sv/SVAnnotateEngineUnitTest.java | 345 +++++++++++++++--- .../walkers/sv/SVAnnotateIntegrationTest.java | 3 +- .../utils/SimpleIntervalUnitTest.java | 110 ++++++ 7 files changed, 698 insertions(+), 84 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java index 95bb083f03b..c7586265990 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java @@ -89,7 +89,9 @@ public enum ComplexVariantSubtype { piDUP_RF, dDUP, dDUP_iDEL, - INS_iDEL + INS_iDEL, + CTX_PP_QQ, + CTX_PQ_QP } // not defined in output vcf header but used in internal id that is currently output in the ID column @@ -163,6 +165,7 @@ public enum ComplexVariantSubtype { public static final String NONCODING_BREAKPOINT = "PREDICTED_NONCODING_BREAKPOINT"; public static final String NEAREST_TSS = "PREDICTED_NEAREST_TSS"; public static final String TSS_DUP = "PREDICTED_TSS_DUP"; + public static final String PARTIAL_DISPERSED_DUP = "PREDICTED_PARTIAL_DISPERSED_DUP"; // SVTYPE classes public enum StructuralVariantAnnotationType { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotate.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotate.java index bd808781c74..9c9e2f11efc 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotate.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotate.java @@ -123,6 +123,11 @@ * duplicated. The partial duplication occurs when a duplication has one breakpoint within the transcript and one * breakpoint after the end of the transcript. When the duplication is in tandem, the result is that there is one * intact copy of the full endogenous gene.

+ *
  • PREDICTED_PARTIAL_DISPERSED_DUP
    + * Gene(s) which are partially overlapped by the duplicated segment involved in an SV's dispersed duplication. + * This annotation is applied to a dispersed (non-tandem) duplication segment that is part of a complex SV if the + * duplicated segment overlaps part of a transcript but not the entire transcript (which would be a + * PREDICTED_COPY_GAIN event).

  • *
  • PREDICTED_INV_SPAN
    * Gene(s) which are entirely spanned by an SV's inversion. A whole-gene inversion occurs when an inversion spans * the entire transcript, from the first base of the 5' UTR to the last base of the 3' UTR.

  • @@ -354,6 +359,7 @@ private void addAnnotationInfoKeysToHeader(final VCFHeader header) { header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.NONCODING_SPAN, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Class(es) of noncoding elements spanned by SV.")); header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.NONCODING_BREAKPOINT, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Class(es) of noncoding elements disrupted by SV breakpoint.")); header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.NEAREST_TSS, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Nearest transcription start site to an intergenic variant.")); + header.addMetaDataLine(new VCFInfoHeaderLine(GATKSVVCFConstants.PARTIAL_DISPERSED_DUP, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Gene(s) overlapped partially by the duplicated interval involved in a dispersed duplication event in a complex SV.")); } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotateEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotateEngine.java index b82da470e65..7c977602303 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotateEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotateEngine.java @@ -12,11 +12,13 @@ import org.broadinstitute.hellbender.utils.SVInterval; import org.broadinstitute.hellbender.utils.SVIntervalTree; import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.codecs.gtf.GencodeGtfFeature; import org.broadinstitute.hellbender.utils.codecs.gtf.GencodeGtfTranscriptFeature; import org.broadinstitute.hellbender.utils.variant.GATKSVVariantContextUtils; import java.util.*; +import java.util.stream.Collectors; public class SVAnnotateEngine { private final int maxBreakendLen; @@ -24,13 +26,37 @@ public class SVAnnotateEngine { private final SVIntervalTree nonCodingIntervalTree; private final SAMSequenceDictionary sequenceDictionary; - private final Set MSV_EXON_OVERLAP_CLASSIFICATIONS = Sets.newHashSet(GATKSVVCFConstants.LOF, + @VisibleForTesting + protected static final Set MSV_EXON_OVERLAP_CLASSIFICATIONS = Sets.newHashSet(GATKSVVCFConstants.LOF, GATKSVVCFConstants.INT_EXON_DUP, GATKSVVCFConstants.DUP_PARTIAL, GATKSVVCFConstants.PARTIAL_EXON_DUP, GATKSVVCFConstants.COPY_GAIN, GATKSVVCFConstants.TSS_DUP); + @VisibleForTesting + protected static final Set PROTEIN_CODING_CONSEQUENCES = Sets.newHashSet(GATKSVVCFConstants.LOF, + GATKSVVCFConstants.INT_EXON_DUP, + GATKSVVCFConstants.DUP_PARTIAL, + GATKSVVCFConstants.PARTIAL_EXON_DUP, + GATKSVVCFConstants.COPY_GAIN, + GATKSVVCFConstants.TSS_DUP, + GATKSVVCFConstants.INV_SPAN, + GATKSVVCFConstants.MSV_EXON_OVERLAP, + GATKSVVCFConstants.UTR, + GATKSVVCFConstants.INTRONIC, + GATKSVVCFConstants.BREAKEND_EXON); + + @VisibleForTesting + protected static final Set COMPLEX_SUBTYPES_WITH_DISPERSED_DUP = + Sets.newHashSet(GATKSVVCFConstants.ComplexVariantSubtype.dDUP, + GATKSVVCFConstants.ComplexVariantSubtype.dupINV, + GATKSVVCFConstants.ComplexVariantSubtype.INVdup, + GATKSVVCFConstants.ComplexVariantSubtype.dupINVdup, + GATKSVVCFConstants.ComplexVariantSubtype.dupINVdel, + GATKSVVCFConstants.ComplexVariantSubtype.delINVdup, + GATKSVVCFConstants.ComplexVariantSubtype.dDUP_iDEL); + // Mini class to package SV type and interval into one object @VisibleForTesting protected static final class SVSegment { @@ -211,14 +237,21 @@ protected static String annotateDeletion(final SimpleInterval variantInterval, * Get consequence of duplication variant on transcript * @param variantInterval - SimpleInterval representing structural variant * @param gtfTranscript - protein-coding GTF transcript + * @param isDispersedDuplication - boolean: true if duplication segment is dispersed, false otherwise * @return - consequence of duplication variant on transcript */ @VisibleForTesting protected static String annotateDuplication(final SimpleInterval variantInterval, - final GencodeGtfTranscriptFeature gtfTranscript) { + final GencodeGtfTranscriptFeature gtfTranscript, + boolean isDispersedDuplication) { final SimpleInterval transcriptInterval = new SimpleInterval(gtfTranscript); if (variantSpansFeature(variantInterval, transcriptInterval)) { + // return COPY_GAIN immediately because same regardless of tandem or dispersed (isDispersedDuplication) return GATKSVVCFConstants.COPY_GAIN; + } else if (isDispersedDuplication) { + // all DUP segments in CPX events are currently dispersed duplications, not tandem. So + // if not COPY_GAIN, then partial gene overlap --> if complex, immediate PARTIAL_DISPERSED_DUP + return GATKSVVCFConstants.PARTIAL_DISPERSED_DUP; } else if (variantOverlapsTranscriptionStartSite(variantInterval, gtfTranscript)) { return GATKSVVCFConstants.TSS_DUP; } else if (!transcriptInterval.contains(variantInterval)) { @@ -276,7 +309,7 @@ protected static String annotateDuplication(final SimpleInterval variantInterval protected static String annotateCopyNumberVariant(final SimpleInterval variantInterval, final GencodeGtfTranscriptFeature gtfTranscript, final Set MSVExonOverlapClassifications) { - final String consequence = annotateDuplication(variantInterval, gtfTranscript); + final String consequence = annotateDuplication(variantInterval, gtfTranscript, false); if (MSVExonOverlapClassifications.contains(consequence)) { return GATKSVVCFConstants.MSV_EXON_OVERLAP; } else { @@ -338,12 +371,14 @@ protected static String annotateBreakend(final SimpleInterval variantInterval, * Add consequence of structural variant on an overlapping transcript to consequence dictionary for variant * @param variantInterval - SimpleInterval representing structural variant * @param svType - SV type + * @param includesDispersedDuplication - boolean: true if SV type contains dispersed duplication(s), false if not * @param transcript - protein-coding GTF transcript * @param variantConsequenceDict - running map of consequence -> feature name for variant to update */ @VisibleForTesting protected void annotateTranscript(final SimpleInterval variantInterval, final GATKSVVCFConstants.StructuralVariantAnnotationType svType, + final boolean includesDispersedDuplication, final GencodeGtfTranscriptFeature transcript, final Map> variantConsequenceDict) { final String consequence; @@ -355,7 +390,9 @@ protected void annotateTranscript(final SimpleInterval variantInterval, consequence = annotateInsertion(variantInterval, transcript); break; case DUP: - consequence = annotateDuplication(variantInterval, transcript); + // if SV includesDispersedDuplication, every DUP segment in the SV is treated as dispersed + // this assumption holds for the complex subtypes currently resolved by GATK-SV + consequence = annotateDuplication(variantInterval, transcript, includesDispersedDuplication); break; case CNV: consequence = annotateCopyNumberVariant(variantInterval,transcript, MSV_EXON_OVERLAP_CLASSIFICATIONS); @@ -395,6 +432,7 @@ private void annotatePromoterOverlaps(final SimpleInterval variantInterval, for (final Iterator> it = promotersForVariant; it.hasNext(); ) { final SVIntervalTree.Entry promoterEntry = it.next(); final String promoterName = promoterEntry.getValue(); + // only annotate promoter overlap if there is no coding annotation for the gene if (!codingAnnotationGenes.contains(promoterName)) { updateVariantConsequenceDict(variantConsequenceDict, GATKSVVCFConstants.PROMOTER, promoterName); } @@ -477,11 +515,13 @@ protected static GATKSVVCFConstants.StructuralVariantAnnotationType getSVType(fi * Add protein-coding annotations for any transcripts overlapping the variant to the variant consequence dictionary * @param variantInterval - SimpleInterval representing structural variant * @param svType - SV type + * @param includesDispersedDuplication - boolean: true if SV type contains dispersed duplication(s), false otherwise * @param variantConsequenceDict - running map of consequence -> feature name for variant to update */ @VisibleForTesting protected void annotateGeneOverlaps(final SimpleInterval variantInterval, final GATKSVVCFConstants.StructuralVariantAnnotationType svType, + final boolean includesDispersedDuplication, final Map> variantConsequenceDict) { final Iterator> gtfTranscriptsForVariant = gtfIntervalTrees.getTranscriptIntervalTree().overlappers( @@ -489,24 +529,101 @@ protected void annotateGeneOverlaps(final SimpleInterval variantInterval, ); for (Iterator> it = gtfTranscriptsForVariant; it.hasNext(); ) { SVIntervalTree.Entry transcriptEntry = it.next(); - annotateTranscript(variantInterval, svType, transcriptEntry.getValue(), variantConsequenceDict); + annotateTranscript(variantInterval, svType, includesDispersedDuplication, transcriptEntry.getValue(), + variantConsequenceDict); } } + + /** + * Parse CPX_INTERVALS field into a list of SV segments. + * Format of each item in CPX_INTERVALS is "SVTYPE_CHROM:POS-END" + * @param cpxIntervals - list of String elements from CPX_INTERVALS field, each describing one segment of a complex SV + * @return - List of SVSegments representing components of the complex SV represented in CPX_INTERVALS + */ + @VisibleForTesting + protected static List parseComplexIntervals(final List cpxIntervals) { + final List segments = new ArrayList<>(cpxIntervals.size() + 1); + for (final String cpxInterval : cpxIntervals) { + final String[] parsed = cpxInterval.split("_"); + final GATKSVVCFConstants.StructuralVariantAnnotationType svTypeForInterval = GATKSVVCFConstants.StructuralVariantAnnotationType.valueOf(parsed[0]); + final SimpleInterval interval = new SimpleInterval(parsed[1]); + segments.add(new SVSegment(svTypeForInterval, interval)); + } + return segments; + } + + /** - * Parse one interval string from CPX_INTERVALS INFO field into an SVSegment representing the SV type and - * interval of one of the components of the complex event - * @param cpxInterval - one element from CPX_INTERVALS list, a string representing one component of complex SV - * @return - SVSegment representing one component of the complex SV (type and interval) + * Return modified complex SV intervals list ready for protein-coding annotation. + * Start from SV segments from CPX_INTERVALS and ignore or adjust intervals as needed: + * (1) Ignore INV segments in dDUP, dDUP_iDEL, and INS_iDEL events because they describe an inversion in the inserted + * sequence that has no impact on the source sequence. + * (2) Adjust INV segments in dupINV, INVdup, dupINVdup, dupINVdel, and delINVdup events by subtracting the portion + * of the interval overlapped by a DUP segment, because that describes an inversion in the duplicated copy that has no + * impact on the source sequence. + * (3) Ignore INS segment in INS_iDEL if present because it represents the source sequence and the impact on the + * source (DUP or DEL) is unknown and cannot be annotated. + * @param cpxIntervals - list of SVSegments representing complex SV intervals from CPX_INTERVALS field + * @param complexType - Complex SV event type category, from CPX_TYPE field + * @return - List of SVSegments representing component of the complex SV (type and interval) to annotate for + * protein-coding consequences */ @VisibleForTesting - protected static SVSegment parseCPXIntervalString(final String cpxInterval) { - final String[] parsed = cpxInterval.split("_"); - final GATKSVVCFConstants.StructuralVariantAnnotationType svTypeForInterval = GATKSVVCFConstants.StructuralVariantAnnotationType.valueOf(parsed[0]); - final SimpleInterval interval = new SimpleInterval(parsed[1]); - return new SVSegment(svTypeForInterval, interval); + protected static List getComplexAnnotationIntervals(final List cpxIntervals, + final GATKSVVCFConstants.ComplexVariantSubtype complexType) { + final List segments = new ArrayList<>(cpxIntervals.size()); + final List dupIntervals = new ArrayList<>(cpxIntervals.size()); + SimpleInterval inversionIntervalToAdjust = null; + boolean keepSegment; + for (final SVSegment originalSegment : cpxIntervals) { + keepSegment = true; + if (originalSegment.getIntervalSVType() == GATKSVVCFConstants.StructuralVariantAnnotationType.INV) { + // ignore INV segment for dDUP or dDUP_iDEL or INS_iDEL + // because it is an inversion of the inserted sequence relative to the origin + // but has no impact on the origin site + if (complexType == GATKSVVCFConstants.ComplexVariantSubtype.dDUP || + complexType == GATKSVVCFConstants.ComplexVariantSubtype.dDUP_iDEL || + complexType == GATKSVVCFConstants.ComplexVariantSubtype.INS_iDEL) { + keepSegment = false; + } + // save INV interval to adjust later for dupINV / INVdup / dupINVdup / dupINVdel / delINVdup + // because INV interval includes duplicated sequence that is inverted relative to origin when re-inserted + // but there is no inversion at the origin site for this sequence + else if (complexType == GATKSVVCFConstants.ComplexVariantSubtype.dupINV || + complexType == GATKSVVCFConstants.ComplexVariantSubtype.INVdup || + complexType == GATKSVVCFConstants.ComplexVariantSubtype.dupINVdup || + complexType == GATKSVVCFConstants.ComplexVariantSubtype.delINVdup || + complexType == GATKSVVCFConstants.ComplexVariantSubtype.dupINVdel) { + inversionIntervalToAdjust = originalSegment.getInterval(); + keepSegment = false; + } + } else if (originalSegment.getIntervalSVType() == GATKSVVCFConstants.StructuralVariantAnnotationType.DUP) { + dupIntervals.add(originalSegment.getInterval()); + } else if (originalSegment.getIntervalSVType() == GATKSVVCFConstants.StructuralVariantAnnotationType.INS && + complexType == GATKSVVCFConstants.ComplexVariantSubtype.INS_iDEL) { + // if there is an INS interval in CPX_INTERVALS for INS_iDEL, ignore it + // because it represents the origin of the inserted sequence and the status (DUP or DEL) is unknown + keepSegment = false; + } + if (keepSegment) { + segments.add(originalSegment); + } + } + // adjust INV interval for dupINV / INVdup / dupINVdup / dupINVdel / delINVdup + // to remove portion that overlaps duplicated sequence (not inverted at sequence origin so no impact) + // and keep portion that represents sink site breakpoints + if (inversionIntervalToAdjust != null) { + for (final SimpleInterval dupInterval : dupIntervals) { + inversionIntervalToAdjust = inversionIntervalToAdjust.subtract(dupInterval); + } + segments.add(new SVSegment(GATKSVVCFConstants.StructuralVariantAnnotationType.INV, inversionIntervalToAdjust)); + } + + return segments; } + /** * Get SV type to use for annotation for a breakend VCF record * Breakend may represent BND, CTX, or DEL / DUP if the user specifies {@code SVAnnotate.MAX_BND_LEN_NAME} @@ -519,11 +636,12 @@ protected static SVSegment parseCPXIntervalString(final String cpxInterval) { * @return - SV type to use for annotation of breakend record */ private static GATKSVVCFConstants.StructuralVariantAnnotationType getAnnotationTypeForBreakend(final VariantContext variant, - final String complexType, + final GATKSVVCFConstants.ComplexVariantSubtype complexType, final int maxBreakendLen, final int svLen, final String chrom, final String chr2) { - if (complexType != null && complexType.contains("CTX")) { + if (complexType != null && (complexType == GATKSVVCFConstants.ComplexVariantSubtype.CTX_PP_QQ || + complexType == GATKSVVCFConstants.ComplexVariantSubtype.CTX_PQ_QP)) { return GATKSVVCFConstants.StructuralVariantAnnotationType.CTX; } else if (maxBreakendLen > 0 && chr2 != null && chrom.equals(chr2) && svLen <= maxBreakendLen) { // if maxBreakendLenForOverlapAnnotation argument provided, annotate as DUP or DEL if applicable @@ -554,26 +672,25 @@ private static GATKSVVCFConstants.StructuralVariantAnnotationType getAnnotationT @VisibleForTesting protected static List getSVSegments(final VariantContext variant, final GATKSVVCFConstants.StructuralVariantAnnotationType overallSVType, - final int maxBreakendLen) { + final int maxBreakendLen, + final GATKSVVCFConstants.ComplexVariantSubtype complexType) { final List intervals; - final String complexType = variant.getAttributeAsString(GATKSVVCFConstants.CPX_TYPE, null); final String chrom = variant.getContig(); final int pos = variant.getStart(); final String chr2 = variant.getAttributeAsString(GATKSVVCFConstants.CONTIG2_ATTRIBUTE, null); final int end2 = variant.getAttributeAsInt(GATKSVVCFConstants.END2_ATTRIBUTE, pos); if (overallSVType.equals(GATKSVVCFConstants.StructuralVariantAnnotationType.CPX)) { - final List cpxIntervalsString = variant.getAttributeAsStringList(GATKSVVCFConstants.CPX_INTERVALS, null); - if (cpxIntervalsString == null) { + final List cpxIntervals = variant.getAttributeAsStringList(GATKSVVCFConstants.CPX_INTERVALS, null); + if (cpxIntervals.isEmpty()) { throw new UserException("Complex (CPX) variant must contain CPX_INTERVALS INFO field"); } if (complexType == null) { throw new UserException("Complex (CPX) variant must contain CPX_TYPE INFO field"); } - intervals = new ArrayList<>(cpxIntervalsString.size() + 1); - for (final String cpxInterval : cpxIntervalsString) { - intervals.add(parseCPXIntervalString(cpxInterval)); - } - if (complexType.contains("dDUP")) { + intervals = getComplexAnnotationIntervals(parseComplexIntervals(cpxIntervals), complexType); + // add sink site as INS for dDUP (encoded in CHROM and POS instead of INFO/CPX_INTERVALS) + // no need to add sink site INS for INS_iDEL or dDUP_iDEL because DEL coordinates contain sink site + if (complexType == GATKSVVCFConstants.ComplexVariantSubtype.dDUP) { intervals.add(new SVSegment(GATKSVVCFConstants.StructuralVariantAnnotationType.INS, new SimpleInterval(chrom, pos, pos + 1))); } @@ -620,7 +737,62 @@ protected static List getSVSegments(final VariantContext variant, return intervals; } + /** + * Returns a list of SVSegments to use for promoter & noncoding annotations + * For simple (non-complex) SVs, returns original list of segments + * For complex SVs, returns a new subsetted list of SVSegments without DUP segments, which are always dispersed + * (never tandem) in current set of defined CPX subtypes, and so are not considered for noncoding annotations. + * @param svSegments - List of SVSegments used for gene overlap annotations + * @return - Subsetted list of SVSegments to use for promoter & noncoding annotations for CPX SVs + */ + @VisibleForTesting + protected static List getSegmentsForNonCodingAnnotations(final List svSegments, + final boolean isComplex) { + if (isComplex) { + return svSegments.stream() + .filter(seg -> seg.getIntervalSVType() != GATKSVVCFConstants.StructuralVariantAnnotationType.DUP) + .collect(Collectors.toList()); + } + else { + return svSegments; + } + } + /** + * Returns a list of SVSegments to use for nearest TSS annotations. + * Must apply on the output of getSegmentsForNonCodingAnnotations. + * For simple (non-complex) SVs, returns original list of segments. + * For complex SVs, merges remaining intervals (DEL, INV) into a single interval for deletion-containing CPX events. + * DUP segments are already removed from complex SVs. + * @param svSegments - List of SVSegments used for gene overlap annotations + * @return - List of SVSegments to use for nearest TSS annotations for CPX SVs + */ + @VisibleForTesting + protected static List getSegmentForNearestTSS(final List svSegments, + final GATKSVVCFConstants.ComplexVariantSubtype complexType) { + // for dDUP_iDEL, INS_iDEL, delINV, INVdel, dupINVdel, delINVdup, delINVdel --> merge all remaining SV segments + // which will be INS, DEL, INV types (DUPs already removed) + // so that there is only one nearest TSS based on outer breakpoints of CPX event + if (complexType != null && + (complexType == GATKSVVCFConstants.ComplexVariantSubtype.INS_iDEL || + complexType == GATKSVVCFConstants.ComplexVariantSubtype.dDUP_iDEL || + complexType == GATKSVVCFConstants.ComplexVariantSubtype.delINV || + complexType == GATKSVVCFConstants.ComplexVariantSubtype.INVdel || + complexType == GATKSVVCFConstants.ComplexVariantSubtype.dupINVdel || + complexType == GATKSVVCFConstants.ComplexVariantSubtype.delINVdup || + complexType == GATKSVVCFConstants.ComplexVariantSubtype.delINVdel)) { + SimpleInterval spanningSegment = svSegments.get(0).getInterval(); + for (int i = 1; i < svSegments.size(); i++) { + spanningSegment = spanningSegment.mergeWithContiguous(svSegments.get(i).getInterval()); + } + return Collections.singletonList(new SVSegment(GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, + spanningSegment)); + } else { + // if not complex return original list of segments + // if complex & dDUP, dupINV, INVdup, dupINVdup --> no further modifications (already adjusted INV, removed DUPs) + return svSegments; + } + } /** * Create a copy of the variant consequence dictionary in which the feature names for each consequence are sorted @@ -639,6 +811,39 @@ protected static Map sortVariantConsequenceDict(final Map> variantConsequenceDict, + final Set proteinCodingConsequences) { + for (final String consequence : variantConsequenceDict.keySet()) { + if (proteinCodingConsequences.contains(consequence)) { + // if the SV has protein-coding consequences other than PARTIAL_DISPERSED_DUP then it is not INTERGENIC + return false; + } + } + // if the SV has no protein-coding consequences then it is intergenic + return true; + } + + /** + * Checks if a variant includes one or more dispersed duplications + * @param complexType - the complex subtype of the SV + * @param complexSubtypesWithDispersedDup - the set of complex subtypes containing one or more dispersed duplications + * @return - boolean: true if the SV's complexType is in the set of complexSubtypesWithDispersedDup, false otherwise + */ + @VisibleForTesting + protected static boolean includesDispersedDuplication(final GATKSVVCFConstants.ComplexVariantSubtype complexType, + final Set complexSubtypesWithDispersedDup) { + return complexType != null && complexSubtypesWithDispersedDup.contains(complexType); + } + /** * Create a consequence -> feature name map and add all annotations for protein-coding, promoter, nearest TSS, * and noncoding consequences for a variant @@ -649,40 +854,58 @@ protected static Map sortVariantConsequenceDict(final Map annotateStructuralVariant(final VariantContext variant) { final Map> variantConsequenceDict = new HashMap<>(); final GATKSVVCFConstants.StructuralVariantAnnotationType overallSVType = getSVType(variant); - final List svSegments = getSVSegments(variant, overallSVType, maxBreakendLen); + final String complexTypeString = variant.getAttributeAsString(GATKSVVCFConstants.CPX_TYPE, null); + GATKSVVCFConstants.ComplexVariantSubtype complexType = null; + if (complexTypeString != null) { + // replace / in CTX_PP/QQ and CTX_PQ/QP with _ to match ComplexVariantSubtype constants which cannot contain slashes + complexType = GATKSVVCFConstants.ComplexVariantSubtype.valueOf(complexTypeString.replace("/", "_")); + } + final boolean includesDispersedDuplication = includesDispersedDuplication(complexType, COMPLEX_SUBTYPES_WITH_DISPERSED_DUP); + final List svSegmentsForGeneOverlaps = getSVSegments(variant, overallSVType, maxBreakendLen, complexType); + + // annotate gene overlaps if (gtfIntervalTrees != null && gtfIntervalTrees.getTranscriptIntervalTree() != null) { - for (SVSegment svSegment : svSegments) { - annotateGeneOverlaps(svSegment.getInterval(), svSegment.getIntervalSVType(), variantConsequenceDict); + for (SVSegment svSegment : svSegmentsForGeneOverlaps) { + annotateGeneOverlaps(svSegment.getInterval(), svSegment.getIntervalSVType(), + includesDispersedDuplication, variantConsequenceDict); } } - // if variant consequence dictionary is empty (no protein-coding annotations), apply INTERGENIC flag - final boolean noCodingAnnotations = variantConsequenceDict.isEmpty(); + // if variant consequence dictionary contains no protein-coding consequences, apply INTERGENIC flag + final boolean isIntergenic = isIntergenic(variantConsequenceDict, PROTEIN_CODING_CONSEQUENCES); + + // get SV segments to annotate promoter & noncoding consequences + final List svSegmentsForNonCodingAnnotations = + getSegmentsForNonCodingAnnotations(svSegmentsForGeneOverlaps, includesDispersedDuplication); // then annotate promoter overlaps and non-coding feature overlaps if (gtfIntervalTrees != null && gtfIntervalTrees.getPromoterIntervalTree() != null) { - for (final SVSegment svSegment : svSegments) { + for (final SVSegment svSegment : svSegmentsForNonCodingAnnotations) { annotatePromoterOverlaps(svSegment.getInterval(), variantConsequenceDict); } } if (nonCodingIntervalTree != null) { - for (SVSegment svSegment : svSegments) { + for (SVSegment svSegment : svSegmentsForNonCodingAnnotations) { annotateNonCodingOverlaps(svSegment.getInterval(), variantConsequenceDict); } } + // get list of SV segments to annotate nearest TSS + List svSegmentsForNearestTSS = + getSegmentForNearestTSS(svSegmentsForNonCodingAnnotations, complexType); + // annotate nearest TSS for intergenic variants with no promoter overlaps if (gtfIntervalTrees != null && gtfIntervalTrees.getTranscriptionStartSiteTree() != null && - !variantConsequenceDict.containsKey(GATKSVVCFConstants.PROMOTER) && noCodingAnnotations) { - for (SVSegment svSegment : svSegments) { + !variantConsequenceDict.containsKey(GATKSVVCFConstants.PROMOTER) && isIntergenic) { + for (SVSegment svSegment : svSegmentsForNearestTSS) { annotateNearestTranscriptionStartSite(svSegment.getInterval(), variantConsequenceDict); } } final Map attributes = sortVariantConsequenceDict(variantConsequenceDict); if (gtfIntervalTrees != null && gtfIntervalTrees.getTranscriptIntervalTree() != null) { - attributes.put(GATKSVVCFConstants.INTERGENIC, noCodingAnnotations); + attributes.put(GATKSVVCFConstants.INTERGENIC, isIntergenic); } return attributes; } diff --git a/src/main/java/org/broadinstitute/hellbender/utils/SimpleInterval.java b/src/main/java/org/broadinstitute/hellbender/utils/SimpleInterval.java index bade66094bf..ebbce974105 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/SimpleInterval.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/SimpleInterval.java @@ -1,6 +1,7 @@ package org.broadinstitute.hellbender.utils; +import com.google.common.annotations.VisibleForTesting; import htsjdk.samtools.SAMSequenceDictionary; import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.util.Locatable; @@ -275,6 +276,25 @@ public SimpleInterval intersect( final Locatable that ) { Math.min( getEnd(), that.getEnd()) ); } + /** + * Get section of starting interval (this) that is not overlapped by the other interval (that) + * @param that - interval to subtract from starting interval. Must overlap (but not fully contain) starting interval + * @return - SimpleInterval representing the portion of starting interval (this) not overlapped by other interval (that) + */ + @VisibleForTesting + public SimpleInterval subtract(final Locatable that) { + Utils.validateArg(this.overlaps(that), () -> + "SimpleIntervaL::subtract(): The two intervals need to overlap: " + this + ", " + that); + Utils.validateArg(!that.contains(this), () -> + "SimpleIntervaL::subtract(): Interval to subtract " + that + " cannot contain starting interval " + this); + if (this.getStart() < that.getStart()) { + return new SimpleInterval(this.getContig(), this.getStart(), that.getStart()); + } + else { + return new SimpleInterval(this.getContig(), that.getEnd(), this.getEnd()); + } + } + /** * Returns a new SimpleInterval that represents the entire span of this and that. Requires that * this and that SimpleInterval are contiguous. diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotateEngineUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotateEngineUnitTest.java index 867e0222221..80f8312ff0c 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotateEngineUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotateEngineUnitTest.java @@ -25,12 +25,6 @@ public class SVAnnotateEngineUnitTest extends GATKBaseTest { private final File TOY_GTF_FILE = new File(getToolTestDataDir().replaceFirst("Engine", "") + "unittest.gtf"); private final File TINY_NONCODING_BED_FILE = new File(getToolTestDataDir().replaceFirst("Engine", "") + "noncoding.unittest.bed"); - private final Set MSV_EXON_OVERLAP_CLASSIFICATIONS = Sets.newHashSet(GATKSVVCFConstants.LOF, - GATKSVVCFConstants.INT_EXON_DUP, - GATKSVVCFConstants.DUP_PARTIAL, - GATKSVVCFConstants.PARTIAL_EXON_DUP, - GATKSVVCFConstants.COPY_GAIN, - GATKSVVCFConstants.TSS_DUP); // Pairs of intervals with different relationships to check if first (variant) interval spans second (feature) @@ -175,14 +169,15 @@ public void testAnnotateIntervalSVTypes( ) { final GencodeGtfTranscriptFeature toyTranscript = loadToyGtfTranscript(); - final String actualDuplicationConsequence = SVAnnotateEngine.annotateDuplication(toyVariant, toyTranscript); + final String actualDuplicationConsequence = SVAnnotateEngine.annotateDuplication(toyVariant, toyTranscript, false); Assert.assertEquals(actualDuplicationConsequence, expectedDuplicationConsequence); final String actualDeletionConsequence = SVAnnotateEngine.annotateDeletion(toyVariant, toyTranscript); Assert.assertEquals(actualDeletionConsequence, expectedDeletionConsequence); final String actualCopyNumberVariantConsequence = - SVAnnotateEngine.annotateCopyNumberVariant(toyVariant, toyTranscript, MSV_EXON_OVERLAP_CLASSIFICATIONS); + SVAnnotateEngine.annotateCopyNumberVariant(toyVariant, toyTranscript, + SVAnnotateEngine.MSV_EXON_OVERLAP_CLASSIFICATIONS); Assert.assertEquals(actualCopyNumberVariantConsequence, expectedCopyNumberVariantConsequence); final String actualInversionConsequence = SVAnnotateEngine.annotateInversion(toyVariant, toyTranscript); @@ -244,21 +239,143 @@ public void testAnnotatePointSVTypes( Assert.assertEquals(actualTwoBaseTranslocationConsequence, expectedTranslocationVariantConsequence); } + /** + * Create list of SV segments with SAME SVTYPE - convenience function for testing getSVSegments + * @param svType - SV type for all segments + * @param intervals - list of intervals + * @return - list of SV segments with provided SV type, one for each interval + */ + private List createListOfSVSegments(final GATKSVVCFConstants.StructuralVariantAnnotationType svType, + final SimpleInterval[] intervals) { + final List segments = new ArrayList<>(intervals.length); + for (final SimpleInterval interval : intervals) { + segments.add(new SVAnnotateEngine.SVSegment(svType, interval)); + } + return segments; + } + + /** + * Create list of SV segments with different SVTYPEs - convenience function + * @param svTypes - list of SV types + * @param intervals - list of intervals + * @return - list of SV segments + */ + private List createListOfSVSegmentsDifferentTypes(final GATKSVVCFConstants.StructuralVariantAnnotationType[] svTypes, + final SimpleInterval[] intervals) { + Assert.assertEquals(svTypes.length, intervals.length); + final List segments = new ArrayList<>(intervals.length); + for (int i = 0; i < svTypes.length; i++) { + segments.add(new SVAnnotateEngine.SVSegment(svTypes[i], intervals[i])); + } + return segments; + } + + // CPX_TYPE & CPX_INTERVALS INFO fields specifying complex variant intervals, and expected SV segments + @DataProvider(name = "complexVariantIntervals") + public Object[][] getComplexVariantIntervalsTestData() { + return new Object[][] { + { GATKSVVCFConstants.ComplexVariantSubtype.dDUP, "DUP_chr1:280-420", + createListOfSVSegmentsDifferentTypes(new GATKSVVCFConstants.StructuralVariantAnnotationType[]{ + GATKSVVCFConstants.StructuralVariantAnnotationType.DUP }, + new SimpleInterval[]{new SimpleInterval("chr1", 280, 420)}) }, + { GATKSVVCFConstants.ComplexVariantSubtype.dupINV, "DUP_chr1:44355904-44356327,INV_chr1:44355904-44357498", + createListOfSVSegmentsDifferentTypes(new GATKSVVCFConstants.StructuralVariantAnnotationType[]{ + GATKSVVCFConstants.StructuralVariantAnnotationType.DUP, + GATKSVVCFConstants.StructuralVariantAnnotationType.INV}, + new SimpleInterval[]{new SimpleInterval("chr1", 44355904, 44356327), + new SimpleInterval("chr1", 44356327, 44357498)}) }, + { GATKSVVCFConstants.ComplexVariantSubtype.dupINVdup, "DUP_chr1:247660974-248129213,INV_chr1:247660974-248587216,DUP_chr1:248520217-248587216", + createListOfSVSegmentsDifferentTypes(new GATKSVVCFConstants.StructuralVariantAnnotationType[]{ + GATKSVVCFConstants.StructuralVariantAnnotationType.DUP, + GATKSVVCFConstants.StructuralVariantAnnotationType.DUP, + GATKSVVCFConstants.StructuralVariantAnnotationType.INV}, + new SimpleInterval[]{new SimpleInterval("chr1", 247660974, 248129213), + new SimpleInterval("chr1", 248520217, 248587216), + new SimpleInterval("chr1", 248129213, 248520217)}) }, + { GATKSVVCFConstants.ComplexVariantSubtype.dDUP_iDEL, "DUP_chr2:131488885-131489335,INV_chr2:131488885-131489335,DEL_chr2:130185450-130185720", + createListOfSVSegmentsDifferentTypes(new GATKSVVCFConstants.StructuralVariantAnnotationType[]{ + GATKSVVCFConstants.StructuralVariantAnnotationType.DUP, + GATKSVVCFConstants.StructuralVariantAnnotationType.DEL}, + new SimpleInterval[]{new SimpleInterval("chr2", 131488885,131489335), + new SimpleInterval("chr2", 130185450,130185720)}) }, + { GATKSVVCFConstants.ComplexVariantSubtype.dDUP_iDEL, "DUP_chr3:95751919-95752156,DEL_chr3:95746923-95749272", + createListOfSVSegmentsDifferentTypes(new GATKSVVCFConstants.StructuralVariantAnnotationType[]{ + GATKSVVCFConstants.StructuralVariantAnnotationType.DUP, + GATKSVVCFConstants.StructuralVariantAnnotationType.DEL}, + new SimpleInterval[]{new SimpleInterval("chr3", 95751919,95752156), + new SimpleInterval("chr3", 95746923,95749272)}) }, + { GATKSVVCFConstants.ComplexVariantSubtype.INS_iDEL, "DEL_chr3:60521333-60521483", + createListOfSVSegmentsDifferentTypes(new GATKSVVCFConstants.StructuralVariantAnnotationType[]{ + GATKSVVCFConstants.StructuralVariantAnnotationType.DEL}, + new SimpleInterval[]{new SimpleInterval("chr3", 60521333,60521483)}) }, + { GATKSVVCFConstants.ComplexVariantSubtype.delINV, "DEL_chr2:120379742-120383130,INV_chr2:120383130-120384123", + createListOfSVSegmentsDifferentTypes(new GATKSVVCFConstants.StructuralVariantAnnotationType[]{ + GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, + GATKSVVCFConstants.StructuralVariantAnnotationType.INV}, + new SimpleInterval[]{new SimpleInterval("chr2", 120379742,120383130), + new SimpleInterval("chr2", 120383130,120384123)}) }, + { GATKSVVCFConstants.ComplexVariantSubtype.INVdel, "INV_chr2:122719025-122719803,DEL_chr2:122719803-122724929", + createListOfSVSegmentsDifferentTypes(new GATKSVVCFConstants.StructuralVariantAnnotationType[]{ + GATKSVVCFConstants.StructuralVariantAnnotationType.INV, + GATKSVVCFConstants.StructuralVariantAnnotationType.DEL}, + new SimpleInterval[]{new SimpleInterval("chr2", 122719025,122719803), + new SimpleInterval("chr2", 122719803,122724929)}) }, + { GATKSVVCFConstants.ComplexVariantSubtype.delINVdup, "DEL_chr2:54002577-54003019,INV_chr2:54003019-54006204,DUP_chr2:54006057-54006204", + createListOfSVSegmentsDifferentTypes(new GATKSVVCFConstants.StructuralVariantAnnotationType[]{ + GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, + GATKSVVCFConstants.StructuralVariantAnnotationType.DUP, + GATKSVVCFConstants.StructuralVariantAnnotationType.INV}, + new SimpleInterval[]{new SimpleInterval("chr2", 54002577,54003019), + new SimpleInterval("chr2", 54006057,54006204), + new SimpleInterval("chr2", 54003019,54006057)}) }, + { GATKSVVCFConstants.ComplexVariantSubtype.dupINVdel, "DUP_chr2:4157678-4157846,INV_chr2:4157678-4165085,DEL_chr2:4165085-4175888", + createListOfSVSegmentsDifferentTypes(new GATKSVVCFConstants.StructuralVariantAnnotationType[]{ + GATKSVVCFConstants.StructuralVariantAnnotationType.DUP, + GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, + GATKSVVCFConstants.StructuralVariantAnnotationType.INV}, + new SimpleInterval[]{new SimpleInterval("chr2", 4157678,4157846), + new SimpleInterval("chr2", 4165085,4175888), + new SimpleInterval("chr2", 4157846,4165085)}) }, + { GATKSVVCFConstants.ComplexVariantSubtype.delINVdel, "DEL_chr2:62384663-62387814,INV_chr2:62387814-62388322,DEL_chr2:62388322-62390272", + createListOfSVSegmentsDifferentTypes(new GATKSVVCFConstants.StructuralVariantAnnotationType[]{ + GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, + GATKSVVCFConstants.StructuralVariantAnnotationType.INV, + GATKSVVCFConstants.StructuralVariantAnnotationType.DEL}, + new SimpleInterval[]{new SimpleInterval("chr2", 62384663,62387814), + new SimpleInterval("chr2", 62387814,62388322), + new SimpleInterval("chr2", 62388322,62390272)}) } + }; + } + + // Test getComplexAnnotationIntervals() + @Test(dataProvider = "complexVariantIntervals") + public void testGetSegmentsFromComplexIntervals( + final GATKSVVCFConstants.ComplexVariantSubtype complexType, + final String cpxIntervalsString, + final List expectedSVSegments + ) { + final List cpxIntervals = SVAnnotateEngine.parseComplexIntervals(Arrays.asList(cpxIntervalsString.split(","))); + final List actualSegments = SVAnnotateEngine.getComplexAnnotationIntervals(cpxIntervals, + complexType); + assertSegmentListEqual(actualSegments, expectedSVSegments); + } + // CPX_INTERVALS INFO field string specifying complex variant intervals, and expected annotation(s) @DataProvider(name = "toyComplexVariants") public Object[][] getToyComplexVariantTestData() { return new Object[][] { - { "DUP_chr1:280-420", Sets.newHashSet(GATKSVVCFConstants.INT_EXON_DUP) }, - { "INV_chr1:90-1010,DUP_chr1:890-1010", - Sets.newHashSet(GATKSVVCFConstants.INV_SPAN, GATKSVVCFConstants.DUP_PARTIAL) }, - { "DEL_chr1:250-450,INV_chr1:450-650,DUP_chr1:610-650", - Sets.newHashSet(GATKSVVCFConstants.LOF, GATKSVVCFConstants.INTRONIC) } + { GATKSVVCFConstants.ComplexVariantSubtype.dDUP, "DUP_chr1:280-420", Sets.newHashSet(GATKSVVCFConstants.PARTIAL_DISPERSED_DUP) }, + { GATKSVVCFConstants.ComplexVariantSubtype.INVdup, "INV_chr1:90-1010,DUP_chr1:890-1010", + Sets.newHashSet(GATKSVVCFConstants.LOF, GATKSVVCFConstants.PARTIAL_DISPERSED_DUP) }, + { GATKSVVCFConstants.ComplexVariantSubtype.delINVdup, "DEL_chr1:250-450,INV_chr1:450-650,DUP_chr1:610-650", + Sets.newHashSet(GATKSVVCFConstants.LOF, GATKSVVCFConstants.PARTIAL_DISPERSED_DUP) } }; } - // Test annotation of CPX events from CPX_INTERVALS string + // Test annotation of CPX events for gene overlaps from CPX_INTERVALS string @Test(dataProvider = "toyComplexVariants") public void testAnnotateComplexEvents( + final GATKSVVCFConstants.ComplexVariantSubtype complexType, final String cpxIntervalsString, final Set expectedConsequences ) { @@ -272,28 +389,36 @@ public void testAnnotateComplexEvents( SVAnnotateEngine svAnnotateEngine = new SVAnnotateEngine(gtfTrees, null, sequenceDictionary, -1); - final String[] cpxIntervalStrings = cpxIntervalsString.split(","); - for (String cpxIntervalString : cpxIntervalStrings) { - SVAnnotateEngine.SVSegment cpxSegment = SVAnnotateEngine.parseCPXIntervalString(cpxIntervalString); - svAnnotateEngine.annotateGeneOverlaps(cpxSegment.getInterval(), cpxSegment.getIntervalSVType(), + final List cpxIntervals = SVAnnotateEngine.parseComplexIntervals(Arrays.asList(cpxIntervalsString.split(","))); + final List cpxSegments = SVAnnotateEngine.getComplexAnnotationIntervals(cpxIntervals, complexType); + for (final SVAnnotateEngine.SVSegment cpxSegment: cpxSegments) { + svAnnotateEngine.annotateGeneOverlaps(cpxSegment.getInterval(), cpxSegment.getIntervalSVType(), true, variantConsequenceDict); } Assert.assertEquals(variantConsequenceDict.keySet(), expectedConsequences); } + private Map> createVariantConsequenceDict(final String[] consequences, + final String[] features) { + Assert.assertEquals(consequences.length, features.length); + final Map> map = new HashMap<>(); + for (int i = 0; i < consequences.length; i++) { + SVAnnotateEngine.updateVariantConsequenceDict(map, consequences[i], features[i]); + } + return map; + } + // Test sortVariantConsequenceDict() sorts lists of genes in variant consequence map @Test public void testSortVariantConsequenceDict() { - final Map> before = new HashMap<>(); - SVAnnotateEngine.updateVariantConsequenceDict(before, GATKSVVCFConstants.LOF, "NOC2L"); - SVAnnotateEngine.updateVariantConsequenceDict(before, GATKSVVCFConstants.LOF, "KLHL17"); - SVAnnotateEngine.updateVariantConsequenceDict(before, GATKSVVCFConstants.LOF, "PLEKHN1"); - SVAnnotateEngine.updateVariantConsequenceDict(before, GATKSVVCFConstants.LOF, "PERM1"); - SVAnnotateEngine.updateVariantConsequenceDict(before, GATKSVVCFConstants.DUP_PARTIAL, "SAMD11"); - SVAnnotateEngine.updateVariantConsequenceDict(before, GATKSVVCFConstants.LOF, "HES4"); - SVAnnotateEngine.updateVariantConsequenceDict(before, GATKSVVCFConstants.TSS_DUP, "ISG15"); + final Map> before = createVariantConsequenceDict( + new String[]{GATKSVVCFConstants.LOF, GATKSVVCFConstants.LOF, GATKSVVCFConstants.LOF, + GATKSVVCFConstants.LOF, GATKSVVCFConstants.DUP_PARTIAL, GATKSVVCFConstants.LOF, + GATKSVVCFConstants.TSS_DUP}, + new String[]{"NOC2L", "KLHL17", "PLEKHN1", "PERM1", "SAMD11", "HES4", "ISG15"} + ); final Map expectedAfter = new HashMap<>(); expectedAfter.put(GATKSVVCFConstants.DUP_PARTIAL, Arrays.asList("SAMD11")); @@ -303,22 +428,74 @@ public void testSortVariantConsequenceDict() { Assert.assertEquals(SVAnnotateEngine.sortVariantConsequenceDict(before), expectedAfter); } + @DataProvider(name = "proteinCodingConsequences") + public Object[][] getProteinCodingConsequences() { + return new Object[][] { + // protein-coding consequence -> not intergenic + { createVariantConsequenceDict( + new String[]{GATKSVVCFConstants.TSS_DUP, GATKSVVCFConstants.PARTIAL_DISPERSED_DUP}, + new String[]{"HES4", "SAMD11"}), + false + }, + // partial dispersed dup does not count towards protein-coding consequences for this + { createVariantConsequenceDict( + new String[]{GATKSVVCFConstants.PARTIAL_DISPERSED_DUP}, + new String[]{"SAMD11"}), + true + }, + // ignore noncoding consequences + { createVariantConsequenceDict( + new String[]{GATKSVVCFConstants.NONCODING_BREAKPOINT, GATKSVVCFConstants.PROMOTER}, + new String[]{"Enhancer", "RP1"}), + true + }, + { new HashMap<>(), + true + } + }; + } - /** - * Create list of SV segments with SAME SVTYPE - convenience function for testing getSVSegments - * @param svType - SV type for all segments - * @param intervals - list of intervals - * @return - list of SV segments with provided SV type, one for each interval - */ - private List createListOfSVSegments(final GATKSVVCFConstants.StructuralVariantAnnotationType svType, - final SimpleInterval[] intervals) { - final List segments = new ArrayList<>(intervals.length); - for (final SimpleInterval interval : intervals) { - segments.add(new SVAnnotateEngine.SVSegment(svType, interval)); - } - return segments; + @Test(dataProvider = "proteinCodingConsequences") + public void testIsIntergenic( + final Map> variantConsequenceDict, + final boolean expectedIsIntergenic + ){ + final boolean actualIsIntergenic = SVAnnotateEngine.isIntergenic(variantConsequenceDict, + SVAnnotateEngine.PROTEIN_CODING_CONSEQUENCES); + Assert.assertEquals(actualIsIntergenic, expectedIsIntergenic); + } + + @DataProvider(name = "complexSubtypes") + public Object[][] getComplexSubtypes() { + return new Object[][] { + { null, false }, + { GATKSVVCFConstants.ComplexVariantSubtype.delINV, false }, + { GATKSVVCFConstants.ComplexVariantSubtype.INVdel, false }, + { GATKSVVCFConstants.ComplexVariantSubtype.dupINV, true }, + { GATKSVVCFConstants.ComplexVariantSubtype.INVdup, true }, + { GATKSVVCFConstants.ComplexVariantSubtype.delINVdel, false }, + { GATKSVVCFConstants.ComplexVariantSubtype.dupINVdup, true }, + { GATKSVVCFConstants.ComplexVariantSubtype.delINVdup, true }, + { GATKSVVCFConstants.ComplexVariantSubtype.dupINVdel, true }, + { GATKSVVCFConstants.ComplexVariantSubtype.dDUP, true }, + { GATKSVVCFConstants.ComplexVariantSubtype.dDUP_iDEL, true }, + { GATKSVVCFConstants.ComplexVariantSubtype.INS_iDEL, false }, + { GATKSVVCFConstants.ComplexVariantSubtype.CTX_PP_QQ, false }, + { GATKSVVCFConstants.ComplexVariantSubtype.CTX_PQ_QP, false } + }; } + @Test(dataProvider = "complexSubtypes") + public void testIncludesDispersedDuplication( + final GATKSVVCFConstants.ComplexVariantSubtype complexType, + final boolean expectedIncludesDispersedDuplication + ){ + final boolean actualIncludesDispersedDuplication = SVAnnotateEngine.includesDispersedDuplication(complexType, + SVAnnotateEngine.COMPLEX_SUBTYPES_WITH_DISPERSED_DUP); + Assert.assertEquals(actualIncludesDispersedDuplication, expectedIncludesDispersedDuplication); + } + + /** * Assert two lists of SVAnnotate.SVSegment objects are equal in contents * Lists must be same size and contain equal SVSegments in the same order @@ -384,6 +561,7 @@ public Object[][] getSVTypesAndSegmentsTestData() { return new Object[][] { { createVariantContext("chr2", 86263976, 86263977, "chr19", 424309, "N", "", null, null, "CTX_PP/QQ", null), + GATKSVVCFConstants.ComplexVariantSubtype.CTX_PP_QQ, GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, createListOfSVSegments(GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, new SimpleInterval[]{ new SimpleInterval("chr2", 86263976, 86263977), @@ -391,66 +569,77 @@ public Object[][] getSVTypesAndSegmentsTestData() { null }, { createVariantContext("chr2", 86263976, 86263976, null, 424309, "G", "G]chr19:424309]", null, null,"CTX_PP/QQ", null), + GATKSVVCFConstants.ComplexVariantSubtype.CTX_PP_QQ, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, Arrays.asList(new SVAnnotateEngine.SVSegment(GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, new SimpleInterval("chr2", 86263976, 86263976))), null}, { createVariantContext("chr2", 86263977, 86263977, null, 424309, "A", "[chr19:424310[A", null, null, "CTX_PP/QQ", null), + GATKSVVCFConstants.ComplexVariantSubtype.CTX_PP_QQ, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, Arrays.asList(new SVAnnotateEngine.SVSegment(GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, new SimpleInterval("chr2", 86263977, 86263977))), null }, { createVariantContext("chr2", 205522308, 205522384, "chr2", null, "N", "", 76, null, null, null), + null, GATKSVVCFConstants.StructuralVariantAnnotationType.INV, Arrays.asList(new SVAnnotateEngine.SVSegment(GATKSVVCFConstants.StructuralVariantAnnotationType.INV, new SimpleInterval("chr2", 205522308, 205522384))), null }, { createVariantContext("chr19", 424309, 424309, null, 424309, "T", "T]chr2:86263976]", null, null, "CTX_PP/QQ", null), + GATKSVVCFConstants.ComplexVariantSubtype.CTX_PP_QQ, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, Arrays.asList(new SVAnnotateEngine.SVSegment(GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, new SimpleInterval("chr19", 424309, 424309))), null }, { createVariantContext("chr19", 424310, 424310, null, 424309, "C", "[chr2:86263977[C", null, null, "CTX_PP/QQ", null), + GATKSVVCFConstants.ComplexVariantSubtype.CTX_PP_QQ, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, Arrays.asList(new SVAnnotateEngine.SVSegment(GATKSVVCFConstants.StructuralVariantAnnotationType.CTX, new SimpleInterval("chr19", 424310, 424310))), null }, { createVariantContext("chr22", 10510000, 10694100, "chr22", null, "N", "", 184100, null, null, null), + null, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, createListOfSVSegments(GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, new SimpleInterval[]{ new SimpleInterval("chr22", 10510000, 10694100)}), null}, { createVariantContext("chr22", 10510000, 10694100, "chr22", null, "N", "", 184100, null, null, null), + null, GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, createListOfSVSegments(GATKSVVCFConstants.StructuralVariantAnnotationType.DEL, new SimpleInterval[]{ new SimpleInterval("chr22", 10510000, 10694100)}), null}, { createVariantContext("chr22", 10524000, 10710000, "chr22", null, "N", "", 186000, null, null, null), + null, GATKSVVCFConstants.StructuralVariantAnnotationType.DUP, createListOfSVSegments(GATKSVVCFConstants.StructuralVariantAnnotationType.DUP, new SimpleInterval[]{ new SimpleInterval("chr22", 10524000, 10710000)}), null }, { createVariantContext("chr22", 10532563, 10532611, "chr22", null, "N", "", 245, null, null, null), + null, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, createListOfSVSegments(GATKSVVCFConstants.StructuralVariantAnnotationType.INS, new SimpleInterval[]{ new SimpleInterval("chr22", 10532563, 10532564)}), null }, { createVariantContext("chr22", 10572758, 10572788, "chr22", null, "N", "", 57, null, null, null), + null, GATKSVVCFConstants.StructuralVariantAnnotationType.INS, createListOfSVSegments(GATKSVVCFConstants.StructuralVariantAnnotationType.INS, new SimpleInterval[]{ new SimpleInterval("chr22", 10572758, 10572759)}), null }, { createVariantContext("chr22", 10717890, 10717890, "chr22", null, "N", "", 5170, "-+", null, null), + null, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, createListOfSVSegments(GATKSVVCFConstants.StructuralVariantAnnotationType.BND, new SimpleInterval[]{ new SimpleInterval("chr22", 10717890, 10717890), @@ -459,12 +648,14 @@ public Object[][] getSVTypesAndSegmentsTestData() { new SimpleInterval[]{ new SimpleInterval("chr22", 10717890, 10723060)}) }, { createVariantContext("chr22", 10774600, 10784500, "chr22", null, "N", "", 9900, null, null, null), + null, GATKSVVCFConstants.StructuralVariantAnnotationType.CNV, createListOfSVSegments(GATKSVVCFConstants.StructuralVariantAnnotationType.CNV, new SimpleInterval[]{ new SimpleInterval("chr22", 10774600, 10784500)}), null }, { createVariantContext("chr22", 10930458, 10930458, "chr22", 11564561, "N", "", 634103, "--", null, null), + null, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, createListOfSVSegments(GATKSVVCFConstants.StructuralVariantAnnotationType.BND, new SimpleInterval[]{ new SimpleInterval("chr22", 10930458, 10930458), @@ -472,6 +663,7 @@ public Object[][] getSVTypesAndSegmentsTestData() { null }, { createVariantContext("chr22", 17636024, 17636024, "chr22", null, "N", "", 10709, "+-", null, null), + null, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, createListOfSVSegments(GATKSVVCFConstants.StructuralVariantAnnotationType.BND, new SimpleInterval[]{ new SimpleInterval("chr22", 17636024, 17636024), @@ -481,10 +673,9 @@ public Object[][] getSVTypesAndSegmentsTestData() { { createVariantContext("chr22", 18971159, 18971435, "chr22", null, "N", "", 386, null, "dDUP", Arrays.asList("INV_chr22:20267228-20267614","DUP_chr22:20267228-20267614")), + GATKSVVCFConstants.ComplexVariantSubtype.dDUP, GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, Arrays.asList( - new SVAnnotateEngine.SVSegment(GATKSVVCFConstants.StructuralVariantAnnotationType.INV, - new SimpleInterval("chr22", 20267228, 20267614)), new SVAnnotateEngine.SVSegment(GATKSVVCFConstants.StructuralVariantAnnotationType.DUP, new SimpleInterval("chr22", 20267228, 20267614)), new SVAnnotateEngine.SVSegment(GATKSVVCFConstants.StructuralVariantAnnotationType.INS, @@ -492,6 +683,7 @@ public Object[][] getSVTypesAndSegmentsTestData() { null }, { createVariantContext("chr22", 22120897, 22120897, "chrX", 126356858, "N", "", -1, "++", null, null), + null, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, createListOfSVSegments(GATKSVVCFConstants.StructuralVariantAnnotationType.BND, new SimpleInterval[]{ new SimpleInterval("chr22", 22120897, 22120897), @@ -499,6 +691,7 @@ public Object[][] getSVTypesAndSegmentsTestData() { null }, { createVariantContext("chr22", 22196261, 22196261, "chr22", null, "N", "", 708725, "+-", null, null), + null, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, createListOfSVSegments(GATKSVVCFConstants.StructuralVariantAnnotationType.BND, new SimpleInterval[]{ new SimpleInterval("chr22", 22196261, 22196261), @@ -506,12 +699,14 @@ public Object[][] getSVTypesAndSegmentsTestData() { null }, { createVariantContext("chr22", 22196261, 22196261, null, null, "A", "A[chr22:22904986[", null, "+-", null, null), + null, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, createListOfSVSegments(GATKSVVCFConstants.StructuralVariantAnnotationType.BND, new SimpleInterval[]{ new SimpleInterval("chr22", 22196261, 22196261) }), null }, { createVariantContext("chr22", 22904986, 22904986, null, null, "T", "]chr22:22196261]T", null, "+-", null, null), + null, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, createListOfSVSegments(GATKSVVCFConstants.StructuralVariantAnnotationType.BND, new SimpleInterval[]{ new SimpleInterval("chr22", 22904986, 22904986) }), @@ -519,12 +714,13 @@ public Object[][] getSVTypesAndSegmentsTestData() { { createVariantContext("chr22", 36533058, 36538234, "chr22", null, "N", "", 5176, null, "dupINV", Arrays.asList("DUP_chr22:36533058-36533299","INV_chr22:36533058-36538234")), + GATKSVVCFConstants.ComplexVariantSubtype.dupINV, GATKSVVCFConstants.StructuralVariantAnnotationType.CPX, Arrays.asList( new SVAnnotateEngine.SVSegment(GATKSVVCFConstants.StructuralVariantAnnotationType.DUP, new SimpleInterval("chr22", 36533058, 36533299)), new SVAnnotateEngine.SVSegment(GATKSVVCFConstants.StructuralVariantAnnotationType.INV, - new SimpleInterval("chr22", 36533058, 36538234))), + new SimpleInterval("chr22", 36533299, 36538234))), null } }; } @@ -533,6 +729,7 @@ public Object[][] getSVTypesAndSegmentsTestData() { @Test(dataProvider = "typesAndSegments") public void testGetSVTypeAndSegments( final VariantContext variant, + final GATKSVVCFConstants.ComplexVariantSubtype complexType, final GATKSVVCFConstants.StructuralVariantAnnotationType expectedSVType, final List expectedSVSegments, final List expectedSVSegmentsWithBNDOverlap @@ -541,11 +738,11 @@ public void testGetSVTypeAndSegments( Assert.assertEquals(actualSVType, expectedSVType); final List actualSegments = SVAnnotateEngine.getSVSegments(variant, - actualSVType, -1); + actualSVType, -1, complexType); assertSegmentListEqual(actualSegments, expectedSVSegments); final List actualSegmentsWithBNDOverlap = SVAnnotateEngine.getSVSegments(variant, - actualSVType, 15000); + actualSVType, 15000, complexType); assertSegmentListEqual(actualSegmentsWithBNDOverlap, expectedSVSegmentsWithBNDOverlap != null ? expectedSVSegmentsWithBNDOverlap : expectedSVSegments); } @@ -581,6 +778,62 @@ public Object[][] getAnnotateStructuralVariantTestData() { createAttributesMap( Arrays.asList(GATKSVVCFConstants.PROMOTER, GATKSVVCFConstants.INTERGENIC), Arrays.asList("EMMA1", true)) }, + // CPX with dDUP CG and promoter annotations + { createVariantContext("chr1", 10, 11, null, null, null, + "", 1500, null, "dDUP", + Collections.singletonList("DUP_chr1:2000-3500")), + createAttributesMap( + Arrays.asList(GATKSVVCFConstants.PROMOTER, GATKSVVCFConstants.COPY_GAIN, + GATKSVVCFConstants.INTERGENIC), + Arrays.asList("EMMA1", "EMMA2", false)) }, + // dupINVdup with CG + { createVariantContext("chr1", 20, 3500, null, null, null, + "", 3480, null, "dupINVdup", + Arrays.asList("DUP_chr1:20-70", "INV_chr1:20-3500", "DUP_chr1:2000-3500")), + createAttributesMap( + Arrays.asList(GATKSVVCFConstants.INV_SPAN, GATKSVVCFConstants.COPY_GAIN, + GATKSVVCFConstants.NONCODING_SPAN, GATKSVVCFConstants.INTERGENIC), + Arrays.asList("EMMA1", "EMMA2", Arrays.asList("DNase", "Enhancer"), false)) }, + // ignore INV for dDUP; ignore dDUP for promoter; CPX noncoding span; CPX intergenic + { createVariantContext("chr1", 1101, 1102, null, null, null, + "", 700, null, "dDUP_iDEL", + Arrays.asList("DUP_chr1:10-60", "INV_chr1:10-60","DEL_chr1:1100-1700")), + createAttributesMap( + Arrays.asList(GATKSVVCFConstants.NONCODING_SPAN, GATKSVVCFConstants.NEAREST_TSS, + GATKSVVCFConstants.INTERGENIC), + Arrays.asList("Enhancer", "EMMA1", true)) }, + // Ignore INV and source INS in CPX_INTERVALS for INS_iDEL + { createVariantContext("chr1", 1101, 1102, null, null, null, + "", 700, null, "INS_iDEL", + Arrays.asList("INV_chr1:10-60","DEL_chr1:1100-1700","INS_chr1:10-60")), + createAttributesMap( + Arrays.asList(GATKSVVCFConstants.NONCODING_SPAN, GATKSVVCFConstants.NEAREST_TSS, + GATKSVVCFConstants.INTERGENIC), + Arrays.asList("Enhancer", "EMMA1", true)) }, + // no noncoding breakpoint for DUP segment of CPX; modify INV interval; multiple genes for a consequence + { createVariantContext("chr1", 450, 3100, null, null, null, + "", 2650, null, "dupINV", + Arrays.asList("DUP_chr1:450-2200", "INV_chr1:450-3100")), + createAttributesMap( + Arrays.asList(GATKSVVCFConstants.PARTIAL_DISPERSED_DUP, GATKSVVCFConstants.LOF, + GATKSVVCFConstants.INTERGENIC), + Arrays.asList(Arrays.asList("EMMA1", "EMMA2"), "EMMA2", false)) }, + // intergenic with partial dispersed dup + { createVariantContext("chr1", 1100, 1100, null, null, null, + "", 550, null, "dDUP", + Arrays.asList("DUP_chr1:450-1000")), + createAttributesMap( + Arrays.asList(GATKSVVCFConstants.PARTIAL_DISPERSED_DUP, GATKSVVCFConstants.NEAREST_TSS, + GATKSVVCFConstants.INTERGENIC), + Arrays.asList("EMMA1", "EMMA1", true)) }, + // merge INV + DEL for nearest TSS; noncoding breakpoint for CPX + { createVariantContext("chr1", 1400, 1900, null, null, null, + "", 500, null, "delINV", + Arrays.asList("DEL_chr1:1400-1500", "INV_chr1:1500-1900")), + createAttributesMap( + Arrays.asList(GATKSVVCFConstants.NONCODING_BREAKPOINT, GATKSVVCFConstants.NEAREST_TSS, + GATKSVVCFConstants.INTERGENIC), + Arrays.asList("Enhancer", "EMMA2", true)) }, { createVariantContext("chr1", 50, 450, null, null, null, "", 400, null, null, null), createAttributesMap( @@ -650,6 +903,4 @@ public void testAnnotateStructuralVariant( Assert.assertEquals(actualAttributes, expectedAttributes); } - - -} \ No newline at end of file +} diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotateIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotateIntegrationTest.java index b9c30729acc..f647a4afb37 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotateIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/sv/SVAnnotateIntegrationTest.java @@ -28,7 +28,8 @@ public class SVAnnotateIntegrationTest extends CommandLineProgramTest { GATKSVVCFConstants.INV_SPAN, GATKSVVCFConstants.PROMOTER, GATKSVVCFConstants.COPY_GAIN, GATKSVVCFConstants.INTERGENIC, GATKSVVCFConstants.NEAREST_TSS, GATKSVVCFConstants.INT_EXON_DUP, GATKSVVCFConstants.PARTIAL_EXON_DUP, GATKSVVCFConstants.MSV_EXON_OVERLAP, GATKSVVCFConstants.UTR, - GATKSVVCFConstants.INTRONIC, GATKSVVCFConstants.TSS_DUP, GATKSVVCFConstants.BREAKEND_EXON); + GATKSVVCFConstants.INTRONIC, GATKSVVCFConstants.TSS_DUP, GATKSVVCFConstants.BREAKEND_EXON, + GATKSVVCFConstants.PARTIAL_DISPERSED_DUP); private void assertVariantAnnotatedAsExpected(final List vcf, final String variantID, Map expectedAnnotations) { diff --git a/src/test/java/org/broadinstitute/hellbender/utils/SimpleIntervalUnitTest.java b/src/test/java/org/broadinstitute/hellbender/utils/SimpleIntervalUnitTest.java index e230f6c5375..4cc892dd931 100644 --- a/src/test/java/org/broadinstitute/hellbender/utils/SimpleIntervalUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/utils/SimpleIntervalUnitTest.java @@ -260,6 +260,116 @@ public void testContains( final SimpleInterval firstInterval, final SimpleInterv "contains() returned incorrect result for intervals " + firstInterval + " and " + secondInterval); } + @DataProvider(name = "subtractIntervalData") + private Object[][] subtractIntervalData() { + return new Object[][] { + { new SimpleInterval("chr1", 10, 30), + new SimpleInterval("chr1", 20, 40), + new SimpleInterval("chr1", 10, 20) }, + { new SimpleInterval("chr1", 10, 30), + new SimpleInterval("chr1", 5, 15), + new SimpleInterval("chr1", 15, 30) }, + { new SimpleInterval("chr1", 10, 30), + new SimpleInterval("chr1", 10, 20), + new SimpleInterval("chr1", 20, 30) }, + { new SimpleInterval("chr1", 10, 30), + new SimpleInterval("chr1", 20, 30), + new SimpleInterval("chr1", 10, 20) } + }; + } + + @Test(dataProvider = "subtractIntervalData") + public void testSubtractInterval( final SimpleInterval firstInterval, + final SimpleInterval secondInterval, + final SimpleInterval expectedResult ) { + Assert.assertEquals(firstInterval.subtract(secondInterval), expectedResult); + } + + @DataProvider(name = "subtractIntervalDataExpectingException") + private Object[][] subtractIntervalDataExpectingException() { + return new Object[][] { + // different contigs + { new SimpleInterval("chr1", 10, 30), + new SimpleInterval("chr2", 20, 40) }, + // non-overlapping intervals on same contig + { new SimpleInterval("chr1", 10, 30), + new SimpleInterval("chr1", 50, 150) }, + // second interval contains first + { new SimpleInterval("chr1", 10, 30), + new SimpleInterval("chr1", 10, 40) } + }; + } + @Test(dataProvider = "subtractIntervalDataExpectingException", expectedExceptions = IllegalArgumentException.class) + public void testSubtractIntervalExpectingException( final SimpleInterval firstInterval, + final SimpleInterval secondInterval) { + firstInterval.subtract(secondInterval); + } + + @DataProvider(name = "mergeWithContiguousData") + private Object[][] mergeWithContiguousData() { + return new Object[][] { + // first is upstream, overlapping + { new SimpleInterval("chr1", 10, 30), + new SimpleInterval("chr1", 20, 40), + new SimpleInterval("chr1", 10, 40) }, + // first is downstream, overlapping + { new SimpleInterval("chr1", 10, 30), + new SimpleInterval("chr1", 5, 15), + new SimpleInterval("chr1", 5, 30) }, + // first contains second + { new SimpleInterval("chr1", 10, 30), + new SimpleInterval("chr1", 15, 20), + new SimpleInterval("chr1", 10, 30) }, + // second contains first + { new SimpleInterval("chr1", 20, 30), + new SimpleInterval("chr1", 10, 30), + new SimpleInterval("chr1", 10, 30) }, + // first is upstream, overlapping by 1 + { new SimpleInterval("chr1", 10, 30), + new SimpleInterval("chr1", 30, 50), + new SimpleInterval("chr1", 10, 50) }, + // first is upstream, adjacent + { new SimpleInterval("chr1", 10, 30), + new SimpleInterval("chr1", 31, 50), + new SimpleInterval("chr1", 10, 50) }, + // first is downstream, overlapping by 1 + { new SimpleInterval("chr1", 40, 60), + new SimpleInterval("chr1", 30, 40), + new SimpleInterval("chr1", 30, 60) }, + // first is downstream, adjacent + { new SimpleInterval("chr1", 40, 60), + new SimpleInterval("chr1", 30, 39), + new SimpleInterval("chr1", 30, 60) } + }; + } + + @Test(dataProvider = "mergeWithContiguousData") + public void testMergeWithContiguous( final SimpleInterval firstInterval, + final SimpleInterval secondInterval, + final SimpleInterval expectedResult ) { + Assert.assertEquals(firstInterval.mergeWithContiguous(secondInterval), expectedResult); + } + + @DataProvider(name = "mergeWithContiguousDataExpectingException") + private Object[][] mergeWithContiguousDataExpectingException() { + return new Object[][] { + // different contigs + { new SimpleInterval("chr1", 10, 30), + new SimpleInterval("chr2", 20, 40) }, + // non-contiguous intervals on same contig, first is upstream + { new SimpleInterval("chr1", 10, 30), + new SimpleInterval("chr1", 50, 150) }, + // non-contiguous intervals on same contig, first is downstream + { new SimpleInterval("chr1", 20, 30), + new SimpleInterval("chr1", 5, 15) } + }; + } + @Test(dataProvider = "mergeWithContiguousDataExpectingException", expectedExceptions = GATKException.class) + public void testMergeWithContiguousExpectingException( final SimpleInterval firstInterval, + final SimpleInterval secondInterval) { + firstInterval.mergeWithContiguous(secondInterval); + } + @Test(expectedExceptions = IllegalArgumentException.class) public void testNoNullInConstruction() throws Exception { new SimpleInterval((String)null);