Skip to content

Commit

Permalink
Add SVStratify and GroupedSVCluster tools (#8990)
Browse files Browse the repository at this point in the history
  • Loading branch information
mwalker174 authored Nov 18, 2024
1 parent 18707c6 commit 3bb2b8b
Show file tree
Hide file tree
Showing 42 changed files with 4,368 additions and 543 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ public final class GATKSVVCFConstants {
// VCF standard keys reserved for sv
public static final String SVTYPE = "SVTYPE";
public static final String SVLEN = "SVLEN";
public static final String EVIDENCE = "EVIDENCE";
public static final String IMPRECISE = "IMPRECISE";
public static final String CIPOS = "CIPOS";
public static final String CIEND = "CIEND";
Expand All @@ -31,6 +32,14 @@ public final class GATKSVVCFConstants {
public static final Allele DEL_ALLELE = Allele.create("<DEL>", false);
public static final Allele DUP_ALLELE = Allele.create("<DUP>", false);

// Evidence types
public enum EvidenceTypes {
BAF,
PE,
RD,
SR
}

// GATK-SV specific header lines
// TODO: 10/3/17 the following comment is a goal we are trying to achieve
// applicable to all records all the time
Expand Down Expand Up @@ -136,8 +145,13 @@ public enum ComplexVariantSubtype {
public static final String BND_DELETION_STRANDS = "+-";
public static final String BND_DUPLICATION_STRANDS = "-+";

// SR support
public static final String BOTHSIDES_SUPPORT_ATTRIBUTE = "BOTHSIDES_SUPPORT";
public static final String HIGH_SR_BACKGROUND_ATTRIBUTE = "HIGH_SR_BACKGROUND";

// format block
public static final String COPY_NUMBER_FORMAT = "CN";
public static final String DEPTH_GENOTYPE_COPY_NUMBER_FORMAT = "RD_CN";
public static final String EXPECTED_COPY_NUMBER_FORMAT = "ECN";
public static final String COPY_NUMBER_QUALITY_FORMAT = "CNQ";

Expand Down Expand Up @@ -175,6 +189,9 @@ public enum ComplexVariantSubtype {
public static final String TRUTH_ALLELE_NUMBER_INFO = "TRUTH_AN";
public static final String TRUTH_ALLELE_FREQUENCY_INFO = "TRUTH_AF";

// stratification
public static final String STRATUM_INFO_KEY = "STRAT";

// functional annotations
public static final String LOF = "PREDICTED_LOF";
public static final String INT_EXON_DUP = "PREDICTED_INTRAGENIC_EXON_DUP";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import java.util.stream.Stream;

import static org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants.COPY_NUMBER_FORMAT;
import static org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants.DEPTH_GENOTYPE_COPY_NUMBER_FORMAT;

public class SVCallRecord implements SVLocatable {

Expand All @@ -31,6 +32,7 @@ public class SVCallRecord implements SVLocatable {
VCFConstants.END_KEY,
GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE,
GATKSVVCFConstants.SVLEN,
GATKSVVCFConstants.EVIDENCE,
GATKSVVCFConstants.CONTIG2_ATTRIBUTE,
GATKSVVCFConstants.END2_ATTRIBUTE,
GATKSVVCFConstants.STRANDS_ATTRIBUTE,
Expand All @@ -48,6 +50,7 @@ public class SVCallRecord implements SVLocatable {
private final Boolean strandB;
private final GATKSVVCFConstants.StructuralVariantAnnotationType type;
private final Integer length;
private final List<GATKSVVCFConstants.EvidenceTypes> evidence;
private final List<String> algorithms;
private final List<Allele> alleles;
private final Allele refAllele;
Expand All @@ -72,14 +75,15 @@ public SVCallRecord(final String id,
final GATKSVVCFConstants.ComplexVariantSubtype cpxSubtype,
final List<ComplexEventInterval> cpxIntervals,
final Integer length,
final List<GATKSVVCFConstants.EvidenceTypes> evidence,
final List<String> algorithms,
final List<Allele> alleles,
final List<Genotype> genotypes,
final Map<String,Object> attributes,
final Set<String> filters,
final Double log10PError,
final SAMSequenceDictionary dictionary) {
this(id, contigA, positionA, strandA, contigB, positionB, strandB, type, cpxSubtype, cpxIntervals, length, algorithms, alleles, genotypes, attributes, filters, log10PError);
this(id, contigA, positionA, strandA, contigB, positionB, strandB, type, cpxSubtype, cpxIntervals, length, evidence, algorithms, alleles, genotypes, attributes, filters, log10PError);
validateCoordinates(dictionary);
}

Expand All @@ -94,6 +98,7 @@ protected SVCallRecord(final String id,
final GATKSVVCFConstants.ComplexVariantSubtype cpxSubtype,
final List<ComplexEventInterval> cpxIntervals,
final Integer length,
final List<GATKSVVCFConstants.EvidenceTypes> evidence,
final List<String> algorithms,
final List<Allele> alleles,
final List<Genotype> genotypes,
Expand All @@ -106,6 +111,7 @@ protected SVCallRecord(final String id,
Utils.nonNull(attributes);
Utils.nonNull(filters);
Utils.nonNull(cpxIntervals);
Utils.nonNull(evidence);
this.id = Utils.nonNull(id);
this.contigA = contigA;
this.positionA = positionA;
Expand All @@ -123,6 +129,7 @@ protected SVCallRecord(final String id,
this.genotypes = GenotypesContext.copy(genotypes).immutable();
this.attributes = validateAttributes(attributes);
this.length = inferLength(type, positionA, positionB, length);
this.evidence = evidence;
final Pair<Boolean, Boolean> strands = inferStrands(type, strandA, strandB);
this.strandA = strands.getLeft();
this.strandB = strands.getRight();
Expand Down Expand Up @@ -272,7 +279,8 @@ private boolean isCarrier(final Genotype genotype) {
}

// Otherwise, try to infer status if it's a biallelic CNV with a copy number call
final int copyNumber = VariantContextGetters.getAttributeAsInt(genotype, COPY_NUMBER_FORMAT, expectedCopyNumber);
final int copyNumber = VariantContextGetters.getAttributeAsInt(genotype, COPY_NUMBER_FORMAT,
VariantContextGetters.getAttributeAsInt(genotype, DEPTH_GENOTYPE_COPY_NUMBER_FORMAT, expectedCopyNumber));
if (type == GATKSVVCFConstants.StructuralVariantAnnotationType.DEL) {
return copyNumber < expectedCopyNumber;
} else if (type == GATKSVVCFConstants.StructuralVariantAnnotationType.DUP) {
Expand Down Expand Up @@ -370,6 +378,10 @@ public Integer getLength() {
return length;
}

public List<GATKSVVCFConstants.EvidenceTypes> getEvidence() {
return evidence;
}

public List<String> getAlgorithms() {
return algorithms;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import java.util.stream.Collectors;
import java.util.stream.Stream;

import static htsjdk.variant.vcf.VCFConstants.MISSING_VALUE_v4;
import static org.broadinstitute.hellbender.tools.sv.SVCallRecord.UNDEFINED_LENGTH;

public final class SVCallRecordUtils {
Expand Down Expand Up @@ -91,6 +92,9 @@ public static VariantContextBuilder getVariantBuilder(final SVCallRecord record)
&& record.getStrandA() != null && record.getStrandB() != null) {
builder.attribute(GATKSVVCFConstants.STRANDS_ATTRIBUTE, getStrandString(record));
}
if (!record.getEvidence().isEmpty()) {
builder.attribute(GATKSVVCFConstants.EVIDENCE, record.getEvidence());
}
if (!record.getFilters().isEmpty()) {
builder.filters(record.getFilters());
}
Expand Down Expand Up @@ -173,12 +177,12 @@ public static GenotypesContext populateGenotypesForMissingSamplesWithAlleles(fin
*/
public static SVCallRecord copyCallWithNewGenotypes(final SVCallRecord record, final GenotypesContext genotypes) {
return new SVCallRecord(record.getId(), record.getContigA(), record.getPositionA(), record.getStrandA(), record.getContigB(),
record.getPositionB(), record.getStrandB(), record.getType(), record.getComplexSubtype(), record.getComplexEventIntervals(), record.getLength(), record.getAlgorithms(), record.getAlleles(),
record.getPositionB(), record.getStrandB(), record.getType(), record.getComplexSubtype(), record.getComplexEventIntervals(), record.getLength(), record.getEvidence(), record.getAlgorithms(), record.getAlleles(),
genotypes, record.getAttributes(), record.getFilters(), record.getLog10PError());
}
public static SVCallRecord copyCallWithNewAttributes(final SVCallRecord record, final Map<String, Object> attr) {
return new SVCallRecord(record.getId(), record.getContigA(), record.getPositionA(), record.getStrandA(), record.getContigB(),
record.getPositionB(), record.getStrandB(), record.getType(), record.getComplexSubtype(), record.getComplexEventIntervals(), record.getLength(), record.getAlgorithms(), record.getAlleles(),
record.getPositionB(), record.getStrandB(), record.getType(), record.getComplexSubtype(), record.getComplexEventIntervals(), record.getLength(), record.getEvidence(), record.getAlgorithms(), record.getAlleles(),
record.getGenotypes(), attr, record.getFilters(), record.getLog10PError());
}

Expand Down Expand Up @@ -291,10 +295,10 @@ public static Stream<SVCallRecord> convertInversionsToBreakends(final SVCallReco
Utils.validateArg(record.isIntrachromosomal(), "Inversion " + record.getId() + " is not intrachromosomal");
final SVCallRecord positiveBreakend = new SVCallRecord(record.getId(), record.getContigA(),
record.getPositionA(), true, record.getContigB(), record.getPositionB(), true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null,record.getComplexEventIntervals(), null,
record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), record.getFilters(), record.getLog10PError(), dictionary);
record.getEvidence(), record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), record.getFilters(), record.getLog10PError(), dictionary);
final SVCallRecord negativeBreakend = new SVCallRecord(record.getId(), record.getContigA(),
record.getPositionA(), false, record.getContigB(), record.getPositionB(), false, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null,record.getComplexEventIntervals(), null,
record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), record.getFilters(), record.getLog10PError(), dictionary);
record.getEvidence(), record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), record.getFilters(), record.getLog10PError(), dictionary);
return Stream.of(positiveBreakend, negativeBreakend);
}

Expand All @@ -319,8 +323,9 @@ public static SVCallRecord create(final VariantContext variant, boolean keepVari

final GATKSVVCFConstants.StructuralVariantAnnotationType type = inferStructuralVariantType(variant);
final GATKSVVCFConstants.ComplexVariantSubtype cpxSubtype = getComplexSubtype(variant);
final List<SVCallRecord.ComplexEventInterval> cpxIntervals = parseComplexIntervals(variant.getAttributeAsStringList(GATKSVVCFConstants.CPX_INTERVALS, null), dictionary);
final List<SVCallRecord.ComplexEventInterval> cpxIntervals = parseComplexIntervals(variant, dictionary);
final List<String> algorithms = getAlgorithms(variant);
final List<GATKSVVCFConstants.EvidenceTypes> evidence = getEvidence(variant);

final String strands;
if (type == GATKSVVCFConstants.StructuralVariantAnnotationType.DEL
Expand Down Expand Up @@ -375,12 +380,13 @@ public static SVCallRecord create(final VariantContext variant, boolean keepVari

final Map<String, Object> sanitizedAttributes = sanitizeAttributes(attributes);
return new SVCallRecord(id, contigA, positionA, strand1, contigB, positionB, strand2, type, cpxSubtype,
cpxIntervals, length, algorithms, variant.getAlleles(), variant.getGenotypes(), sanitizedAttributes,
cpxIntervals, length, evidence, algorithms, variant.getAlleles(), variant.getGenotypes(), sanitizedAttributes,
variant.getFilters(), log10PError);
}

private static List<SVCallRecord.ComplexEventInterval> parseComplexIntervals(final List<String> intervals, final SAMSequenceDictionary dictionary) {
return intervals.stream().map(i -> SVCallRecord.ComplexEventInterval.decode(i, dictionary)).toList();
private static List<SVCallRecord.ComplexEventInterval> parseComplexIntervals(final VariantContext variant, final SAMSequenceDictionary dictionary) {
return variant.getAttributeAsStringList(GATKSVVCFConstants.CPX_INTERVALS, null).stream()
.map(i -> SVCallRecord.ComplexEventInterval.decode(i, dictionary)).toList();
}

private static Map<String, Object> sanitizeAttributes(final Map<String, Object> attributes) {
Expand All @@ -402,6 +408,19 @@ private static Integer getLength(final VariantContext variant, final GATKSVVCFCo
return length;
}

public static List<GATKSVVCFConstants.EvidenceTypes> getEvidence(final VariantContext variant) {
Utils.nonNull(variant);
final List<String> value = variant.getAttributeAsStringList(GATKSVVCFConstants.EVIDENCE, null);
if (value == null) {
return Collections.emptyList();
} else {
return value.stream()
.filter(v -> v != null && !v.equals(MISSING_VALUE_v4))
.map(GATKSVVCFConstants.EvidenceTypes::valueOf)
.collect(Collectors.toList());
}
}

public static List<String> getAlgorithms(final VariantContext variant) {
Utils.nonNull(variant);
Utils.validateArg(variant.hasAttribute(GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE), "Expected " + GATKSVVCFConstants.ALGORITHMS_ATTRIBUTE + " field for variant " + variant.getID());
Expand Down
Loading

0 comments on commit 3bb2b8b

Please sign in to comment.