diff --git a/src/sv-pipeline/04_variant_resolution/scripts/process_posthoc_cpx_depth_regenotyping.py b/src/sv-pipeline/04_variant_resolution/scripts/process_posthoc_cpx_depth_regenotyping.py new file mode 100644 index 000000000..8b63e6073 --- /dev/null +++ b/src/sv-pipeline/04_variant_resolution/scripts/process_posthoc_cpx_depth_regenotyping.py @@ -0,0 +1,1173 @@ +#!/usr/bin/env python + +import argparse +import gzip +import logging +import sys + +from collections import Counter +from statistics import median +from typing import List, Text, Dict, Optional, Set + +from pysam import VariantFile +from intervaltree import IntervalTree + +# Parameters +MIN_SIZE = 1000 +MIN_DIFF = 0.4 +MIN_SIZE_IDEL = 150 +MIN_DDUP_THRESH = 1000000 + + +def interval_string(chrom, start, end): + return f"{chrom}:{start}-{end}" + + +class VariantInfo: + + def __init__(self, chrom, start, end): + self.chrom = chrom + self.start = start + self.end = end + self.genotype_copy_numbers = dict() + + def __str__(self): + return str(vars(self)) + + def __repr__(self): + return str(vars(self)) + + +class VariantIntervalRecord: + + def __init__(self, chrom, start, end, merged_vid, carrier_del, carrier_wt, carrier_dup, control_del, control_wt, + control_dup, diff_case_control_del_frac, diff_case_control_dup_frac): + self.chrom = chrom + self.start = start + self.end = end + self.merged_vid = merged_vid + + tokens = merged_vid.split(';') + if len(tokens) < 4: + raise ValueError(f"Encountered record with fewer than 4 semicolon-delimited tokens: {merged_vid}") + self.vid = tokens[0] + self.cpx_type = tokens[1] + self.sv_type = tokens[2] + self.intervals_str_list = tokens[3].split(',') + + self.carrier_del = carrier_del + self.carrier_wt = carrier_wt + self.carrier_dup = carrier_dup + self.control_del = control_del + self.control_wt = control_wt + self.control_dup = control_dup + self.diff_case_control_del_frac = diff_case_control_del_frac + self.diff_case_control_dup_frac = diff_case_control_dup_frac + + def __str__(self): + return str(vars(self)) + + def __repr__(self): + return str(vars(self)) + + def size(self): + return self.end - self.start + + +class CleanedVariantIntervalRecord: + + def __init__(self, chrom, start, end, interval_type, vid, variant_sv_type, variant_cpx_type, + carrier_del, carrier_wt, carrier_dup, control_del, control_wt, + control_dup, diff_case_control_del_frac, diff_case_control_dup_frac, cnv_assessment): + self.chrom = chrom + self.start = start + self.end = end + self.interval_type = interval_type + self.vid = vid + + self.variant_sv_type = variant_sv_type + self.variant_cpx_type = variant_cpx_type + + self.carrier_del = carrier_del + self.carrier_wt = carrier_wt + self.carrier_dup = carrier_dup + self.control_del = control_del + self.control_wt = control_wt + self.control_dup = control_dup + self.diff_case_control_del_frac = diff_case_control_del_frac + self.diff_case_control_dup_frac = diff_case_control_dup_frac + self.cnv_assessment = cnv_assessment + + def __str__(self): + return str(vars(self)) + + def __repr__(self): + return str(vars(self)) + + def size(self): + return self.end - self.start + + +class VcfRecord: + + def __init__(self, variant_record, end_dict): + self.vid = variant_record.id + + # Sink is just the normal coordinates but 0-based / open + # TODO despite implementing the same strategy as svtk vcf2bed, small differences in some CPX intervals + # may occur because pysam forces stop to be max(pos, end) + self.sink_chrom = variant_record.chrom + start, end = sorted([variant_record.pos, end_dict.get(self.vid, variant_record.stop)]) + self.sink_start = max(0, start - 1) + if variant_record.info.get("UNRESOLVED_TYPE", "").startswith("INVERSION_SINGLE_ENDER"): + self.sink_end = variant_record.info.get("END2", variant_record.stop) + else: + self.sink_end = end + + # Source interval + if "SOURCE" in variant_record.info: + source_tokens = variant_record.info["SOURCE"].replace("_", "\t").replace(":", "\t") \ + .replace("-", "\t").split("\t") + if len(source_tokens) != 4: + raise ValueError(f"Record {variant_record.vid} has SOURCE field with unexpected format: " + f"{variant_record.info['SOURCE']}") + self.source_chrom = source_tokens[1] + self.source_start = int(source_tokens[2]) + self.source_end = int(source_tokens[3]) + else: + self.source_chrom = None + self.source_start = None + self.source_end = None + + def __str__(self): + return str(vars(self)) + + def __repr__(self): + return str(vars(self)) + + def sink_size(self): + return None if self.sink_chrom is None else self.sink_end - self.sink_start + + def source_size(self): + return None if self.source_chrom is None else self.source_end - self.source_start + + def source_string(self): + return interval_string(self.source_chrom, self.source_start, self.source_end) + + def sink_string(self): + return interval_string(self.sink_chrom, self.sink_start, self.sink_end) + + def source_len(self): + return self.source_end - self.source_start + + def sink_len(self): + return self.sink_end - self.sink_start + + +class VariantAssessment: + def __init__(self, vid, modification, reason, new_sv_type, new_cpx_type, new_cpx_intervals, new_svlen, + new_source, new_start, new_end): + self.vid = vid + self.modification = modification + self.reason = reason + self.new_sv_type = new_sv_type + self.new_cpx_type = new_cpx_type + self.new_cpx_intervals = new_cpx_intervals + self.new_svlen = new_svlen + self.new_source = new_source + self.new_start = new_start + self.new_end = new_end + + def __str__(self): + return str(vars(self)) + + def __repr__(self): + return str(vars(self)) + + +def has_reciprocal_overlap(chrom1, start1, end1, chrom2, start2, end2, min_reciprocal_overlap): + overlap = get_overlap(chrom1, start1, end1, chrom2, start2, end2) + size_1 = end1 - start1 + size_2 = end2 - start2 + return overlap / max(size_1, size_2, 1) >= min_reciprocal_overlap + + +def get_overlap(chrom1, start1, end1, chrom2, start2, end2): + if chrom1 != chrom2: + return 0 + if not (start1 <= end2 and start2 <= end1): + return 0 + return min(end1, end2) - max(start1, start2) + + +def get_reciprocal_overlaps(chrom, start, end, tree_dict, min_reciprocal_overlap): + if chrom not in tree_dict: + return list() + return [interval.data for interval in tree_dict[chrom].overlap(start, end) + if has_reciprocal_overlap(chrom, start, end, chrom, + interval.data.start, interval.data.end, min_reciprocal_overlap)] + + +def genotype_counts_per_variant(intervals_path: Text, + genotype_dict: Dict, + chr_x: Text, + chr_y: Text, + all_samples: Set[Text], + male_samples: Set[Text], + female_samples: Set[Text]): + def _count_del_dup_wt(variants_list, samples, control_cn): + num_del = 0 + num_wt = 0 + num_dup = 0 + for var in variants_list: + for s in samples: + copy_number = var.genotype_copy_numbers.get(s, None) + if copy_number is not None: + if copy_number < control_cn: + num_del += 1 + elif copy_number == control_cn: + num_wt += 1 + else: + num_dup += 1 + return num_del, num_wt, num_dup + + intervals_list_dict = dict() + record_list = list() + with gzip.open(intervals_path, 'rt') as f: + for line in f: + if line.startswith("#"): + continue + tokens = line.strip().split('\t') + if len(tokens) < 5: + raise ValueError(f"Less than 5 columns in intervals record: {line.strip()}") + chrom = tokens[0] + start = int(tokens[1]) + end = int(tokens[2]) + vid = tokens[3] + carrier_samples = set(tokens[4].split(',')) + + # Add to intervals list for referencing later + if vid not in intervals_list_dict: + intervals_list_dict[vid] = list() + intervals_list_dict[vid].append(f"{chrom}:{start}-{end}") + + # Determine carrier and control samples for this variant + if chrom == chr_x: + female_carriers = carrier_samples.intersection(female_samples) + if len(female_carriers) > 0: + # Only use females if there are female carriers + carrier_samples = female_carriers + control_samples = female_samples - carrier_samples + default_median_cn = 2 + else: + # Otherwise use only male samples + carrier_samples = carrier_samples.intersection(male_samples) + control_samples = male_samples - carrier_samples + default_median_cn = 1 + elif chrom == chr_y: + # Only use male samples + carrier_samples = carrier_samples.intersection(male_samples) + control_samples = male_samples - carrier_samples + default_median_cn = 1 + else: + # Autosomal + control_samples = all_samples - carrier_samples + default_median_cn = 2 + + # Get valid matching variant genotype records + variant_info_list = genotype_dict[vid] + if end - start < 1000000: + matching_variants = [var for var in variant_info_list + if has_reciprocal_overlap(chrom, start, end, var.chrom, var.start, var.end, 0.95)] + else: + matching_variants = [var for var in variant_info_list + if get_overlap(chrom, start, end, var.chrom, var.start, var.end) == + (var.end - var.start)] + + # Expected copy number to determine whether each genotype copy number is del/dup/wt + control_cn = [var.genotype_copy_numbers[s] for s in control_samples for var in matching_variants + if var.genotype_copy_numbers.get(s, None) is not None] + if len(control_cn) == 0: + # If no valid controls, assume default + median_control_cn = default_median_cn + else: + median_control_cn = median(control_cn) + + # Count carrier and non-carrier genotypes as del/dup/wt + carrier_del, carrier_wt, carrier_dup = _count_del_dup_wt(variants_list=matching_variants, + samples=carrier_samples, + control_cn=median_control_cn) + control_del, control_wt, control_dup = _count_del_dup_wt(variants_list=matching_variants, + samples=control_samples, + control_cn=median_control_cn) + + # Compute differences between carrier and control fractions for del/dup + n_carrier = carrier_del + carrier_wt + carrier_dup + if n_carrier == 0: + # Edge case with no valid carriers + diff_case_control_del_frac = 0 + diff_case_control_dup_frac = 0 + else: + n_control = max(control_del + control_wt + control_dup, 1) + diff_case_control_del_frac = (carrier_del / n_carrier) - (control_del / n_control) + diff_case_control_dup_frac = (carrier_dup / n_carrier) - (control_dup / n_control) + + record_list.append(VariantIntervalRecord(chrom, start, end, vid, + carrier_del, carrier_wt, carrier_dup, + control_del, control_wt, control_dup, + diff_case_control_del_frac, diff_case_control_dup_frac)) + return intervals_list_dict, record_list + + +def clean_up_intervals(genotype_counts, intervals_list_dict, genotype_counts_tree_dict): + visited_vids = set() # Only visit each unique extended vid (w/ interval info) + cleaned_genotype_counts = dict() + for query_record in genotype_counts: + if query_record.merged_vid in visited_vids: + continue + else: + visited_vids.add(query_record.merged_vid) + intervals = query_record.intervals_str_list + if len(intervals) == 0 or (len(intervals) == 1 and intervals[0] == "NA"): + # If not available, pull from the intervals list with matching merged vid + if query_record.merged_vid not in intervals_list_dict: + sys.stderr.write(f"Warning: Could not reference vid {query_record.merged_vid} in intervals\n") + else: + intervals = ["UNK_" + s for s in intervals_list_dict[query_record.merged_vid]] + for interval_str in intervals: + tokens = interval_str.replace(":", "\t").replace("_", "\t").replace("-", "\t").split("\t") + interval_type = tokens[0] + interval_chrom = tokens[1] + interval_start = int(tokens[2]) + interval_end = int(tokens[3]) + if interval_chrom in genotype_counts_tree_dict: + matches = get_reciprocal_overlaps(interval_chrom, interval_start, interval_end, + genotype_counts_tree_dict, 0.95) + for matching_record in matches: + # Assign CNV type depending on frac diff support + if matching_record.diff_case_control_del_frac > MIN_DIFF: + if matching_record.diff_case_control_dup_frac > MIN_DIFF: + cnv_assessment = "DELDUP" + else: + cnv_assessment = "DEL" + elif matching_record.diff_case_control_dup_frac > MIN_DIFF: + cnv_assessment = "DUP" + elif matching_record.end - matching_record.start < MIN_SIZE: + cnv_assessment = "TOO_SMALL" + else: + cnv_assessment = "WT" + record = CleanedVariantIntervalRecord(interval_chrom, interval_start, interval_end, + interval_type, matching_record.vid, matching_record.sv_type, + matching_record.cpx_type, matching_record.carrier_del, + matching_record.carrier_wt, matching_record.carrier_dup, + matching_record.control_del, matching_record.control_wt, + matching_record.control_dup, + matching_record.diff_case_control_del_frac, + matching_record.diff_case_control_dup_frac, cnv_assessment) + if record.vid not in cleaned_genotype_counts: + cleaned_genotype_counts[record.vid] = [record] + else: + cleaned_genotype_counts[record.vid].append(record) + return cleaned_genotype_counts + + +def parse_ends(vcf_path: Text) -> Dict[Text, int]: + """ + Since pysam automatically changes invalid END fields (i.e. when less than the start position), they must + be parsed manually. + + Parameters + ---------- + vcf_path: Text + input vcf path + + Returns + ------- + header: Dict[Text, int] + map from variant ID to END position + """ + end_dict = dict() + with gzip.open(vcf_path, 'rt') as f: + for line in f: + if line.startswith('#'): + continue + columns = line.split('\t', 8) + vid = columns[2] + info = columns[7] + info_tokens = info.split(';') + end_field_list = [x for x in info_tokens if x.startswith("END=")] + if len(end_field_list) > 0: + end = int(end_field_list[0].replace("END=", "")) + else: + # Special case where END and POS happen to be equal + end = int(columns[1]) + end_dict[vid] = end + return end_dict + + +def read_vcf(vcf_path, cleaned_genotype_counts_vids): + end_dict = parse_ends(vcf_path) + with VariantFile(vcf_path) as f: + return {r.id: VcfRecord(r, end_dict) for r in f if r.id in cleaned_genotype_counts_vids} + + +def final_assessment(cleaned_genotype_counts, variants_to_reclassify): + + def _subtract_interval(start1, end1, start2, end2): + # Spanning case results in null (shouldn't ever do this in this context) + if start2 <= start1 and end2 >= end1: + return None, None + # Non-overlapping so just return original interval + if end2 < start1 or start2 > end1: + return start1, end1 + if start2 <= start1: + return end2, end1 + else: + return start1, start2 + + def _evaluate_cpx(r, records_list, new_sv_type, new_cpx_type): + cnv_records_list = [m for m in records_list if m.interval_type == "DEL" or m.interval_type == "DUP"] + num_cnv_intervals = 0 + num_failed_cnv_intervals = 0 + num_validated_cnv_intervals = 0 + # Check whether each CNV interval larger than min size failed to validate + # or if at least one CNV interval validated + # or not enough information to make a call, so keep it + for m in cnv_records_list: + num_cnv_intervals += 1 + if m.cnv_assessment == "WT": + num_failed_cnv_intervals += 1 + elif m.interval_type == m.cnv_assessment or m.cnv_assessment == "DELDUP": + num_validated_cnv_intervals += 1 + if num_cnv_intervals == 0: + mod = "KEEP" + reason = "NO_PREDICTED_CNV_INTERVALS" + elif num_failed_cnv_intervals > 0: + mod = "UNRESOLVED" + reason = "AT_LEAST_ONE_CNV_INTERVAL_FAILED" + elif num_validated_cnv_intervals > 0: + mod = "KEEP" + reason = "AT_LEAST_ONE_CNV_VALIDATED" + else: + mod = "KEEP" + reason = "NO_CNVS_LARGER_THAN_MIN_SIZE" + return VariantAssessment( + vid=r.vid, + modification=mod, + reason=reason, + new_sv_type=new_sv_type, + new_cpx_type=new_cpx_type, + new_cpx_intervals=None, + new_svlen=None, + new_source=None, + new_start=None, + new_end=None + ) + + # Solve translocational insertions + # A2B: A interval inserted into B + # B2A: B interval inserted into A + # CTX_INS_A2B: A interval inserted into B, interchromosomal, can be inverted + # CTX_INS_B2A: B interval inserted into A, interchromosomal, can be inverted + # OTHERWISE: Just test for INS-iDEL + # Note: CTX_INS_B2A variants are not resolved here, because they are indexed + # onto a different chromosome, and will be dealt with in that shard + def _evaluate_ctx_ins(r, records_list, default_sv_type, default_cpx_type): + if r.source_chrom is None: + return VariantAssessment( + vid=r.vid, + modification="SKIP", + reason="NO_SOURCE_INTERVAL_IN_CURRENT_SHARD", + new_sv_type=default_sv_type, + new_cpx_type=default_cpx_type, + new_cpx_intervals=None, + new_svlen=None, + new_source=None, + new_start=None, + new_end=None + ) + source_overlaps = [m for m in records_list + if has_reciprocal_overlap(r.source_chrom, r.source_start, r.source_end, + m.chrom, m.start, m.end, 0.95)] + source_is_dup = any(m.cnv_assessment == "DUP" for m in source_overlaps) + # TODO: could overlap against r.sink_* + sinks = [m for m in records_list + if not has_reciprocal_overlap(r.source_chrom, r.source_start, r.source_end, + m.chrom, m.start, m.end, 0.95)] + if len(sinks) > 0: + # TODO Unstable ordering + sink_chrom = sinks[0].chrom + sink_start = sinks[0].start + sink_end = sinks[0].end + sink_size = sinks[0].size() + if sink_size >= MIN_SIZE_IDEL: + sink_is_del = any([s.cnv_assessment == "DEL" for s in sinks]) + else: + sink_is_del = False + else: + sink_chrom = None + sink_start = None + sink_end = None + sink_size = None + sink_is_del = False + + # Determine if the variant needs to be modified + if source_is_dup and sink_is_del: + # If source is duplicated and sink is deleted, reclassify as CPX, dDUP_iDEL + return VariantAssessment( + vid=r.vid, + modification="RECLASSIFY", + reason="DISPERSED_DUPLICATION_W_INSERT_SITE_DEL", + new_sv_type="CPX", + new_cpx_type="dDUP_iDEL", + new_cpx_intervals=f"DUP_{r.source_string()},DEL_{interval_string(sink_chrom, sink_start, sink_end)}", + new_svlen=r.source_end - r.source_start + sink_end - sink_start, + new_source=f"DUP_{r.source_string()}", + new_start=None, + new_end=None + ) + elif source_is_dup and not sink_is_del: + # If source is duplicated and sink is not deleted, reclassify as CPX, dDUP + return VariantAssessment( + vid=r.vid, + modification="RECLASSIFY", + reason="DISPERSED_DUPLICATION", + new_sv_type="CPX", + new_cpx_type="dDUP", + new_cpx_intervals=f"DUP_{r.source_string()}", + new_svlen=None, + new_source=f"DUP_{r.source_string()}", + new_start=None, + new_end=None + ) + elif (not source_is_dup) and sink_is_del: + # If source is not duplicated but sink is deleted, reclassify as CPX, INS_iDEL + return VariantAssessment( + vid=r.vid, + modification="RECLASSIFY", + reason="INSERT_SITE_DEL", + new_sv_type="CPX", + new_cpx_type="INS_iDEL", + new_cpx_intervals=f"DEL_{interval_string(sink_chrom, sink_start, sink_end)}", + new_svlen=r.source_end - r.source_start + sink_end - sink_start, + new_source=f"INS_{r.source_string()}", + new_start=None, + new_end=None + ) + else: + # Otherwise, leave as canonical insertion + return VariantAssessment( + vid=r.vid, + modification="KEEP", + reason="NON_DUPLICATED_INSERTION", + new_sv_type=default_sv_type, + new_cpx_type=default_cpx_type, + new_cpx_intervals=None, + new_svlen=None, + new_source=None, + new_start=None, + new_end=None + ) + + def _is_ctx_ins(t): + return t == "INS_A2B" \ + or t == "INS_B2A" \ + or (t.startswith("CTX_") and t.endswith("INS_A2B")) \ + or (t.startswith("CTX_") and t.endswith("INS_B2A")) + + # Solve inverted dispersed duplications vs dupINV / dupINVdel or INVdup / delINVdup + # DUP5/INS3 or dupINV / dupINVdel + def _evaluate_dup5_ins3(r, records_list, default_sv_type, default_cpx_type): + # Get duplication/insertion interval + dup_chrom = r.source_chrom + dup_start = r.source_start + dup_end = r.source_end + dup_size = dup_end - dup_start + # Test dup interval for evidence of duplication + # TODO unsure if the check should be == "DUP" + + dup_confirmed = any(m.cnv_assessment != "WT" and + has_reciprocal_overlap(dup_chrom, dup_start, dup_end, m.chrom, m.start, m.end, 0.95) + for m in records_list) + # As long as dup doesn't explicitly fail depth genotyping, proceed + if dup_confirmed: + # If sink length >= MINSIZEiDEL, test sink interval for evidence of deletion + sink_is_del = r.sink_size() >= MIN_SIZE_IDEL and any( + m.cnv_assessment == "DEL" and has_reciprocal_overlap(r.sink_chrom, r.sink_start, r.sink_end, m.chrom, + m.start, m.end, 0.95) for m in records_list) + # Get inversion interval. Corresponds to min dup/ins coord and max sink coord + inv_chrom = r.sink_chrom + inv_start = dup_start + inv_end = r.sink_end + inv_size = inv_end - inv_start + # If inversion length < MINdDUPTHRESH, classify as dupINV or dupINVdel + if inv_size < MIN_DDUP_THRESH: + # If sink is deleted, classify as dupINVdel + if sink_is_del: + # Revise inv interval (subtracting del interval) + inv_start, inv_end = _subtract_interval(inv_start, inv_end, r.sink_start, r.sink_end) + if inv_start is None: + raise ValueError(f"Sink interval {r.sink_start}-{r.sink_end} spans inversion interval " + f"{inv_start}-{inv_end} for record {r.vid}") + return VariantAssessment( + vid=r.vid, + modification="RECLASSIFY", + reason="DUP_FLANKED_INVERSION_WITH_DEL", + new_sv_type="CPX", + new_cpx_type="dupINVdel", + new_cpx_intervals=f"DUP_{interval_string(dup_chrom, dup_start, dup_end)}," + f"INV_{interval_string(inv_chrom, inv_start, inv_end)}," + f"DEL_{r.sink_string()}", + new_svlen=inv_size, + new_source=None, + new_start=min(dup_start, dup_end, inv_start, inv_end, r.sink_start, r.sink_end), + new_end=max(dup_start, dup_end, inv_start, inv_end, r.sink_start, r.sink_end) + ) + else: + # If sink is not clearly deleted (or too small), classify as dupINV (no del) + return VariantAssessment( + vid=r.vid, + modification="RECLASSIFY", + reason="DUP_FLANKED_INVERSION", + new_sv_type="CPX", + new_cpx_type="dupINV", + new_cpx_intervals=f"DUP_{interval_string(dup_chrom, dup_start, dup_end)}," + f"INV_{interval_string(inv_chrom, inv_start, inv_end)}", + new_svlen=inv_size, + new_source=None, + new_start=min(dup_start, dup_end, inv_start, inv_end), + new_end=max(dup_start, dup_end, inv_start, inv_end) + ) + elif sink_is_del: + # If sink is deleted, classify as dDUP_iDEL + return VariantAssessment( + vid=r.vid, + modification="RECLASSIFY", + reason="INVERTED_DISPERSED_DUPLICATION_WITH_DELETION", + new_sv_type="CPX", + new_cpx_type="dDUP_iDEL", + new_cpx_intervals=f"DUP_{interval_string(dup_chrom, dup_start, dup_end)}," + f"INV_{interval_string(dup_chrom, dup_start, dup_end)}," + f"DEL_{r.sink_string()}", + new_svlen=dup_size + r.sink_size(), + new_source=f"DUP_{interval_string(dup_chrom, dup_start, dup_end)}", + new_start=None, + new_end=None + ) + else: + # If sink is not clearly deleted (or too small), classify as dDUP (no iDEL) + return VariantAssessment( + vid=r.vid, + modification="RECLASSIFY", + reason="INVERTED_DISPERSED_DUPLICATION", + new_sv_type="CPX", + new_cpx_type="dDUP", + new_cpx_intervals=f"DUP_{interval_string(dup_chrom, dup_start, dup_end)}," + f"INV_{interval_string(dup_chrom, dup_start, dup_end)}", + new_svlen=dup_size, + new_source=f"DUP_{interval_string(dup_chrom, dup_start, dup_end)}", + new_start=None, + new_end=None + ) + else: + # If dup explicitly fails depth genotyping, mark as unresolved + return VariantAssessment( + vid=r.vid, + modification="UNRESOLVED", + reason="PREDICTED_DUP_INTERVAL_FAILED_GT", + new_sv_type=default_sv_type, + new_cpx_type=default_cpx_type, + new_cpx_intervals=None, + new_svlen=None, + new_source=None, + new_start=None, + new_end=None + ) + + # DUP3/INS5 or INVdup / delINVdup + def _evaluate_dup3_ins5(r, records_list, default_sv_type, default_cpx_type): + # Get duplication/insertion interval + dup_chrom = r.source_chrom + dup_start = r.source_start + dup_end = r.source_end + dup_size = dup_end - dup_start + # Test dup interval for evidence of duplication + # TODO should this check should be == "DUP" ? + dup_confirmed = any( + m.cnv_assessment != "WT" and has_reciprocal_overlap(dup_chrom, dup_start, dup_end, m.chrom, m.start, m.end, + 0.95) for m in records_list) + # As long as dup doesn't explicitly fail depth genotyping, proceed + if dup_confirmed: + # If sink length >= MINSIZEiDEL, test sink interval for evidence of deletion + sink_is_del = r.sink_size() >= MIN_SIZE_IDEL and any( + m.cnv_assessment == "DEL" and has_reciprocal_overlap(r.sink_chrom, r.sink_start, r.sink_end, m.chrom, + m.start, m.end, 0.95) for m in records_list) + # Get inversion interval. Corresponds to min sink coord and max dup/inv coord + # Note this is different from the DUP5/INS3 case + inv_chrom = r.sink_chrom + inv_start = r.sink_start + inv_end = dup_end + inv_size = inv_end - inv_start + # If inversion length < MINdDUPTHRESH, classify as dupINV or dupINVdel + if inv_size < MIN_DDUP_THRESH: + # If sink is deleted, classify as delINVdup + if sink_is_del: + # Revise inv interval (subtracting del interval) + inv_start, inv_end = _subtract_interval(inv_start, inv_end, r.sink_start, r.sink_end) + if inv_start is None: + raise ValueError(f"Sink interval {r.sink_start}-{r.sink_end} spans inversion interval " + f"{inv_start}-{inv_end} for record {r.vid}") + return VariantAssessment( + vid=r.vid, + modification="RECLASSIFY", + reason="DUP_FLANKED_INVERSION_WITH_DEL", + new_sv_type="CPX", + new_cpx_type="delINVdup", + new_cpx_intervals=f"DEL_{r.sink_string()}," + f"INV_{interval_string(inv_chrom, inv_start, inv_end)}," + f"DUP_{interval_string(dup_chrom, dup_start, dup_end)}", + new_svlen=inv_size, + new_source=None, + new_start=min(dup_start, dup_end, inv_start, inv_end, r.sink_start, r.sink_end), + new_end=max(dup_start, dup_end, inv_start, inv_end, r.sink_start, r.sink_end) + ) + else: + # If sink is not clearly deleted (or too small), classify as INVdup (no del) + return VariantAssessment( + vid=r.vid, + modification="RECLASSIFY", + reason="DUP_FLANKED_INVERSION", + new_sv_type="CPX", + new_cpx_type="INVdup", + new_cpx_intervals=f"INV_{interval_string(inv_chrom, inv_start, inv_end)}," + f"DUP_{interval_string(dup_chrom, dup_start, dup_end)}", + new_svlen=inv_size, + new_source=None, + new_start=min(dup_start, dup_end, inv_start, inv_end), + new_end=max(dup_start, dup_end, inv_start, inv_end) + ) + elif sink_is_del: + # If sink is deleted, classify as dDUP_iDEL + return VariantAssessment( + vid=r.vid, + modification="RECLASSIFY", + reason="INVERTED_DISPERSED_DUPLICATION_WITH_DELETION", + new_sv_type="CPX", + new_cpx_type="dDUP_iDEL", + new_cpx_intervals=f"DUP_{interval_string(dup_chrom, dup_start, dup_end)}," + f"INV_{interval_string(dup_chrom, dup_start, dup_end)}," + f"DEL_{r.sink_string()}", + new_svlen=dup_size + r.sink_size(), + new_source=f"DUP_{interval_string(dup_chrom, dup_start, dup_end)}", + new_start=None, + new_end=None + ) + else: + # If sink is not clearly deleted (or too small), classify as dDUP (no iDEL) + return VariantAssessment( + vid=r.vid, + modification="RECLASSIFY", + reason="INVERTED_DISPERSED_DUPLICATION", + new_sv_type="CPX", + new_cpx_type="dDUP", + new_cpx_intervals=f"INV_{interval_string(dup_chrom, dup_start, dup_end)}," + f"DUP_{interval_string(dup_chrom, dup_start, dup_end)}", + + new_svlen=dup_size, + new_source=f"DUP_{interval_string(dup_chrom, dup_start, dup_end)}", + new_start=None, + new_end=None + ) + else: + # If dup explicitly fails depth genotyping, mark as unresolved + return VariantAssessment( + vid=r.vid, + modification="UNRESOLVED", + reason="PREDICTED_DUP_INTERVAL_FAILED_GT", + new_sv_type=default_sv_type, + new_cpx_type=default_cpx_type, + new_cpx_intervals=None, + new_svlen=None, + new_source=None, + new_start=None, + new_end=None + ) + + # Default to test for INS-iDEL + def _evaluate_ins_idel(r, records_list, default_sv_type, default_cpx_type): + if r.sink_chrom is not None: + # TODO shell script uses bedtools intersect with -v flag (=>"not") that probably isn't supposed to be used + sinks = [m for m in records_list + if not has_reciprocal_overlap(r.sink_chrom, r.sink_start, r.sink_end, + m.chrom, m.start, m.end, 0.95)] + if len(sinks) > 0: + # TODO unstable ordering + sink_record = sinks[0] + # TODO Both sink_is_del and sink_is_not_del could both be true. The logic here needs to be cleaned up. + if sink_record.size() >= MIN_SIZE_IDEL: + # If insertion site size >= MINSIZEiDEL, check if insertion site is deleted + sink_is_del = any(m.cnv_assessment == "DEL" for m in sinks) + sink_is_not_del = any(m.cnv_assessment == "WT" for m in sinks) + else: + sink_is_del = False + sink_is_not_del = False + # TODO original script checks for source_is_dup but the variable is never assigned. We will assume False + source_is_dup = False + if (not source_is_dup) and sink_is_del: + # If sink is deleted, reclassify as CPX, INS_iDEL + return VariantAssessment( + vid=r.vid, + modification="RECLASSIFY", + reason="INSERT_SITE_DEL", + new_sv_type="CPX", + new_cpx_type="INS_iDEL", + new_cpx_intervals=f"DEL_{r.sink_string()}", + new_svlen=r.sink_size() + r.source_size(), + new_source=f"INS_{r.source_string()}", + new_start=None, + new_end=None + ) + elif sink_is_not_del and sink_record.size() >= MIN_SIZE: + # If sink is explicitly *not* deleted and is >= MINSIZE, reclassify as UNRESOLVED + return VariantAssessment( + vid=r.vid, + modification="UNRESOLVED", + reason="PREDICTED_LARGE_SINK_DEL_INTERVAL_FAILED_GT", + new_sv_type=default_sv_type, + new_cpx_type=default_cpx_type, + new_cpx_intervals=None, + new_svlen=None, + new_source=None, + new_start=None, + new_end=None + ) + else: + # Otherwise, leave as canonical insertion + return VariantAssessment( + vid=r.vid, + modification="KEEP", + reason="CANONICAL_INS_NO_SINK_DELETION", + new_sv_type=default_sv_type, + new_cpx_type=default_cpx_type, + new_cpx_intervals=None, + new_svlen=None, + new_source=None, + new_start=None, + new_end=None + ) + else: + # No sinks found + return VariantAssessment( + vid=r.vid, + modification="SKIP", + reason="NO_SINK_INTERVAL_IN_CURRENT_SHARD", + new_sv_type=default_sv_type, + new_cpx_type=default_cpx_type, + new_cpx_intervals=None, + new_svlen=None, + new_source=None, + new_start=None, + new_end=None + ) + + # Salvage inverted duplications from inversion single enders + def _evaluate_bnd(r, records_list, default_sv_type, default_cpx_type): + if not default_cpx_type.startswith("INVERSION_SINGLE_ENDER"): + # Only consider inversion single enders + # TODO original script does not set MOD/REASON for this case + return VariantAssessment( + vid=r.vid, + modification="", + reason="", + new_sv_type=default_sv_type, + new_cpx_type=default_cpx_type, + new_cpx_intervals=None, + new_svlen=None, + new_source=None, + new_start=None, + new_end=None + ) + # Get span of inversion + dup_chrom = r.sink_chrom + dup_start = r.sink_start + dup_end = r.sink_end + dup_size = dup_end - dup_start + # Test dup interval for explicit confirmation of duplication + dup_confirmed = any( + m.cnv_assessment == "DUP" and has_reciprocal_overlap(dup_chrom, dup_start, dup_end, + m.chrom, m.start, m.end, 0.95) + for m in records_list + ) + if dup_confirmed: + # If dup confirms, resolve as palindromic inverted duplication + # TODO Investigate why these are never being found + new_cpx_type = "piDUP_FR" if default_cpx_type == "INVERSION_SINGLE_ENDER_" else "piDUP_RF" + return VariantAssessment( + vid=r.vid, + modification="RECLASSIFY", + reason="PALINDROMIC_INVERTED_DUPLICATION", + new_sv_type="CPX", + new_cpx_type=new_cpx_type, + new_cpx_intervals=f"DUP_{interval_string(dup_chrom, dup_start, dup_end)}," + f"INV_{interval_string(dup_chrom, dup_start, dup_end)}", + new_svlen=dup_size, + new_source=None, + new_start=dup_start, + new_end=dup_end + ) + else: + # Otherwise, leave as unresolved + return VariantAssessment( + vid=r.vid, + modification="KEEP", + reason="DID_NOT_RESOLVE_AS_piDUP", + new_sv_type=default_sv_type, + new_cpx_type=default_cpx_type, + new_cpx_intervals=None, + new_svlen=None, + new_source=None, + new_start=None, + new_end=None + ) + + # Default case for other variant types + def _evaluate_irrelevant(r, default_sv_type, default_cpx_type): + return VariantAssessment( + vid=r.vid, + modification="KEEP", + reason="IRRELEVANT_SV_TYPE", + new_sv_type=default_sv_type, + new_cpx_type=default_cpx_type, + new_cpx_intervals=None, + new_svlen=None, + new_source=None, + new_start=None, + new_end=None + ) + + # SV and CPX class each is most common one + for vid, records in cleaned_genotype_counts.items(): + if vid not in variants_to_reclassify: + logging.warning(f"Variant ID {vid} was not found in vcf, ignoring...") + continue + vcf_record = variants_to_reclassify[vid] + sv_type = Counter([r.variant_sv_type for r in records]).most_common(1)[0][0] + cpx_type = Counter([r.variant_cpx_type for r in records]).most_common(1)[0][0] + if sv_type == "CPX": + result = _evaluate_cpx(vcf_record, records, sv_type, cpx_type) + elif sv_type == "INS": + if _is_ctx_ins(cpx_type): + result = _evaluate_ctx_ins(vcf_record, records, sv_type, cpx_type) + elif cpx_type == "DUP5/INS3": + result = _evaluate_dup5_ins3(vcf_record, records, sv_type, cpx_type) + elif cpx_type == "DUP3/INS5": + result = _evaluate_dup3_ins5(vcf_record, records, sv_type, cpx_type) + else: + result = _evaluate_ins_idel(vcf_record, records, sv_type, cpx_type) + elif sv_type == "BND": + result = _evaluate_bnd(vcf_record, records, sv_type, cpx_type) + else: + result = _evaluate_irrelevant(vcf_record, sv_type, cpx_type) + yield result + + +def write_vcf(input_vcf_path, output_path, final_assessment_list): + def _modify_record(record, assessment): + mod = assessment.modification + if mod == "SKIP" or mod == "KEEP": + return + elif mod == "UNRESOLVED": + # Note the filter status is now added during CleanVcf instead + record.info["UNRESOLVED"] = True + record.info["UNRESOLVED_TYPE"] = "POSTHOC_RD_GT_REJECTION" + elif mod == "RECLASSIFY": + if assessment.new_start: + record.pos = assessment.new_start + if assessment.new_end: + record.stop = assessment.new_end + if assessment.new_svlen: + record.info["SVLEN"] = assessment.new_svlen + if assessment.new_sv_type: + record.info["SVTYPE"] = assessment.new_sv_type + record.alts = ("<" + assessment.new_sv_type + ">",) + if assessment.new_cpx_type: + record.info["CPX_TYPE"] = assessment.new_cpx_type + if assessment.new_source: + record.info["SOURCE"] = assessment.new_source + elif "SOURCE" in record.info: + record.info.pop("SOURCE") + if assessment.new_cpx_intervals: + record.info["CPX_INTERVALS"] = assessment.new_cpx_intervals + if "UNRESOLVED" in record.info: + record.info.pop("UNRESOLVED") + if "UNRESOLVED_TYPE" in record.info: + record.info.pop("UNRESOLVED_TYPE") + if "EVENT" in record.info: + record.info.pop("EVENT") + + def _strip_cpx(record): + svtype = record.info.get("SVTYPE", "") + if svtype != "CPX" and svtype != "CTX": + if "CPX_TYPE" in record.info: + record.info.pop("CPX_TYPE") + if "CPX_INTERVALS" in record.info: + record.info.pop("CPX_INTERVALS") + + final_assessment_dict = {a.vid: a for a in final_assessment_list} + with VariantFile(input_vcf_path) as fin: + header = fin.header + header.add_line("##CPX_TYPE_delINV=\"Complex inversion with 5' flanking deletion.\"") + header.add_line("##CPX_TYPE_INVdel=\"Complex inversion with 3' flanking deletion.\"") + header.add_line("##CPX_TYPE_dupINV=\"Complex inversion with 5' flanking duplication.\"") + header.add_line("##CPX_TYPE_INVdup=\"Complex inversion with 3' flanking duplication.\"") + header.add_line("##CPX_TYPE_delINVdel=\"Complex inversion with 5' and 3' flanking deletions.\"") + header.add_line("##CPX_TYPE_dupINVdup=\"Complex inversion with 5' and 3' flanking duplications.\"") + header.add_line("##CPX_TYPE_delINVdup=\"Complex inversion with 5' flanking deletion and 3' flanking " + "duplication.\"") + header.add_line("##CPX_TYPE_dupINVdel=\"Complex inversion with 5' flanking duplication and 3' flanking " + "deletion.\"") + header.add_line("##CPX_TYPE_piDUP_FR=\"Palindromic inverted tandem duplication, forward-reverse orientation.\"") + header.add_line("##CPX_TYPE_piDUP_RF=\"Palindromic inverted tandem duplication, reverse-forward orientation.\"") + header.add_line("##CPX_TYPE_dDUP=\"Dispersed duplication.\"") + header.add_line("##CPX_TYPE_dDUP_iDEL=\"Dispersed duplication with deletion at insertion site.\"") + header.add_line("##CPX_TYPE_INS_iDEL=\"Insertion with deletion at insertion site.\"") + with VariantFile(output_path, 'w', header=header) as fout: + for r in fin: + if r.id in final_assessment_dict: + _modify_record(r, final_assessment_dict[r.id]) + _strip_cpx(r) + fout.write(r) + + +def make_genotype_tree(genotype_counts): + tree_dict = dict() + for g in genotype_counts: + if g.chrom not in tree_dict: + tree_dict[g.chrom] = IntervalTree() + tree_dict[g.chrom].addi(g.start, g.end, g) + return tree_dict + + +def parse_genotypes(path: Text) -> Dict: + genotype_list_dict = dict() + with gzip.open(path, 'rt') as f: + # Traverse depth genotypes table, containing one line per genotype per interval + for line in f: + if line.startswith('#') or line == "\n": + continue + tokens = line.strip().split('\t') + chrom = tokens[0] + start = int(tokens[1]) + end = int(tokens[2]) + vid = tokens[3] + sample = tokens[4] + copy_number = None if tokens[5] == "." else int(tokens[5]) + if vid not in genotype_list_dict: + genotype_list_dict[vid] = list() + matching_variant_info = None + for variant_info in genotype_list_dict[vid]: + # Find if there is already a variant matching on vid, chrom, start, and end + # Note that one variant may map to multiple intervals + if variant_info.chrom == chrom and variant_info.start == start and variant_info.end == end: + matching_variant_info = variant_info + break + if matching_variant_info is None: + # If match wasn't found, create new object for the site + matching_variant_info = VariantInfo(chrom, start, end) + genotype_list_dict[vid].append(matching_variant_info) + # Add sample copy state + matching_variant_info.genotype_copy_numbers[sample] = copy_number + return genotype_list_dict + + +def parse_ped(path: Text) -> (Set, Set, Set): + all_samples = set() + male_samples = set() + female_samples = set() + with open(path) as f: + for line in f: + tokens = line.strip().split('\t') + if len(tokens) < 5: + raise ValueError(f"Encountered line with fewer than 5 columns: {line.strip()}") + sample = tokens[1] + all_samples.add(sample) + if tokens[4] == "1": + male_samples.add(sample) + elif tokens[4] == "2": + female_samples.add(sample) + return all_samples, male_samples, female_samples + + +def write_reclassification_table(path, assessment): + with open(path, 'w') as f: + f.write("#VID\tMODIFICATION\tREASON\tNEW_SVTYPE\tNEW_CPX_TYPE\tNEW_CPX_INTERVALS\t" + "NEW_SVLEN\tNEW_SOURCE\tNEW_START\tNEW_END\n") + for r in assessment: + new_cpx_intervals = r.new_cpx_intervals if r.new_cpx_intervals else "." + new_source = r.new_source if r.new_source else "." + new_svlen = r.new_svlen if r.new_svlen is not None else "." + new_start = r.new_start if r.new_start is not None else "." + new_end = r.new_end if r.new_end is not None else "." + f.write(f"{r.vid}\t{r.modification}\t{r.reason}\t{r.new_sv_type}\t{r.new_cpx_type}\t{new_cpx_intervals}\t" + f"{new_svlen}\t{new_source}\t{new_start}\t{new_end}\n") + + +def _parse_arguments(argv: List[Text]) -> argparse.Namespace: + # noinspection PyTypeChecker + parser = argparse.ArgumentParser( + description="Reassign complex variant labels based on depth regenotyping", + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument('--vcf', type=str, help='Input vcf') + parser.add_argument('--intervals', type=str, help='BED file of genotyped intervals (gzipped)') + parser.add_argument('--genotypes', type=str, help='Melted depth genotypes (gzipped)') + parser.add_argument('--ped', type=str, help='PED family file') + parser.add_argument('--out', type=str, help='Output file') + parser.add_argument('--reclassification-table', type=str, help='Output reclassification table path', required=False) + parser.add_argument('--chrx', type=str, help='Chromosome X contig name', default='chrX') + parser.add_argument('--chry', type=str, help='Chromosome Y contig name', default='chrY') + parser.add_argument("-l", "--log-level", required=False, default="INFO", + help="Specify level of logging information, ie. info, warning, error (not case-sensitive). " + "Default: INFO") + if len(argv) <= 1: + parser.parse_args(["--help"]) + sys.exit(0) + parsed_arguments = parser.parse_args(argv[1:]) + return parsed_arguments + + +def main(argv: Optional[List[Text]] = None): + if argv is None: + argv = sys.argv + args = _parse_arguments(argv) + + # Set logging level from -l input + log_level = args.log_level + numeric_level = getattr(logging, log_level.upper(), None) + if not isinstance(numeric_level, int): + raise ValueError('Invalid log level: %s' % log_level) + logging.basicConfig(level=numeric_level, format='%(asctime)s - %(levelname)s: %(message)s') + + all_samples, male_samples, female_samples = parse_ped(args.ped) + genotype_dict = parse_genotypes(args.genotypes) + intervals_list_dict, genotype_counts = genotype_counts_per_variant(intervals_path=args.intervals, + genotype_dict=genotype_dict, + chr_x=args.chrx, + chr_y=args.chry, + all_samples=all_samples, + male_samples=male_samples, + female_samples=female_samples) + del genotype_dict + genotype_counts_tree_dict = make_genotype_tree(genotype_counts) + cleaned_genotype_counts = clean_up_intervals(genotype_counts, intervals_list_dict, genotype_counts_tree_dict) + del genotype_counts, intervals_list_dict, genotype_counts_tree_dict + variants_to_reclassify = read_vcf(args.vcf, cleaned_genotype_counts_vids=cleaned_genotype_counts.keys()) + assessment = list(final_assessment(cleaned_genotype_counts, variants_to_reclassify)) + del cleaned_genotype_counts, variants_to_reclassify + if args.reclassification_table: + write_reclassification_table(args.reclassification_table, assessment) + write_vcf(args.vcf, args.out, assessment) + + +if __name__ == "__main__": + main() diff --git a/src/sv-pipeline/04_variant_resolution/scripts/process_posthoc_cpx_depth_regenotyping.sh b/src/sv-pipeline/04_variant_resolution/scripts/process_posthoc_cpx_depth_regenotyping.sh deleted file mode 100755 index 087448a97..000000000 --- a/src/sv-pipeline/04_variant_resolution/scripts/process_posthoc_cpx_depth_regenotyping.sh +++ /dev/null @@ -1,1079 +0,0 @@ -#!/bin/bash - -# Reassign variant labels based on depth regenotyping in mod04b - -set -eo pipefail -###USAGE -usage(){ -cat < Minimum insertion site size (in bp) to be considered for - distinguishing insertion site deletions [default: 150 bp] - -T Minimum size (in bp) at which to prioritize an inverted - dDUP classification over a dupINV or INVdup classification - [default: 1000000 bp] - -R Path to table containing the final reclassification - decision made per variant. [default: no table output] - -G Path to table containing the raw genotype counts table - per interval per variant. [default: no table output] - - -Notes: - 1. All input files other than FAMFILE must be compressed with bgzip. - 2. OUTVCF will be automatically bgzipped. - 3. Rationale for minimum insertion site size: reference alignment bias - can make insertion sites appear deleted, so -D should be set to a size - at which a confident depth assessment can be made on insertion site. Should - be at least as large as the library fragment size. - -EOF -} - - -###PARSE ARGS -MINSIZE=1000 -MINDIFF=0.4 -MINSIZEiDEL=150 -MINdDUPTHRESH=1000000 -unset RTABLE -unset GTCOUNTSTABLE -while getopts ":s:d:D:T:R:G:h" opt; do - case "$opt" in - h) - usage - exit 0 - ;; - s) - MINSIZE=${OPTARG} - ;; - d) - MINDIFF=${OPTARG} - ;; - D) - MINSIZEiDEL=${OPTARG} - ;; - T) - MINdDUPTHRESH=${OPTARG} - ;; - R) - RTABLE=${OPTARG} - ;; - G) - GTCOUNTSTABLE=${OPTARG} - ;; - esac -done -shift $(( ${OPTIND} - 1)) -INVCF=$1 -INTERVALS=$2 -GENOTYPES=$3 -FAMFILE=$4 -OUTVCF=$5 - - -###PROCESS ARGS & OPTIONS -#Check for required input -if [ -z ${INVCF} ]; then - echo -e "\nERROR: input VCF not specified\n" - usage - exit 0 -fi -if ! [ -s ${INVCF} ]; then - echo -e "\nERROR: input VCF either empty or not found\n" - usage - exit 0 -fi -if [[ ${INVCF} != *.gz ]]; then - echo -e "\nERROR: input VCF must be bgzipped\n" - usage - exit 0 -fi -if [ -z ${INTERVALS} ]; then - echo -e "\nERROR: intervals file not specified\n" - usage - exit 0 -fi -if ! [ -s ${INTERVALS} ]; then - echo -e "\nERROR: intervals file either empty or not found\n" - usage - exit 0 -fi -if [[ ${INTERVALS} != *.gz ]]; then - echo -e "\nERROR: intervals file must be bgzipped\n" - usage - exit 0 -fi -if [ -z ${GENOTYPES} ]; then - echo -e "\nERROR: melted genotypes file not specified\n" - usage - exit 0 -fi -if ! [ -s ${GENOTYPES} ]; then - echo -e "\nERROR: melted genotypes file either empty or not found\n" - usage - exit 0 -fi -if [[ ${GENOTYPES} != *.gz ]]; then - echo -e "\nERROR: melted genotypes file must be bgzipped\n" - usage - exit 0 -fi -if [ -z ${FAMFILE} ]; then - echo -e "\nERROR: input .fam file not specified\n" - usage - exit 0 -fi -if ! [ -s ${FAMFILE} ]; then - echo -e "\nERROR: input .fam file either empty or not found\n" - usage - exit 0 -fi -if [ -z ${OUTVCF} ]; then - echo -e "\nERROR: path to output VCF not specified\n" - usage - exit 0 -fi -if [ -z ${MINSIZE} ]; then - echo -e "\nERROR: minimum CNV size not set\n" - usage - exit 0 -fi -if [ -z ${MINDIFF} ]; then - echo -e "\nERROR: minimum CNV carrier freq difference not set\n" - usage - exit 0 -fi -#Set path to execution directory -BIN=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) -#Prepares temporary directory -GTDIR=`mktemp -d` - - -###PREPARE SAMPLE lists -#Master lists of samples extracted from famfile -if [[ ${FAMFILE} == *.gz ]]; then - #All samples - zcat ${FAMFILE} | fgrep -v "#" | cut -f2 | sort | uniq \ - > ${GTDIR}/all.samples.list - #Males - zcat ${FAMFILE} | fgrep -v "#" \ - | awk '{ if ($5=="1") print $2 }' | sort | uniq \ - > ${GTDIR}/male.samples.list - #Females - zcat ${FAMFILE} | fgrep -v "#" \ - | awk '{ if ($5=="2") print $2 }' | sort | uniq \ - > ${GTDIR}/female.samples.list -else - #All samples - fgrep -v "#" ${FAMFILE} | cut -f2 | sort | uniq \ - > ${GTDIR}/all.samples.list - #Males - fgrep -v "#" ${FAMFILE} \ - | awk '{ if ($5=="1") print $2 }' | sort | uniq \ - > ${GTDIR}/male.samples.list - #Females - fgrep -v "#" ${FAMFILE} \ - | awk '{ if ($5=="2") print $2 }' | sort | uniq \ - > ${GTDIR}/female.samples.list -fi - - -###GATHER DATA PER VARIANT -#Header -echo -e "#chr\tstart\tend\tVID\tcarrier_del\tcarrier_wt\tcarrier_dup\ -\tcontrol_del\tcontrol_wt\tcontrol_dup\ -\tdiff_case_control_del_frac\tdiff_case_control_dup_frac" \ - > ${GTDIR}/genotype_counts_per_variant.bed -#Count copy states of carriers & noncarriers per variant -while read chr start end VID samps trash; do - #Wrap one line per variant - for wrapper in 1; do - #Print variant info - echo -e "${chr}\t${start}\t${end}\t${VID}" - - #Clear existing files - if [ -e ${GTDIR}/carrier_samples.tmp ]; then - rm ${GTDIR}/carrier_samples.tmp - fi - if [ -e ${GTDIR}/control_samples.tmp ]; then - rm ${GTDIR}/control_samples.tmp - fi - unset medCN - - ###Get list of samples & reference CN to consider, dependent on chr of call - #ChrX: use only diploid females if possible, otherwise use haploid males - if [ ${chr} == "X" ] || [ ${chr} == "chrX" ]; then - - #Try to get female carrier samples - echo -e "${samps}" | sed 's/,/\n/g' \ - | fgrep -wf - ${GTDIR}/female.samples.list \ - > ${GTDIR}/carrier_samples.tmp ||true - - #Use only females if there are any female carriers - if [ $( cat ${GTDIR}/carrier_samples.tmp | wc -l ) -gt 0 ]; then - - #Get non-carrier female controls - fgrep -wvf ${GTDIR}/carrier_samples.tmp \ - ${GTDIR}/female.samples.list \ - > ${GTDIR}/control_samples.tmp ||true - - #Set default medCN - medCNdefault=2 - - #Otherwise, use males - else - - #Get male carrier samples - echo -e "${samps}" | sed 's/,/\n/g' \ - | fgrep -wf - ${GTDIR}/male.samples.list \ - > ${GTDIR}/carrier_samples.tmp ||true - - #Get non-carrier male controls - fgrep -wvf ${GTDIR}/carrier_samples.tmp \ - ${GTDIR}/male.samples.list \ - > ${GTDIR}/control_samples.tmp ||true - - #Set default medCN - medCNdefault=1 - fi - - #ChrY: use only haploid males - else - if [ ${chr} == "Y" ] || [ ${chr} == "chrY" ]; then - - #Get male carrier samples - echo -e "${samps}" | sed 's/,/\n/g' \ - | fgrep -wf - ${GTDIR}/male.samples.list \ - > ${GTDIR}/carrier_samples.tmp ||true - - #Get non-carrier male controls - fgrep -wvf ${GTDIR}/carrier_samples.tmp \ - ${GTDIR}/male.samples.list \ - > ${GTDIR}/control_samples.tmp ||true - - #Set default medCN - medCNdefault=1 - - #All other chromosomes: use all samples & assume diploid locus - else - - #Get all carrier samples - echo "${samps}" | sed 's/,/\n/g' \ - | sort | uniq > ${GTDIR}/carrier_samples.tmp - - #Get all non-carrier controls - fgrep -wvf ${GTDIR}/carrier_samples.tmp \ - ${GTDIR}/all.samples.list \ - > ${GTDIR}/control_samples.tmp ||true - - #Set default medCN - medCNdefault=2 - fi - fi - - - #Gather genotypes corresponding to interval of interest - if [ $(( ${end} - ${start} )) -lt 1000000 ]; then - bedtools intersect -wa -r -f 0.95 \ - -a ${GENOTYPES} \ - -b <( echo -e "${chr}\t${start}\t${end}" ) \ - | awk -v OFS="\t" -v VID=${VID} '{ if ($4==VID) print $0 }' \ - > ${GTDIR}/intersected_genotypes.tmp.bed - else - zcat ${GENOTYPES} \ - | awk -v OFS="\t" -v VID=${VID} '{ if ($4==VID) print $0 }' \ - | bedtools coverage -f 1 \ - -a - \ - -b <( echo -e "${chr}\t${start}\t${end}" ) \ - | cut -f1-7 \ - > ${GTDIR}/intersected_genotypes.tmp.bed - fi - - #Get median copy number across predicted non-carrier controls - if [ $( cat ${GTDIR}/control_samples.tmp | wc -l ) -gt 0 ]; then - medCN=$( fgrep -wf ${GTDIR}/control_samples.tmp \ - ${GTDIR}/intersected_genotypes.tmp.bed \ - | cut -f6 \ - | sort -nk1,1 \ - | perl -e '$d=.5;@l=<>;print $l[int($d*$#l)]' ||true) - fi - #If no non-carriers exist, assume median CN = default - if [ -z ${medCN} ]; then - medCN=${medCNdefault} - fi - - - #For predicted carriers, count number of genotypes lower than, - # equal to, and greater than the overall median - if [ $( cat ${GTDIR}/carrier_samples.tmp | wc -l ) -gt 0 ]; then - fgrep -wf ${GTDIR}/carrier_samples.tmp \ - ${GTDIR}/intersected_genotypes.tmp.bed \ - | awk -v OFS="\t" -v medCN=${medCN} '{ if ($6medCN) print $6 }' | wc -l ||true - else - echo -e "0\n0\n0" - fi - - #For predicted non-carriers, count number of genotypes lower than, - # equal to, and greater than the overall median - if [ $( cat ${GTDIR}/control_samples.tmp | wc -l ) -gt 0 ]; then - fgrep -wf ${GTDIR}/control_samples.tmp \ - ${GTDIR}/intersected_genotypes.tmp.bed \ - | awk -v OFS="\t" -v medCN=${medCN} '{ if ($6medCN) print $6 }' | wc -l ||true - else - echo -e "0\n0\n0" - fi - - #Clean up tmp file - rm ${GTDIR}/intersected_genotypes.tmp.bed - done \ - | paste -s \ - | awk -v OFS="\t" \ - '{ ncase=$5+$6+$7; nctrl=$8+$9+$10; if (nctrl==0) nctrl=1; \ - if (ncase>0) print $0, ($5/ncase)-($8/nctrl), ($7/ncase)-($10/nctrl); else print $0, 0, 0 }' -done < <( zcat ${INTERVALS} | fgrep -v "#" ) \ - >> ${GTDIR}/genotype_counts_per_variant.bed - - -###CLEAN UP INTERVALS -#Write header -echo -e "#chr\tstart\tend\tinterval_size\tinterval_type\tVID\tvariant_svtype\tvariant_cpxtype\ -\tcarrier_del\tcarrier_wt\tcarrier_dup\ -\tcontrol_del\tcontrol_wt\tcontrol_dup\ -\tdiff_case_control_del_frac\tdiff_case_control_dup_frac\ -\tCNV_assessment" \ - > ${GTDIR}/genotype_counts_per_variant.cleaned_intervals.bed -#Match intervals per variant with CNV evidence -while read mergedVID; do - VID=$( echo "${mergedVID}" | sed 's/;/\t/g' | cut -f1 ) - cpxtype=$( echo "${mergedVID}" | sed 's/;/\t/g' | cut -f2 ) - svtype=$( echo "${mergedVID}" | sed 's/;/\t/g' | cut -f3 | head -n1 ) - intervals=$( echo "${mergedVID}" | sed 's/;/\t/g' | cut -f4- | paste -s -d\; ) - if [ ${intervals} == "NA" ]; then - intervals=$( zcat ${INTERVALS} \ - | fgrep -w "${mergedVID}" \ - | awk -v OFS="\t" '{ print "UNK_"$1":"$2"-"$3 }' \ - | paste -s -d, ||true) - fi - echo "${intervals}" | sed -e 's/\_/\t/g' -e 's/\:/\t/g' -e 's/\-/\t/g' -e 's/,/\n/g' \ - | awk -v OFS="\t" -v VID=${VID} -v cpxtype=${cpxtype} -v svtype=${svtype} \ - '{ print $2, $3, $4, $4-$3, $1, VID, svtype, cpxtype }' \ - | bedtools intersect -wb -r -f 0.95 \ - -a - \ - -b ${GTDIR}/genotype_counts_per_variant.bed \ - | cut --complement -f9-12 \ - | awk -v OFS="\t" -v MINSIZE=${MINSIZE} -v MINDIFF=${MINDIFF} \ - '{ if ($(NF-1)>MINDIFF && $NF>MINDIFF) assess="DELDUP"; \ - else if ($(NF-1)>MINDIFF && $NF<=MINDIFF) assess="DEL"; \ - else if ($(NF-1)<=MINDIFF && $NF>MINDIFF) assess="DUP"; \ - else if ($3-$2> ${GTDIR}/genotype_counts_per_variant.cleaned_intervals.bed - - -###CONVERT RELEVANT VARIANTS FROM VCF TO BED -#Start with all but INV single enders -cat <( zcat ${INVCF} | fgrep "#" ) \ - <( zcat ${INVCF} | fgrep -v "#" \ - | fgrep -wf <( fgrep -v "#" \ - ${GTDIR}/genotype_counts_per_variant.cleaned_intervals.bed \ - | { fgrep -v "INVERSION_SINGLE_ENDER" || true; } \ - | cut -f6 | sort | uniq ||true) ) \ - | svtk vcf2bed --no-samples --info ALL \ - - ${GTDIR}/variants_to_reclassify.vcf2bed.bed -#Add inversion single enders and use modified coordinates -cat <( zcat ${INVCF} | fgrep "#" ) \ - <( zcat ${INVCF} | fgrep -v "#" \ - | fgrep -wf <( fgrep -v "#" \ - ${GTDIR}/genotype_counts_per_variant.cleaned_intervals.bed \ - | { fgrep "INVERSION_SINGLE_ENDER" || true; } \ - | cut -f6 | sort | uniq ||true) ) \ - | svtk vcf2bed --no-samples --info ALL \ - - ${GTDIR}/inv_se_vcf2bed.precut.bed -ENDidx=$( head -n1 ${GTDIR}/inv_se_vcf2bed.precut.bed \ - | sed 's/\t/\n/g' \ - | awk '{ if ($1=="END") print NR }' ) -awk -v ENDidx=${ENDidx} -v OFS="\t" '{ $3=$ENDidx; print }' \ - ${GTDIR}/inv_se_vcf2bed.precut.bed \ - | fgrep -v "#" || true \ - >> ${GTDIR}/variants_to_reclassify.vcf2bed.bed - - -###MAKE FINAL ASSESSMENT FOR EACH VARIANT -#Print header -echo -e "#VID\tMODIFICATION\tREASON\tNEW_SVTYPE\tNEW_CPX_TYPE\tNEW_CPX_INTERVALS\tNEW_SVLEN\tNEW_SOURCE\tNEW_START\tNEW_END" \ - > ${GTDIR}/final_variant_reassessment_table.txt -#Get variables required for processing -SOURCEidx=$( head -n1 ${GTDIR}/variants_to_reclassify.vcf2bed.bed \ - | sed 's/\t/\n/g' \ - | awk '{ if ($1=="SOURCE") print NR }' ) -#Iterate and evaluate each SV -while read VID; do - #Get base variant class - svtype=$( fgrep -w ${VID} ${GTDIR}/genotype_counts_per_variant.cleaned_intervals.bed \ - | cut -f7 | sort | uniq -c | sort -nrk1,1 | head -n1 | awk '{ print $2 }' ||true) - - #Get predicted complex type - cpxtype=$( fgrep -w ${VID} ${GTDIR}/genotype_counts_per_variant.cleaned_intervals.bed \ - | cut -f8 | sort | uniq -c | sort -nrk1,1 | head -n1 | awk '{ print $2 }' ||true) - - #Logic varies based on variant class - case ${svtype} in - #Evaluate existing complex variants - "CPX") - #Check if the CPX type has any DEL or DUP intervals predicted - if [ $( fgrep -w ${VID} ${GTDIR}/genotype_counts_per_variant.cleaned_intervals.bed \ - | cut -f5 | grep -e 'DEL\|DUP' | wc -l ||true) -gt 0 ]; then - #Check if any CNV interval larger than min size failed to validate - nfail=$( awk -v VID=${VID} '{ if ($6==VID && ($5=="DEL" || $5=="DUP") && $NF=="WT" ) print $0 }' \ - ${GTDIR}/genotype_counts_per_variant.cleaned_intervals.bed | wc -l ) - if [ ${nfail} -gt 0 ]; then - MOD="UNRESOLVED" - REASON="AT_LEAST_ONE_CNV_INTERVAL_FAILED" - - #Otherwise, check if at least one CNV interval validated - else - nval=$( awk -v VID=${VID} '{ if ($6==VID && ($5=="DEL" || $5=="DUP") && ($5==$NF || $NF=="DELDUP") ) print $0 }' \ - ${GTDIR}/genotype_counts_per_variant.cleaned_intervals.bed | wc -l ) - if [ ${nval} -gt 0 ]; then - MOD="KEEP" - REASON="AT_LEAST_ONE_CNV_VALIDATED" - - #Otherwise, not enough information to make a call, so keep it - else - MOD="KEEP" - REASON="NO_CNVS_LARGER_THAN_MIN_SIZE" - fi - fi - else - MOD="KEEP" - REASON="NO_PREDICTED_CNV_INTERVALS" - fi - ;; - - #Examine CNV evidence for insertion variants - "INS") - case ${cpxtype} in - ###Solve translocational insertions - #A2B: A interval inserted into B - #B2A: B interval inserted into A - #CTX_INS_A2B: A interval inserted into B, interchromosomal, can be inverted - #CTX_INS_B2A: B interval inserted into A, interchromosomal, can be inverted - #OTHERWISE: Just test for INS-iDEL - #Note: CTX_INS_B2A variants are not resolved here, because they are indexed - # onto a different chromosome, and will be dealt with in that shard - INS_A2B|INS_B2A|CTX_*INS_A2B|CTX_*INS_B2A) - #Get inserted sequence coordinates - sourcecoord=$( fgrep -w "${VID}" ${GTDIR}/variants_to_reclassify.vcf2bed.bed \ - | cut -f${SOURCEidx} | sed -e 's/_/\t/g' | cut -f2 ||true) - #Skip variant if no source coordinate is specified - if ! [ -z ${sourcecoord} ]; then - #Check if insertion is duplicated - sourceisdup=$( fgrep -w "${VID}" ${GTDIR}/genotype_counts_per_variant.cleaned_intervals.bed \ - | bedtools intersect -wa -r -f 0.95 -a - \ - -b <( echo -e "${sourcecoord}" | sed -e 's/\:/\t/g' -e 's/\-/\t/g' ) \ - | awk '{ if ($NF=="DUP") print $0 }' | wc -l ||true) - #Get sink sequence coordinates - sinkcoord=$( fgrep -w "${VID}" ${GTDIR}/genotype_counts_per_variant.cleaned_intervals.bed \ - | bedtools intersect -wa -v -r -f 0.95 -a - \ - -b <( echo -e "${sourcecoord}" | sed -e 's/\:/\t/g' -e 's/\-/\t/g' ) \ - | awk '{ print $1":"$2"-"$3 }' | head -n1 ||true) - #Get sink sequence size - sinksize=$( fgrep -w "${VID}" ${GTDIR}/genotype_counts_per_variant.cleaned_intervals.bed \ - | bedtools intersect -wa -v -r -f 0.95 -a - \ - -b <( echo -e "${sourcecoord}" | sed -e 's/\:/\t/g' -e 's/\-/\t/g' ) \ - | awk '{ print $4 }' | head -n1 ||true) - #If insertion site size >= $MINSIZEiDEL, check if insertion site is deleted - if ! [ -z ${sinksize} ] && [ ${sinksize} -ge ${MINSIZEiDEL} ]; then - sinkisdel=$( fgrep -w "${VID}" ${GTDIR}/genotype_counts_per_variant.cleaned_intervals.bed \ - | bedtools intersect -wa -v -r -f 0.95 -a - \ - -b <( echo -e "${sourcecoord}" | sed -e 's/\:/\t/g' -e 's/\-/\t/g' ) \ - | awk '{ if ($NF=="DEL") print $0 }' | wc -l ||true) - else - sinkisdel=0 - fi - #If source is duplicated and sink is deleted, reclassify as CPX, dDUP_iDEL - if [ ${sourceisdup} -gt 0 ] && [ ${sinkisdel} -gt 0 ]; then - svtype="CPX" - cpxtype="dDUP_iDEL" - MOD="RECLASSIFY" - REASON="DISPERSED_DUPLICATION_W_INSERT_SITE_DEL" - cpxintervals="DUP_${sourcecoord},DEL_${sinkcoord}" - SOURCE="DUP_${sourcecoord}" - SVLEN=$( echo "${cpxintervals}" \ - | sed -e 's/\,/\n/g' -e 's/\:/\t/g' -e 's/\-/\t/g' \ - | awk '{ sum+=$3-$2 }END{ print sum }' ) - #If source is duplicated and sink is not deleted, reclassify as CPX, dDUP - elif [ ${sourceisdup} -gt 0 ] && [ ${sinkisdel} -eq 0 ]; then - svtype="CPX" - cpxtype="dDUP" - MOD="RECLASSIFY" - REASON="DISPERSED_DUPLICATION" - cpxintervals="DUP_${sourcecoord}" - SOURCE="DUP_${sourcecoord}" - #If source is not duplicated but sink is delted, reclassify as CPX, INS_iDEL - elif [ ${sourceisdup} -eq 0 ] && [ ${sinkisdel} -gt 0 ]; then - svtype="CPX" - cpxtype="INS_iDEL" - MOD="RECLASSIFY" - REASON="INSERT_SITE_DEL" - cpxintervals="DEL_${sinkcoord}" - SOURCE="INS_${sourcecoord}" - SVLEN=$( echo "${cpxintervals},${sourcecoord}" \ - | sed -e 's/\,/\n/g' -e 's/\:/\t/g' -e 's/\-/\t/g' \ - | awk '{ sum+=$3-$2 }END{ print sum }' ) - #Otherwise, leave as canonical insertion - else - MOD="KEEP" - REASON="NON_DUPLICATED_INSERTION" - fi - else - MOD="SKIP" - REASON="NO_SOURCE_INTERVAL_IN_CURRENT_SHARD" - fi - ;; - - ###Solve inverted dispersed duplications vs dupINV / dupINVdel or INVdup / delINVdup - #DUP5/INS3 or dupINV / dupINVdel - "DUP5/INS3") - #Get duplication/insertion interval - dupinterval=$( fgrep -w "${VID}" ${GTDIR}/variants_to_reclassify.vcf2bed.bed \ - | cut -f${SOURCEidx} | sed -e 's/_/\t/g' | cut -f2 ||true) - #Get duplication interval length - dupsize=$( echo "${dupinterval}" | sed -e 's/\:/\t/g' -e 's/\-/\t/g' \ - | awk '{ print $3-$2 }' ) - #Test dup interval for evidence of duplication - dupconfirmed=$( fgrep -w "${VID}" ${GTDIR}/genotype_counts_per_variant.cleaned_intervals.bed \ - | bedtools intersect -wa -r -f 0.95 -a - \ - -b <( echo -e "${dupinterval}" | sed -e 's/\:/\t/g' -e 's/\-/\t/g' ) \ - | awk '{ if ($NF!="WT") print $0 }' | wc -l ||true) - #As long as dup doesn't explicitly fail depth genotyping, proceed - if [ ${dupconfirmed} -gt 0 ]; then - #Get sink interval - sinkinterval=$( fgrep -w "${VID}" ${GTDIR}/variants_to_reclassify.vcf2bed.bed \ - | awk '{ print $1":"$2"-"$3 }' ||true) - #Get sink interval length - sinksize=$( echo "${sinkinterval}" | sed -e 's/\:/\t/g' -e 's/\-/\t/g' \ - | awk '{ print $3-$2 }' ) - #If sink length >= $MINSIZEiDEL, test sink interval for evidence of deletion - if [ ${sinksize} -ge ${MINSIZEiDEL} ]; then - sinkisdel=$( fgrep -w "${VID}" ${GTDIR}/genotype_counts_per_variant.cleaned_intervals.bed \ - | bedtools intersect -wa -r -f 0.95 -a - \ - -b <( echo -e "${sinkinterval}" | sed -e 's/\:/\t/g' -e 's/\-/\t/g' ) \ - | awk '{ if ($NF=="DEL") print $0 }' | wc -l ||true) - else - sinkisdel=0 - fi - #Get inversion interval. Corresponds to min dup/ins coord and max sink coord - invinterval=$( paste \ - <( fgrep -w "${VID}" ${GTDIR}/variants_to_reclassify.vcf2bed.bed | cut -f1 ||true) \ - <( echo "${dupinterval}" | sed 's/\:/\t/g' | cut -f2 | cut -f1 -d\- ) \ - <( fgrep -w "${VID}" ${GTDIR}/variants_to_reclassify.vcf2bed.bed | cut -f3 ||true) \ - | awk '{ print $1":"$2"-"$3 }' ) - #Get inversion size - invsize=$( echo "${invinterval}" | sed -e 's/\:/\t/g' -e 's/\-/\t/g' \ - | awk '{ print $3-$2 }' ) - #If inversion length < $MINdDUPTHRESH, classify as dupINV or dupINVdel - if [ ${invsize} -lt ${MINdDUPTHRESH} ]; then - #If sink is deleted, classify as dupINVdel - if [ ${sinkisdel} -gt 0 ]; then - #Revise inv interval (subtracting del interval) - invinterval=$( bedtools subtract \ - -a <( echo "${invinterval}" | sed -e 's/\:/\t/g' -e 's/\-/\t/g' ) \ - -b <( echo "${sinkinterval}" | sed -e 's/\:/\t/g' -e 's/\-/\t/g' ) \ - | head -n1 | awk '{ print $1":"$2"-"$3 }' ) - #Update other info - svtype="CPX" - cpxtype="dupINVdel" - MOD="RECLASSIFY" - REASON="DUP_FLANKED_INVERSION_WITH_DEL" - cpxintervals="DUP_${dupinterval},INV_${invinterval},DEL_${sinkinterval}" - SVLEN=${invsize} - START=$( echo -e "${dupinterval}\n${invinterval}\n${sinkinterval}" \ - | cut -f2- -d\: | sed 's/\-/\n/g' \ - | sort -nk1,1 | head -n1 ) - END=$( echo -e "${dupinterval}\n${invinterval}\n${sinkinterval}" \ - | cut -f2- -d\: | sed 's/\-/\n/g' \ - | sort -nk1,1 | tail -n1 ) - #If sink is not clearly deleted (or too small), classify as dupINV (no del) - else - svtype="CPX" - cpxtype="dupINV" - MOD="RECLASSIFY" - REASON="DUP_FLANKED_INVERSION" - cpxintervals="DUP_${dupinterval},INV_${invinterval}" - SVLEN=${invsize} - START=$( echo -e "${dupinterval}\n${invinterval}" \ - | cut -f2- -d\: | sed 's/\-/\n/g' \ - | sort -nk1,1 | head -n1 ) - END=$( echo -e "${dupinterval}\n${invinterval}" \ - | cut -f2- -d\: | sed 's/\-/\n/g' \ - | sort -nk1,1 | tail -n1 ) - fi - #Otherwise, reclassify as dDUP or dDUP_iDEL - else - #If sink is deleted, classify as dDUP_iDEL - if [ ${sinkisdel} -gt 0 ]; then - svtype="CPX" - cpxtype="dDUP_iDEL" - MOD="RECLASSIFY" - REASON="INVERTED_DISPERSED_DUPLICATION_WITH_DELETION" - cpxintervals="DUP_${dupinterval},INV_${dupinterval},DEL_${sinkinterval}" - SOURCE="DUP_${dupinterval}" - SVLEN=$(( ${dupsize}+${sinksize} )) - #If sink is not clearly deleted (or too small), classify as dDUP (no iDEL) - else - svtype="CPX" - cpxtype="dDUP" - MOD="RECLASSIFY" - REASON="INVERTED_DISPERSED_DUPLICATION" - cpxintervals="DUP_${dupinterval},INV_${dupinterval}" - SOURCE="DUP_${dupinterval}" - SVLEN="${dupsize}" - fi - fi - #If dup explicitly fails depth genotyping, mark as unresolved - else - MOD="UNRESOLVED" - REASON="PREDICTED_DUP_INTERVAL_FAILED_GT" - fi - ;; - - #DUP3/INS5 or INVdup / delINVdup - "DUP3/INS5") - #Get duplication/insertion interval - dupinterval=$( fgrep -w "${VID}" ${GTDIR}/variants_to_reclassify.vcf2bed.bed \ - | cut -f${SOURCEidx} | sed -e 's/_/\t/g' | cut -f2 ||true) - #Get duplication interval length - dupsize=$( echo "${dupinterval}" | sed -e 's/\:/\t/g' -e 's/\-/\t/g' \ - | awk '{ print $3-$2 }' ) - #Test dup interval for evidence of duplication - dupconfirmed=$( fgrep -w "${VID}" ${GTDIR}/genotype_counts_per_variant.cleaned_intervals.bed \ - | bedtools intersect -wa -r -f 0.95 -a - \ - -b <( echo -e "${dupinterval}" | sed -e 's/\:/\t/g' -e 's/\-/\t/g' ) \ - | awk '{ if ($NF!="WT") print $0 }' | wc -l ||true) - #As long as dup doesn't explicitly fail depth genotyping, proceed - if [ ${dupconfirmed} -gt 0 ]; then - #Get sink interval - sinkinterval=$( fgrep -w "${VID}" ${GTDIR}/variants_to_reclassify.vcf2bed.bed \ - | awk '{ print $1":"$2"-"$3 }' ||true) - #Get sink interval length - sinksize=$( echo "${sinkinterval}" | sed -e 's/\:/\t/g' -e 's/\-/\t/g' \ - | awk '{ print $3-$2 }' ) - #If sink length >= $MINSIZEiDEL, test sink interval for evidence of deletion - if [ ${sinksize} -ge ${MINSIZEiDEL} ]; then - sinkisdel=$( fgrep -w "${VID}" ${GTDIR}/genotype_counts_per_variant.cleaned_intervals.bed \ - | bedtools intersect -wa -r -f 0.95 -a - \ - -b <( echo -e "${sinkinterval}" | sed -e 's/\:/\t/g' -e 's/\-/\t/g' ) \ - | awk '{ if ($NF=="DEL") print $0 }' | wc -l ||true) - else - sinkisdel=0 - fi - #Get inversion interval. Corresponds to min sink coord and max dup/inv coord - invinterval=$( paste \ - <( fgrep -w "${VID}" ${GTDIR}/variants_to_reclassify.vcf2bed.bed | cut -f1 ||true) \ - <( fgrep -w "${VID}" ${GTDIR}/variants_to_reclassify.vcf2bed.bed | cut -f2 ||true) \ - <( echo "${dupinterval}" | sed 's/\:/\t/g' | cut -f2 | cut -f2 -d\- ) \ - | awk '{ print $1":"$2"-"$3 }' ) - #Get inversion size - invsize=$( echo "${invinterval}" | sed -e 's/\:/\t/g' -e 's/\-/\t/g' \ - | awk '{ print $3-$2 }' ) - #If inversion length < $MINdDUPTHRESH, classify as INVdup or delINVdup - if [ ${invsize} -lt ${MINdDUPTHRESH} ]; then - #If sink is deleted, classify as delINVdup - if [ ${sinkisdel} -gt 0 ]; then - #Revise inv interval (subtracting del interval) - invinterval=$( bedtools subtract \ - -a <( echo "${invinterval}" | sed -e 's/\:/\t/g' -e 's/\-/\t/g' ) \ - -b <( echo "${sinkinterval}" | sed -e 's/\:/\t/g' -e 's/\-/\t/g' ) \ - | head -n1 | awk '{ print $1":"$2"-"$3 }' ) - #Update other info - svtype="CPX" - cpxtype="delINVdup" - MOD="RECLASSIFY" - REASON="DUP_FLANKED_INVERSION_WITH_DEL" - cpxintervals="DEL_${sinkinterval},INV_${invinterval},DUP_${dupinterval}" - SVLEN=${invsize} - START=$( echo -e "${dupinterval}\n${invinterval}\n${sinkinterval}" \ - | cut -f2- -d\: | sed 's/\-/\n/g' \ - | sort -nk1,1 | head -n1 ) - END=$( echo -e "${dupinterval}\n${invinterval}\n${sinkinterval}" \ - | cut -f2- -d\: | sed 's/\-/\n/g' \ - | sort -nk1,1 | tail -n1 ) - #If sink is not clearly deleted (or too small), classify as dupINV (no del) - else - svtype="CPX" - cpxtype="INVdup" - MOD="RECLASSIFY" - REASON="DUP_FLANKED_INVERSION" - cpxintervals="INV_${invinterval},DUP_${dupinterval}" - SVLEN=${invsize} - START=$( echo -e "${dupinterval}\n${invinterval}" \ - | cut -f2- -d\: | sed 's/\-/\n/g' \ - | sort -nk1,1 | head -n1 ) - END=$( echo -e "${dupinterval}\n${invinterval}" \ - | cut -f2- -d\: | sed 's/\-/\n/g' \ - | sort -nk1,1 | tail -n1 ) - fi - #Otherwise, reclassify as dDUP or dDUP_iDEL - else - #If sink is deleted, classify as dDUP_iDEL - if [ ${sinkisdel} -gt 0 ]; then - svtype="CPX" - cpxtype="dDUP_iDEL" - MOD="RECLASSIFY" - REASON="INVERTED_DISPERSED_DUPLICATION_WITH_DELETION" - cpxintervals="DUP_${dupinterval},INV_${dupinterval},DEL_${sinkinterval}" - SOURCE="DUP_${dupinterval}" - SVLEN=$(( ${dupsize}+${sinksize} )) - #If sink is not clearly deleted (or too small), classify as dDUP (no iDEL) - else - svtype="CPX" - cpxtype="dDUP" - MOD="RECLASSIFY" - REASON="INVERTED_DISPERSED_DUPLICATION" - cpxintervals="INV_${dupinterval},DUP_${dupinterval}" - SOURCE="DUP_${dupinterval}" - SVLEN="${dupsize}" - fi - fi - #If dup explicitly fails depth genotyping, mark as unresolved - else - MOD="UNRESOLVED" - REASON="PREDICTED_DUP_INTERVAL_FAILED_GT" - fi - ;; - - #Otherwise, just test for INS-iDEL - *) - #Get inserted sequence coordinates - sourcecoord=$( fgrep -w "${VID}" ${GTDIR}/variants_to_reclassify.vcf2bed.bed \ - | cut -f${SOURCEidx} | sed -e 's/_/\t/g' | cut -f2 ||true) - - #Get sink sequence coordinates - sinkcoord=$( fgrep -w "${VID}" ${GTDIR}/variants_to_reclassify.vcf2bed.bed \ - | awk '{ print $1":"$2"-"$3 }' ||true) - #Skip variant if no sink coordinate is specified - if ! [ -z ${sinkcoord} ]; then - #Get sink sequence size - sinksize=$( fgrep -w "${VID}" ${GTDIR}/genotype_counts_per_variant.cleaned_intervals.bed \ - | bedtools intersect -wa -v -r -f 0.95 -a - \ - -b <( echo -e "${sinkcoord}" | sed -e 's/\:/\t/g' -e 's/\-/\t/g' ) \ - | awk '{ print $4 }' | head -n1 ||true) - #If insertion site size >= $MINSIZEiDEL, check if insertion site is deleted - if ! [ -z ${sinksize} ] && [ ${sinksize} -ge ${MINSIZEiDEL} ]; then - sinkisdel=$( fgrep -w "${VID}" ${GTDIR}/genotype_counts_per_variant.cleaned_intervals.bed \ - | bedtools intersect -wa -v -r -f 0.95 -a - \ - -b <( echo -e "${sinkcoord}" | sed -e 's/\:/\t/g' -e 's/\-/\t/g' ) \ - | awk '{ if ($NF=="DEL") print $0 }' | wc -l ||true) - sinkisnotdel=$( fgrep -w "${VID}" ${GTDIR}/genotype_counts_per_variant.cleaned_intervals.bed \ - | bedtools intersect -wa -v -r -f 0.95 -a - \ - -b <( echo -e "${sinkcoord}" | sed -e 's/\:/\t/g' -e 's/\-/\t/g' ) \ - | awk '{ if ($NF=="WT") print $0 }' | wc -l ||true) - else - sinkisdel=0 - sinkisnotdel=0 - fi - #If sink is deleted, reclassify as CPX, INS_iDEL - if [ ${sourceisdup} -eq 0 ] && [ ${sinkisdel} -gt 0 ]; then - svtype="CPX" - cpxtype="INS_iDEL" - MOD="RECLASSIFY" - REASON="INSERT_SITE_DEL" - cpxintervals="DEL_${sinkcoord}" - SOURCE="INS_${sourcecoord}" - SVLEN=$( echo "${cpxintervals},${sourcecoord}" \ - | sed -e 's/\,/\n/g' -e 's/\:/\t/g' -e 's/\-/\t/g' \ - | awk '{ sum+=$3-$2 }END{ print sum }' ) - #If sink is explicitly *not* deleted and is >= $MINSIZE, reclassify as UNRESOLVED - elif [ ${sinkisnotdel} -gt 0 ] && [ ${sinksize} -ge ${MINSIZE} ]; then - MOD="UNRESOLVED" - REASON="PREDICTED_LARGE_SINK_DEL_INTERVAL_FAILED_GT" - #Otherwise, leave as canonical insertion - else - MOD="KEEP" - REASON="CANONICAL_INS_NO_SINK_DELETION" - fi - else - MOD="SKIP" - REASON="NO_SINK_INTERVAL_IN_CURRENT_SHARD" - fi - ;; - esac - ;; - - #Salvage inverted duplications from inversion single enders - "BND") - #Only consider inversion single enders - if [ ${cpxtype} == "INVERSION_SINGLE_ENDER_--" ] \ - || [ ${cpxtype} == "INVERSION_SINGLE_ENDER_++" ]; then - #Get span of inversion - dupinterval=$( fgrep -w "${VID}" ${GTDIR}/variants_to_reclassify.vcf2bed.bed \ - | awk '{ print $1":"$2"-"$3 }' ||true) - #Get duplication interval length - dupsize=$( echo "${dupinterval}" | sed -e 's/\:/\t/g' -e 's/\-/\t/g' \ - | awk '{ print $3-$2 }' ) - #Test dup interval for explicit confirmation of duplication - dupconfirmed=$( fgrep -w "${VID}" ${GTDIR}/genotype_counts_per_variant.cleaned_intervals.bed \ - | bedtools intersect -wa -r -f 0.95 -a - \ - -b <( echo -e "${dupinterval}" | sed -e 's/\:/\t/g' -e 's/\-/\t/g' ) \ - | awk '{ if ($NF=="DUP") print $0 }' | wc -l ||true) - #If dup confirms, resolve as palindromic inverted duplication - if [ ${dupconfirmed} -gt 0 ]; then - svtype="CPX" - MOD="RECLASSIFY" - REASON="PALINDROMIC_INVERTED_DUPLICATION" - cpxintervals="DUP_${dupinterval},INV_${dupinterval}" - SVLEN=${dupsize} - START=$( echo -e "${dupinterval}" \ - | cut -f2- -d\: | sed 's/\-/\n/g' \ - | sort -nk1,1 | head -n1 ) - END=$( echo -e "${dupinterval}" \ - | cut -f2- -d\: | sed 's/\-/\n/g' \ - | sort -nk1,1 | tail -n1 ) - #RR single enders = piDUP_FR - if [ ${cpxtype} == "INVERSION_SINGLE_ENDER_--" ]; then - cpxtype="piDUP_FR" - else - cpxtype="piDUP_RF" - fi - #Otherwise, leave as unresolved - else - MOD="KEEP" - REASON="DID_NOT_RESOLVE_AS_piDUP" - fi - fi - ;; - - *) - MOD="KEEP" - REASON="IRRELEVANT_SV_TYPE" - ;; - esac - - #Leave new intervals and SVLEN as unset if none specified - if [ -z ${cpxintervals} ]; then - cpxintervals="." - fi - if [ -z ${SVLEN} ]; then - SVLEN="." - fi - if [ -z ${SOURCE} ]; then - SOURCE="." - fi - if [ -z ${START} ]; then - START="." - fi - if [ -z ${END} ]; then - END="." - fi - - #Print final result per variant - echo -e "${VID}\t${MOD}\t${REASON}\t${svtype}\t${cpxtype}\t${cpxintervals}\t${SVLEN}\t${SOURCE}\t${START}\t${END}" - - #Unset all intermediate variables - unset nval; unset MOD; unset REASON; unset cpxintervals; unset SVLEN; unset SOURCE; unset START; unset END - -done < <( fgrep -v "#" ${GTDIR}/genotype_counts_per_variant.cleaned_intervals.bed \ - | cut -f6 | sort | uniq ||true) \ - >> ${GTDIR}/final_variant_reassessment_table.txt -#Copy final reclassification table, if optioned -if ! [ -z ${RTABLE} ]; then - cp ${GTDIR}/final_variant_reassessment_table.txt ${RTABLE} -fi -#Copy genotyping counts, if optioned -if ! [ -z ${GTCOUNTSTABLE} ]; then - cp ${GTDIR}/genotype_counts_per_variant.cleaned_intervals.bed ${GTCOUNTSTABLE} -fi - - -###GENERATE FINAL VCF WITH VARIANT REASSESSMENTS -#Write header to final VCF -zcat ${INVCF} | fgrep "##" \ - > ${GTDIR}/cleaned_output.vcf -#Add lines corresponding to complex types -for wrapper in 1; do - echo -e "##CPX_TYPE_delINV=\"Complex inversion with 5' flanking deletion.\"" - echo -e "##CPX_TYPE_INVdel=\"Complex inversion with 3' flanking deletion.\"" - echo -e "##CPX_TYPE_dupINV=\"Complex inversion with 5' flanking duplication.\"" - echo -e "##CPX_TYPE_INVdup=\"Complex inversion with 3' flanking duplication.\"" - echo -e "##CPX_TYPE_delINVdel=\"Complex inversion with 5' and 3' flanking deletions.\"" - echo -e "##CPX_TYPE_dupINVdup=\"Complex inversion with 5' and 3' flanking duplications.\"" - echo -e "##CPX_TYPE_delINVdup=\"Complex inversion with 5' flanking deletion and 3' flanking duplication.\"" - echo -e "##CPX_TYPE_dupINVdel=\"Complex inversion with 5' flanking duplication and 3' flanking deletion.\"" - echo -e "##CPX_TYPE_piDUP_FR=\"Palindromic inverted tandem duplication, forward-reverse orientation.\"" - echo -e "##CPX_TYPE_piDUP_RF=\"Palindromic inverted tandem duplication, reverse-forward orientation.\"" - echo -e "##CPX_TYPE_dDUP=\"Dispersed duplication.\"" - echo -e "##CPX_TYPE_dDUP_iDEL=\"Dispersed duplication with deletion at insertion site.\"" - echo -e "##CPX_TYPE_INS_iDEL=\"Insertion with deletion at insertion site.\"" -done >> ${GTDIR}/cleaned_output.vcf -#Add samples line to VCF header -zcat ${INVCF} | fgrep "#" | fgrep -v "##" \ - >> ${GTDIR}/cleaned_output.vcf -#Write new vcf without variants in the final reassessment table -zcat ${INVCF} | fgrep -v "#" \ - | fgrep -wvf <( cut -f1 ${GTDIR}/final_variant_reassessment_table.txt | fgrep -v "#" ) \ - >> ${GTDIR}/cleaned_output.vcf ||true -#Get smaller VCF (no header) with all variants in final assessment table -zcat ${INVCF} \ - | fgrep -wf <( cut -f1 ${GTDIR}/final_variant_reassessment_table.txt | fgrep -v "#" ) \ - > ${GTDIR}/variants_to_be_reassessed.vcf||true -#Next, iterate over the final variant reassessment table & take action for each row -while read VID MOD REASON svtype cpxtype cpxintervals SVLEN SOURCE START END; do - #Perform different actions based on $MOD flag - case ${MOD} in - #Emit KEEP or SKIP variants as-is - "SKIP"|"KEEP") - fgrep -w ${VID} ${GTDIR}/variants_to_be_reassessed.vcf - ;; - - #Reassign UNRESOLVED variants to unresolved & scrub previous info - "UNRESOLVED") - - #Set filter status to UNRESOLVED and add UNRESOLVED - awk -v vid=${VID} \ - '$3 == vid {OFS="\t"; $7 = "UNRESOLVED"; $8 = $8";UNRESOLVED;UNRESOLVED_TYPE=POSTHOC_RD_GT_REJECTION"; print;}' \ - ${GTDIR}/variants_to_be_reassessed.vcf - ;; - - #Revise records for RECLASSIFY variants as appropriate - "RECLASSIFY") - #Get missing SVLEN, START, and END from existing record if needed - if [ ${SVLEN} == "." ]; then - SVLEN=$( fgrep -w ${VID} ${GTDIR}/variants_to_be_reassessed.vcf \ - | cut -f8 \ - | sed 's/\;/\n/g' \ - | fgrep SVLEN= \ - | cut -f2 -d\= ) - fi - if [ ${START} == "." ]; then - START=$( fgrep -w ${VID} ${GTDIR}/variants_to_be_reassessed.vcf | cut -f2 ) - fi - if [ ${END} == "." ]; then - END=$( fgrep -w ${VID} ${GTDIR}/variants_to_be_reassessed.vcf \ - | cut -f8 \ - | sed 's/\;/\n/g' \ - | fgrep END= \ - | cut -f2 -d\= ) - fi - #Modify info as needed - INFO=$( fgrep -w ${VID} ${GTDIR}/variants_to_be_reassessed.vcf \ - | cut -f8 \ - | sed -r -e "s/^END=[^;]*;/END=$END;/" \ - | sed -r -e "s/;END=[^;]*;/;END=$END;/" \ - | sed -r -e "s/;END=[^;]*$/;END=$END/" \ - | sed -r -e "s/^SVTYPE=[^;]*;/SVTYPE=$svtype;/" \ - | sed -r -e "s/;SVTYPE=[^;]*;/;SVTYPE=$svtype;/" \ - | sed -r -e "s/;SVTYPE=[^;]*$/;SVTYPE=$svtype/" \ - | sed -r -e "s/^SVLEN=[^;]*;/SVLEN=$SVLEN;/" \ - | sed -r -e "s/;SVLEN=[^;]*;/;SVLEN=$SVLEN;/" \ - | sed -r -e "s/;SVLEN=[^;]*$/;SVLEN=$SVLEN/" \ - | sed -r -e "s/^CPX_TYPE=[^;]*;/CPX_TYPE=${cpxtype};/" \ - | sed -r -e "s/;CPX_TYPE=[^;]*;/;CPX_TYPE=${cpxtype};/" \ - | sed -r -e "s/;CPX_TYPE=[^;]*$/;CPX_TYPE=${cpxtype}/" \ - | sed -r -e 's/^UNRESOLVED;//' \ - | sed -r -e 's/;UNRESOLVED;/;/' \ - | sed -r -e 's/;UNRESOLVED$//' \ - | sed -r -e 's/^UNRESOLVED_TYPE=[^;]*;//' \ - | sed -r -e 's/;UNRESOLVED_TYPE=[^;]*;/;/' \ - | sed -r -e 's/;UNRESOLVED_TYPE=[^;]*$//' \ - | sed -r -e 's/^EVENT=[^;]*;//' \ - | sed -r -e 's/;EVENT=[^;]*;/;/' \ - | sed -r -e 's/;EVENT=[^;]*$//' ) - #Add/remove/modify CPX_TYPE, if needed - if [ ${svtype} == "CPX" ]; then - if [ $( echo "${INFO}" | fgrep "CPX_TYPE" | wc -l ) -eq 0 ]; then - INFO=$( echo ${INFO} | sed -e "s/\$/;CPX_TYPE=$cpxtype/" ) - fi - else - INFO=$( echo "${INFO}" \ - | sed -r -e 's/;CPX_TYPE=[^;]*;/;/' \ - | sed -r -e 's/;CPX_TYPE=[^;]*$//' ) - fi - #Add/remove/correct SOURCE, as needed - if [ ${SOURCE} == "." ]; then - INFO=$( echo ${INFO} \ - | sed -r -e "s/;SOURCE=[^;]*;/;/" \ - | sed -r -e "s/;SOURCE=[^;]*$//" ) - else - if [ $( echo ${INFO} | fgrep SOURCE | wc -l ) -gt 0 ]; then - INFO=$( echo ${INFO} \ - | sed -r -e "s/;SOURCE=[^;]*;/;SOURCE=${SOURCE};/" \ - | sed -r -e "s/;SOURCE=[^;]*$/;SOURCE=${SOURCE}/" ) - else - INFO=$( echo ${INFO} \ - | sed -r -e "s/$/;SOURCE=${SOURCE}/" ) - fi - fi - #Add/remove/correct CPX_INTERVALS, as needed - if [ ${cpxintervals} == "." ]; then - INFO=$( echo ${INFO} \ - | sed -r -e "s/;CPX_INTERVALS=[^;]*;/;/" \ - | sed -r -e "s/;CPX_INTERVALS=[^;]*$//" ) - else - if [ $( echo ${INFO} | fgrep CPX_INTERVALS | wc -l ) -gt 0 ]; then - INFO=$( echo ${INFO} \ - | sed -r -e "s/;CPX_INTERVALS=[^;]*;/;CPX_INTERVALS=${cpxintervals};/" \ - | sed -r -e "s/;CPX_INTERVALS=[^;]*$/;CPX_INTERVALS=${cpxintervals}/" ) - else - INFO=$( echo ${INFO} \ - | sed -r -e "s/$/;CPX_INTERVALS=${cpxintervals}/" ) - fi - fi - #Print record - paste \ - <( fgrep -w ${VID} ${GTDIR}/variants_to_be_reassessed.vcf | cut -f1-4 \ - | awk -v startpos=${START} -v OFS="\t" '{ print $1, startpos, $3, $4 }' ) \ - <( echo "<${svtype}>" ) \ - <( fgrep -w ${VID} ${GTDIR}/variants_to_be_reassessed.vcf | cut -f6-7 ) \ - <( echo "${INFO}" ) \ - <( fgrep -w ${VID} ${GTDIR}/variants_to_be_reassessed.vcf | cut -f9- ) - ;; - esac -done < <( fgrep -v "#" ${GTDIR}/final_variant_reassessment_table.txt ) \ - >> ${GTDIR}/cleaned_output.vcf -#Sort & compress final output -#Also strip CPX_TYPE and CPX_INTERVALS from any non-CPX variants -${BIN}/rm_cpx_info.py ${GTDIR}/cleaned_output.vcf /dev/stdout \ - | vcf-sort | bgzip -c \ - > ${OUTVCF} - - -#Clean up -rm -rf ${GTDIR} - diff --git a/src/sv-pipeline/04_variant_resolution/scripts/rm_cpx_info.py b/src/sv-pipeline/04_variant_resolution/scripts/rm_cpx_info.py deleted file mode 100755 index dd4877dd9..000000000 --- a/src/sv-pipeline/04_variant_resolution/scripts/rm_cpx_info.py +++ /dev/null @@ -1,58 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# - -""" -Strip CPX INFO fields from non-CPX variants -""" - -import argparse -import sys -import pysam - -# Cleanup function - - -def rmcpx(vcf, fout): - - # Iterate over records - for record in vcf: - - # Get basic info about record - svtype = record.info['SVTYPE'] - - # Clear CPX info fields for non-CPX variants - if svtype != "CPX" and svtype != "CTX": - for info in 'CPX_TYPE CPX_INTERVALS'.split(): - if info in record.info.keys(): - record.info.pop(info) - - # Write record to file - fout.write(record) - - -# Main block -def main(): - parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.RawDescriptionHelpFormatter) - parser.add_argument('vcf') - parser.add_argument('fout') - - args = parser.parse_args() - - if args.vcf in '- stdin'.split(): - vcf = pysam.VariantFile(sys.stdin) - else: - vcf = pysam.VariantFile(args.vcf) - - if args.fout in '- stdout'.split(): - fout = pysam.VariantFile(sys.stdout, 'w', header=vcf.header) - else: - fout = pysam.VariantFile(args.fout, 'w', header=vcf.header) - - rmcpx(vcf, fout) - - -if __name__ == '__main__': - main() diff --git a/wdl/GenotypeCpxCnvs.wdl b/wdl/GenotypeCpxCnvs.wdl index 398707f9b..ef327212c 100644 --- a/wdl/GenotypeCpxCnvs.wdl +++ b/wdl/GenotypeCpxCnvs.wdl @@ -106,7 +106,6 @@ workflow GenotypeCpxCnvs { File cpx_depth_gt_resolved_vcf = ParseGenotypes.cpx_depth_gt_resolved_vcf File cpx_depth_gt_resolved_vcf_idx = ParseGenotypes.cpx_depth_gt_resolved_vcf_idx File reclassification_table = ParseGenotypes.reclassification_table - File interval_genotype_counts_table = ParseGenotypes.gt_counts_table } } @@ -178,8 +177,8 @@ task ParseGenotypes { # be held in memory or disk while working, potentially in a form that takes up more space) Float input_size = size([vcf, intervals, genotypes, ped_file], "GiB") RuntimeAttr runtime_default = object { - mem_gb: 2.0 + 5.0 * input_size, - disk_gb: ceil(5 + input_size * 60), + mem_gb: 3.75, + disk_gb: ceil(10 + input_size * 4), cpu_cores: 1, preemptible_tries: 3, max_retries: 1, @@ -198,22 +197,30 @@ task ParseGenotypes { command <<< set -eu -o pipefail - - /opt/sv-pipeline/04_variant_resolution/scripts/process_posthoc_cpx_depth_regenotyping.sh \ - -R ~{prefix}.CPXregenotyping_reclassification_table.~{contig}.txt \ - -G ~{prefix}.CPXregenotyping_raw_genotype_counts_table.~{contig}.txt \ - ~{vcf} \ - ~{intervals} \ - ~{genotypes} \ - ~{ped_file} \ - ~{prefix}.postCPXregenotyping.~{contig}.vcf.gz + python /opt/sv-pipeline/04_variant_resolution/scripts/process_posthoc_cpx_depth_regenotyping.py \ + --vcf ~{vcf} \ + --intervals ~{intervals} \ + --genotypes ~{genotypes} \ + --ped ~{ped_file} \ + --out out.vcf.gz \ + --reclassification-table ~{prefix}.CPXregenotyping_reclassification_table.~{contig}.txt + + # Compress for storage + gzip ~{prefix}.CPXregenotyping_reclassification_table.~{contig}.txt + + # Re-sort and index since coordinates may have changed + mkdir temp + bcftools sort \ + --temp-dir temp \ + --output-type z \ + --output-file ~{prefix}.postCPXregenotyping.~{contig}.vcf.gz \ + out.vcf.gz tabix ~{prefix}.postCPXregenotyping.~{contig}.vcf.gz >>> output { File cpx_depth_gt_resolved_vcf = "~{prefix}.postCPXregenotyping.~{contig}.vcf.gz" File cpx_depth_gt_resolved_vcf_idx = "~{prefix}.postCPXregenotyping.~{contig}.vcf.gz.tbi" - File reclassification_table = "~{prefix}.CPXregenotyping_reclassification_table.~{contig}.txt" - File gt_counts_table = "~{prefix}.CPXregenotyping_raw_genotype_counts_table.~{contig}.txt" + File reclassification_table = "~{prefix}.CPXregenotyping_reclassification_table.~{contig}.txt.gz" } }