From 80d9a8db47732fb5fc5ecb1807d8898c3b837b22 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Thu, 17 Oct 2024 16:49:17 -0400 Subject: [PATCH 01/28] port RefineComplexVariants from ManualReview/FilterComplex --- .../reformat_CPX_bed_and_generate_script.py | 474 ++++++++++ wdl/CollectLargeCNVSupportForCPX.wdl | 287 ++++++ wdl/CollectPEMetricsForCPX.wdl | 102 ++ wdl/CollectPEMetricsPerBatchCPX.wdl | 205 +++++ wdl/RefineComplexVariants.wdl | 870 ++++++++++++++++++ 5 files changed, 1938 insertions(+) create mode 100644 src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py create mode 100644 wdl/CollectLargeCNVSupportForCPX.wdl create mode 100644 wdl/CollectPEMetricsForCPX.wdl create mode 100644 wdl/CollectPEMetricsPerBatchCPX.wdl create mode 100644 wdl/RefineComplexVariants.wdl diff --git a/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py b/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py new file mode 100644 index 000000000..f05e52d9b --- /dev/null +++ b/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py @@ -0,0 +1,474 @@ +#!python +#this script takes in the CPX SVs in complex format, reformat them in svelter format meanwhile generate the script to collect PE metrics + +def header_pos_readin(input_bed): + fin=os.popen(r'''zcat %s'''%(input_bed)) + pin=fin.readline().strip().split() + out = {} + for i in range(len(pin)): + out[pin[i]] = i + fin.close() + return out + +def sample_pe_readin(sample_pe_file): + #sample_pe_file is a 2 column file with sample ID and PE metrics on both columns + out = {} + fin=open(sample_pe_file) + for line in fin: + pin=line.strip().split() + if not pin[0] in out.keys(): + out[pin[0]] = pin[1] + fin.close() + return out + +def SVID_sample_readin(input_bed,header_pos): + out={} + fin=os.popen(r'''zcat %s'''%(input_bed)) + for line in fin: + pin=line.strip().split('\t') + if not pin[0][0]=='#': + out[pin[header_pos['name']]] = pin[header_pos['samples']] + fin.close() + return out + +def parse_segment(segment): + svtype, interval = segment.split('_') + chrom, coords = interval.split(':') + coord1, coord2 = coords.split('-') + return svtype, chrom, int(coord1), int(coord2) + +# segments: CPX_INTERVALS split by ',' +def extract_bp_list_for_inv_cnv_events(segments, cpx_type): + s0_svtype, s0_chrom, s0_c1, s0_c2 = parse_segment(segments[0]) + breakpoints = [s0_chrom]+[s0_c1, s0_c2] + if not 'INVdup' in cpx_type: + for seg in segments[1:]: + seg_svtype, seg_chrom, seg_c1, seg_c2 = parse_segment(seg) + breakpoints.append(seg_c2) + else: + if cpx_type=='INVdup': + s1_svtype, s1_chrom, s1_c1, s1_c2 = parse_segment(segments[1]) + if s1_c1 < breakpoints[2]: + breakpoints = breakpoints[:2] + [s1_c1, breakpoints[2]] + elif cpx_type == 'dupINVdup' or cpx_type == 'delINVdup': + s2_svtype, s2_chrom, s2_c1, s2_c2 = parse_segment(segments[2]) + breakpoints += [s2_c1, s2_c2] + return breakpoints + +def extract_bp_list_for_ddup_events(coordinates, segments, small_sv_size_threshold): + #example of coordinates: ["chr1",1154129,1154133] + #example of segments: DUP_chr1:229987763-230011157 + del_bp = [coordinates[0], int(coordinates[1]), int(coordinates[2])] + # CW: this looks like it only gets the coordinates from the first DUP segment? + #dup_seg = [i for i in segments if 'DUP_' in i][0].split('_')[1].split(':') + #dup_bp = [int(i) for i in dup_seg[1].split('-')] + #INV_flag = len([i for i in segments if "INV_" in i]) + # rewrite as: + INV_flag = 0 + for segment in segments: + if 'DUP_' in segment: + seg_svtype, seg_chrom, seg_c1, seg_c2 = parse_segment(segment) + dup_bp = [seg_c1, seg_c2] + if 'INV_' in segment: + INV_flag = INV_flag + 1 + if 'DEL_' in segment: + seg_svtype, seg_chrom, seg_c1, seg_c2 = parse_segment(segment) + del_bp = [seg_chrom, seg_c1, seg_c2] + #INV_flag == 0 : no INV involved in the insertion + #INV_flag > 0: INV involved + + if INV_flag==0: + if del_bp[2] < dup_bp[0]: + breakpints = del_bp + dup_bp + if del_bp[2]-del_bp[1]>small_sv_size_threshold: + structures = ['abc','cbc'] + else: + structures = ['ab','bab'] + elif del_bp[1] > dup_bp[1]: + breakpints = [del_bp[0]]+dup_bp+del_bp[1:] + if del_bp[2]-del_bp[1]>small_sv_size_threshold: + structures = ['abc','aba'] + else: + structures = ['ab','aba'] + elif del_bp[1] < dup_bp[0] and not del_bp[2] < dup_bp[0]: + breakpints = del_bp[:2] + dup_bp + structures = ['ab','bb'] + elif del_bp[1] < dup_bp[1] and not del_bp[2] < dup_bp[1]: + breakpints = [del_bp[0]]+dup_bp+[del_bp[2]] + structures = ['ab','aa'] + else: + return 'unresolved' + elif INV_flag>0: + if del_bp[2] < dup_bp[0]: + breakpints = del_bp + dup_bp + if del_bp[2]-del_bp[1]>small_sv_size_threshold: + structures = ['abc','c^bc'] + else: + structures = ['ab','b^ab'] + elif del_bp[1] > dup_bp[1]: + breakpints = [del_bp[0]]+dup_bp+del_bp[1:] + if del_bp[2]-del_bp[1]>small_sv_size_threshold: + structures = ['abc','aba^'] + else: + structures = ['ab','aba^'] + elif del_bp[1] < dup_bp[0] and not del_bp[2] < dup_bp[0]: + breakpints = del_bp[:2] + dup_bp + structures = ['ab','b^b'] + elif del_bp[1] < dup_bp[1] and not del_bp[2] < dup_bp[1]: + breakpints = [del_bp[0]]+dup_bp+[del_bp[2]] + structures = ['ab','aa^'] + else: + return 'unresolved' + return [breakpints, structures] + +def extract_bp_list_for_ins_idel(coordinates, segments, small_sv_size_threshold): + #example of coordinates: ["chr1",1154129,1154133] + #example of segments: DUP_chr1:229987763-230011157 + del_bp = [coordinates[0], int(coordinates[1]), int(coordinates[2])] + dup_seg = [i for i in segments if 'INS_' in i][0].split('_')[1].split(':') + dup_bp = [int(i) for i in dup_seg[1].split('-')] + INV_flag = len([i for i in segments if "INV_" in i]) + #INV_flag == 0 : no INV involved in the insertion + #INV_flag > 0: INV involved + if INV_flag==0: + if del_bp[2] < dup_bp[0]: + breakpints = del_bp + dup_bp + if del_bp[2]-del_bp[1]> small_sv_size_threshold: + structures = ['abc','cbc'] + else: + structures = ['ab','bab'] + elif del_bp[1] > dup_bp[1]: + breakpints = [del_bp[0]]+dup_bp+del_bp[1:] + if del_bp[2]-del_bp[1]> small_sv_size_threshold: + structures = ['abc','aba'] + else: + structures = ['ab','aba'] + elif del_bp[1] < dup_bp[0] and not del_bp[2] < dup_bp[0]: + breakpints = del_bp[:2] + dup_bp + structures = ['ab','bb'] + elif del_bp[1] < dup_bp[1] and not del_bp[2] < dup_bp[1]: + breakpints = [del_bp[0]]+dup_bp+[del_bp[2]] + structures = ['ab','aa'] + elif INV_flag>0: + if del_bp[2] < dup_bp[0]: + breakpints = del_bp + dup_bp + if del_bp[2]-del_bp[1]> small_sv_size_threshold: + structures = ['abc','c^bc'] + else: + structures = ['ab','b^ab'] + elif del_bp[1] > dup_bp[1]: + breakpints = [del_bp[0]]+dup_bp+del_bp[1:] + if del_bp[2]-del_bp[1]> small_sv_size_threshold: + structures = ['abc','aba^'] + else: + structures = ['ab','aba^'] + elif del_bp[1] < dup_bp[0] and not del_bp[2] < dup_bp[0]: + breakpints = del_bp[:2] + dup_bp + structures = ['ab','b^b'] + elif del_bp[1] < dup_bp[1] and not del_bp[2] < dup_bp[1]: + breakpints = [del_bp[0]]+dup_bp+[del_bp[2]] + structures = ['ab','aa^'] + return [breakpints, structures] + +def extract_bp_list_V4(coordinates, segments, small_sv_size_threshold): + del_bp = [coordinates[0], int(coordinates[1]), int(coordinates[2])] + dup_seg = [i for i in segments if 'INV_' in i][0].split('_')[1].split(':') + dup_bp = [int(i) for i in dup_seg[1].split('-')] + INV_flag = 1 + if del_bp[2] < dup_bp[0]: + breakpoints = del_bp + dup_bp + if del_bp[2]-del_bp[1]> small_sv_size_threshold: + structures = ['abc','c^bc'] + else: + structures = ['ab','b^ab'] + elif del_bp[1] > dup_bp[1]: + breakpoints = [del_bp[0]]+dup_bp+del_bp[1:] + if del_bp[2]-del_bp[1]> small_sv_size_threshold: + structures = ['abc','aba^'] + else: + structures = ['ab','aba^'] + return [breakpoints, structures] + +def is_interchromosomal(pin, header_pos): + chrom = pin[0] + if pin[header_pos['CPX_TYPE']] in ['dDUP', 'dDUP_iDEL']: + seg2 = [i for i in pin[header_pos['CPX_INTERVALS']].split(',') if 'DUP_' in i][0].split('_')[1].split(':') + if seg2[0] != chrom: + return True + elif pin[header_pos['CPX_TYPE']] in ['INS_iDEL']: + seg2 = [i for i in pin[header_pos['CPX_INTERVALS']].split(',') if 'INS_' in i][0].split('_')[1].split(':') + if seg2[0] != chrom: + return True + elif pin[header_pos['CPX_TYPE']] in ['CTX_PQ/QP', 'CTX_PP/QQ'] or pin[header_pos['SVTYPE']] in ['CTX']: + return True + elif "INV_" in pin[header_pos['SOURCE']] and pin[header_pos['SVTYPE']]=="INS": + seg2 = [i for i in pin[header_pos['SOURCE']].split(',') if 'INV_' in i][0].split('_')[1].split(':') + seg2 = [seg2[0]]+seg2[1].split('-') + if seg2[0] != chrom: + return True + return False + +def cpx_SV_readin(input_bed, header_pos, unresolved): + unresolved_svids = [] + fin=os.popen(r'''zcat %s'''%(input_bed)) + out = [] + small_sv_size_threshold = 250 + for line in fin: + pin=line.strip().split('\t') + # label pidups as unresolved since manual inspection indicated low quality + if not pin[0][0]=="#": + if pin[header_pos['CPX_TYPE']] in ['piDUP_RF', 'piDUP_FR']: + unresolved_svids.append(pin[header_pos['name']]) + elif not is_interchromosomal(pin, header_pos): + if pin[header_pos['CPX_TYPE']] in ['delINV', 'INVdel', 'dupINV','INVdup','delINVdel', 'delINVdup','dupINVdel','dupINVdup']: + segments = pin[header_pos['CPX_INTERVALS']].split(',') + breakpoints = extract_bp_list_for_inv_cnv_events(segments, pin[header_pos['CPX_TYPE']]) + if pin[header_pos['CPX_TYPE']] == 'delINV': + ref_alt = ['ab','b^'] + if pin[header_pos['CPX_TYPE']] == 'INVdel': + ref_alt = ['ab','a^'] + if pin[header_pos['CPX_TYPE']] == 'dupINV': + ref_alt = ['ab','aba^'] + if pin[header_pos['CPX_TYPE']] == 'INVdup': + ref_alt = ['ab','b^ab'] + if pin[header_pos['CPX_TYPE']] == 'delINVdel': + ref_alt = ['abc','b^'] + if pin[header_pos['CPX_TYPE']] == 'delINVdup': + ref_alt = ['abc','c^bc'] + if pin[header_pos['CPX_TYPE']] == 'dupINVdel': + ref_alt = ['abc','aba^'] + if pin[header_pos['CPX_TYPE']] == 'dupINVdup': + ref_alt = ['abc','ac^b^a^c'] + elif pin[header_pos['CPX_TYPE']] in ['dDUP', 'dDUP_iDEL']: + segments = pin[header_pos['CPX_INTERVALS']].split(',') + cpx_info = extract_bp_list_for_ddup_events(pin[:3], segments, small_sv_size_threshold) + if cpx_info == 'unresolved': + unresolved_svids.append(pin[header_pos['name']]) + continue + ref_alt = cpx_info[1] + breakpoints = cpx_info[0] + elif pin[header_pos['CPX_TYPE']] in ['INS_iDEL']: + segments = pin[header_pos['CPX_INTERVALS']].split(',') + cpx_info = extract_bp_list_for_ins_idel(pin[:3], segments, small_sv_size_threshold) + ref_alt = cpx_info[1] + breakpoints = cpx_info[0] + else: + segments = pin[header_pos['SOURCE']].split(',') + cpx_info = extract_bp_list_V4(pin[:3], segments, small_sv_size_threshold) + ref_alt = cpx_info[1] + breakpoints = cpx_info[0] + out.append([breakpoints, ref_alt,pin[3]]) + else: + continue + fin.close() + with open(unresolved, 'w') as unres: + for svid in unresolved_svids: + unres.write(svid + "\n") + return out + +def cpx_inter_chromo_SV_readin(input_bed, header_pos): + fin=os.popen(r'''zcat %s'''%(input_bed)) + out = [] + chr_list = ['chr'+str(i) for i in range(1,23)]+['chrX','chrY'] + for line in fin: + pin=line.strip().split('\t') + if not pin[0][0]=="#": + if is_interchromosomal(pin, header_pos): + if pin[header_pos['CPX_TYPE']] in ['dDUP', 'dDUP_iDEL']: + seg1 = pin[:3] + seg2 = [i for i in pin[header_pos['CPX_INTERVALS']].split(',') if 'DUP_' in i][0].split('_')[1].split(':') + seg2 = [seg2[0]]+seg2[1].split('-') + if chr_list.index(seg1[0]) < chr_list.index(seg2[0]): + bp = [[seg1[0]]+[int(i) for i in seg1[1:]], [seg2[0]]+[int(i) for i in seg2[1:]]] + if int(seg1[2])-int(seg1[1])>250: + ref_alt = ['ab_c', 'cb_c'] + else: + ref_alt = ['a_b', 'ba_b'] + elif chr_list.index(seg1[0]) > chr_list.index(seg2[0]): + bp = [[seg2[0]]+[int(i) for i in seg2[1:]], [seg1[0]]+[int(i) for i in seg1[1:]]] + if int(seg1[2])-int(seg1[1])>250: + ref_alt = ['a_bc','a_ba'] + else: + ref_alt = ['a_b', 'a_ba'] + elif pin[header_pos['CPX_TYPE']] in ['INS_iDEL']: + seg1 = pin[:3] + seg2 = [i for i in pin[header_pos['CPX_INTERVALS']].split(',') if 'INS_' in i][0].split('_')[1].split(':') + seg2 = [seg2[0]]+seg2[1].split('-') + if chr_list.index(seg1[0]) < chr_list.index(seg2[0]): + bp = [[seg1[0]]+[int(i) for i in seg1[1:]], [seg2[0]]+[int(i) for i in seg2[1:]]] + if int(seg1[2])-int(seg1[1])>250: + ref_alt = ['ab_c', 'cb_c'] + else: + ref_alt = ['a_b', 'ba_b'] + elif chr_list.index(seg1[0]) > chr_list.index(seg2[0]): + bp = [[seg2[0]]+[int(i) for i in seg2[1:]], [seg1[0]]+[int(i) for i in seg1[1:]]] + if int(seg1[2])-int(seg1[1])>250: + ref_alt = ['a_bc','a_ba'] + else: + ref_alt = ['a_b', 'a_ba'] + elif pin[header_pos['CPX_TYPE']] in ['CTX_PQ/QP', 'CTX_PP/QQ'] or pin[header_pos['SVTYPE']] in ['CTX']: + seg1 = pin[:3] + seg2 = [pin[header_pos['CHR2']], pin[header_pos['END2']], pin[header_pos['END2']]] + if chr_list.index(seg1[0]) < chr_list.index(seg2[0]): + bp = [[seg1[0]]+[int(i) for i in seg1[1:]], [seg2[0]]+[int(i) for i in seg2[1:]]] + elif chr_list.index(seg1[0]) > chr_list.index(seg2[0]): + bp = [[seg2[0]]+[int(i) for i in seg2[1:]], [seg1[0]]+[int(i) for i in seg1[1:]]] + ref_alt = [pin[header_pos['CPX_TYPE']]] + elif "INV_" in pin[header_pos['SOURCE']] and pin[header_pos['SVTYPE']]=="INS": + seg1 = pin[:3] + seg2 = [i for i in pin[header_pos['SOURCE']].split(',') if 'INV_' in i][0].split('_')[1].split(':') + seg2 = [seg2[0]]+seg2[1].split('-') + if chr_list.index(seg1[0]) < chr_list.index(seg2[0]): + bp = [[seg1[0]]+[int(i) for i in seg1[1:]], [seg2[0]]+[int(i) for i in seg2[1:]]] + if int(seg1[2])-int(seg1[1])>250: + ref_alt = ['ab_c', 'c^b_c'] + else: + ref_alt = ['a_b', 'b^a_b'] + elif chr_list.index(seg1[0]) > chr_list.index(seg2[0]): + bp = [[seg2[0]]+[int(i) for i in seg2[1:]], [seg1[0]]+[int(i) for i in seg1[1:]]] + if int(seg1[2])-int(seg1[1])>250: + ref_alt = ['a_bc','a_ba^'] + else: + ref_alt = ['a_b', 'a_ba^'] + out.append([bp, ref_alt, pin[3]]) + fin.close() + return out + +def cpx_sample_batch_readin(cpx_SV, SVID_sample,sample_pe, PE_evidence, out_file): + out = [] + flank_back = 1000 + flank_front = 100 + fo=open(out_file,'w') + for info in cpx_SV: + breakpints = info[0] + if info[1][0] == 'ab' and info[1][1]=='b^': #delINV + common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_back)+'-'+str(breakpints[1]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_front)+'-'+str(breakpints[2]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>'+str(breakpints[3]-flank_front) , '&&', '$5<'+str(breakpints[3]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + elif info[1][0] == 'abc' and info[1][1]=='b^': #delINVdel + common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_back)+'-'+str(breakpints[1]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_front)+'-'+str(breakpints[2]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>'+str(breakpints[4]-flank_front) , '&&', '$5<'+str(breakpints[4]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + elif info[1][0] == 'abc' and info[1][1]=='c^bc': #delINVdup + common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_back)+'-'+str(breakpints[1]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>'+str(breakpints[4]-flank_back) , '&&', '$5<'+str(breakpints[4]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_front)+'-'+str(breakpints[2]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>'+str(breakpints[3]-flank_front) , '&&', '$5<'+str(breakpints[3]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + elif info[1][0] == 'ab' and info[1][1]=='aba^': #dupINV + common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_back)+'-'+str(breakpints[2]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>'+str(breakpints[3]-flank_front) , '&&', '$5<'+str(breakpints[3]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + elif info[1][0] == 'abc' and info[1][1]=='ac^b^a^c': #dupINVdup + common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_back)+'-'+str(breakpints[2]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>'+str(breakpints[4]-flank_back) , '&&', '$5<'+str(breakpints[4]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>'+str(breakpints[3]-flank_front) , '&&', '$5<'+str(breakpints[3]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + elif info[1][0] == 'abc' and info[1][1]=='aba^': #dupINVdel + common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_back)+'-'+str(breakpints[2]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>'+str(breakpints[4]-flank_front) , '&&', '$5<'+str(breakpints[4]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + elif info[1][0] == 'ab' and info[1][1]=='a^': #INVdel + common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_back)+'-'+str(breakpints[1]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>'+str(breakpints[2]-flank_back) , '&&', '$5<'+str(breakpints[2]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>'+str(breakpints[3]-flank_front) , '&&', '$5<'+str(breakpints[3]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + elif info[1][0] == 'ab' and info[1][1]=='b^ab': #INVdup + common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_back)+'-'+str(breakpints[1]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>'+str(breakpints[2]-flank_front) , '&&', '$5<'+str(breakpints[2]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + elif info[1][0] == 'ab' and info[1][1]=='aba': #dupINS + common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_back)+'-'+str(breakpints[2]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>'+str(breakpints[3]-flank_front) , '&&', '$5<'+str(breakpints[3]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + elif info[1][0] == 'abc' and info[1][1]=='aba': #dupINSdel + common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_back)+'-'+str(breakpints[2]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>'+str(breakpints[4]-flank_front) , '&&', '$5<'+str(breakpints[4]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + elif info[1][0] == 'ab' and info[1][1]=='aa': #dupdel + common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_back)+'-'+str(breakpints[2]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>'+str(breakpints[3]-flank_front) , '&&', '$5<'+str(breakpints[3]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>'+str(breakpints[2]-flank_back) , '&&', '$5<'+str(breakpints[2]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + elif info[1][0] == 'ab' and info[1][1]=='bab': #INSdup + common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_back)+'-'+str(breakpints[1]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>'+str(breakpints[2]-flank_front) , '&&', '$5<'+str(breakpints[2]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + elif info[1][0] == 'abc' and info[1][1]=='cbc': #delINSdup + common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_back)+'-'+str(breakpints[1]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>'+str(breakpints[3]-flank_front) , '&&', '$5<'+str(breakpints[3]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_front)+'-'+str(breakpints[2]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>'+str(breakpints[4]-flank_back) , '&&', '$5<'+str(breakpints[4]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + elif info[1][0] == 'ab' and info[1][1]=='bb': #deldup + common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_back)+'-'+str(breakpints[1]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>'+str(breakpints[2]-flank_front) , '&&', '$5<'+str(breakpints[2]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_front)+'-'+str(breakpints[2]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + elif info[1] == ['a_b','ba_b'] or info[1] == ['ab_c','cb_c']: #ddup or ddup_iDEL, insertion from the smaller chromosome to the larger chromosome + common_1 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][1]-flank_back)+'-'+str(breakpints[0][1]+flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="-"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][1]-flank_front) , '&&', '$5<'+str(breakpints[1][1]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + common_2 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][2]-flank_front)+'-'+str(breakpints[0][2]+flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="+"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][2]-flank_back) , '&&', '$5<'+str(breakpints[1][2]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + elif info[1] == ['a_b','a_ba'] or info[1] == ['a_bc','a_ba']: #ddup or ddup_iDEL, insertion from the larger chromosome to the smaller chromosome + common_1 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][1]-flank_front)+'-'+str(breakpints[0][1]+flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="+"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][1]-flank_back) , '&&', '$5<'+str(breakpints[1][1]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + common_2 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][2]-flank_back)+'-'+str(breakpints[0][2]+flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="-"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][2]-flank_front) , '&&', '$5<'+str(breakpints[1][2]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + elif info[1] == ['a_b', 'b^a_b'] or info[1] == ['ab_c', 'c^b_c']: #inverted insertion, insertion from the smaller chromosome to the larger chromosome + common_1 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][1]-flank_back)+'-'+str(breakpints[0][1]+flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="+"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][2]-flank_back) , '&&', '$5<'+str(breakpints[1][2]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + common_2 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][2]-flank_front)+'-'+str(breakpints[0][2]+flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="-"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][1]-flank_front) , '&&', '$5<'+str(breakpints[1][1]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + elif info[1] == ['a_b', 'a_ba^'] or info[1] == ['a_bc','a_ba^']: #inverted insertion, insertion from the larger chromosome to the smaller chromosome + common_1 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][1]-flank_front)+'-'+str(breakpints[0][1]+flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="-"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][2]-flank_front) , '&&', '$5<'+str(breakpints[1][2]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + common_2 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][2]-flank_back)+'-'+str(breakpints[0][2]+flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="+"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][1]-flank_back) , '&&', '$5<'+str(breakpints[1][1]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + elif info[1] == ['CTX_PQ/QP']: #inverted insertion, insertion from the larger chromosome to the smaller chromosome + common_1 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][1]-flank_back)+'-'+str(breakpints[0][1]+flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="+"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][1]-flank_back) , '&&', '$5<'+str(breakpints[1][1]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + common_2 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][2]-flank_front)+'-'+str(breakpints[0][2]+flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="-"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][2]-flank_front) , '&&', '$5<'+str(breakpints[1][2]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + elif info[1] == ['CTX_PP/QQ']: #inverted insertion, insertion from the larger chromosome to the smaller chromosome + common_1 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][1]-flank_back)+'-'+str(breakpints[0][1]+flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="-"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][1]-flank_front) , '&&', '$5<'+str(breakpints[1][1]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + common_2 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][2]-flank_front)+'-'+str(breakpints[0][2]+flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="+"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][2]-flank_back) , '&&', '$5<'+str(breakpints[1][2]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] + else: + print(info) + samples = SVID_sample[info[2]] + if not samples =='' and not samples=='NA': + sample_list = samples.split(',') + pe_metrics_list = [sample_pe[samp] for samp in sample_list] + for num in range(len(sample_list)): + common_1[1] = pe_metrics_list[num] + common_2[1] = pe_metrics_list[num] + common_1[4] = sample_list[num] + common_2[4] = sample_list[num] + write_1 = ' '.join(common_1) + write_2 = ' '.join(common_2) + print(write_1, file=fo) + print(write_2, file=fo) + + fo.close() + return out + +def write_cpx_command(cpx_command, out_file): + fo=open(out_file,'w') + for i in cpx_command: + print(i, file=fo) + fo.close() + +def write_cpx_SVs(cpx_SV, out_file): + fo=open(out_file,'w') + for i in cpx_SV: + out = [i[2],':'.join([str(j) for j in i[0]])]+i[1] + print('\t'.join(out), file=fo) + fo.close() + +def main(): + """ + Command-line main block + """ + + # Parse command line arguments and options + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument('-i','--input', required=True, help='input file of CPX events in bed format') + parser.add_argument('-s','--sample_pe', required=True, help='2 column file with sample ID and PE metrics information') + parser.add_argument('-p','--pe_evidence', required=True, help='name of file to store collected PE metrics') + parser.add_argument('-c','--command_script', required=True, help='name of file that has scripts to collect PE evidences') + parser.add_argument('-r','--reformat_SV', required=True, help='reformatted SV in svelter format') + parser.add_argument('-u','--unresolved', required=True, help='list of SVIDs to mark unresolved without evaluating (temporary workaround') # TODO + args = parser.parse_args() + + input_bed = args.input + sample_pe_file = args.sample_pe + PE_evidence = args.pe_evidence + command_script = args.command_script + reformatted_SV = args.reformat_SV + + header_pos = header_pos_readin(input_bed) + sample_pe = sample_pe_readin(sample_pe_file) + SVID_sample = SVID_sample_readin(input_bed, header_pos) + + cpx_SV = cpx_SV_readin(input_bed, header_pos, args.unresolved) + cpx_inter_chromo_SV = cpx_inter_chromo_SV_readin(input_bed, header_pos) + write_cpx_SVs(cpx_SV+cpx_inter_chromo_SV, reformatted_SV) + + cpx_sample_batch_readin(cpx_SV+cpx_inter_chromo_SV, SVID_sample, sample_pe, PE_evidence, command_script) + +import os +import sys +import argparse +if __name__ == '__main__': + main() + diff --git a/wdl/CollectLargeCNVSupportForCPX.wdl b/wdl/CollectLargeCNVSupportForCPX.wdl new file mode 100644 index 000000000..ba925c9b1 --- /dev/null +++ b/wdl/CollectLargeCNVSupportForCPX.wdl @@ -0,0 +1,287 @@ +version 1.0 + +import "Structs.wdl" +import "TasksMakeCohortVcf.wdl" as MiniTasks + +workflow CollectLargeCNVSupportForCPX { + input { + File cpx_ctx_bed + String prefix + + File sample_depth_calls + Array[String] batch_name_list + Array[File] Depth_DEL_beds + Array[File] Depth_DUP_beds + + String sv_base_mini_docker + String sv_pipeline_docker + + RuntimeAttr? runtime_attr_generate_cnv_segments_from_cpx + RuntimeAttr? runtime_attr_extract_cpx_lg_cnv_by_batch + RuntimeAttr? runtime_attr_seek_depth_supp_for_cpx + RuntimeAttr? runtime_attr_concat_bed_Step1 + RuntimeAttr? runtime_attr_concat_bed_Step2 + + } + + call GenerateCnvSegmentFromCpx { + input: + bed = cpx_ctx_bed, #input file including cpx calls in bed format; + sample_depth_calls = sample_depth_calls, + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_generate_cnv_segments_from_cpx + } + + scatter (i in range(length(batch_name_list))){ + call ExtractCpxLgCnvByBatch { + input: + bed_gz = GenerateCnvSegmentFromCpx.cpx_lg_cnv, + batch_name = batch_name_list[i], + sv_base_mini_docker = sv_base_mini_docker, + runtime_attr_override = runtime_attr_extract_cpx_lg_cnv_by_batch + } + + call SeekDepthSuppForCpx as seek_depth_supp_for_cpx_del { + input: + cpx_lg_cnv = ExtractCpxLgCnvByBatch.lg_cnv_del, + raw_depth_bed = Depth_DEL_beds[i], + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_seek_depth_supp_for_cpx + } + + call SeekDepthSuppForCpx as seek_depth_supp_for_cpx_dup { + input: + cpx_lg_cnv = ExtractCpxLgCnvByBatch.lg_cnv_dup, + raw_depth_bed = Depth_DUP_beds[i], + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_seek_depth_supp_for_cpx + } + + call MiniTasks.ConcatBeds as concat_beds_svtype { + input: + shard_bed_files = [seek_depth_supp_for_cpx_del.cpx_cnv_depth_supp, seek_depth_supp_for_cpx_dup.cpx_cnv_depth_supp], + prefix = "~{batch_name_list[i]}.lg_cnv.depth_supp", + sv_base_mini_docker = sv_base_mini_docker, + runtime_attr_override = runtime_attr_concat_bed_Step1 + } + } + + + call MiniTasks.ConcatBeds as concat_beds_across_batches { + input: + shard_bed_files = concat_beds_svtype.merged_bed_file, + prefix = "~{prefix}.lg_cnv.depth_supp", + sv_base_mini_docker = sv_base_mini_docker, + runtime_attr_override = runtime_attr_concat_bed_Step2 + } + + output { + File lg_cnv_depth_supp = concat_beds_across_batches.merged_bed_file + } +} + +task SeekDepthSuppForCpx { + input { + File cpx_lg_cnv + File raw_depth_bed + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 3.75, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + String prefix = basename(cpx_lg_cnv, '.bed.gz') + + output { + File cpx_cnv_depth_supp = "~{prefix}.depth_supp.bed.gz" + } + command <<< + + set -e pipefail + + zcat ~{cpx_lg_cnv} | awk '{print $6}' | sort | uniq > sample_list.tsv + + echo -e '#chr\tpos\tend\tSVTYPE\tSVID\tsample\tbatch\tdepth_cov' > ~{prefix}.depth_supp.bed + + while read sample_name; do + zcat ~{cpx_lg_cnv} | awk -v sample="$sample_name" '{if ($6==sample) print}' > query.bed + zcat ~{raw_depth_bed} | awk -v sample="$sample_name" '{if ($5==sample) print}' > ref.bed + bedtools coverage -a query.bed -b ref.bed \ + | awk '{print $1,$2,$3,$4,$5,$6,$7,$NF}' \ + | sed -e 's/ /\t/g' \ + >> ~{prefix}.depth_supp.bed + done < sample_list.tsv + + bgzip ~{prefix}.depth_supp.bed + + >>> + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + + +task ExtractCpxLgCnvByBatch { + input { + File bed_gz + String batch_name + String sv_base_mini_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 3.75, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + output { + File lg_cnv_del = "~{batch_name}.lg_cnv.DEL.bed.gz" + File lg_cnv_dup = "~{batch_name}.lg_cnv.DUP.bed.gz" + } + command <<< + + set -e pipefail + + zcat ~{bed_gz} | awk '{if ($4=="DEL" && $7=="~{batch_name}") print}' > ~{batch_name}.lg_cnv.DEL.bed + zcat ~{bed_gz} | awk '{if ($4=="DUP" && $7=="~{batch_name}") print}' > ~{batch_name}.lg_cnv.DUP.bed + bgzip ~{batch_name}.lg_cnv.DEL.bed + bgzip ~{batch_name}.lg_cnv.DUP.bed + + >>> + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_base_mini_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + +task GenerateCnvSegmentFromCpx { + input { + File bed + # sample depth calls is map of sample name to batch ID + File sample_depth_calls + String sv_pipeline_docker + Int min_depth_size = 5000 + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 5, + disk_gb: 10, + boot_disk_gb: 30, + preemptible_tries: 1, + max_retries: 1 + } + + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + + String prefix = basename(bed, ".bed.gz") + + command <<< + set -euo pipefail + + python <0: + for i in interval: + if i[2]-i[1]>=min_depth_size: + sample_names = pin[pos_SAMPLES].split(',') + if i[3]=="DEL": + for j in sample_names: + CPX_CNV.append(i+[j, sample_depth_hash[j][0]]) + if i[3]=="DUP": + for j in sample_names: + CPX_CNV.append(i+[j, sample_depth_hash[j][0]]) + f_bed.close() + return CPX_CNV + + def write_cpx_cnv(CPX_CNV, fileout): + fo=open(fileout, 'w') + for info in CPX_CNV: + print('\t'.join([str(i) for i in info]), file=fo) + fo.close() + + bed_input = "~{bed}" + fileout = "~{prefix}.lg_CNV.bed" + sample_depth = "~{sample_depth_calls}" + min_depth_size = ~{min_depth_size} + sample_depth_hash = sample_depth_calls_readin(sample_depth) + CPX_CNV = readin_cpx_cnv(bed_input, sample_depth_hash, min_depth_size) + write_cpx_cnv(CPX_CNV, fileout) + CODE + + bgzip "~{prefix}.lg_CNV.bed" + >>> + + output { + File cpx_lg_cnv = "~{prefix}.lg_CNV.bed.gz" + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + diff --git a/wdl/CollectPEMetricsForCPX.wdl b/wdl/CollectPEMetricsForCPX.wdl new file mode 100644 index 000000000..746f6a907 --- /dev/null +++ b/wdl/CollectPEMetricsForCPX.wdl @@ -0,0 +1,102 @@ +version 1.0 + +import "Structs.wdl" +import "CollectPEMetricsPerBatchCPX.wdl" as per_batch + +workflow CollectPEMetricsForCPX { + input { + Array[String] batch_name_list + Array[File] PE_metrics + Array[File] PE_metrics_idxes + File PE_collect_script + String prefix + Int n_per_split + String sv_pipeline_docker + String sv_base_mini_docker + RuntimeAttr? runtime_attr_collect_pe + RuntimeAttr? runtime_attr_split_script + RuntimeAttr? runtime_attr_calcu_pe_stat + RuntimeAttr? runtime_attr_concat_evidence + } + + scatter (i in range(length(batch_name_list))) { + call per_batch.CollectPEMetricsPerBatchCPX { + input: + n_per_split = n_per_split, + prefix = "~{prefix}.~{batch_name_list[i]}", + batch_name = batch_name_list[i], + PE_metric = PE_metrics[i], + PE_metrics_idx = PE_metrics_idxes[i], + PE_collect_script = PE_collect_script, + sv_pipeline_docker = sv_pipeline_docker, + sv_base_mini_docker = sv_base_mini_docker, + runtime_attr_collect_pe = runtime_attr_collect_pe, + runtime_attr_split_script = runtime_attr_split_script, + runtime_attr_calcu_pe_stat = runtime_attr_calcu_pe_stat, + runtime_attr_concat_evidence = runtime_attr_concat_evidence + } + } + + call per_batch.ConcatEvidences { + input: + evidences = CollectPEMetricsPerBatchCPX.evidence, + prefix = prefix, + sv_base_mini_docker = sv_base_mini_docker, + runtime_attr_override = runtime_attr_concat_evidence + } + + call CalcuPEStat { + input: + evidence = ConcatEvidences.concat_evidence, + prefix = prefix, + sv_base_mini_docker = sv_base_mini_docker, + runtime_attr_override = runtime_attr_calcu_pe_stat + } + + output { + File evidence = ConcatEvidences.concat_evidence + File evi_stat = CalcuPEStat.evi_stat + } +} + + +task CalcuPEStat { + input { + File evidence + String prefix + String sv_base_mini_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 5, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + command <<< + set -euo pipefail + + zcat ~{evidence} | cut -f3,6- | uniq -c > ~{prefix}.evi_stat + bgzip ~{prefix}.evi_stat + >>> + + output { + File evi_stat = "~{prefix}.evi_stat.gz" + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_base_mini_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} diff --git a/wdl/CollectPEMetricsPerBatchCPX.wdl b/wdl/CollectPEMetricsPerBatchCPX.wdl new file mode 100644 index 000000000..655959748 --- /dev/null +++ b/wdl/CollectPEMetricsPerBatchCPX.wdl @@ -0,0 +1,205 @@ +version 1.0 + +import "Structs.wdl" +workflow CollectPEMetricsPerBatchCPX { + input { + Int n_per_split + String batch_name + File PE_metric + File PE_metrics_idx + File PE_collect_script + String prefix + String sv_base_mini_docker + String sv_pipeline_docker + RuntimeAttr? runtime_attr_collect_pe + RuntimeAttr? runtime_attr_concat_evidence + RuntimeAttr? runtime_attr_calcu_pe_stat + RuntimeAttr? runtime_attr_split_script + } + + call SplitScripts { + input: + script = PE_collect_script, + n_per_split = n_per_split, + batch_name = batch_name, + sv_base_mini_docker = sv_base_mini_docker, + runtime_attr_override = runtime_attr_split_script + } + + scatter (script in SplitScripts.script_splits ) { + call CollectPEMetrics { + input: + batch_name = batch_name, + PE_metric = PE_metric, + PE_metrics_idx = PE_metrics_idx, + PE_collect_script = script, + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_collect_pe + } + } + + call ConcatEvidences { + input: + evidences = CollectPEMetrics.evidence, + prefix = prefix, + sv_base_mini_docker = sv_base_mini_docker, + runtime_attr_override = runtime_attr_concat_evidence + } + + output { + File evidence = ConcatEvidences.concat_evidence + } +} + + + +# collect PE metrics +task CollectPEMetrics { + input { + String batch_name + File PE_metric + File PE_metrics_idx + File PE_collect_script + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 5, + disk_gb: ceil(30.0 + size(PE_metric, "GiB") * 3), + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + command <<< + set -euo pipefail + mkdir PE_metrics/ + gsutil cp ~{PE_metric} ./ + gsutil cp ~{PE_metrics_idx} ./ + echo "Metrics succesfully downloaded." + echo "Starting the evidence collection ..." + + if [ $(wc -c < "~{PE_collect_script}") -gt 0 ]; then + bash ~{PE_collect_script} + fi + + touch ~{batch_name}.evidence + touch ~{batch_name}.0.PE_evidences + for peEvFile in *.PE_evidences + do + cat ${peEvFile} >> ~{batch_name}.evidence + done + + bgzip ~{batch_name}.evidence + + >>> + + output { + File evidence = "~{batch_name}.evidence.gz" + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + +task ConcatEvidences { + input { + Array[File] evidences + String prefix + String sv_base_mini_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 5, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + command <<< + set -euo pipefail + + while read SPLIT; do + zcat $SPLIT + done < ~{write_lines(evidences)} \ + | bgzip -c \ + > ~{prefix}.evidence.gz + + >>> + + output { + File concat_evidence = "~{prefix}.evidence.gz" + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_base_mini_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + +task SplitScripts { + input { + File script + String batch_name + Int n_per_split + String sv_base_mini_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 3.75, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + output { + Array[File] script_splits = glob("collect_PE_evidences.*") + } + command <<< + + set -e pipefail + + awk '{if ($2~"~{batch_name}") print}' ~{script} > tmp_metrics.sh + + if [ $(wc -c < "tmp_metrics.sh") -gt 0 ]; then + split --additional-suffix ".sh" -l ~{n_per_split} -a 6 tmp_metrics.sh collect_PE_evidences. + else + touch collect_PE_evidences.aaaaaa.sh + fi + + >>> + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_base_mini_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} diff --git a/wdl/RefineComplexVariants.wdl b/wdl/RefineComplexVariants.wdl new file mode 100644 index 000000000..7d773430a --- /dev/null +++ b/wdl/RefineComplexVariants.wdl @@ -0,0 +1,870 @@ +version 1.0 + +import "Structs.wdl" +import "HailMerge.wdl" as hail +import "TasksMakeCohortVcf.wdl" as MiniTasks +import "CollectPEMetricsForCPX.wdl" as collect_pe_metrics_for_cpx +import "CollectLargeCNVSupportForCPX.wdl" as collect_lg_cnv_supp_for_cpx +import "Utils.wdl" as util + +workflow RefineComplexVariants { + input { + File vcf + File vcf_idx + + Array[String] batch_name_list + Array[File] PE_metrics + Array[File] PE_metrics_idxes + Array[File] Depth_DEL_beds + Array[File] Depth_DUP_beds + + String prefix + Int n_per_split + File sample_PE_metrics + File sample_depth_calls + File? script_generate_cpx_review_script + + Boolean use_hail = false + String? gcs_project # required if use_hail = true + + String sv_base_mini_docker + String sv_pipeline_docker + String linux_docker + + RuntimeAttr? runtime_attr_vcf2bed + RuntimeAttr? runtime_attr_collect_pe + RuntimeAttr? runtime_attr_scatter_vcf + RuntimeAttr? runtime_attr_split_script + RuntimeAttr? runtime_attr_calcu_pe_stat + RuntimeAttr? runtime_attr_split_cpx_ctx + RuntimeAttr? runtime_attr_concat_evidence + RuntimeAttr? runtime_attr_concat + RuntimeAttr? runtime_attr_preconcat + RuntimeAttr? runtime_attr_hail_merge + RuntimeAttr? runtime_attr_fix_header + RuntimeAttr? runtime_attr_generate_cpx_review_script + RuntimeAttr? runtime_attr_generate_cnv_segments_from_cpx + RuntimeAttr? runtime_attr_get_vcf_header_with_members_info_line + RuntimeAttr? runtime_attr_generate_cnv_segments_from_cpx + RuntimeAttr? runtime_attr_extract_cpx_lg_cnv_by_batch + RuntimeAttr? runtime_attr_seek_depth_supp_for_cpx + RuntimeAttr? runtime_attr_concat_bed_Step1 + RuntimeAttr? runtime_attr_concat_bed_Step2 + RuntimeAttr? runtime_attr_calcu_cpx_evidences + } + + + call MiniTasks.ScatterVcf { + input: + vcf = vcf, + prefix = prefix, + records_per_shard = n_per_split, + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_scatter_vcf + } + + scatter (i in range(length(ScatterVcf.shards))) { + + call util.VcfToBed { + input: + vcf_file = ScatterVcf.shards[i], + args = "-i ALL --include-filters", + variant_interpretation_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_vcf2bed + } + + call SplitCpxCtx { + input: + bed = VcfToBed.bed_output, + sv_base_mini_docker = sv_base_mini_docker, + runtime_attr_override = runtime_attr_split_cpx_ctx + } + + call collect_lg_cnv_supp_for_cpx.CollectLargeCNVSupportForCPX { + input: + cpx_ctx_bed = SplitCpxCtx.cpx_ctx_bed, + prefix = "~{prefix}.~{i}", + + sample_depth_calls = sample_depth_calls, + batch_name_list = batch_name_list, + Depth_DEL_beds = Depth_DEL_beds, + Depth_DUP_beds = Depth_DUP_beds, + + sv_base_mini_docker = sv_base_mini_docker, + sv_pipeline_docker = sv_pipeline_docker, + + runtime_attr_concat_bed_Step1 = runtime_attr_concat_bed_Step1, + runtime_attr_concat_bed_Step2 = runtime_attr_concat_bed_Step2, + runtime_attr_seek_depth_supp_for_cpx = runtime_attr_seek_depth_supp_for_cpx, + runtime_attr_extract_cpx_lg_cnv_by_batch = runtime_attr_extract_cpx_lg_cnv_by_batch, + runtime_attr_generate_cnv_segments_from_cpx = runtime_attr_generate_cnv_segments_from_cpx + } + + call GenerateCpxReviewScript { + input: + bed = SplitCpxCtx.cpx_ctx_bed, + sample_PE_metrics = sample_PE_metrics, + script = script_generate_cpx_review_script, + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_generate_cpx_review_script + } + + call collect_pe_metrics_for_cpx.CollectPEMetricsForCPX { + input: + batch_name_list = batch_name_list, + PE_metrics = PE_metrics, + PE_metrics_idxes = PE_metrics_idxes, + PE_collect_script = GenerateCpxReviewScript.pe_evidence_collection_script, + prefix = "~{prefix}.~{i}", + n_per_split = n_per_split, + sv_pipeline_docker = sv_pipeline_docker, + sv_base_mini_docker = sv_base_mini_docker, + runtime_attr_collect_pe = runtime_attr_collect_pe, + runtime_attr_split_script = runtime_attr_split_script, + runtime_attr_calcu_pe_stat = runtime_attr_calcu_pe_stat, + runtime_attr_concat_evidence = runtime_attr_concat_evidence + } + + call CalculateCpxEvidences { + input: + PE_collect_script = GenerateCpxReviewScript.pe_evidence_collection_script, + PE_supp = CollectPEMetricsForCPX.evi_stat, + depth_supp = CollectLargeCNVSupportForCPX.lg_cnv_depth_supp, + prefix = "~{prefix}.~{i}", + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_calcu_cpx_evidences + } + + call CalculateCtxEvidences{ + input: + PE_collect_script = GenerateCpxReviewScript.pe_evidence_collection_script, + PE_supp = CollectPEMetricsForCPX.evi_stat, + prefix = "~{prefix}.~{i}", + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_calcu_cpx_evidences + } + + call ReviseVcf { + input: + vcf_file = ScatterVcf.shards[i], + CPX_manual = CalculateCpxEvidences.manual_revise_CPX_results, + CTX_manual = CalculateCtxEvidences.manual_revise_CTX_results, + unresolved_svids = GenerateCpxReviewScript.unresolved_svids, + prefix = "~{prefix}.~{i}", + sv_pipeline_docker = sv_pipeline_docker + } + } + + if (use_hail) { + call hail.HailMerge { + input: + vcfs=ReviseVcf.revised_vcf, + prefix="~{prefix}.cpx_filtered", + gcs_project=gcs_project, + sv_base_mini_docker=sv_base_mini_docker, + sv_pipeline_docker=sv_pipeline_docker, + runtime_override_preconcat=runtime_attr_preconcat, + runtime_override_hail_merge=runtime_attr_hail_merge, + runtime_override_fix_header=runtime_attr_fix_header + } + } + + if (!use_hail) { + call MiniTasks.ConcatVcfs { + input: + vcfs=ReviseVcf.revised_vcf, + vcfs_idx=ReviseVcf.revised_vcf_idx, + allow_overlaps=true, + outfile_prefix="~{prefix}.cpx_filtered", + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_attr_concat + } + } + + call MiniTasks.ConcatHeaderedTextFiles { + input: + text_files = CalculateCpxEvidences.manual_revise_CPX_results, + output_filename = "~{prefix}.CPX_evidence.txt" , + linux_docker = linux_docker + + } + + File reviewed_vcf = select_first([ConcatVcfs.concat_vcf, HailMerge.merged_vcf]) + File reviewed_vcf_idx = select_first([ConcatVcfs.concat_vcf_idx, HailMerge.merged_vcf_index]) + + output { + File revised_output_vcf = reviewed_vcf + File revised_output_vcf_idx = reviewed_vcf_idx + File cpx_evidences = ConcatHeaderedTextFiles.out + } +} + + +task ReviseVcf { + input { + File vcf_file + File CPX_manual + File CTX_manual + File unresolved_svids + String prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 7.5, + disk_gb: ceil(5.0 + size(vcf_file, "GB")*3), + boot_disk_gb: 30, + preemptible_tries: 1, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + command<<< + set -euo pipefail + + python < 0: + return int(median(quals)) + +def CPX_manual_readin(CPX_manual): + out={} + fin=open(CPX_manual) + for line in fin: + pin=line.strip().split() + if not pin[0] in out.keys(): + out[pin[0]] = {} + out[pin[0]][pin[1]] = pin[2:] + fin.close() + return out + +def CTX_manual_readin(CTX_manual): + fin=open(CTX_manual) + out={} + for line in fin: + pin=line.strip().split('\t') + if not pin[0] in out.keys(): + out[pin[0]] = {} + if not pin[1] in out[pin[0]].keys(): + out[pin[0]][pin[1]] = pin[2:] + fin.close() + return out + + +def unresolved_readin(unresolved_svids): + svids = set() + with open(unresolved_svids, 'r') as inp: + for line in inp: + svids.add(line.strip()) + return svids + + +def revise_vcf(vcf_input, vcf_output, hash_CPX_manual, unresolved_svids, hash_CTX_manual): + fin=pysam.VariantFile(vcf_input) + #revise vcf header + header = fin.header + fo=pysam.VariantFile(vcf_output, 'w', header = header) + for record in fin: + if record.id in unresolved_svids: + if 'PASS' in record.filter: + record.filter.clear() + record.filter.add('UNRESOLVED') + #label CPX with manual review results: + elif record.id in hash_CPX_manual.keys(): + unresolve_rec = 0 + for sample in hash_CPX_manual[record.id].keys(): + if sample in record.samples.keys(): + if hash_CPX_manual[record.id][sample][0] == 'no_PE': + if hash_CPX_manual[record.id][sample][1] in ["NA","lack_depth","lack_depth,lack_depth", "lack_depth,depth","depth,lack_depth"]: + record.samples[sample]['GT'] = [None,None] + elif hash_CPX_manual[record.id][sample][0] == 'low_PE': + if hash_CPX_manual[record.id][sample][1] in ["NA","lack_depth","lack_depth,lack_depth", "lack_depth,depth","depth,lack_depth"]: + record.samples[sample]['GT'] = [None,None] + elif hash_CPX_manual[record.id][sample][0] == 'partial_PE': + if hash_CPX_manual[record.id][sample][1] in ["NA","lack_depth","lack_depth,lack_depth", "lack_depth,depth","depth,lack_depth"]: + record.samples[sample]['GT'] = [None,None] + unresolve_rec+=1 + elif hash_CPX_manual[record.id][sample][0] == 'high_PE': + if hash_CPX_manual[record.id][sample][1] in ["lack_depth","lack_depth,lack_depth", "lack_depth,depth","depth,lack_depth"]: + record.samples[sample]['GT'] = [None,None] + if not unresolve_rec/len(hash_CPX_manual[record.id].keys())<.5: + if 'PASS' in record.filter: + record.filter.clear() + record.filter.add('UNRESOLVED') + #label CTX with manual review results: + elif record.id in hash_CTX_manual.keys(): + if 'NA' in hash_CTX_manual[record.id].keys(): + record.filter.add('UNRESOLVED') + else: + for sample in hash_CTX_manual[record.id].keys(): + if sample in record.samples.keys(): + if hash_CTX_manual[record.id][sample][1] == 'Not_Enough_PE_Pairs': + record.samples[sample]['GT'] = [None,None] + elif hash_CTX_manual[record.id][sample][1] == 'PARTIAL_PE': + record.samples[sample]['GT'] = [None,None] + elif hash_CTX_manual[record.id][sample][1] == 'unbalanced_paternal_transmission_of_tloc,2nd_junction_is_missing': + record.samples[sample]['GT'] = [None,None] + elif hash_CTX_manual[record.id][sample][1] == 'NON_SPECIFIC': + record.samples[sample]['GT'] = [None,None] + elif hash_CTX_manual[record.id][sample][1] == 'Not_Enough_Coordinate_Span': + record.samples[sample]['GT'] = [None,None] + elif hash_CTX_manual[record.id][sample][1] == 'Interrupted_PE_Patterns_when_sorted_on_2nd_breakpoint': + record.samples[sample]['GT'] = [None,None] + elif hash_CTX_manual[record.id][sample][1] == 'CTX_with_DEL': + record.stop = int(hash_CTX_manual[record.id][sample][0].split(':')[1].split('-')[1]) + # revisions to insertion-type complex events + if record.info['SVTYPE']=="CPX" and record.info['CPX_TYPE'] in ['dDUP','dDUP_iDEL','INS_iDEL']: + if record.info['CPX_TYPE']=='INS_iDEL': + record.info['CPX_INTERVALS'] = ','.join([x for x in record.info['CPX_INTERVALS']] + [record.info['SOURCE']]) + del record.info['SOURCE'] + elif record.info['SVTYPE']=="INS" and 'SOURCE' in record.info.keys() and "INV" in record.info['SOURCE']: + record.info['SVTYPE']="CPX" + record.alts = ('',) + del_section = record.stop - record.pos + if del_section<50: + record.info['CPX_TYPE']="dDUP" + record.info['CPX_INTERVALS'] = record.info['SOURCE'].replace('INV','DUP')+','+record.info['SOURCE'] + else: + record.info['CPX_TYPE']="dDUP_iDEL" + record.info['CPX_INTERVALS'] = record.info['SOURCE'].replace('INV','DUP')+','+record.info['SOURCE']+','+"DEL_"+record.chrom+":"+str(record.pos)+'-'+str(record.stop) + del record.info['SOURCE'] + fo.write(record) # write out every record that was in the input - NCR will remove ones with no carriers left + fin.close() + fo.close() + +import os +import sys +from numpy import median +import pysam +import argparse + +#Define global variables +NULL_GTs = [(None, None), (None, )] +REF_GTs = [(0, 0), (0, ), (None, 2)] +NULL_and_REF_GTs = NULL_GTs + REF_GTs +HET_GTs = [(0, 1), (None, 1), (None, 3)] + +unresolved_svids = unresolved_readin("~{unresolved_svids}") +hash_CPX_manual = CPX_manual_readin("~{CPX_manual}") +hash_CTX_manual = CTX_manual_readin("~{CTX_manual}") +print(len(hash_CPX_manual.keys())) +revise_vcf("~{vcf_file}", "~{prefix}.Manual_Revised.vcf.gz", hash_CPX_manual, unresolved_svids) + +CODE + + tabix ~{prefix}.Manual_Revised.vcf.gz + >>> + + output { + File revised_vcf = "~{prefix}.Manual_Revised.vcf.gz" + File revised_vcf_idx = "~{prefix}.Manual_Revised.vcf.gz.tbi" + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + +task SplitCpxCtx { + input { + File bed + String sv_base_mini_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 7.5, + disk_gb: 10, + boot_disk_gb: 30, + preemptible_tries: 1, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + + String prefix = basename(bed, ".bed.gz") + + command <<< + set -eu + + zcat ~{bed} | head -1 > ~{prefix}.cpx_ctx.bed + + filterColumn=$(zcat ~{bed} | head -1 | tr "\t" "\n" | awk '$1=="FILTER" {print NR}') + + set -o pipefail + + zcat ~{bed} | awk 'NR > 1' | { grep CPX || true; } | awk -v filter_column=${filterColumn} '$filter_column !~ /UNRESOLVED/' >> ~{prefix}.cpx_ctx.bed + + zcat ~{bed} | awk 'NR > 1' | { grep CTX || true; } >> ~{prefix}.cpx_ctx.bed + + # INS with INV in SOURCE - will be converted to CPX later so need to evaluate evidence + zcat ~{bed} | awk 'NR > 1' | { grep INS || true; } | { grep INV || true; } >> ~{prefix}.cpx_ctx.bed + + bgzip ~{prefix}.cpx_ctx.bed + >>> + + output { + File cpx_ctx_bed = "~{prefix}.cpx_ctx.bed.gz" + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_base_mini_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + +task GenerateCpxReviewScript { + input { + File bed + File sample_PE_metrics + File? script + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 5, + disk_gb: 10, + boot_disk_gb: 30, + preemptible_tries: 1, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + + String prefix = basename(bed, ".bed.gz") + + command <<< + set -euo pipefail + + python ~{default="/opt/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py" script} \ + -i ~{bed} \ + -s ~{sample_PE_metrics} \ + -p CPX_CTX_disINS.PASS.PE_evidences \ + -c collect_PE_evidences.CPX_CTX_disINS.PASS.sh \ + -r ~{prefix}.svelter \ + -u ~{prefix}.unresolved_svids.txt + + >>> + + output { + File pe_evidence_collection_script = "collect_PE_evidences.CPX_CTX_disINS.PASS.sh" + File svelter = "~{prefix}.svelter" + File unresolved_svids = "~{prefix}.unresolved_svids.txt" + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + +task GenerateCnvSegmentFromCpx { + input { + File bed + File sample_depth_calls + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 5, + disk_gb: 10, + boot_disk_gb: 30, + preemptible_tries: 1, + max_retries: 1 + } + + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + + String prefix = basename(bed, ".bed.gz") + + command <<< + set -euo pipefail + + python <0: + for i in interval: + if i[2]-i[1]>4999: + sample_names = pin[5].split(',') + if i[3]=="DEL": + for j in sample_names: + CPX_CNV.append(i+[j, sample_depth_hash[j][0]]) + if i[3]=="DUP": + for j in sample_names: + CPX_CNV.append(i+[j, sample_depth_hash[j][1]]) + f_bed.close() + return CPX_CNV + + def write_cpx_cnv(CPX_CNV, fileout): + fo=open(fileout, 'w') + for info in CPX_CNV: + print('\t'.join(info), file=fo) + fo.close() + + bed_input = "~{bed}" + fileout = "~{prefix}.lg_CNV.bed" + + sample_depth_hash = sample_depth_calls_readin(sample_depth) + CPX_CNV = readin_cpx_cnv(bed_input, sample_depth_hash) + write_cpx_cnv(CPX_CNV, fileout) + CODE + + bgzip "~{prefix}.lg_CNV.bed" + >>> + + output { + File cpx_lg_cnv = "~{prefix}.lg_CNV.bed.gz" + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + +task CalculateCpxEvidences { + input { + File PE_collect_script + File PE_supp + File depth_supp + String prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 5, + disk_gb: 10, + boot_disk_gb: 30, + preemptible_tries: 1, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + command <<< + + set -eu + awk '{print $6, $(NF-3)}' ~{PE_collect_script} | grep -v "_CTX_"| uniq > sample_SVID.tsv + + set -euo pipefail + + + python <0 or PE_supp[svid][sample][1][0]>0: + pe_supp = 'partial_PE' + if PE_supp[svid][sample][0][0]>0 and PE_supp[svid][sample][1][0]>0: + pe_supp = 'low_PE' + if PE_supp[svid][sample][0][0]>1 and PE_supp[svid][sample][1][0]>1: + pe_supp = 'high_PE' + out[svid][sample] = [pe_supp] + else: + out[svid][sample] = ['no_PE'] + else: + for sample in sample_svid[svid]: + out[svid][sample] = ['no_PE'] + return out + + def add_depth_supp(sample_svid_pe, Depth_supp): + for svid in sample_svid_pe.keys(): + if svid in Depth_supp.keys(): + for sample in sample_svid_pe[svid].keys(): + if sample in Depth_supp[svid].keys(): + depth_supp = 'lack_depth' + if Depth_supp[svid][sample][0]>.5: + depth_supp = 'depth' + if len(Depth_supp[svid][sample])>1: + if not Depth_supp[svid][sample][1]>.5: + depth_supp+=',lack_depth' + else: + if len(Depth_supp[svid][sample])>1: + if Depth_supp[svid][sample][1]>.5: + depth_supp+=',depth' + sample_svid_pe[svid][sample].append(depth_supp) + else: + sample_svid_pe[svid][sample].append('NA') + else: + for sample in sample_svid_pe[svid].keys(): + sample_svid_pe[svid][sample].append('NA') + return sample_svid_pe + + def write_pe_depth_supp(sample_svid_pe_depth, fileout): + fo=open(fileout,'w') + print('\t'.join(['VID','sample','supportive_PE_counts','depth_supp']), file=fo) + for svid in sample_svid_pe_depth.keys(): + for sample in sample_svid_pe_depth[svid].keys(): + print('\t'.join([svid,sample] + sample_svid_pe_depth[svid][sample]), file=fo) + fo.close() + + import os + sample_svid = sample_svid_readin("sample_SVID.tsv") + PE_supp = PE_supp_readin("~{PE_supp}") + Depth_supp = Depth_supp_readin("~{depth_supp}") + sample_svid_pe = add_PE_supp(sample_svid, PE_supp) + sample_svid_pe_depth = add_depth_supp(sample_svid_pe, Depth_supp) + write_pe_depth_supp(sample_svid_pe_depth, "~{prefix}.manual_revise.CPX_results") + + CODE + >>> + + output { + File manual_revise_CPX_results = "~{prefix}.manual_revise.CPX_results" + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + +task CalculateCtxEvidences { + input { + File PE_collect_script + File PE_supp + String prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 5, + disk_gb: 10, + boot_disk_gb: 30, + preemptible_tries: 1, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + command<<< + + set -eu + + awk '{print $6, $(NF-3)}' ~{PE_collect_script} | grep "_CTX_"| uniq > sample_SVID.tsv + + set -eu + + zcat ~{PE_supp} | grep "_CTX_"| uniq > PE_supp.tsv + + set -euo pipefail + + python <0 or PE_supp[svid][sample][1][0]>0: + pe_supp = 'partial_PE' + if PE_supp[svid][sample][0][0]>0 and PE_supp[svid][sample][1][0]>0: + pe_supp = 'low_PE' + if PE_supp[svid][sample][0][0]>2 and PE_supp[svid][sample][1][0]>2: + pe_supp = 'high_PE' + out[svid][sample] = [pe_supp] + else: + out[svid][sample] = ['no_PE'] + else: + for sample in sample_svid[svid]: + out[svid][sample] = ['no_PE'] + return out + + def write_pe_depth_supp(sample_svid_pe_depth, fileout): + fo=open(fileout,'w') + print('\t'.join(['VID','sample','supportive_PE_counts']), file=fo) + for svid in sample_svid_pe_depth.keys(): + for sample in sample_svid_pe_depth[svid].keys(): + print('\t'.join([svid,sample] + sample_svid_pe_depth[svid][sample]), file=fo) + fo.close() + + import os + sample_svid = sample_svid_readin("sample_SVID.tsv") + PE_supp = PE_supp_readin("PE_supp.tsv") + sample_svid_pe = add_PE_supp(sample_svid, PE_supp) + write_pe_depth_supp(sample_svid_pe, "~{prefix}.manual_revise.CTX_results") + + CODE + >>> + + output { + File manual_revise_CTX_results = "~{prefix}.manual_revise.CTX_results" + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + From a550faa05c013d9cdd986e38ef359384a8028c0a Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Thu, 17 Oct 2024 17:35:16 -0400 Subject: [PATCH 02/28] remove dependency on manually created files --- wdl/CollectLargeCNVSupportForCPX.wdl | 33 +++-- wdl/RefineComplexVariants.wdl | 186 +++++++++++---------------- 2 files changed, 92 insertions(+), 127 deletions(-) diff --git a/wdl/CollectLargeCNVSupportForCPX.wdl b/wdl/CollectLargeCNVSupportForCPX.wdl index ba925c9b1..9675bac4d 100644 --- a/wdl/CollectLargeCNVSupportForCPX.wdl +++ b/wdl/CollectLargeCNVSupportForCPX.wdl @@ -8,7 +8,7 @@ workflow CollectLargeCNVSupportForCPX { File cpx_ctx_bed String prefix - File sample_depth_calls + File sample_batch_pe_map Array[String] batch_name_list Array[File] Depth_DEL_beds Array[File] Depth_DUP_beds @@ -27,7 +27,7 @@ workflow CollectLargeCNVSupportForCPX { call GenerateCnvSegmentFromCpx { input: bed = cpx_ctx_bed, #input file including cpx calls in bed format; - sample_depth_calls = sample_depth_calls, + sample_batch_pe_map = sample_batch_pe_map, sv_pipeline_docker = sv_pipeline_docker, runtime_attr_override = runtime_attr_generate_cnv_segments_from_cpx } @@ -181,8 +181,7 @@ task ExtractCpxLgCnvByBatch { task GenerateCnvSegmentFromCpx { input { File bed - # sample depth calls is map of sample name to batch ID - File sample_depth_calls + File sample_batch_pe_map String sv_pipeline_docker Int min_depth_size = 5000 RuntimeAttr? runtime_attr_override @@ -214,16 +213,15 @@ task GenerateCnvSegmentFromCpx { out = [info.split('_')[0], info.split('_')[1].split(':')[0]] + [int(i) for i in info.split('_')[1].split(':')[1].split('-')] return out[1:4]+[out[0]] - def sample_depth_calls_readin(sample_depth): - out = {} - fin=open(sample_depth) - for line in fin: - pin=line.strip().split() - out[pin[0]] = pin[1:] - fin.close() - return out + def get_sample_batch_map(sample_batch_pe_map): + sample_to_batch = {} + with open(sample_batch_pe_map, 'r') as inp: + for line in inp: + sample, batch, pe_file = line.strip().split("\t") + sample_to_batch[sample] = batch + return sample_to_batch - def readin_cpx_cnv(bed_input,sample_depth_hash,min_depth_size): + def readin_cpx_cnv(bed_input,sample_to_batch,min_depth_size): CPX_CNV = [] f_bed = os.popen(r'''zcat %s'''%(bed_input)) for line in f_bed: @@ -245,10 +243,10 @@ task GenerateCnvSegmentFromCpx { sample_names = pin[pos_SAMPLES].split(',') if i[3]=="DEL": for j in sample_names: - CPX_CNV.append(i+[j, sample_depth_hash[j][0]]) + CPX_CNV.append(i+[j, sample_to_batch[j]]) if i[3]=="DUP": for j in sample_names: - CPX_CNV.append(i+[j, sample_depth_hash[j][0]]) + CPX_CNV.append(i+[j, sample_to_batch[j]]) f_bed.close() return CPX_CNV @@ -260,10 +258,9 @@ task GenerateCnvSegmentFromCpx { bed_input = "~{bed}" fileout = "~{prefix}.lg_CNV.bed" - sample_depth = "~{sample_depth_calls}" min_depth_size = ~{min_depth_size} - sample_depth_hash = sample_depth_calls_readin(sample_depth) - CPX_CNV = readin_cpx_cnv(bed_input, sample_depth_hash, min_depth_size) + sample_to_batch = get_sample_batch_map("~{sample_batch_pe_map}") + CPX_CNV = readin_cpx_cnv(bed_input, sample_to_batch, min_depth_size) write_cpx_cnv(CPX_CNV, fileout) CODE diff --git a/wdl/RefineComplexVariants.wdl b/wdl/RefineComplexVariants.wdl index 7d773430a..33e57ee26 100644 --- a/wdl/RefineComplexVariants.wdl +++ b/wdl/RefineComplexVariants.wdl @@ -17,11 +17,10 @@ workflow RefineComplexVariants { Array[File] PE_metrics_idxes Array[File] Depth_DEL_beds Array[File] Depth_DUP_beds + Array[File] batch_sample_lists String prefix Int n_per_split - File sample_PE_metrics - File sample_depth_calls File? script_generate_cpx_review_script Boolean use_hail = false @@ -31,6 +30,7 @@ workflow RefineComplexVariants { String sv_pipeline_docker String linux_docker + RuntimeAttr? runtime_attr_sample_batch RuntimeAttr? runtime_attr_vcf2bed RuntimeAttr? runtime_attr_collect_pe RuntimeAttr? runtime_attr_scatter_vcf @@ -53,6 +53,14 @@ workflow RefineComplexVariants { RuntimeAttr? runtime_attr_calcu_cpx_evidences } + call GetSampleBatchPEMap { + input: + batch_name_list = batch_name_list, + batch_sample_lists = batch_sample_lists, + batch_pe_files = write_lines(PE_metrics), + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_attr_sample_batch + } call MiniTasks.ScatterVcf { input: @@ -85,7 +93,7 @@ workflow RefineComplexVariants { cpx_ctx_bed = SplitCpxCtx.cpx_ctx_bed, prefix = "~{prefix}.~{i}", - sample_depth_calls = sample_depth_calls, + sample_batch_pe_map = GetSampleBatchPEMap.sample_batch_pe_map, batch_name_list = batch_name_list, Depth_DEL_beds = Depth_DEL_beds, Depth_DUP_beds = Depth_DUP_beds, @@ -103,7 +111,7 @@ workflow RefineComplexVariants { call GenerateCpxReviewScript { input: bed = SplitCpxCtx.cpx_ctx_bed, - sample_PE_metrics = sample_PE_metrics, + sample_batch_pe_map = GetSampleBatchPEMap.sample_batch_pe_map, script = script_generate_cpx_review_script, sv_pipeline_docker = sv_pipeline_docker, runtime_attr_override = runtime_attr_generate_cpx_review_script @@ -200,6 +208,67 @@ workflow RefineComplexVariants { } +task GetSampleBatchPEMap { + input { + Array[File] batch_sample_lists + Array[String] batch_name_list + File batch_pe_files + String prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 5, + disk_gb: 10 + 2*ceil(size(flatten([batch_sample_lists]), "GB")), + boot_disk_gb: 30, + preemptible_tries: 1, + max_retries: 1 + } + + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + command <<< + set -euo pipefail + + python <>> + + output { + File sample_batch_pe_map = "~{prefix}.sample_batch_pe_map.tsv" + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + + task ReviseVcf { input { File vcf_file @@ -444,7 +513,7 @@ task SplitCpxCtx { task GenerateCpxReviewScript { input { File bed - File sample_PE_metrics + File sample_batch_pe_map File? script String sv_pipeline_docker RuntimeAttr? runtime_attr_override @@ -466,9 +535,11 @@ task GenerateCpxReviewScript { command <<< set -euo pipefail + cut -f1,3 ~{sample_batch_pe_map} > sample_PE_metrics.tsv + python ~{default="/opt/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py" script} \ -i ~{bed} \ - -s ~{sample_PE_metrics} \ + -s sample_PE_metrics.tsv \ -p CPX_CTX_disINS.PASS.PE_evidences \ -c collect_PE_evidences.CPX_CTX_disINS.PASS.sh \ -r ~{prefix}.svelter \ @@ -493,109 +564,6 @@ task GenerateCpxReviewScript { } } -task GenerateCnvSegmentFromCpx { - input { - File bed - File sample_depth_calls - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 5, - disk_gb: 10, - boot_disk_gb: 30, - preemptible_tries: 1, - max_retries: 1 - } - - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - - - String prefix = basename(bed, ".bed.gz") - - command <<< - set -euo pipefail - - python <0: - for i in interval: - if i[2]-i[1]>4999: - sample_names = pin[5].split(',') - if i[3]=="DEL": - for j in sample_names: - CPX_CNV.append(i+[j, sample_depth_hash[j][0]]) - if i[3]=="DUP": - for j in sample_names: - CPX_CNV.append(i+[j, sample_depth_hash[j][1]]) - f_bed.close() - return CPX_CNV - - def write_cpx_cnv(CPX_CNV, fileout): - fo=open(fileout, 'w') - for info in CPX_CNV: - print('\t'.join(info), file=fo) - fo.close() - - bed_input = "~{bed}" - fileout = "~{prefix}.lg_CNV.bed" - - sample_depth_hash = sample_depth_calls_readin(sample_depth) - CPX_CNV = readin_cpx_cnv(bed_input, sample_depth_hash) - write_cpx_cnv(CPX_CNV, fileout) - CODE - - bgzip "~{prefix}.lg_CNV.bed" - >>> - - output { - File cpx_lg_cnv = "~{prefix}.lg_CNV.bed.gz" - } - - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - docker: sv_pipeline_docker - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - } -} - task CalculateCpxEvidences { input { File PE_collect_script From b1496861b30941fdbea35e6ffd0fd6a6112b8651 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Thu, 17 Oct 2024 18:20:00 -0400 Subject: [PATCH 03/28] add json templates --- .../RefineComplexVariants.json.tmpl | 17 +++++++++++++++++ .../RefineComplexVariants.json.tmpl | 17 +++++++++++++++++ wdl/RefineComplexVariants.wdl | 10 +++++----- 3 files changed, 39 insertions(+), 5 deletions(-) create mode 100644 inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/RefineComplexVariants.json.tmpl create mode 100644 inputs/templates/test/RefineComplexVariants/RefineComplexVariants.json.tmpl diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/RefineComplexVariants.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/RefineComplexVariants.json.tmpl new file mode 100644 index 000000000..fb95e1ce4 --- /dev/null +++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/RefineComplexVariants.json.tmpl @@ -0,0 +1,17 @@ +{ + "RefineComplexVariants.vcf": "${this.cleaned_vcf}", + "RefineComplexVariants.prefix": "${this.sample_set_set_id}", + + "RefineComplexVariants.batch_name_list": "${this.sample_sets.sample_set_id}", + "RefineComplexVariants.batch_sample_lists": "${this.sample_sets.filtered_batch_samples_file}", + "RefineComplexVariants.PE_metrics": "${this.sample_sets.merged_PE}", + "RefineComplexVariants.PE_metrics_indexes": "${this.sample_sets.merged_PE_index}", + "RefineComplexVariants.Depth_DEL_beds": "${this.sample_sets.merged_dels}", + "RefineComplexVariants.Depth_DUP_beds": "${this.sample_sets.merged_dups}", + + "RefineComplexVariants.n_per_split": "15000", + + "RefineComplexVariants.sv_base_mini_docker": "${workspace.sv_base_mini_docker}", + "RefineComplexVariants.sv_pipeline_docker": "${workspace.sv_pipeline_docker}", + "RefineComplexVariants.linux_docker": "${workspace.linux_docker}" +} diff --git a/inputs/templates/test/RefineComplexVariants/RefineComplexVariants.json.tmpl b/inputs/templates/test/RefineComplexVariants/RefineComplexVariants.json.tmpl new file mode 100644 index 000000000..c9feb45a2 --- /dev/null +++ b/inputs/templates/test/RefineComplexVariants/RefineComplexVariants.json.tmpl @@ -0,0 +1,17 @@ +{ + "RefineComplexVariants.vcf": {{ test_batch.clean_vcf | tojson }}, + "RefineComplexVariants.prefix": {{ test_batch.name | tojson }}, + + "RefineComplexVariants.batch_name_list": [{{ test_batch.name | tojson }}], + "RefineComplexVariants.batch_sample_lists": [ {{ test_batch.final_sample_list | tojson }} ], + "RefineComplexVariants.PE_metrics": [ {{ test_batch.merged_disc_file | tojson }} ], + "RefineComplexVariants.PE_metrics_indexes": [ {{ test_batch.merged_disc_file_index | tojson }} ], + "RefineComplexVariants.Depth_DEL_beds": [ {{ test_batch.del_bed | tojson }} ], + "RefineComplexVariants.Depth_DUP_beds": [ {{ test_batch.dup_bed | tojson }} ], + + "RefineComplexVariants.n_per_split": "15000", + + "RefineComplexVariants.sv_base_mini_docker": {{ dockers.sv_base_mini_docker | tojson }}, + "RefineComplexVariants.sv_pipeline_docker": {{ dockers.sv_pipeline_docker | tojson }}, + "RefineComplexVariants.linux_docker": {{ dockers.linux_docker | tojson }} +} diff --git a/wdl/RefineComplexVariants.wdl b/wdl/RefineComplexVariants.wdl index 33e57ee26..1e189eb9b 100644 --- a/wdl/RefineComplexVariants.wdl +++ b/wdl/RefineComplexVariants.wdl @@ -10,16 +10,15 @@ import "Utils.wdl" as util workflow RefineComplexVariants { input { File vcf - File vcf_idx + String prefix Array[String] batch_name_list + Array[File] batch_sample_lists Array[File] PE_metrics - Array[File] PE_metrics_idxes + Array[File] PE_metrics_indexes Array[File] Depth_DEL_beds Array[File] Depth_DUP_beds - Array[File] batch_sample_lists - String prefix Int n_per_split File? script_generate_cpx_review_script @@ -58,6 +57,7 @@ workflow RefineComplexVariants { batch_name_list = batch_name_list, batch_sample_lists = batch_sample_lists, batch_pe_files = write_lines(PE_metrics), + prefix = prefix, sv_pipeline_docker = sv_pipeline_docker, runtime_attr_override = runtime_attr_sample_batch } @@ -121,7 +121,7 @@ workflow RefineComplexVariants { input: batch_name_list = batch_name_list, PE_metrics = PE_metrics, - PE_metrics_idxes = PE_metrics_idxes, + PE_metrics_idxes = PE_metrics_indexes, PE_collect_script = GenerateCpxReviewScript.pe_evidence_collection_script, prefix = "~{prefix}.~{i}", n_per_split = n_per_split, From b3f31b532a674576cc85d3cc96a124f3438c26fe Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Thu, 17 Oct 2024 19:00:32 -0400 Subject: [PATCH 04/28] rescue ME dels in CleanVcf --- .../CleanVcf.json.tmpl | 2 + .../cohort_mode/workspace.tsv.tmpl | 2 + .../test/MakeCohortVcf/CleanVcf.json.tmpl | 2 + inputs/values/resources_hg38.json | 2 + wdl/CleanVcf.wdl | 7 ++ wdl/CleanVcfChromosome.wdl | 111 +++++++++++++++++- wdl/RefineComplexVariants.wdl | 7 +- 7 files changed, 127 insertions(+), 6 deletions(-) diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CleanVcf.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CleanVcf.json.tmpl index e7c1bc7a0..3d895e35b 100644 --- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CleanVcf.json.tmpl +++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CleanVcf.json.tmpl @@ -1,6 +1,8 @@ { "CleanVcf.contig_list": "${workspace.primary_contigs_fai}", "CleanVcf.allosome_fai": "${workspace.allosome_file}", + "CleanVcf.HERVK_reference": "${workspace.HERVK_reference", + "CleanVcf.LINE1_reference": "${workspace.LINE1_reference}", "CleanVcf.chr_x": "${workspace.chr_x}", "CleanVcf.chr_y": "${workspace.chr_y}", diff --git a/inputs/templates/terra_workspaces/cohort_mode/workspace.tsv.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workspace.tsv.tmpl index b9d5f26d5..6ee7a177d 100644 --- a/inputs/templates/terra_workspaces/cohort_mode/workspace.tsv.tmpl +++ b/inputs/templates/terra_workspaces/cohort_mode/workspace.tsv.tmpl @@ -29,6 +29,8 @@ exclude_intervals_for_gcnv_filter_intervals {{ reference_resources.exclude_inter external_af_ref_bed {{ reference_resources.external_af_ref_bed }} external_af_ref_bed_prefix {{ reference_resources.external_af_ref_bed_prefix }} genome_file {{ reference_resources.genome_file }} +HERVK_reference {{ reference_resources.hervk_reference }} +LINE1_reference {{ reference_resources.line1_reference }} manta_region_bed {{ reference_resources.manta_region_bed }} manta_region_bed_index {{ reference_resources.manta_region_bed_index }} mei_bed {{ reference_resources.mei_bed }} diff --git a/inputs/templates/test/MakeCohortVcf/CleanVcf.json.tmpl b/inputs/templates/test/MakeCohortVcf/CleanVcf.json.tmpl index 8606a4a50..196505724 100644 --- a/inputs/templates/test/MakeCohortVcf/CleanVcf.json.tmpl +++ b/inputs/templates/test/MakeCohortVcf/CleanVcf.json.tmpl @@ -1,6 +1,8 @@ { "CleanVcf.contig_list": {{ reference_resources.primary_contigs_fai | tojson }}, "CleanVcf.allosome_fai": {{ reference_resources.allosome_file | tojson }}, + "CleanVcf.HERVK_reference": {{ reference_resources.hervk_reference | tojson }}, + "CleanVcf.LINE1_reference": {{ reference_resources.line1_reference | tojson }}, "CleanVcf.chr_x": {{ reference_resources.chr_x | tojson }}, "CleanVcf.chr_y": {{ reference_resources.chr_y | tojson }}, diff --git a/inputs/values/resources_hg38.json b/inputs/values/resources_hg38.json index 73484d5b3..545d4b2b0 100644 --- a/inputs/values/resources_hg38.json +++ b/inputs/values/resources_hg38.json @@ -21,6 +21,8 @@ "external_af_ref_bed" : "gs://gcp-public-data--broad-references/hg38/v0/sv-resources/resources/v1/gnomad_AF/gnomad_v2.1_sv.sites.GRCh38.bed.gz", "external_af_ref_bed_prefix" : "gnomad_v2.1_sv", "genome_file" : "gs://gcp-public-data--broad-references/hg38/v0/sv-resources/resources/v1/hg38.genome", + "hervk_reference": "gs://gatk-sv-resources-public/hg38/v0/sv-resources/resources/v1/HERVK.sorted.bed.gz", + "line1_reference": "gs://gatk-sv-resources-public/hg38/v0/sv-resources/resources/v1/LINE1.sorted.bed.gz", "manta_region_bed" : "gs://gcp-public-data--broad-references/hg38/v0/sv-resources/resources/v1/primary_contigs_plus_mito.bed.gz", "manta_region_bed_index" : "gs://gcp-public-data--broad-references/hg38/v0/sv-resources/resources/v1/primary_contigs_plus_mito.bed.gz.tbi", "mei_bed" : "gs://gatk-sv-resources-public/hg38/v0/sv-resources/resources/v1/hg38.repeatmasker.mei.with_SVA.pad_50_merged.bed.gz", diff --git a/wdl/CleanVcf.wdl b/wdl/CleanVcf.wdl index c86e88500..ab228ccf7 100644 --- a/wdl/CleanVcf.wdl +++ b/wdl/CleanVcf.wdl @@ -24,6 +24,9 @@ workflow CleanVcf { Int clean_vcf1b_records_per_shard Int clean_vcf5_records_per_shard + File HERVK_reference + File LINE1_reference + String chr_x String chr_y @@ -67,6 +70,7 @@ workflow CleanVcf { RuntimeAttr? runtime_override_stitch_fragmented_cnvs RuntimeAttr? runtime_override_final_cleanup RuntimeAttr? runtime_attr_format + RuntimeAttr? runtime_override_rescue_me_dels # Clean vcf 1b RuntimeAttr? runtime_attr_override_subset_large_cnvs_1b @@ -133,6 +137,8 @@ workflow CleanVcf { clean_vcf1b_records_per_shard=clean_vcf1b_records_per_shard, clean_vcf5_records_per_shard=clean_vcf5_records_per_shard, ploidy_table=CreatePloidyTableFromPed.out, + HERVK_reference=HERVK_reference, + LINE1_reference=LINE1_reference, chr_x=chr_x, chr_y=chr_y, linux_docker=linux_docker, @@ -170,6 +176,7 @@ workflow CleanVcf { runtime_override_concat_vcfs_1b=runtime_override_concat_vcfs_1b, runtime_override_cat_multi_cnvs_1b=runtime_override_cat_multi_cnvs_1b, runtime_attr_format=runtime_attr_format, + runtime_override_rescue_me_dels=runtime_override_rescue_me_dels } } diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl index 29b3bcc8e..45258f304 100644 --- a/wdl/CleanVcfChromosome.wdl +++ b/wdl/CleanVcfChromosome.wdl @@ -25,6 +25,9 @@ workflow CleanVcfChromosome { File? outlier_samples_list Int? max_samples_per_shard_step3 + File HERVK_reference + File LINE1_reference + File ploidy_table String chr_x String chr_y @@ -49,6 +52,7 @@ workflow CleanVcfChromosome { RuntimeAttr? runtime_override_clean_vcf_5_polish RuntimeAttr? runtime_override_stitch_fragmented_cnvs RuntimeAttr? runtime_override_final_cleanup + RuntimeAttr? runtime_override_rescue_me_dels # Clean vcf 1b RuntimeAttr? runtime_attr_override_subset_large_cnvs_1b @@ -244,9 +248,19 @@ workflow CleanVcfChromosome { runtime_attr_override_polish=runtime_override_clean_vcf_5_polish } + call RescueMobileElementDeletions { + input: + vcf = CleanVcf5.polished, + prefix = "~{prefix}.~{contig}.rescue_me_dels", + LINE1 = LINE1_reference, + HERVK = HERVK_reference, + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_override_rescue_me_dels + } + call DropRedundantCnvs { input: - vcf=CleanVcf5.polished, + vcf=RescueMobileElementDeletions.out, prefix="~{prefix}.drop_redundant_cnvs", contig=contig, sv_pipeline_docker=sv_pipeline_docker, @@ -595,6 +609,101 @@ task CleanVcf4 { } } +task RescueMobileElementDeletions { + input { + File vcf + String prefix + File LINE1 + File HERVK + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + Float input_size = size(vcf, "GiB") + # disk is cheap, read/write speed is proportional to disk size, and disk IO is a significant time factor: + # in tests on large VCFs, memory usage is ~1.0 * input VCF size + # the biggest disk usage is at the end of the task, with input + output VCF on disk + Int cpu_cores = 2 # speed up compression / decompression of VCFs + RuntimeAttr runtime_default = object { + mem_gb: 3.75 + input_size * 1.5, + disk_gb: ceil(100.0 + input_size * 3.0), + cpu_cores: cpu_cores, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_pipeline_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + command <<< + set -euo pipefail + + python <.5) print}' | cut -f4 | sed -e 's/$/\tDEL\tPASS\toverlap_LINE1/' > manual_revise.MEI_DEL_from_BND.SVID_SVTYPE_FILTER_INFO.tsv + bedtools coverage -wo -a ~{prefix}.bnd_del.bed.gz -b ~{HERVK} | awk '{if ($NF>.5) print}' | cut -f4 | sed -e 's/$/\tDEL\tPASS\toverlap_HERVK/' >> manual_revise.MEI_DEL_from_BND.SVID_SVTYPE_FILTER_INFO.tsv + + python <',) + if hash_MEI_DEL_reset[record.id] == 'overlap_HERVK': + record.alts = ('',) +fin.close() +fo.close() +CODE + >>> + + output { + File out = "~{prefix}.vcf.gz" + } +} + # Remove CNVs that are redundant with CPX events or other CNVs task DropRedundantCnvs { diff --git a/wdl/RefineComplexVariants.wdl b/wdl/RefineComplexVariants.wdl index 1e189eb9b..f694d4f21 100644 --- a/wdl/RefineComplexVariants.wdl +++ b/wdl/RefineComplexVariants.wdl @@ -197,12 +197,9 @@ workflow RefineComplexVariants { } - File reviewed_vcf = select_first([ConcatVcfs.concat_vcf, HailMerge.merged_vcf]) - File reviewed_vcf_idx = select_first([ConcatVcfs.concat_vcf_idx, HailMerge.merged_vcf_index]) - output { - File revised_output_vcf = reviewed_vcf - File revised_output_vcf_idx = reviewed_vcf_idx + File cpx_refined_vcf = select_first([ConcatVcfs.concat_vcf, HailMerge.merged_vcf]) + File cpx_refined_vcf_index = select_first([ConcatVcfs.concat_vcf_idx, HailMerge.merged_vcf_index]) File cpx_evidences = ConcatHeaderedTextFiles.out } } From b556ba2f7b50f178f140d300ad43fc6d73f003c1 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Thu, 17 Oct 2024 19:16:01 -0400 Subject: [PATCH 05/28] recal qual after gt filtering --- src/sv-pipeline/scripts/apply_sl_filter.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/sv-pipeline/scripts/apply_sl_filter.py b/src/sv-pipeline/scripts/apply_sl_filter.py index ce0151e92..4e8ecc678 100644 --- a/src/sv-pipeline/scripts/apply_sl_filter.py +++ b/src/sv-pipeline/scripts/apply_sl_filter.py @@ -78,6 +78,24 @@ def _fails_filter(sl, gt_is_ref, cutoff): return sl < cutoff +def recal_qual_score(record): + """ + Recalibrate quality score for a single variant + """ + quals = [] + for s in [s for s in record.samples]: + gt = record.samples[s]['GT'] + if _is_hom_ref(gt) or _is_no_call(gt): + continue + elif _is_hom_var(gt): + quals.append(99) + else: + quals.append(record.samples[s]['GQ']) + + if len(quals) > 0: + return int(median(quals)) + + def _apply_filter(record, sl_threshold, ploidy_dict, apply_hom_ref, ncr_threshold, keep_gq, gq_scale_factor, upper_sl_cap, lower_sl_cap, sl_shift, max_gq): record.info['MINSL'] = sl_threshold @@ -118,6 +136,7 @@ def _apply_filter(record, sl_threshold, ploidy_dict, apply_hom_ref, ncr_threshol n_sl = len(sl_list) record.info['SL_MEAN'] = sum(sl_list) / n_sl if n_sl > 0 else None record.info['SL_MAX'] = max(sl_list) if n_sl > 0 else None + record.qual = recal_qual_score(record) # Clean out AF metrics since they're no longer valid for field in ['AC', 'AF']: if field in record.info: From 7ddf9e9e66c0a0bf123145866b93af9fef84bbb2 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Thu, 17 Oct 2024 19:44:27 -0400 Subject: [PATCH 06/28] use source for ins_idel not cpx_intervals --- .../cohort_mode/workflow_configurations/CleanVcf.json.tmpl | 2 +- .../scripts/reformat_CPX_bed_and_generate_script.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CleanVcf.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CleanVcf.json.tmpl index 3d895e35b..93c172d8c 100644 --- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CleanVcf.json.tmpl +++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CleanVcf.json.tmpl @@ -1,7 +1,7 @@ { "CleanVcf.contig_list": "${workspace.primary_contigs_fai}", "CleanVcf.allosome_fai": "${workspace.allosome_file}", - "CleanVcf.HERVK_reference": "${workspace.HERVK_reference", + "CleanVcf.HERVK_reference": "${workspace.HERVK_reference}", "CleanVcf.LINE1_reference": "${workspace.LINE1_reference}", "CleanVcf.chr_x": "${workspace.chr_x}", "CleanVcf.chr_y": "${workspace.chr_y}", diff --git a/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py b/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py index f05e52d9b..06cac1a83 100644 --- a/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py +++ b/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py @@ -196,7 +196,7 @@ def is_interchromosomal(pin, header_pos): if seg2[0] != chrom: return True elif pin[header_pos['CPX_TYPE']] in ['INS_iDEL']: - seg2 = [i for i in pin[header_pos['CPX_INTERVALS']].split(',') if 'INS_' in i][0].split('_')[1].split(':') + seg2 = [i for i in pin[header_pos['SOURCE']].split(',') if 'INS_' in i][0].split('_')[1].split(':') if seg2[0] != chrom: return True elif pin[header_pos['CPX_TYPE']] in ['CTX_PQ/QP', 'CTX_PP/QQ'] or pin[header_pos['SVTYPE']] in ['CTX']: @@ -248,7 +248,7 @@ def cpx_SV_readin(input_bed, header_pos, unresolved): ref_alt = cpx_info[1] breakpoints = cpx_info[0] elif pin[header_pos['CPX_TYPE']] in ['INS_iDEL']: - segments = pin[header_pos['CPX_INTERVALS']].split(',') + segments = pin[header_pos['SOURCE']].split(',') cpx_info = extract_bp_list_for_ins_idel(pin[:3], segments, small_sv_size_threshold) ref_alt = cpx_info[1] breakpoints = cpx_info[0] @@ -292,7 +292,7 @@ def cpx_inter_chromo_SV_readin(input_bed, header_pos): ref_alt = ['a_b', 'a_ba'] elif pin[header_pos['CPX_TYPE']] in ['INS_iDEL']: seg1 = pin[:3] - seg2 = [i for i in pin[header_pos['CPX_INTERVALS']].split(',') if 'INS_' in i][0].split('_')[1].split(':') + seg2 = [i for i in pin[header_pos['SOURCE']].split(',') if 'INS_' in i][0].split('_')[1].split(':') seg2 = [seg2[0]]+seg2[1].split('-') if chr_list.index(seg1[0]) < chr_list.index(seg2[0]): bp = [[seg1[0]]+[int(i) for i in seg1[1:]], [seg2[0]]+[int(i) for i in seg2[1:]]] From 3e61822625fe6f557399415c4b3ff2de3fa5f438 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Fri, 18 Oct 2024 14:02:02 -0400 Subject: [PATCH 07/28] pass ctx revisions dict to revise --- wdl/RefineComplexVariants.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wdl/RefineComplexVariants.wdl b/wdl/RefineComplexVariants.wdl index f694d4f21..44ed0ecac 100644 --- a/wdl/RefineComplexVariants.wdl +++ b/wdl/RefineComplexVariants.wdl @@ -430,7 +430,7 @@ unresolved_svids = unresolved_readin("~{unresolved_svids}") hash_CPX_manual = CPX_manual_readin("~{CPX_manual}") hash_CTX_manual = CTX_manual_readin("~{CTX_manual}") print(len(hash_CPX_manual.keys())) -revise_vcf("~{vcf_file}", "~{prefix}.Manual_Revised.vcf.gz", hash_CPX_manual, unresolved_svids) +revise_vcf("~{vcf_file}", "~{prefix}.Manual_Revised.vcf.gz", hash_CPX_manual, unresolved_svids, hash_CTX_manual) CODE From 85817c0ef7a8eaef39fd83369416483c3d78e27b Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Fri, 18 Oct 2024 14:44:13 -0400 Subject: [PATCH 08/28] set filter to pass if empty --- src/sv-pipeline/scripts/apply_sl_filter.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/sv-pipeline/scripts/apply_sl_filter.py b/src/sv-pipeline/scripts/apply_sl_filter.py index 4e8ecc678..664648031 100644 --- a/src/sv-pipeline/scripts/apply_sl_filter.py +++ b/src/sv-pipeline/scripts/apply_sl_filter.py @@ -133,6 +133,8 @@ def _apply_filter(record, sl_threshold, ploidy_dict, apply_hom_ref, ncr_threshol record.info['NCR'] = n_no_call / n_samples if n_samples > 0 else None if ncr_threshold is not None and record.info['NCR'] is not None and record.info['NCR'] >= ncr_threshold: record.filter.add(_HIGH_NCR_FILTER) + if len(record.filter) == 0: + record.filter.add('PASS') n_sl = len(sl_list) record.info['SL_MEAN'] = sum(sl_list) / n_sl if n_sl > 0 else None record.info['SL_MAX'] = max(sl_list) if n_sl > 0 else None From 755c364de63efdf572d6f68353e294c9b098411b Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Fri, 18 Oct 2024 16:28:53 -0400 Subject: [PATCH 09/28] import median, move me del rescue to end of cleanvcf --- src/sv-pipeline/scripts/apply_sl_filter.py | 1 + wdl/CleanVcfChromosome.wdl | 24 +++++++++++----------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/src/sv-pipeline/scripts/apply_sl_filter.py b/src/sv-pipeline/scripts/apply_sl_filter.py index 664648031..74187eb01 100644 --- a/src/sv-pipeline/scripts/apply_sl_filter.py +++ b/src/sv-pipeline/scripts/apply_sl_filter.py @@ -4,6 +4,7 @@ import sys import pysam import math +from numpy import median from typing import List, Text, Dict, Optional _gt_no_call_map = dict() diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl index 45258f304..957d2fdc0 100644 --- a/wdl/CleanVcfChromosome.wdl +++ b/wdl/CleanVcfChromosome.wdl @@ -248,19 +248,9 @@ workflow CleanVcfChromosome { runtime_attr_override_polish=runtime_override_clean_vcf_5_polish } - call RescueMobileElementDeletions { - input: - vcf = CleanVcf5.polished, - prefix = "~{prefix}.~{contig}.rescue_me_dels", - LINE1 = LINE1_reference, - HERVK = HERVK_reference, - sv_pipeline_docker = sv_pipeline_docker, - runtime_attr_override = runtime_override_rescue_me_dels - } - call DropRedundantCnvs { input: - vcf=RescueMobileElementDeletions.out, + vcf=CleanVcf5.polished, prefix="~{prefix}.drop_redundant_cnvs", contig=contig, sv_pipeline_docker=sv_pipeline_docker, @@ -299,9 +289,19 @@ workflow CleanVcfChromosome { runtime_attr_override=runtime_override_stitch_fragmented_cnvs } + call RescueMobileElementDeletions { + input: + vcf = StitchFragmentedCnvs.stitched_vcf_shard, + prefix = "~{prefix}.~{contig}.rescue_me_dels", + LINE1 = LINE1_reference, + HERVK = HERVK_reference, + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_override_rescue_me_dels + } + call FinalCleanup { input: - vcf=StitchFragmentedCnvs.stitched_vcf_shard, + vcf=RescueMobileElementDeletions.out, contig=contig, prefix="~{prefix}.final_cleanup", sv_pipeline_docker=sv_pipeline_docker, From d32de60afb2f34dd1dec56ac5710e541e108ad82 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Fri, 18 Oct 2024 16:56:33 -0400 Subject: [PATCH 10/28] align ctx revise with input file; cleanup revise --- wdl/RefineComplexVariants.wdl | 46 +++-------------------------------- 1 file changed, 4 insertions(+), 42 deletions(-) diff --git a/wdl/RefineComplexVariants.wdl b/wdl/RefineComplexVariants.wdl index 44ed0ecac..98eba802d 100644 --- a/wdl/RefineComplexVariants.wdl +++ b/wdl/RefineComplexVariants.wdl @@ -167,7 +167,7 @@ workflow RefineComplexVariants { call hail.HailMerge { input: vcfs=ReviseVcf.revised_vcf, - prefix="~{prefix}.cpx_filtered", + prefix="~{prefix}.cpx_refined", gcs_project=gcs_project, sv_base_mini_docker=sv_base_mini_docker, sv_pipeline_docker=sv_pipeline_docker, @@ -183,7 +183,7 @@ workflow RefineComplexVariants { vcfs=ReviseVcf.revised_vcf, vcfs_idx=ReviseVcf.revised_vcf_idx, allow_overlaps=true, - outfile_prefix="~{prefix}.cpx_filtered", + outfile_prefix="~{prefix}.cpx_refined", sv_base_mini_docker=sv_base_mini_docker, runtime_attr_override=runtime_attr_concat } @@ -292,23 +292,6 @@ task ReviseVcf { python < 0: - return int(median(quals)) - def CPX_manual_readin(CPX_manual): out={} fin=open(CPX_manual) @@ -320,6 +303,7 @@ def CPX_manual_readin(CPX_manual): fin.close() return out + def CTX_manual_readin(CTX_manual): fin=open(CTX_manual) out={} @@ -380,20 +364,8 @@ def revise_vcf(vcf_input, vcf_output, hash_CPX_manual, unresolved_svids, hash_CT else: for sample in hash_CTX_manual[record.id].keys(): if sample in record.samples.keys(): - if hash_CTX_manual[record.id][sample][1] == 'Not_Enough_PE_Pairs': - record.samples[sample]['GT'] = [None,None] - elif hash_CTX_manual[record.id][sample][1] == 'PARTIAL_PE': - record.samples[sample]['GT'] = [None,None] - elif hash_CTX_manual[record.id][sample][1] == 'unbalanced_paternal_transmission_of_tloc,2nd_junction_is_missing': - record.samples[sample]['GT'] = [None,None] - elif hash_CTX_manual[record.id][sample][1] == 'NON_SPECIFIC': + if hash_CTX_manual[record.id][sample][0] in ['no_PE', 'low_PE', 'partial_PE']: record.samples[sample]['GT'] = [None,None] - elif hash_CTX_manual[record.id][sample][1] == 'Not_Enough_Coordinate_Span': - record.samples[sample]['GT'] = [None,None] - elif hash_CTX_manual[record.id][sample][1] == 'Interrupted_PE_Patterns_when_sorted_on_2nd_breakpoint': - record.samples[sample]['GT'] = [None,None] - elif hash_CTX_manual[record.id][sample][1] == 'CTX_with_DEL': - record.stop = int(hash_CTX_manual[record.id][sample][0].split(':')[1].split('-')[1]) # revisions to insertion-type complex events if record.info['SVTYPE']=="CPX" and record.info['CPX_TYPE'] in ['dDUP','dDUP_iDEL','INS_iDEL']: if record.info['CPX_TYPE']=='INS_iDEL': @@ -414,17 +386,7 @@ def revise_vcf(vcf_input, vcf_output, hash_CPX_manual, unresolved_svids, hash_CT fin.close() fo.close() -import os -import sys -from numpy import median import pysam -import argparse - -#Define global variables -NULL_GTs = [(None, None), (None, )] -REF_GTs = [(0, 0), (0, ), (None, 2)] -NULL_and_REF_GTs = NULL_GTs + REF_GTs -HET_GTs = [(0, 1), (None, 1), (None, 3)] unresolved_svids = unresolved_readin("~{unresolved_svids}") hash_CPX_manual = CPX_manual_readin("~{CPX_manual}") From 64cce6e6e51851ce0dd6fe779fd3b7ff735966a9 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Mon, 21 Oct 2024 11:43:20 -0400 Subject: [PATCH 11/28] cleanvcf me del rescue fixes --- wdl/CleanVcfChromosome.wdl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl index 957d2fdc0..41df87262 100644 --- a/wdl/CleanVcfChromosome.wdl +++ b/wdl/CleanVcfChromosome.wdl @@ -292,7 +292,7 @@ workflow CleanVcfChromosome { call RescueMobileElementDeletions { input: vcf = StitchFragmentedCnvs.stitched_vcf_shard, - prefix = "~{prefix}.~{contig}.rescue_me_dels", + prefix = "~{prefix}.rescue_me_dels", LINE1 = LINE1_reference, HERVK = HERVK_reference, sv_pipeline_docker = sv_pipeline_docker, @@ -694,6 +694,7 @@ for record in fin: record.alts = ('',) if hash_MEI_DEL_reset[record.id] == 'overlap_HERVK': record.alts = ('',) + fo.write(record) fin.close() fo.close() CODE From 608908c60a3ac958a69ab6ff852c89976aa514d4 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Mon, 21 Oct 2024 12:30:06 -0400 Subject: [PATCH 12/28] parametrize min pe; remove top-level script override input --- wdl/RefineComplexVariants.wdl | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/wdl/RefineComplexVariants.wdl b/wdl/RefineComplexVariants.wdl index 98eba802d..2867d902c 100644 --- a/wdl/RefineComplexVariants.wdl +++ b/wdl/RefineComplexVariants.wdl @@ -20,7 +20,8 @@ workflow RefineComplexVariants { Array[File] Depth_DUP_beds Int n_per_split - File? script_generate_cpx_review_script + Int min_pe_cpx = 3 + Int min_pe_ctx = 3 Boolean use_hail = false String? gcs_project # required if use_hail = true @@ -112,7 +113,6 @@ workflow RefineComplexVariants { input: bed = SplitCpxCtx.cpx_ctx_bed, sample_batch_pe_map = GetSampleBatchPEMap.sample_batch_pe_map, - script = script_generate_cpx_review_script, sv_pipeline_docker = sv_pipeline_docker, runtime_attr_override = runtime_attr_generate_cpx_review_script } @@ -138,6 +138,7 @@ workflow RefineComplexVariants { PE_collect_script = GenerateCpxReviewScript.pe_evidence_collection_script, PE_supp = CollectPEMetricsForCPX.evi_stat, depth_supp = CollectLargeCNVSupportForCPX.lg_cnv_depth_supp, + min_pe_cpx = min_pe_cpx, prefix = "~{prefix}.~{i}", sv_pipeline_docker = sv_pipeline_docker, runtime_attr_override = runtime_attr_calcu_cpx_evidences @@ -147,6 +148,7 @@ workflow RefineComplexVariants { input: PE_collect_script = GenerateCpxReviewScript.pe_evidence_collection_script, PE_supp = CollectPEMetricsForCPX.evi_stat, + min_pe_ctx = min_pe_ctx, prefix = "~{prefix}.~{i}", sv_pipeline_docker = sv_pipeline_docker, runtime_attr_override = runtime_attr_calcu_cpx_evidences @@ -528,6 +530,7 @@ task CalculateCpxEvidences { File PE_collect_script File PE_supp File depth_supp + Int min_pe_cpx String prefix String sv_pipeline_docker RuntimeAttr? runtime_attr_override @@ -610,7 +613,7 @@ task CalculateCpxEvidences { pe_supp = 'partial_PE' if PE_supp[svid][sample][0][0]>0 and PE_supp[svid][sample][1][0]>0: pe_supp = 'low_PE' - if PE_supp[svid][sample][0][0]>1 and PE_supp[svid][sample][1][0]>1: + if PE_supp[svid][sample][0][0]>=~{min_pe_cpx} and PE_supp[svid][sample][1][0]>=~{min_pe_cpx}: pe_supp = 'high_PE' out[svid][sample] = [pe_supp] else: @@ -681,6 +684,7 @@ task CalculateCtxEvidences { input { File PE_collect_script File PE_supp + Int min_pe_ctx String prefix String sv_pipeline_docker RuntimeAttr? runtime_attr_override @@ -753,7 +757,7 @@ task CalculateCtxEvidences { pe_supp = 'partial_PE' if PE_supp[svid][sample][0][0]>0 and PE_supp[svid][sample][1][0]>0: pe_supp = 'low_PE' - if PE_supp[svid][sample][0][0]>2 and PE_supp[svid][sample][1][0]>2: + if PE_supp[svid][sample][0][0]>=~{min_pe_ctx} and PE_supp[svid][sample][1][0]>=~{min_pe_ctx}: pe_supp = 'high_PE' out[svid][sample] = [pe_supp] else: From 0d0069533e98619c4cefdc72b49309f6ecb1f465 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Mon, 21 Oct 2024 14:33:50 -0400 Subject: [PATCH 13/28] add hervk/line1 reference inputs; add RefineComplexVariants to docs --- README.md | 2 +- .../cohort_mode_workspace_dashboard.md.tmpl | 13 +++++++------ .../workflow_configurations/MakeCohortVcf.json.tmpl | 2 ++ .../GATKSVPipelineSingleSample.json.tmpl | 3 +++ .../single_sample/workspace.tsv.tmpl | 2 ++ ...GATKSVPipelineBatch.FromSampleEvidence.json.tmpl | 2 ++ .../GATKSVPipelineBatch.json.tmpl | 2 ++ .../GATKSVPipelineSingleSample.json.tmpl | 3 +++ .../GATKSVPipelineSingleSample.melt.json.tmpl | 4 ++++ .../test/MakeCohortVcf/MakeCohortVcf.json.tmpl | 2 ++ scripts/test/terra_validation.py | 2 +- wdl/MakeCohortVcf.wdl | 4 ++++ 12 files changed, 33 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index cb8808221..09146ef12 100644 --- a/README.md +++ b/README.md @@ -27,4 +27,4 @@ continuous integration (CI) and continuous delivery (CD). GATK-SV CI/CD is developed as a set of Github Actions workflows that are available under the `.github/workflows` directory. Please refer to the [workflow's README](.github/workflows/README.md) -for their current coverage and setup. +for their current coverage and setup. \ No newline at end of file diff --git a/inputs/templates/terra_workspaces/cohort_mode/cohort_mode_workspace_dashboard.md.tmpl b/inputs/templates/terra_workspaces/cohort_mode/cohort_mode_workspace_dashboard.md.tmpl index 224f83af9..c22618380 100644 --- a/inputs/templates/terra_workspaces/cohort_mode/cohort_mode_workspace_dashboard.md.tmpl +++ b/inputs/templates/terra_workspaces/cohort_mode/cohort_mode_workspace_dashboard.md.tmpl @@ -61,10 +61,11 @@ The following workflows are included in this workspace, to be executed in this o 13. `13-ResolveComplexVariants`: Complex variant resolution 14. `14-GenotypeComplexVariants`: Complex variant re-genotyping 15. `15-CleanVcf`: VCF cleanup -16. `16-JoinRawCalls`: Combines unfiltered calls (from step 5) across batches -17. `17-SVConcordance`: Annotates variants with genotype concordance against raw calls -18. `18-FilterGenotypes`: Performs genotype filtering to improve precision and generates QC plots -19. `19-AnnotateVcf`: Cohort VCF annotations, including functional annotation, allele frequency (AF) annotation, and AF annotation with external population callsets +16. `16-RefineComplexVariants`: Complex variant and translocation refinement +17. `17-JoinRawCalls`: Combines unfiltered calls (from step 5) across batches +18. `18-SVConcordance`: Annotates variants with genotype concordance against raw calls +19. `19-FilterGenotypes`: Performs genotype filtering to improve precision and generates QC plots +20. `20-AnnotateVcf`: Cohort VCF annotations, including functional annotation, allele frequency (AF) annotation, and AF annotation with external population callsets Additional downstream modules, such as those for visualization, are under development. They are not included in this workspace at this time, but the source code can be found in the [GATK-SV GitHub repository](https://github.com/broadinstitute/gatk-sv). See **Downstream steps** towards the bottom of this page for more information. @@ -205,9 +206,9 @@ Read the full MergeBatchSites documentation [here](https://github.com/broadinsti Read the full GenotypeBatch documentation [here](https://github.com/broadinstitute/gatk-sv#genotype-batch). * Use the same `sample_set` definitions you used for `03-TrainGCNV` through `08-FilterBatchSamples`. -#### 11-RegenotypeCNVs, 12-CombineBatches, 13-ResolveComplexVariants, 14-GenotypeComplexVariants, 15-CleanVcf, 16-JoinRawCalls, 17-SVConcordance, 18-FilterGenotypes, and 19-AnnotateVcf +#### 11-RegenotypeCNVs, 12-CombineBatches, 13-ResolveComplexVariants, 14-GenotypeComplexVariants, 15-CleanVcf, 16-RefineComplexVariants, 17-JoinRawCalls, 18-SVConcordance, 19-FilterGenotypes, and 20-AnnotateVcf -Read the full documentation for [RegenotypeCNVs](https://github.com/broadinstitute/gatk-sv#regenotype-cnvs), [MakeCohortVcf](https://github.com/broadinstitute/gatk-sv#make-cohort-vcf) (which includes `CombineBatches`, `ResolveComplexVariants`, `GenotypeComplexVariants`, `CleanVcf`), [`JoinRawCalls`](https://github.com/broadinstitute/gatk-sv#join-raw-calls), [`SVConcordance`](https://github.com/broadinstitute/gatk-sv#svconcordance), [`FilterGenotypes`](https://github.com/broadinstitute/gatk-sv#filter-genotypes), and [AnnotateVcf](https://github.com/broadinstitute/gatk-sv#annotate-vcf) on the README. +Read the full documentation for [RegenotypeCNVs](https://github.com/broadinstitute/gatk-sv#regenotype-cnvs), [MakeCohortVcf](https://github.com/broadinstitute/gatk-sv#make-cohort-vcf) (which includes `CombineBatches`, `ResolveComplexVariants`, `GenotypeComplexVariants`, `CleanVcf`), [`RefineComplexVariants`](https://github.com/broadinstitute/gatk-sv#refine-complex), [`JoinRawCalls`](https://github.com/broadinstitute/gatk-sv#join-raw-calls), [`SVConcordance`](https://github.com/broadinstitute/gatk-sv#svconcordance), [`FilterGenotypes`](https://github.com/broadinstitute/gatk-sv#filter-genotypes), and [AnnotateVcf](https://github.com/broadinstitute/gatk-sv#annotate-vcf) on the README. * Use the same cohort `sample_set_set` you created and used for `09-MergeBatchSites`. #### Downstream steps diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/MakeCohortVcf.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/MakeCohortVcf.json.tmpl index ccab739da..f3fbfb018 100644 --- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/MakeCohortVcf.json.tmpl +++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/MakeCohortVcf.json.tmpl @@ -8,6 +8,8 @@ "MakeCohortVcf.depth_exclude_list": "${workspace.depth_exclude_list}", "MakeCohortVcf.empty_file" : "${workspace.empty_file}", "MakeCohortVcf.ref_dict": "${workspace.reference_dict}", + "MakeCohortVcf.HERVK_reference": "${workspace.HERVK_reference}", + "MakeCohortVcf.LINE1_reference": "${workspace.LINE1_reference}", "MakeCohortVcf.site_level_comparison_datasets": [ {{ reference_resources.ccdg_abel_site_level_benchmarking_dataset | tojson }}, diff --git a/inputs/templates/terra_workspaces/single_sample/GATKSVPipelineSingleSample.json.tmpl b/inputs/templates/terra_workspaces/single_sample/GATKSVPipelineSingleSample.json.tmpl index e5a920a0c..88c735b2d 100644 --- a/inputs/templates/terra_workspaces/single_sample/GATKSVPipelineSingleSample.json.tmpl +++ b/inputs/templates/terra_workspaces/single_sample/GATKSVPipelineSingleSample.json.tmpl @@ -97,6 +97,9 @@ "GATKSVPipelineSingleSample.clean_vcf_random_seed": 0, "GATKSVPipelineSingleSample.run_vcf_qc" : false, + "GATKSVPipelineSingleSample.MakeCohortVcf.HERVK_reference": "${workspace.reference_HERVK}", + "GATKSVPipelineSingleSample.MakeCohortVcf.LINE1_reference": "${workspace.reference_LINE1}", + "GATKSVPipelineSingleSample.protein_coding_gtf" : "${workspace.reference_protein_coding_gtf}", "GATKSVPipelineSingleSample.noncoding_bed" : "${workspace.reference_noncoding_bed}", "GATKSVPipelineSingleSample.external_af_ref_bed" : "${workspace.reference_external_af_ref_bed}", diff --git a/inputs/templates/terra_workspaces/single_sample/workspace.tsv.tmpl b/inputs/templates/terra_workspaces/single_sample/workspace.tsv.tmpl index 25e13568f..2bcef0ad2 100644 --- a/inputs/templates/terra_workspaces/single_sample/workspace.tsv.tmpl +++ b/inputs/templates/terra_workspaces/single_sample/workspace.tsv.tmpl @@ -55,6 +55,8 @@ reference_exclude_intervals_for_gcnv_filter_intervals {{ reference_resources.exc reference_external_af_ref_bed {{ reference_resources.external_af_ref_bed }} reference_external_af_ref_bed_prefix {{ reference_resources.external_af_ref_bed_prefix }} reference_genome_file {{ reference_resources.genome_file }} +reference_HERVK {{ reference_resources.hervk_reference }} +reference_LINE1 {{ reference_resources.line1_reference }} reference_manta_region_bed {{ reference_resources.manta_region_bed }} reference_manta_region_bed_index {{ reference_resources.manta_region_bed_index }} reference_mei_bed {{ reference_resources.mei_bed }} diff --git a/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.FromSampleEvidence.json.tmpl b/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.FromSampleEvidence.json.tmpl index 2f5ba41c4..2bfbc4f1b 100644 --- a/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.FromSampleEvidence.json.tmpl +++ b/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.FromSampleEvidence.json.tmpl @@ -134,6 +134,8 @@ "GATKSVPipelineBatch.MakeCohortVcf.mei_bed": {{ reference_resources.mei_bed | tojson }}, "GATKSVPipelineBatch.MakeCohortVcf.pe_exclude_list": {{ reference_resources.pesr_exclude_list | tojson }}, "GATKSVPipelineBatch.MakeCohortVcf.depth_exclude_list": {{ reference_resources.depth_exclude_list | tojson }}, + "GATKSVPipelineBatch.MakeCohortVcf.HERVK_reference": {{ reference_resources.hervk_reference | tojson }}, + "GATKSVPipelineBatch.MakeCohortVcf.LINE1_reference": {{ reference_resources.line1_reference | tojson }}, "GATKSVPipelineBatch.MakeCohortVcf.min_sr_background_fail_batches": 0.5, "GATKSVPipelineBatch.MakeCohortVcf.max_shards_per_chrom_clean_vcf_step1": 200, "GATKSVPipelineBatch.MakeCohortVcf.min_records_per_shard_clean_vcf_step1": 5000, diff --git a/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.json.tmpl b/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.json.tmpl index 5412fd79e..9ba04d229 100644 --- a/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.json.tmpl +++ b/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.json.tmpl @@ -127,6 +127,8 @@ "GATKSVPipelineBatch.MakeCohortVcf.mei_bed": {{ reference_resources.mei_bed | tojson }}, "GATKSVPipelineBatch.MakeCohortVcf.pe_exclude_list": {{ reference_resources.pesr_exclude_list | tojson }}, "GATKSVPipelineBatch.MakeCohortVcf.depth_exclude_list": {{ reference_resources.depth_exclude_list | tojson }}, + "GATKSVPipelineBatch.MakeCohortVcf.HERVK_reference": {{ reference_resources.hervk_reference | tojson }}, + "GATKSVPipelineBatch.MakeCohortVcf.LINE1_reference": {{ reference_resources.line1_reference | tojson }}, "GATKSVPipelineBatch.MakeCohortVcf.min_sr_background_fail_batches": 0.5, "GATKSVPipelineBatch.MakeCohortVcf.max_shards_per_chrom_clean_vcf_step1": 200, "GATKSVPipelineBatch.MakeCohortVcf.min_records_per_shard_clean_vcf_step1": 5000, diff --git a/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.json.tmpl b/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.json.tmpl index f08b38902..870b5b82f 100644 --- a/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.json.tmpl +++ b/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.json.tmpl @@ -101,6 +101,9 @@ "GATKSVPipelineSingleSample.clean_vcf5_records_per_shard": 5000, "GATKSVPipelineSingleSample.run_vcf_qc" : false, + "GATKSVPipelineSingleSample.MakeCohortVcf.HERVK_reference": {{ reference_resources.hervk_reference | tojson }}, + "GATKSVPipelineSingleSample.MakeCohortVcf.LINE1_reference": {{ reference_resources.line1_reference | tojson }}, + "GATKSVPipelineSingleSample.protein_coding_gtf" : {{ reference_resources.protein_coding_gtf | tojson }}, "GATKSVPipelineSingleSample.noncoding_bed" : {{ reference_resources.noncoding_bed | tojson }}, "GATKSVPipelineSingleSample.external_af_ref_bed" : {{ reference_resources.external_af_ref_bed | tojson }}, diff --git a/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.melt.json.tmpl b/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.melt.json.tmpl index 4d35ca4ae..8352403df 100644 --- a/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.melt.json.tmpl +++ b/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.melt.json.tmpl @@ -90,6 +90,10 @@ "GATKSVPipelineSingleSample.depth_clustering_algorithm": "SINGLE_LINKAGE", "GATKSVPipelineSingleSample.ref_copy_number_autosomal_contigs" : 2, + + "GATKSVPipelineSingleSample.MakeCohortVcf.HERVK_reference": {{ reference_resources.hervk_reference | tojson }}, + "GATKSVPipelineSingleSample.MakeCohortVcf.LINE1_reference": {{ reference_resources.line1_reference | tojson }}, + "GATKSVPipelineSingleSample.clean_vcf_min_sr_background_fail_batches": 0.5, "GATKSVPipelineSingleSample.max_shard_size_resolve" : 500, "GATKSVPipelineSingleSample.clean_vcf_max_shards_per_chrom_clean_vcf_step1": 200, diff --git a/inputs/templates/test/MakeCohortVcf/MakeCohortVcf.json.tmpl b/inputs/templates/test/MakeCohortVcf/MakeCohortVcf.json.tmpl index 8c27a74cf..cad2a9e03 100644 --- a/inputs/templates/test/MakeCohortVcf/MakeCohortVcf.json.tmpl +++ b/inputs/templates/test/MakeCohortVcf/MakeCohortVcf.json.tmpl @@ -8,6 +8,8 @@ "MakeCohortVcf.depth_exclude_list": {{ reference_resources.depth_exclude_list | tojson }}, "MakeCohortVcf.empty_file" : {{ reference_resources.empty_file | tojson }}, "MakeCohortVcf.ref_dict": {{ reference_resources.reference_dict | tojson }}, + "MakeCohortVcf.HERVK_reference": {{ reference_resources.hervk_reference | tojson }}, + "MakeCohortVcf.LINE1_reference": {{ reference_resources.line1_reference | tojson }}, "MakeCohortVcf.site_level_comparison_datasets": [ {{ reference_resources.ccdg_abel_site_level_benchmarking_dataset | tojson }}, diff --git a/scripts/test/terra_validation.py b/scripts/test/terra_validation.py index 2e9a64756..aa9079d19 100644 --- a/scripts/test/terra_validation.py +++ b/scripts/test/terra_validation.py @@ -113,7 +113,7 @@ def main(): parser.add_argument("-j", "--womtool-jar", help="Path to womtool jar", required=True) parser.add_argument("-n", "--num-input-jsons", help="Number of Terra input JSONs expected", - required=False, default=24, type=int) + required=False, default=25, type=int) parser.add_argument("--log-level", help="Specify level of logging information, ie. info, warning, error (not case-sensitive)", required=False, default="INFO") diff --git a/wdl/MakeCohortVcf.wdl b/wdl/MakeCohortVcf.wdl index 5ea66ea50..f3891fc8d 100644 --- a/wdl/MakeCohortVcf.wdl +++ b/wdl/MakeCohortVcf.wdl @@ -44,6 +44,8 @@ workflow MakeCohortVcf { File pe_exclude_list File depth_exclude_list File ref_dict + File HERVK_reference + File LINE1_reference Int max_shard_size_resolve Int max_shards_per_chrom_clean_vcf_step1 Int min_records_per_shard_clean_vcf_step1 @@ -378,6 +380,8 @@ workflow MakeCohortVcf { allosome_fai=allosome_fai, chr_x=chr_x, chr_y=chr_y, + HERVK_reference=HERVK_reference, + LINE1_reference=LINE1_reference, max_shards_per_chrom_step1=max_shards_per_chrom_clean_vcf_step1, min_records_per_shard_step1=min_records_per_shard_clean_vcf_step1, clean_vcf1b_records_per_shard=clean_vcf1b_records_per_shard, From 395ded1e37a7d513c452949999ed0bf77183bd93 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Mon, 21 Oct 2024 14:39:51 -0400 Subject: [PATCH 14/28] add downstream steps to dockstore.yml --- .github/.dockstore.yml | 72 +++++++++++++++++++++++++++++++----------- 1 file changed, 54 insertions(+), 18 deletions(-) diff --git a/.github/.dockstore.yml b/.github/.dockstore.yml index 3bcad4198..974fdeee6 100644 --- a/.github/.dockstore.yml +++ b/.github/.dockstore.yml @@ -4,7 +4,7 @@ workflows: name: GatherSampleEvidence primaryDescriptorPath: /wdl/GatherSampleEvidence.wdl filters: - branches: + branches: - main tags: - /.*/ @@ -13,7 +13,7 @@ workflows: name: EvidenceQC primaryDescriptorPath: /wdl/EvidenceQC.wdl filters: - branches: + branches: - main tags: - /.*/ @@ -22,7 +22,7 @@ workflows: name: TrainGCNV primaryDescriptorPath: /wdl/TrainGCNV.wdl filters: - branches: + branches: - main tags: - /.*/ @@ -31,7 +31,7 @@ workflows: name: GatherBatchEvidence primaryDescriptorPath: /wdl/GatherBatchEvidence.wdl filters: - branches: + branches: - main tags: - /.*/ @@ -40,7 +40,7 @@ workflows: name: ClusterBatch primaryDescriptorPath: /wdl/ClusterBatch.wdl filters: - branches: + branches: - main tags: - /.*/ @@ -49,7 +49,7 @@ workflows: name: GenerateBatchMetrics primaryDescriptorPath: /wdl/GenerateBatchMetrics.wdl filters: - branches: + branches: - main tags: - /.*/ @@ -58,7 +58,7 @@ workflows: name: FilterBatchSites primaryDescriptorPath: /wdl/FilterBatchSites.wdl filters: - branches: + branches: - main tags: - /.*/ @@ -67,7 +67,7 @@ workflows: name: PlotSVCountsPerSample primaryDescriptorPath: /wdl/PlotSVCountsPerSample.wdl filters: - branches: + branches: - main tags: - /.*/ @@ -76,7 +76,7 @@ workflows: name: FilterBatchSamples primaryDescriptorPath: /wdl/FilterBatchSamples.wdl filters: - branches: + branches: - main tags: - /.*/ @@ -85,7 +85,7 @@ workflows: name: MergeBatchSites primaryDescriptorPath: /wdl/MergeBatchSites.wdl filters: - branches: + branches: - main tags: - /.*/ @@ -94,7 +94,7 @@ workflows: name: GenotypeBatch primaryDescriptorPath: /wdl/GenotypeBatch.wdl filters: - branches: + branches: - main tags: - /.*/ @@ -103,7 +103,7 @@ workflows: name: RegenotypeCNVs primaryDescriptorPath: /wdl/RegenotypeCNVs.wdl filters: - branches: + branches: - main tags: - /.*/ @@ -112,7 +112,7 @@ workflows: name: CombineBatches primaryDescriptorPath: /wdl/CombineBatches.wdl filters: - branches: + branches: - main tags: - /.*/ @@ -121,7 +121,7 @@ workflows: name: ResolveComplexVariants primaryDescriptorPath: /wdl/ResolveComplexVariants.wdl filters: - branches: + branches: - main tags: - /.*/ @@ -130,7 +130,7 @@ workflows: name: GenotypeComplexVariants primaryDescriptorPath: /wdl/GenotypeComplexVariants.wdl filters: - branches: + branches: - main tags: - /.*/ @@ -139,7 +139,43 @@ workflows: name: CleanVcf primaryDescriptorPath: /wdl/CleanVcf.wdl filters: - branches: + branches: + - main + tags: + - /.*/ + + - subclass: WDL + name: RefineComplexVariants + primaryDescriptorPath: /wdl/RefineComplexVariants.wdl + filters: + branches: + - main + tags: + - /.*/ + + - subclass: WDL + name: JoinRawCalls + primaryDescriptorPath: /wdl/JoinRawCalls.wdl + filters: + branches: + - main + tags: + - /.*/ + + - subclass: WDL + name: SVConcordance + primaryDescriptorPath: /wdl/SVConcordance.wdl + filters: + branches: + - main + tags: + - /.*/ + + - subclass: WDL + name: FilterGenotypes + primaryDescriptorPath: /wdl/FilterGenotypes.wdl + filters: + branches: - main tags: - /.*/ @@ -148,7 +184,7 @@ workflows: name: MainVcfQc primaryDescriptorPath: /wdl/MainVcfQc.wdl filters: - branches: + branches: - main tags: - /.*/ @@ -157,7 +193,7 @@ workflows: name: AnnotateVcf primaryDescriptorPath: /wdl/AnnotateVcf.wdl filters: - branches: + branches: - main tags: - /.*/ From 12ba977273141423f520c373ea4a1f41b3d8fe8b Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Mon, 21 Oct 2024 15:24:20 -0400 Subject: [PATCH 15/28] add sanitizeheader to filtergenotypes & fix jsons --- .../FilterGenotypes.fixed_cutoffs.json.tmpl | 8 +- ...FilterGenotypes.optimize_cutoffs.json.tmpl | 8 +- wdl/FilterGenotypes.wdl | 75 ++++++++++++++++++- 3 files changed, 86 insertions(+), 5 deletions(-) diff --git a/inputs/templates/test/FilterGenotypes/FilterGenotypes.fixed_cutoffs.json.tmpl b/inputs/templates/test/FilterGenotypes/FilterGenotypes.fixed_cutoffs.json.tmpl index 5035f0d0b..1025504cf 100644 --- a/inputs/templates/test/FilterGenotypes/FilterGenotypes.fixed_cutoffs.json.tmpl +++ b/inputs/templates/test/FilterGenotypes/FilterGenotypes.fixed_cutoffs.json.tmpl @@ -5,7 +5,13 @@ "FilterGenotypes.gq_recalibrator_model_file": {{ reference_resources.aou_recalibrate_gq_model_file | tojson }}, "FilterGenotypes.sl_filter_args": "--small-del-threshold 93 --medium-del-threshold 150 --small-dup-threshold -51 --medium-dup-threshold -4 --ins-threshold -13 --inv-threshold -19", - "FilterGenotypes.genome_tracks": {{ reference_resources.recalibrate_gq_genome_tracks | tojson }}, + "FilterGenotypes.genome_tracks": [ + {{ reference_resources.recalibrate_gq_genome_track_repeatmasker | tojson }}, + {{ reference_resources.recalibrate_gq_genome_track_segdup | tojson }}, + {{ reference_resources.recalibrate_gq_genome_track_simple_repeats | tojson }}, + {{ reference_resources.recalibrate_gq_genome_track_umap100 | tojson }}, + {{ reference_resources.recalibrate_gq_genome_track_umap24 | tojson }} + ], "FilterGenotypes.recalibrate_gq_args": [ "--keep-homvar false", "--keep-homref true", diff --git a/inputs/templates/test/FilterGenotypes/FilterGenotypes.optimize_cutoffs.json.tmpl b/inputs/templates/test/FilterGenotypes/FilterGenotypes.optimize_cutoffs.json.tmpl index b11a12812..53e50c1ba 100644 --- a/inputs/templates/test/FilterGenotypes/FilterGenotypes.optimize_cutoffs.json.tmpl +++ b/inputs/templates/test/FilterGenotypes/FilterGenotypes.optimize_cutoffs.json.tmpl @@ -6,7 +6,13 @@ "FilterGenotypes.primary_contigs_fai": {{ reference_resources.primary_contigs_fai | tojson }}, "FilterGenotypes.gq_recalibrator_model_file": {{ reference_resources.aou_recalibrate_gq_model_file | tojson }}, - "FilterGenotypes.genome_tracks": {{ reference_resources.recalibrate_gq_genome_tracks | tojson }}, + "FilterGenotypes.genome_tracks": [ + {{ reference_resources.recalibrate_gq_genome_track_repeatmasker | tojson }}, + {{ reference_resources.recalibrate_gq_genome_track_segdup | tojson }}, + {{ reference_resources.recalibrate_gq_genome_track_simple_repeats | tojson }}, + {{ reference_resources.recalibrate_gq_genome_track_umap100 | tojson }}, + {{ reference_resources.recalibrate_gq_genome_track_umap24 | tojson }} + ], "FilterGenotypes.recalibrate_gq_args": [ "--keep-homvar false", "--keep-homref true", diff --git a/wdl/FilterGenotypes.wdl b/wdl/FilterGenotypes.wdl index 0603b5750..58b98d71d 100644 --- a/wdl/FilterGenotypes.wdl +++ b/wdl/FilterGenotypes.wdl @@ -25,6 +25,9 @@ workflow FilterGenotypes { Int optimize_vcf_records_per_shard = 50000 Int filter_vcf_records_per_shard = 20000 + # For SanitizeHeader - bcftools annotate -x ~{header_drop_fields} + String header_drop_fields = "FILTER/LOW_QUALITY,FORMAT/TRUTH_CN_EQUAL,FORMAT/GT_FILTER,FORMAT/CONC_ST,INFO/STATUS,INFO/TRUTH_AC,INFO/TRUTH_AN,INFO/TRUTH_AF,INFO/TRUTH_VID,INFO/CNV_CONCORDANCE,INFO/GENOTYPE_CONCORDANCE,INFO/HET_PPV,INFO/HET_SENSITIVITY,INFO/HOMVAR_PPV,INFO/HOMVAR_SENSITIVITY,INFO/MINSL,INFO/NON_REF_GENOTYPE_CONCORDANCE,INFO/SL_MAX,INFO/SL_MEAN,INFO/VAR_PPV,INFO/VAR_SENSITIVITY,INFO/VAR_SPECIFICITY" + # For MainVcfQc File primary_contigs_fai File? ped_file @@ -120,10 +123,19 @@ workflow FilterGenotypes { sv_base_mini_docker=sv_base_mini_docker } + call SanitizeHeader { + input: + vcf = ConcatVcfs.concat_vcf, + vcf_index = ConcatVcfs.concat_vcf_idx, + drop_fields = header_drop_fields, + prefix = "~{output_prefix_}.filter_genotypes.sanitized", + sv_pipeline_docker = sv_pipeline_docker + } + if (run_qc) { call qc.MainVcfQc { input: - vcfs=[ConcatVcfs.concat_vcf], + vcfs=[SanitizeHeader.out], prefix="~{output_prefix_}.filter_genotypes", ped_file=ped_file, bcftools_preprocessing_options=qc_bcftools_preprocessing_options, @@ -142,8 +154,8 @@ workflow FilterGenotypes { } output { - File filtered_vcf = ConcatVcfs.concat_vcf - File filtered_vcf_index = ConcatVcfs.concat_vcf_idx + File filtered_vcf = SanitizeHeader.out + File filtered_vcf_index = SanitizeHeader.out_index File? main_vcf_qc_tarball = MainVcfQc.sv_vcf_qc_output # For optional analysis @@ -295,3 +307,60 @@ task FilterVcf { maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) } } + +task SanitizeHeader { + input { + File vcf + File vcf_index + String prefix + String drop_fields + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr runtime_default = object { + mem_gb: 3.75, + disk_gb: ceil(10.0 + size(vcf, "GiB") * 2.0), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_pipeline_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + command <<< + set -euo pipefail + + bcftools view --no-version -h ~{vcf} > header.vcf + + grep -v -e ^"##bcftools" header.vcf \ + -e ^"##source=depth" \ + -e ^"##source=cleanvcf" \ + -e ^"##ALT=/##ALT=/g' > newheader.vcf + + bcftools reheader -h newheader.vcf ~{vcf} \ + | bcftools annotate -x ~{drop_fields} \ + --no-version \ + -O z \ + -o ~{prefix}.vcf.gz + + tabix ~{prefix}.vcf.gz + >>> + + output { + File out = "~{prefix}.vcf.gz" + File out_index = "~{prefix}.vcf.gz.tbi" + } +} From c473dcb603e4031d64a503d7d4d4067bd8802642 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Mon, 21 Oct 2024 15:51:12 -0400 Subject: [PATCH 16/28] update sanitize task --- wdl/FilterGenotypes.wdl | 1 - 1 file changed, 1 deletion(-) diff --git a/wdl/FilterGenotypes.wdl b/wdl/FilterGenotypes.wdl index 58b98d71d..a703862f1 100644 --- a/wdl/FilterGenotypes.wdl +++ b/wdl/FilterGenotypes.wdl @@ -346,7 +346,6 @@ task SanitizeHeader { -e ^"##source=depth" \ -e ^"##source=cleanvcf" \ -e ^"##ALT=/##ALT=/g' > newheader.vcf From b8c36178ff6e02370dc7f5a229ac0f854761e3aa Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Mon, 21 Oct 2024 17:07:03 -0400 Subject: [PATCH 17/28] lint --- .../reformat_CPX_bed_and_generate_script.py | 865 +++++++++--------- 1 file changed, 445 insertions(+), 420 deletions(-) diff --git a/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py b/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py index 06cac1a83..8a90e16f3 100644 --- a/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py +++ b/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py @@ -1,35 +1,41 @@ #!python -#this script takes in the CPX SVs in complex format, reformat them in svelter format meanwhile generate the script to collect PE metrics +# this script takes in the CPX SVs, reformats in svelter format, and generates the script to collect PE metrics +import os +import argparse + def header_pos_readin(input_bed): - fin=os.popen(r'''zcat %s'''%(input_bed)) - pin=fin.readline().strip().split() - out = {} - for i in range(len(pin)): - out[pin[i]] = i - fin.close() - return out + fin = os.popen(r'''zcat %s''' % input_bed) + pin = fin.readline().strip().split() + out = {} + for i in range(len(pin)): + out[pin[i]] = i + fin.close() + return out + def sample_pe_readin(sample_pe_file): - #sample_pe_file is a 2 column file with sample ID and PE metrics on both columns - out = {} - fin=open(sample_pe_file) - for line in fin: - pin=line.strip().split() - if not pin[0] in out.keys(): - out[pin[0]] = pin[1] - fin.close() - return out - -def SVID_sample_readin(input_bed,header_pos): - out={} - fin=os.popen(r'''zcat %s'''%(input_bed)) - for line in fin: - pin=line.strip().split('\t') - if not pin[0][0]=='#': - out[pin[header_pos['name']]] = pin[header_pos['samples']] - fin.close() - return out + # sample_pe_file is a 2 column file with sample ID and PE metrics on both columns + out = {} + fin = open(sample_pe_file) + for line in fin: + pin = line.strip().split() + if not pin[0] in out.keys(): + out[pin[0]] = pin[1] + fin.close() + return out + + +def svid_sample_readin(input_bed, header_pos): + out = {} + fin = os.popen(r'''zcat %s''' % input_bed) + for line in fin: + pin = line.strip().split('\t') + if not pin[0][0] == '#': + out[pin[header_pos['name']]] = pin[header_pos['samples']] + fin.close() + return out + def parse_segment(segment): svtype, interval = segment.split('_') @@ -37,438 +43,457 @@ def parse_segment(segment): coord1, coord2 = coords.split('-') return svtype, chrom, int(coord1), int(coord2) + # segments: CPX_INTERVALS split by ',' def extract_bp_list_for_inv_cnv_events(segments, cpx_type): s0_svtype, s0_chrom, s0_c1, s0_c2 = parse_segment(segments[0]) - breakpoints = [s0_chrom]+[s0_c1, s0_c2] - if not 'INVdup' in cpx_type: - for seg in segments[1:]: - seg_svtype, seg_chrom, seg_c1, seg_c2 = parse_segment(seg) - breakpoints.append(seg_c2) + breakpoints = [s0_chrom] + [s0_c1, s0_c2] + if 'INVdup' not in cpx_type: + for seg in segments[1:]: + seg_svtype, seg_chrom, seg_c1, seg_c2 = parse_segment(seg) + breakpoints.append(seg_c2) else: - if cpx_type=='INVdup': - s1_svtype, s1_chrom, s1_c1, s1_c2 = parse_segment(segments[1]) - if s1_c1 < breakpoints[2]: - breakpoints = breakpoints[:2] + [s1_c1, breakpoints[2]] - elif cpx_type == 'dupINVdup' or cpx_type == 'delINVdup': - s2_svtype, s2_chrom, s2_c1, s2_c2 = parse_segment(segments[2]) - breakpoints += [s2_c1, s2_c2] + if cpx_type == 'INVdup': + s1_svtype, s1_chrom, s1_c1, s1_c2 = parse_segment(segments[1]) + if s1_c1 < breakpoints[2]: + breakpoints = breakpoints[:2] + [s1_c1, breakpoints[2]] + elif cpx_type == 'dupINVdup' or cpx_type == 'delINVdup': + s2_svtype, s2_chrom, s2_c1, s2_c2 = parse_segment(segments[2]) + breakpoints += [s2_c1, s2_c2] return breakpoints + def extract_bp_list_for_ddup_events(coordinates, segments, small_sv_size_threshold): - #example of coordinates: ["chr1",1154129,1154133] - #example of segments: DUP_chr1:229987763-230011157 + # example of coordinates: ["chr1",1154129,1154133] + # example of segments: DUP_chr1:229987763-230011157 del_bp = [coordinates[0], int(coordinates[1]), int(coordinates[2])] # CW: this looks like it only gets the coordinates from the first DUP segment? - #dup_seg = [i for i in segments if 'DUP_' in i][0].split('_')[1].split(':') - #dup_bp = [int(i) for i in dup_seg[1].split('-')] - #INV_flag = len([i for i in segments if "INV_" in i]) + # dup_seg = [i for i in segments if 'DUP_' in i][0].split('_')[1].split(':') + # dup_bp = [int(i) for i in dup_seg[1].split('-')] + # inv_flag = len([i for i in segments if "INV_" in i]) # rewrite as: - INV_flag = 0 + inv_flag = 0 + dup_bp = [] for segment in segments: if 'DUP_' in segment: seg_svtype, seg_chrom, seg_c1, seg_c2 = parse_segment(segment) dup_bp = [seg_c1, seg_c2] if 'INV_' in segment: - INV_flag = INV_flag + 1 + inv_flag = inv_flag + 1 if 'DEL_' in segment: seg_svtype, seg_chrom, seg_c1, seg_c2 = parse_segment(segment) del_bp = [seg_chrom, seg_c1, seg_c2] - #INV_flag == 0 : no INV involved in the insertion - #INV_flag > 0: INV involved - - if INV_flag==0: - if del_bp[2] < dup_bp[0]: - breakpints = del_bp + dup_bp - if del_bp[2]-del_bp[1]>small_sv_size_threshold: - structures = ['abc','cbc'] - else: - structures = ['ab','bab'] - elif del_bp[1] > dup_bp[1]: - breakpints = [del_bp[0]]+dup_bp+del_bp[1:] - if del_bp[2]-del_bp[1]>small_sv_size_threshold: - structures = ['abc','aba'] - else: - structures = ['ab','aba'] - elif del_bp[1] < dup_bp[0] and not del_bp[2] < dup_bp[0]: - breakpints = del_bp[:2] + dup_bp - structures = ['ab','bb'] - elif del_bp[1] < dup_bp[1] and not del_bp[2] < dup_bp[1]: - breakpints = [del_bp[0]]+dup_bp+[del_bp[2]] - structures = ['ab','aa'] - else: - return 'unresolved' - elif INV_flag>0: - if del_bp[2] < dup_bp[0]: - breakpints = del_bp + dup_bp - if del_bp[2]-del_bp[1]>small_sv_size_threshold: - structures = ['abc','c^bc'] - else: - structures = ['ab','b^ab'] - elif del_bp[1] > dup_bp[1]: - breakpints = [del_bp[0]]+dup_bp+del_bp[1:] - if del_bp[2]-del_bp[1]>small_sv_size_threshold: - structures = ['abc','aba^'] - else: - structures = ['ab','aba^'] - elif del_bp[1] < dup_bp[0] and not del_bp[2] < dup_bp[0]: - breakpints = del_bp[:2] + dup_bp - structures = ['ab','b^b'] - elif del_bp[1] < dup_bp[1] and not del_bp[2] < dup_bp[1]: - breakpints = [del_bp[0]]+dup_bp+[del_bp[2]] - structures = ['ab','aa^'] - else: - return 'unresolved' - return [breakpints, structures] + # inv_flag == 0 : no INV involved in the insertion + # inv_flag > 0: INV involved + structures = [] + breakpoints = [] + if inv_flag == 0: + if del_bp[2] < dup_bp[0]: + breakpoints = del_bp + dup_bp + if del_bp[2] - del_bp[1] > small_sv_size_threshold: + structures = ['abc', 'cbc'] + else: + structures = ['ab', 'bab'] + elif del_bp[1] > dup_bp[1]: + breakpoints = [del_bp[0]] + dup_bp + del_bp[1:] + if del_bp[2] - del_bp[1] > small_sv_size_threshold: + structures = ['abc', 'aba'] + else: + structures = ['ab', 'aba'] + elif del_bp[1] < dup_bp[0] and not del_bp[2] < dup_bp[0]: + breakpoints = del_bp[:2] + dup_bp + structures = ['ab', 'bb'] + elif del_bp[1] < dup_bp[1] and not del_bp[2] < dup_bp[1]: + breakpoints = [del_bp[0]] + dup_bp + [del_bp[2]] + structures = ['ab', 'aa'] + else: + return 'unresolved' + elif inv_flag > 0: + if del_bp[2] < dup_bp[0]: + breakpoints = del_bp + dup_bp + if del_bp[2] - del_bp[1] > small_sv_size_threshold: + structures = ['abc', 'c^bc'] + else: + structures = ['ab', 'b^ab'] + elif del_bp[1] > dup_bp[1]: + breakpoints = [del_bp[0]] + dup_bp + del_bp[1:] + if del_bp[2] - del_bp[1] > small_sv_size_threshold: + structures = ['abc', 'aba^'] + else: + structures = ['ab', 'aba^'] + elif del_bp[1] < dup_bp[0] and not del_bp[2] < dup_bp[0]: + breakpoints = del_bp[:2] + dup_bp + structures = ['ab', 'b^b'] + elif del_bp[1] < dup_bp[1] and not del_bp[2] < dup_bp[1]: + breakpoints = [del_bp[0]] + dup_bp + [del_bp[2]] + structures = ['ab', 'aa^'] + else: + return 'unresolved' + return [breakpoints, structures] + def extract_bp_list_for_ins_idel(coordinates, segments, small_sv_size_threshold): - #example of coordinates: ["chr1",1154129,1154133] - #example of segments: DUP_chr1:229987763-230011157 + # example of coordinates: ["chr1",1154129,1154133] + # example of segments: DUP_chr1:229987763-230011157 del_bp = [coordinates[0], int(coordinates[1]), int(coordinates[2])] dup_seg = [i for i in segments if 'INS_' in i][0].split('_')[1].split(':') dup_bp = [int(i) for i in dup_seg[1].split('-')] - INV_flag = len([i for i in segments if "INV_" in i]) - #INV_flag == 0 : no INV involved in the insertion - #INV_flag > 0: INV involved - if INV_flag==0: - if del_bp[2] < dup_bp[0]: - breakpints = del_bp + dup_bp - if del_bp[2]-del_bp[1]> small_sv_size_threshold: - structures = ['abc','cbc'] - else: - structures = ['ab','bab'] - elif del_bp[1] > dup_bp[1]: - breakpints = [del_bp[0]]+dup_bp+del_bp[1:] - if del_bp[2]-del_bp[1]> small_sv_size_threshold: - structures = ['abc','aba'] - else: - structures = ['ab','aba'] - elif del_bp[1] < dup_bp[0] and not del_bp[2] < dup_bp[0]: - breakpints = del_bp[:2] + dup_bp - structures = ['ab','bb'] - elif del_bp[1] < dup_bp[1] and not del_bp[2] < dup_bp[1]: - breakpints = [del_bp[0]]+dup_bp+[del_bp[2]] - structures = ['ab','aa'] - elif INV_flag>0: - if del_bp[2] < dup_bp[0]: - breakpints = del_bp + dup_bp - if del_bp[2]-del_bp[1]> small_sv_size_threshold: - structures = ['abc','c^bc'] - else: - structures = ['ab','b^ab'] - elif del_bp[1] > dup_bp[1]: - breakpints = [del_bp[0]]+dup_bp+del_bp[1:] - if del_bp[2]-del_bp[1]> small_sv_size_threshold: - structures = ['abc','aba^'] - else: - structures = ['ab','aba^'] - elif del_bp[1] < dup_bp[0] and not del_bp[2] < dup_bp[0]: - breakpints = del_bp[:2] + dup_bp - structures = ['ab','b^b'] - elif del_bp[1] < dup_bp[1] and not del_bp[2] < dup_bp[1]: - breakpints = [del_bp[0]]+dup_bp+[del_bp[2]] - structures = ['ab','aa^'] - return [breakpints, structures] - -def extract_bp_list_V4(coordinates, segments, small_sv_size_threshold): + inv_flag = len([i for i in segments if "INV_" in i]) + # inv_flag == 0 : no INV involved in the insertion + # inv_flag > 0: INV involved + structures = [] + breakpoints = [] + if inv_flag == 0: + if del_bp[2] < dup_bp[0]: + breakpoints = del_bp + dup_bp + if del_bp[2] - del_bp[1] > small_sv_size_threshold: + structures = ['abc', 'cbc'] + else: + structures = ['ab', 'bab'] + elif del_bp[1] > dup_bp[1]: + breakpoints = [del_bp[0]] + dup_bp + del_bp[1:] + if del_bp[2] - del_bp[1] > small_sv_size_threshold: + structures = ['abc', 'aba'] + else: + structures = ['ab', 'aba'] + elif del_bp[1] < dup_bp[0] and not del_bp[2] < dup_bp[0]: + breakpoints = del_bp[:2] + dup_bp + structures = ['ab', 'bb'] + elif del_bp[1] < dup_bp[1] and not del_bp[2] < dup_bp[1]: + breakpoints = [del_bp[0]] + dup_bp + [del_bp[2]] + structures = ['ab', 'aa'] + elif inv_flag > 0: + if del_bp[2] < dup_bp[0]: + breakpoints = del_bp + dup_bp + if del_bp[2] - del_bp[1] > small_sv_size_threshold: + structures = ['abc', 'c^bc'] + else: + structures = ['ab', 'b^ab'] + elif del_bp[1] > dup_bp[1]: + breakpoints = [del_bp[0]] + dup_bp + del_bp[1:] + if del_bp[2] - del_bp[1] > small_sv_size_threshold: + structures = ['abc', 'aba^'] + else: + structures = ['ab', 'aba^'] + elif del_bp[1] < dup_bp[0] and not del_bp[2] < dup_bp[0]: + breakpoints = del_bp[:2] + dup_bp + structures = ['ab', 'b^b'] + elif del_bp[1] < dup_bp[1] and not del_bp[2] < dup_bp[1]: + breakpoints = [del_bp[0]] + dup_bp + [del_bp[2]] + structures = ['ab', 'aa^'] + return [breakpoints, structures] + + +def extract_bp_list_v4(coordinates, segments, small_sv_size_threshold): del_bp = [coordinates[0], int(coordinates[1]), int(coordinates[2])] dup_seg = [i for i in segments if 'INV_' in i][0].split('_')[1].split(':') dup_bp = [int(i) for i in dup_seg[1].split('-')] - INV_flag = 1 + structures = [] + breakpoints = [] if del_bp[2] < dup_bp[0]: breakpoints = del_bp + dup_bp - if del_bp[2]-del_bp[1]> small_sv_size_threshold: - structures = ['abc','c^bc'] + if del_bp[2] - del_bp[1] > small_sv_size_threshold: + structures = ['abc', 'c^bc'] else: - structures = ['ab','b^ab'] + structures = ['ab', 'b^ab'] elif del_bp[1] > dup_bp[1]: - breakpoints = [del_bp[0]]+dup_bp+del_bp[1:] - if del_bp[2]-del_bp[1]> small_sv_size_threshold: - structures = ['abc','aba^'] + breakpoints = [del_bp[0]] + dup_bp + del_bp[1:] + if del_bp[2] - del_bp[1] > small_sv_size_threshold: + structures = ['abc', 'aba^'] else: - structures = ['ab','aba^'] + structures = ['ab', 'aba^'] return [breakpoints, structures] + def is_interchromosomal(pin, header_pos): - chrom = pin[0] - if pin[header_pos['CPX_TYPE']] in ['dDUP', 'dDUP_iDEL']: - seg2 = [i for i in pin[header_pos['CPX_INTERVALS']].split(',') if 'DUP_' in i][0].split('_')[1].split(':') - if seg2[0] != chrom: - return True - elif pin[header_pos['CPX_TYPE']] in ['INS_iDEL']: - seg2 = [i for i in pin[header_pos['SOURCE']].split(',') if 'INS_' in i][0].split('_')[1].split(':') - if seg2[0] != chrom: - return True - elif pin[header_pos['CPX_TYPE']] in ['CTX_PQ/QP', 'CTX_PP/QQ'] or pin[header_pos['SVTYPE']] in ['CTX']: - return True - elif "INV_" in pin[header_pos['SOURCE']] and pin[header_pos['SVTYPE']]=="INS": - seg2 = [i for i in pin[header_pos['SOURCE']].split(',') if 'INV_' in i][0].split('_')[1].split(':') - seg2 = [seg2[0]]+seg2[1].split('-') - if seg2[0] != chrom: - return True - return False - -def cpx_SV_readin(input_bed, header_pos, unresolved): - unresolved_svids = [] - fin=os.popen(r'''zcat %s'''%(input_bed)) - out = [] - small_sv_size_threshold = 250 - for line in fin: - pin=line.strip().split('\t') - # label pidups as unresolved since manual inspection indicated low quality - if not pin[0][0]=="#": - if pin[header_pos['CPX_TYPE']] in ['piDUP_RF', 'piDUP_FR']: - unresolved_svids.append(pin[header_pos['name']]) - elif not is_interchromosomal(pin, header_pos): - if pin[header_pos['CPX_TYPE']] in ['delINV', 'INVdel', 'dupINV','INVdup','delINVdel', 'delINVdup','dupINVdel','dupINVdup']: - segments = pin[header_pos['CPX_INTERVALS']].split(',') - breakpoints = extract_bp_list_for_inv_cnv_events(segments, pin[header_pos['CPX_TYPE']]) - if pin[header_pos['CPX_TYPE']] == 'delINV': - ref_alt = ['ab','b^'] - if pin[header_pos['CPX_TYPE']] == 'INVdel': - ref_alt = ['ab','a^'] - if pin[header_pos['CPX_TYPE']] == 'dupINV': - ref_alt = ['ab','aba^'] - if pin[header_pos['CPX_TYPE']] == 'INVdup': - ref_alt = ['ab','b^ab'] - if pin[header_pos['CPX_TYPE']] == 'delINVdel': - ref_alt = ['abc','b^'] - if pin[header_pos['CPX_TYPE']] == 'delINVdup': - ref_alt = ['abc','c^bc'] - if pin[header_pos['CPX_TYPE']] == 'dupINVdel': - ref_alt = ['abc','aba^'] - if pin[header_pos['CPX_TYPE']] == 'dupINVdup': - ref_alt = ['abc','ac^b^a^c'] - elif pin[header_pos['CPX_TYPE']] in ['dDUP', 'dDUP_iDEL']: - segments = pin[header_pos['CPX_INTERVALS']].split(',') - cpx_info = extract_bp_list_for_ddup_events(pin[:3], segments, small_sv_size_threshold) - if cpx_info == 'unresolved': - unresolved_svids.append(pin[header_pos['name']]) + chrom = pin[0] + if pin[header_pos['CPX_TYPE']] in ['dDUP', 'dDUP_iDEL']: + seg2 = [i for i in pin[header_pos['CPX_INTERVALS']].split(',') if 'DUP_' in i][0].split('_')[1].split(':') + if seg2[0] != chrom: + return True + elif pin[header_pos['CPX_TYPE']] in ['INS_iDEL']: + seg2 = [i for i in pin[header_pos['SOURCE']].split(',') if 'INS_' in i][0].split('_')[1].split(':') + if seg2[0] != chrom: + return True + elif pin[header_pos['CPX_TYPE']] in ['CTX_PQ/QP', 'CTX_PP/QQ'] or pin[header_pos['SVTYPE']] in ['CTX']: + return True + elif "INV_" in pin[header_pos['SOURCE']] and pin[header_pos['SVTYPE']] == "INS": + seg2 = [i for i in pin[header_pos['SOURCE']].split(',') if 'INV_' in i][0].split('_')[1].split(':') + seg2 = [seg2[0]] + seg2[1].split('-') + if seg2[0] != chrom: + return True + return False + + +def cpx_sv_readin(input_bed, header_pos, unresolved): + unresolved_svids = [] + fin = os.popen(r'''zcat %s''' % input_bed) + out = [] + small_sv_size_threshold = 250 + ref_alt = [] + for line in fin: + pin = line.strip().split('\t') + # label pidups as unresolved since manual inspection indicated low quality + if not pin[0][0] == "#": + if pin[header_pos['CPX_TYPE']] in ['piDUP_RF', 'piDUP_FR']: + unresolved_svids.append(pin[header_pos['name']]) + elif not is_interchromosomal(pin, header_pos): + if pin[header_pos['CPX_TYPE']] in ['delINV', 'INVdel', 'dupINV', 'INVdup', 'delINVdel', 'delINVdup', + 'dupINVdel', 'dupINVdup']: + segments = pin[header_pos['CPX_INTERVALS']].split(',') + breakpoints = extract_bp_list_for_inv_cnv_events(segments, pin[header_pos['CPX_TYPE']]) + if pin[header_pos['CPX_TYPE']] == 'delINV': + ref_alt = ['ab', 'b^'] + if pin[header_pos['CPX_TYPE']] == 'INVdel': + ref_alt = ['ab', 'a^'] + if pin[header_pos['CPX_TYPE']] == 'dupINV': + ref_alt = ['ab', 'aba^'] + if pin[header_pos['CPX_TYPE']] == 'INVdup': + ref_alt = ['ab', 'b^ab'] + if pin[header_pos['CPX_TYPE']] == 'delINVdel': + ref_alt = ['abc', 'b^'] + if pin[header_pos['CPX_TYPE']] == 'delINVdup': + ref_alt = ['abc', 'c^bc'] + if pin[header_pos['CPX_TYPE']] == 'dupINVdel': + ref_alt = ['abc', 'aba^'] + if pin[header_pos['CPX_TYPE']] == 'dupINVdup': + ref_alt = ['abc', 'ac^b^a^c'] + elif pin[header_pos['CPX_TYPE']] in ['dDUP', 'dDUP_iDEL']: + segments = pin[header_pos['CPX_INTERVALS']].split(',') + cpx_info = extract_bp_list_for_ddup_events(pin[:3], segments, small_sv_size_threshold) + if cpx_info == 'unresolved': + unresolved_svids.append(pin[header_pos['name']]) + continue + ref_alt = cpx_info[1] + breakpoints = cpx_info[0] + elif pin[header_pos['CPX_TYPE']] in ['INS_iDEL']: + segments = pin[header_pos['SOURCE']].split(',') + cpx_info = extract_bp_list_for_ins_idel(pin[:3], segments, small_sv_size_threshold) + ref_alt = cpx_info[1] + breakpoints = cpx_info[0] + else: + segments = pin[header_pos['SOURCE']].split(',') + cpx_info = extract_bp_list_v4(pin[:3], segments, small_sv_size_threshold) + ref_alt = cpx_info[1] + breakpoints = cpx_info[0] + out.append([breakpoints, ref_alt, pin[3]]) + else: continue - ref_alt = cpx_info[1] - breakpoints = cpx_info[0] - elif pin[header_pos['CPX_TYPE']] in ['INS_iDEL']: - segments = pin[header_pos['SOURCE']].split(',') - cpx_info = extract_bp_list_for_ins_idel(pin[:3], segments, small_sv_size_threshold) - ref_alt = cpx_info[1] - breakpoints = cpx_info[0] + fin.close() + with open(unresolved, 'w') as unres: + for svid in unresolved_svids: + unres.write(svid + "\n") + return out + + +def cpx_inter_chromo_sv_readin(input_bed, header_pos): + fin = os.popen(r'''zcat %s''' % input_bed) + out = [] + chr_list = ['chr' + str(i) for i in range(1, 23)] + ['chrX', 'chrY'] + bp = [] + ref_alt = [] + for line in fin: + pin = line.strip().split('\t') + if not pin[0][0] == "#": + if is_interchromosomal(pin, header_pos): + if pin[header_pos['CPX_TYPE']] in ['dDUP', 'dDUP_iDEL']: + seg1 = pin[:3] + seg2 = [i for i in pin[header_pos['CPX_INTERVALS']].split(',') if 'DUP_' in i][0].split('_')[1].split(':') + seg2 = [seg2[0]] + seg2[1].split('-') + if chr_list.index(seg1[0]) < chr_list.index(seg2[0]): + bp = [[seg1[0]] + [int(i) for i in seg1[1:]], [seg2[0]] + [int(i) for i in seg2[1:]]] + if int(seg1[2]) - int(seg1[1]) > 250: + ref_alt = ['ab_c', 'cb_c'] + else: + ref_alt = ['a_b', 'ba_b'] + elif chr_list.index(seg1[0]) > chr_list.index(seg2[0]): + bp = [[seg2[0]] + [int(i) for i in seg2[1:]], [seg1[0]] + [int(i) for i in seg1[1:]]] + if int(seg1[2]) - int(seg1[1]) > 250: + ref_alt = ['a_bc', 'a_ba'] + else: + ref_alt = ['a_b', 'a_ba'] + elif pin[header_pos['CPX_TYPE']] in ['INS_iDEL']: + seg1 = pin[:3] + seg2 = [i for i in pin[header_pos['SOURCE']].split(',') if 'INS_' in i][0].split('_')[1].split(':') + seg2 = [seg2[0]] + seg2[1].split('-') + if chr_list.index(seg1[0]) < chr_list.index(seg2[0]): + bp = [[seg1[0]] + [int(i) for i in seg1[1:]], [seg2[0]] + [int(i) for i in seg2[1:]]] + if int(seg1[2]) - int(seg1[1]) > 250: + ref_alt = ['ab_c', 'cb_c'] + else: + ref_alt = ['a_b', 'ba_b'] + elif chr_list.index(seg1[0]) > chr_list.index(seg2[0]): + bp = [[seg2[0]] + [int(i) for i in seg2[1:]], [seg1[0]] + [int(i) for i in seg1[1:]]] + if int(seg1[2]) - int(seg1[1]) > 250: + ref_alt = ['a_bc', 'a_ba'] + else: + ref_alt = ['a_b', 'a_ba'] + elif pin[header_pos['CPX_TYPE']] in ['CTX_PQ/QP', 'CTX_PP/QQ'] or pin[header_pos['SVTYPE']] in ['CTX']: + seg1 = pin[:3] + seg2 = [pin[header_pos['CHR2']], pin[header_pos['END2']], pin[header_pos['END2']]] + if chr_list.index(seg1[0]) < chr_list.index(seg2[0]): + bp = [[seg1[0]] + [int(i) for i in seg1[1:]], [seg2[0]] + [int(i) for i in seg2[1:]]] + elif chr_list.index(seg1[0]) > chr_list.index(seg2[0]): + bp = [[seg2[0]] + [int(i) for i in seg2[1:]], [seg1[0]] + [int(i) for i in seg1[1:]]] + ref_alt = [pin[header_pos['CPX_TYPE']]] + elif "INV_" in pin[header_pos['SOURCE']] and pin[header_pos['SVTYPE']] == "INS": + seg1 = pin[:3] + seg2 = [i for i in pin[header_pos['SOURCE']].split(',') if 'INV_' in i][0].split('_')[1].split(':') + seg2 = [seg2[0]] + seg2[1].split('-') + if chr_list.index(seg1[0]) < chr_list.index(seg2[0]): + bp = [[seg1[0]] + [int(i) for i in seg1[1:]], [seg2[0]] + [int(i) for i in seg2[1:]]] + if int(seg1[2]) - int(seg1[1]) > 250: + ref_alt = ['ab_c', 'c^b_c'] + else: + ref_alt = ['a_b', 'b^a_b'] + elif chr_list.index(seg1[0]) > chr_list.index(seg2[0]): + bp = [[seg2[0]] + [int(i) for i in seg2[1:]], [seg1[0]] + [int(i) for i in seg1[1:]]] + if int(seg1[2]) - int(seg1[1]) > 250: + ref_alt = ['a_bc', 'a_ba^'] + else: + ref_alt = ['a_b', 'a_ba^'] + out.append([bp, ref_alt, pin[3]]) + fin.close() + return out + + +def cpx_sample_batch_readin(cpx_sv, svid_sample, sample_pe, pe_evidence, out_file): + out = [] + flank_back = 1000 + flank_front = 100 + fo = open(out_file, 'w') + for info in cpx_sv: + breakpoints = info[0] + if info[1][0] == 'ab' and info[1][1] == 'b^': # delINV + common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_back) + '-' + str(breakpoints[1] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>' + str(breakpoints[3] - flank_back), '&&', '$5<' + str(breakpoints[3] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[2] - flank_front) + '-' + str(breakpoints[2] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>' + str(breakpoints[3] - flank_front), '&&', '$5<' + str(breakpoints[3] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1][0] == 'abc' and info[1][1] == 'b^': # delINVdel + common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_back) + '-' + str(breakpoints[1] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>' + str(breakpoints[3] - flank_back), '&&', '$5<' + str(breakpoints[3] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[2] - flank_front) + '-' + str(breakpoints[2] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>' + str(breakpoints[4] - flank_front), '&&', '$5<' + str(breakpoints[4] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1][0] == 'abc' and info[1][1] == 'c^bc': # delINVdup + common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_back) + '-' + str(breakpoints[1] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>' + str(breakpoints[4] - flank_back), '&&', '$5<' + str(breakpoints[4] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[2] - flank_front) + '-' + str(breakpoints[2] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>' + str(breakpoints[3] - flank_front), '&&', '$5<' + str(breakpoints[3] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1][0] == 'ab' and info[1][1] == 'aba^': # dupINV + common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[2] - flank_back) + '-' + str(breakpoints[2] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>' + str(breakpoints[3] - flank_back), '&&', '$5<' + str(breakpoints[3] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_front) + '-' + str(breakpoints[1] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>' + str(breakpoints[3] - flank_front), '&&', '$5<' + str(breakpoints[3] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1][0] == 'abc' and info[1][1] == 'ac^b^a^c': # dupINVdup + common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[2] - flank_back) + '-' + str(breakpoints[2] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>' + str(breakpoints[4] - flank_back), '&&', '$5<' + str(breakpoints[4] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_front) + '-' + str(breakpoints[1] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>' + str(breakpoints[3] - flank_front), '&&', '$5<' + str(breakpoints[3] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1][0] == 'abc' and info[1][1] == 'aba^': # dupINVdel + common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[2] - flank_back) + '-' + str(breakpoints[2] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>' + str(breakpoints[3] - flank_back), '&&', '$5<' + str(breakpoints[3] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_front) + '-' + str(breakpoints[1] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>' + str(breakpoints[4] - flank_front), '&&', '$5<' + str(breakpoints[4] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1][0] == 'ab' and info[1][1] == 'a^': # INVdel + common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_back) + '-' + str(breakpoints[1] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>' + str(breakpoints[2] - flank_back), '&&', '$5<' + str(breakpoints[2] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_front) + '-' + str(breakpoints[1] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>' + str(breakpoints[3] - flank_front), '&&', '$5<' + str(breakpoints[3] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1][0] == 'ab' and info[1][1] == 'b^ab': # INVdup + common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_back) + '-' + str(breakpoints[1] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>' + str(breakpoints[3] - flank_back), '&&', '$5<' + str(breakpoints[3] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_front) + '-' + str(breakpoints[1] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>' + str(breakpoints[2] - flank_front), '&&', '$5<' + str(breakpoints[2] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1][0] == 'ab' and info[1][1] == 'aba': # dupINS + common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[2] - flank_back) + '-' + str(breakpoints[2] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>' + str(breakpoints[3] - flank_front), '&&', '$5<' + str(breakpoints[3] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_front) + '-' + str(breakpoints[1] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>' + str(breakpoints[3] - flank_back), '&&', '$5<' + str(breakpoints[3] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1][0] == 'abc' and info[1][1] == 'aba': # dupINSdel + common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[2] - flank_back) + '-' + str(breakpoints[2] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>' + str(breakpoints[4] - flank_front), '&&', '$5<' + str(breakpoints[4] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_front) + '-' + str(breakpoints[1] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>' + str(breakpoints[3] - flank_back), '&&', '$5<' + str(breakpoints[3] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1][0] == 'ab' and info[1][1] == 'aa': # dupdel + common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[2] - flank_back) + '-' + str(breakpoints[2] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>' + str(breakpoints[3] - flank_front), '&&', '$5<' + str(breakpoints[3] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_front) + '-' + str(breakpoints[1] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>' + str(breakpoints[2] - flank_back), '&&', '$5<' + str(breakpoints[2] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1][0] == 'ab' and info[1][1] == 'bab': # INSdup + common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_back) + '-' + str(breakpoints[1] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>' + str(breakpoints[2] - flank_front), '&&', '$5<' + str(breakpoints[2] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_front) + '-' + str(breakpoints[1] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>' + str(breakpoints[3] - flank_back), '&&', '$5<' + str(breakpoints[3] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1][0] == 'abc' and info[1][1] == 'cbc': # delINSdup + common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_back) + '-' + str(breakpoints[1] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>' + str(breakpoints[3] - flank_front), '&&', '$5<' + str(breakpoints[3] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[2] - flank_front) + '-' + str(breakpoints[2] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>' + str(breakpoints[4] - flank_back), '&&', '$5<' + str(breakpoints[4] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1][0] == 'ab' and info[1][1] == 'bb': # deldup + common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_back) + '-' + str(breakpoints[1] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>' + str(breakpoints[2] - flank_front), '&&', '$5<' + str(breakpoints[2] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[2] - flank_front) + '-' + str(breakpoints[2] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>' + str(breakpoints[3] - flank_back), '&&', '$5<' + str(breakpoints[3] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1] == ['a_b', 'ba_b'] or info[1] == ['ab_c', 'cb_c']: # ddup or ddup_iDEL, insertion from the smaller chromosome to the larger chromosome + common_1 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][1] - flank_back) + '-' + str(breakpoints[0][1] + flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="-"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][1] - flank_front), '&&', '$5<' + str(breakpoints[1][1] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][2] - flank_front) + '-' + str(breakpoints[0][2] + flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="+"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][2] - flank_back), '&&', '$5<' + str(breakpoints[1][2] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1] == ['a_b', 'a_ba'] or info[1] == ['a_bc', 'a_ba']: # ddup or ddup_iDEL, insertion from the larger chromosome to the smaller chromosome + common_1 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][1] - flank_front) + '-' + str(breakpoints[0][1] + flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="+"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][1] - flank_back), '&&', '$5<' + str(breakpoints[1][1] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][2] - flank_back) + '-' + str(breakpoints[0][2] + flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="-"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][2] - flank_front), '&&', '$5<' + str(breakpoints[1][2] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1] == ['a_b', 'b^a_b'] or info[1] == ['ab_c', 'c^b_c']: # inverted insertion, insertion from the smaller chromosome to the larger chromosome + common_1 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][1] - flank_back) + '-' + str(breakpoints[0][1] + flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="+"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][2] - flank_back), '&&', '$5<' + str(breakpoints[1][2] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][2] - flank_front) + '-' + str(breakpoints[0][2] + flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="-"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][1] - flank_front), '&&', '$5<' + str(breakpoints[1][1] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1] == ['a_b', 'a_ba^'] or info[1] == ['a_bc', 'a_ba^']: # inverted insertion, insertion from the larger chromosome to the smaller chromosome + common_1 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][1] - flank_front) + '-' + str(breakpoints[0][1] + flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="-"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][2] - flank_front), '&&', '$5<' + str(breakpoints[1][2] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][2] - flank_back) + '-' + str(breakpoints[0][2] + flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="+"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][1] - flank_back), '&&', '$5<' + str(breakpoints[1][1] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1] == ['CTX_PQ/QP']: # inverted insertion, insertion from the larger chromosome to the smaller chromosome + common_1 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][1] - flank_back) + '-' + str(breakpoints[0][1] + flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="+"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][1] - flank_back), '&&', '$5<' + str(breakpoints[1][1] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][2] - flank_front) + '-' + str(breakpoints[0][2] + flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="-"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][2] - flank_front), '&&', '$5<' + str(breakpoints[1][2] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + elif info[1] == ['CTX_PP/QQ']: # inverted insertion, insertion from the larger chromosome to the smaller chromosome + common_1 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][1] - flank_back) + '-' + str(breakpoints[0][1] + flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="-"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][1] - flank_front), '&&', '$5<' + str(breakpoints[1][1] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] + common_2 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][2] - flank_front) + '-' + str(breakpoints[0][2] + flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="+"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][2] - flank_back), '&&', '$5<' + str(breakpoints[1][2] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence] else: - segments = pin[header_pos['SOURCE']].split(',') - cpx_info = extract_bp_list_V4(pin[:3], segments, small_sv_size_threshold) - ref_alt = cpx_info[1] - breakpoints = cpx_info[0] - out.append([breakpoints, ref_alt,pin[3]]) - else: - continue - fin.close() - with open(unresolved, 'w') as unres: - for svid in unresolved_svids: - unres.write(svid + "\n") - return out - -def cpx_inter_chromo_SV_readin(input_bed, header_pos): - fin=os.popen(r'''zcat %s'''%(input_bed)) - out = [] - chr_list = ['chr'+str(i) for i in range(1,23)]+['chrX','chrY'] - for line in fin: - pin=line.strip().split('\t') - if not pin[0][0]=="#": - if is_interchromosomal(pin, header_pos): - if pin[header_pos['CPX_TYPE']] in ['dDUP', 'dDUP_iDEL']: - seg1 = pin[:3] - seg2 = [i for i in pin[header_pos['CPX_INTERVALS']].split(',') if 'DUP_' in i][0].split('_')[1].split(':') - seg2 = [seg2[0]]+seg2[1].split('-') - if chr_list.index(seg1[0]) < chr_list.index(seg2[0]): - bp = [[seg1[0]]+[int(i) for i in seg1[1:]], [seg2[0]]+[int(i) for i in seg2[1:]]] - if int(seg1[2])-int(seg1[1])>250: - ref_alt = ['ab_c', 'cb_c'] - else: - ref_alt = ['a_b', 'ba_b'] - elif chr_list.index(seg1[0]) > chr_list.index(seg2[0]): - bp = [[seg2[0]]+[int(i) for i in seg2[1:]], [seg1[0]]+[int(i) for i in seg1[1:]]] - if int(seg1[2])-int(seg1[1])>250: - ref_alt = ['a_bc','a_ba'] - else: - ref_alt = ['a_b', 'a_ba'] - elif pin[header_pos['CPX_TYPE']] in ['INS_iDEL']: - seg1 = pin[:3] - seg2 = [i for i in pin[header_pos['SOURCE']].split(',') if 'INS_' in i][0].split('_')[1].split(':') - seg2 = [seg2[0]]+seg2[1].split('-') - if chr_list.index(seg1[0]) < chr_list.index(seg2[0]): - bp = [[seg1[0]]+[int(i) for i in seg1[1:]], [seg2[0]]+[int(i) for i in seg2[1:]]] - if int(seg1[2])-int(seg1[1])>250: - ref_alt = ['ab_c', 'cb_c'] - else: - ref_alt = ['a_b', 'ba_b'] - elif chr_list.index(seg1[0]) > chr_list.index(seg2[0]): - bp = [[seg2[0]]+[int(i) for i in seg2[1:]], [seg1[0]]+[int(i) for i in seg1[1:]]] - if int(seg1[2])-int(seg1[1])>250: - ref_alt = ['a_bc','a_ba'] - else: - ref_alt = ['a_b', 'a_ba'] - elif pin[header_pos['CPX_TYPE']] in ['CTX_PQ/QP', 'CTX_PP/QQ'] or pin[header_pos['SVTYPE']] in ['CTX']: - seg1 = pin[:3] - seg2 = [pin[header_pos['CHR2']], pin[header_pos['END2']], pin[header_pos['END2']]] - if chr_list.index(seg1[0]) < chr_list.index(seg2[0]): - bp = [[seg1[0]]+[int(i) for i in seg1[1:]], [seg2[0]]+[int(i) for i in seg2[1:]]] - elif chr_list.index(seg1[0]) > chr_list.index(seg2[0]): - bp = [[seg2[0]]+[int(i) for i in seg2[1:]], [seg1[0]]+[int(i) for i in seg1[1:]]] - ref_alt = [pin[header_pos['CPX_TYPE']]] - elif "INV_" in pin[header_pos['SOURCE']] and pin[header_pos['SVTYPE']]=="INS": - seg1 = pin[:3] - seg2 = [i for i in pin[header_pos['SOURCE']].split(',') if 'INV_' in i][0].split('_')[1].split(':') - seg2 = [seg2[0]]+seg2[1].split('-') - if chr_list.index(seg1[0]) < chr_list.index(seg2[0]): - bp = [[seg1[0]]+[int(i) for i in seg1[1:]], [seg2[0]]+[int(i) for i in seg2[1:]]] - if int(seg1[2])-int(seg1[1])>250: - ref_alt = ['ab_c', 'c^b_c'] - else: - ref_alt = ['a_b', 'b^a_b'] - elif chr_list.index(seg1[0]) > chr_list.index(seg2[0]): - bp = [[seg2[0]]+[int(i) for i in seg2[1:]], [seg1[0]]+[int(i) for i in seg1[1:]]] - if int(seg1[2])-int(seg1[1])>250: - ref_alt = ['a_bc','a_ba^'] - else: - ref_alt = ['a_b', 'a_ba^'] - out.append([bp, ref_alt, pin[3]]) - fin.close() - return out - -def cpx_sample_batch_readin(cpx_SV, SVID_sample,sample_pe, PE_evidence, out_file): - out = [] - flank_back = 1000 - flank_front = 100 - fo=open(out_file,'w') - for info in cpx_SV: - breakpints = info[0] - if info[1][0] == 'ab' and info[1][1]=='b^': #delINV - common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_back)+'-'+str(breakpints[1]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_front)+'-'+str(breakpints[2]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>'+str(breakpints[3]-flank_front) , '&&', '$5<'+str(breakpints[3]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - elif info[1][0] == 'abc' and info[1][1]=='b^': #delINVdel - common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_back)+'-'+str(breakpints[1]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_front)+'-'+str(breakpints[2]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>'+str(breakpints[4]-flank_front) , '&&', '$5<'+str(breakpints[4]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - elif info[1][0] == 'abc' and info[1][1]=='c^bc': #delINVdup - common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_back)+'-'+str(breakpints[1]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>'+str(breakpints[4]-flank_back) , '&&', '$5<'+str(breakpints[4]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_front)+'-'+str(breakpints[2]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>'+str(breakpints[3]-flank_front) , '&&', '$5<'+str(breakpints[3]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - elif info[1][0] == 'ab' and info[1][1]=='aba^': #dupINV - common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_back)+'-'+str(breakpints[2]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>'+str(breakpints[3]-flank_front) , '&&', '$5<'+str(breakpints[3]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - elif info[1][0] == 'abc' and info[1][1]=='ac^b^a^c': #dupINVdup - common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_back)+'-'+str(breakpints[2]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>'+str(breakpints[4]-flank_back) , '&&', '$5<'+str(breakpints[4]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>'+str(breakpints[3]-flank_front) , '&&', '$5<'+str(breakpints[3]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - elif info[1][0] == 'abc' and info[1][1]=='aba^': #dupINVdel - common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_back)+'-'+str(breakpints[2]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>'+str(breakpints[4]-flank_front) , '&&', '$5<'+str(breakpints[4]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - elif info[1][0] == 'ab' and info[1][1]=='a^': #INVdel - common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_back)+'-'+str(breakpints[1]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>'+str(breakpints[2]-flank_back) , '&&', '$5<'+str(breakpints[2]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>'+str(breakpints[3]-flank_front) , '&&', '$5<'+str(breakpints[3]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - elif info[1][0] == 'ab' and info[1][1]=='b^ab': #INVdup - common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_back)+'-'+str(breakpints[1]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>'+str(breakpints[2]-flank_front) , '&&', '$5<'+str(breakpints[2]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - elif info[1][0] == 'ab' and info[1][1]=='aba': #dupINS - common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_back)+'-'+str(breakpints[2]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>'+str(breakpints[3]-flank_front) , '&&', '$5<'+str(breakpints[3]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - elif info[1][0] == 'abc' and info[1][1]=='aba': #dupINSdel - common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_back)+'-'+str(breakpints[2]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>'+str(breakpints[4]-flank_front) , '&&', '$5<'+str(breakpints[4]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - elif info[1][0] == 'ab' and info[1][1]=='aa': #dupdel - common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_back)+'-'+str(breakpints[2]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>'+str(breakpints[3]-flank_front) , '&&', '$5<'+str(breakpints[3]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>'+str(breakpints[2]-flank_back) , '&&', '$5<'+str(breakpints[2]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - elif info[1][0] == 'ab' and info[1][1]=='bab': #INSdup - common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_back)+'-'+str(breakpints[1]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>'+str(breakpints[2]-flank_front) , '&&', '$5<'+str(breakpints[2]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - elif info[1][0] == 'abc' and info[1][1]=='cbc': #delINSdup - common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_back)+'-'+str(breakpints[1]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>'+str(breakpints[3]-flank_front) , '&&', '$5<'+str(breakpints[3]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_front)+'-'+str(breakpints[2]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>'+str(breakpints[4]-flank_back) , '&&', '$5<'+str(breakpints[4]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - elif info[1][0] == 'ab' and info[1][1]=='bb': #deldup - common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_back)+'-'+str(breakpints[1]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>'+str(breakpints[2]-flank_front) , '&&', '$5<'+str(breakpints[2]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_front)+'-'+str(breakpints[2]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - elif info[1] == ['a_b','ba_b'] or info[1] == ['ab_c','cb_c']: #ddup or ddup_iDEL, insertion from the smaller chromosome to the larger chromosome - common_1 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][1]-flank_back)+'-'+str(breakpints[0][1]+flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="-"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][1]-flank_front) , '&&', '$5<'+str(breakpints[1][1]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - common_2 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][2]-flank_front)+'-'+str(breakpints[0][2]+flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="+"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][2]-flank_back) , '&&', '$5<'+str(breakpints[1][2]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - elif info[1] == ['a_b','a_ba'] or info[1] == ['a_bc','a_ba']: #ddup or ddup_iDEL, insertion from the larger chromosome to the smaller chromosome - common_1 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][1]-flank_front)+'-'+str(breakpints[0][1]+flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="+"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][1]-flank_back) , '&&', '$5<'+str(breakpints[1][1]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - common_2 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][2]-flank_back)+'-'+str(breakpints[0][2]+flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="-"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][2]-flank_front) , '&&', '$5<'+str(breakpints[1][2]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - elif info[1] == ['a_b', 'b^a_b'] or info[1] == ['ab_c', 'c^b_c']: #inverted insertion, insertion from the smaller chromosome to the larger chromosome - common_1 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][1]-flank_back)+'-'+str(breakpints[0][1]+flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="+"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][2]-flank_back) , '&&', '$5<'+str(breakpints[1][2]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - common_2 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][2]-flank_front)+'-'+str(breakpints[0][2]+flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="-"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][1]-flank_front) , '&&', '$5<'+str(breakpints[1][1]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - elif info[1] == ['a_b', 'a_ba^'] or info[1] == ['a_bc','a_ba^']: #inverted insertion, insertion from the larger chromosome to the smaller chromosome - common_1 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][1]-flank_front)+'-'+str(breakpints[0][1]+flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="-"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][2]-flank_front) , '&&', '$5<'+str(breakpints[1][2]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - common_2 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][2]-flank_back)+'-'+str(breakpints[0][2]+flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="+"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][1]-flank_back) , '&&', '$5<'+str(breakpints[1][1]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - elif info[1] == ['CTX_PQ/QP']: #inverted insertion, insertion from the larger chromosome to the smaller chromosome - common_1 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][1]-flank_back)+'-'+str(breakpints[0][1]+flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="+"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][1]-flank_back) , '&&', '$5<'+str(breakpints[1][1]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - common_2 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][2]-flank_front)+'-'+str(breakpints[0][2]+flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="-"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][2]-flank_front) , '&&', '$5<'+str(breakpints[1][2]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - elif info[1] == ['CTX_PP/QQ']: #inverted insertion, insertion from the larger chromosome to the smaller chromosome - common_1 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][1]-flank_back)+'-'+str(breakpints[0][1]+flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="-"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][1]-flank_front) , '&&', '$5<'+str(breakpints[1][1]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - common_2 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][2]-flank_front)+'-'+str(breakpints[0][2]+flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="+"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][2]-flank_back) , '&&', '$5<'+str(breakpints[1][2]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence] - else: - print(info) - samples = SVID_sample[info[2]] - if not samples =='' and not samples=='NA': - sample_list = samples.split(',') - pe_metrics_list = [sample_pe[samp] for samp in sample_list] - for num in range(len(sample_list)): - common_1[1] = pe_metrics_list[num] - common_2[1] = pe_metrics_list[num] - common_1[4] = sample_list[num] - common_2[4] = sample_list[num] - write_1 = ' '.join(common_1) - write_2 = ' '.join(common_2) - print(write_1, file=fo) - print(write_2, file=fo) - - fo.close() - return out + print(info) + common_1 = None + common_2 = None + samples = svid_sample[info[2]] + if not samples == '' and not samples == 'NA': + sample_list = samples.split(',') + pe_metrics_list = [sample_pe[samp] for samp in sample_list] + for num in range(len(sample_list)): + common_1[1] = pe_metrics_list[num] + common_2[1] = pe_metrics_list[num] + common_1[4] = sample_list[num] + common_2[4] = sample_list[num] + write_1 = ' '.join(common_1) + write_2 = ' '.join(common_2) + print(write_1, file=fo) + print(write_2, file=fo) + + fo.close() + return out + def write_cpx_command(cpx_command, out_file): - fo=open(out_file,'w') - for i in cpx_command: - print(i, file=fo) - fo.close() - -def write_cpx_SVs(cpx_SV, out_file): - fo=open(out_file,'w') - for i in cpx_SV: - out = [i[2],':'.join([str(j) for j in i[0]])]+i[1] - print('\t'.join(out), file=fo) - fo.close() + fo = open(out_file, 'w') + for i in cpx_command: + print(i, file=fo) + fo.close() + + +def write_cpx_svs(cpx_sv, out_file): + fo = open(out_file, 'w') + for i in cpx_sv: + out = [i[2], ':'.join([str(j) for j in i[0]])] + i[1] + print('\t'.join(out), file=fo) + fo.close() + def main(): - """ - Command-line main block - """ - - # Parse command line arguments and options - parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.RawDescriptionHelpFormatter) - parser.add_argument('-i','--input', required=True, help='input file of CPX events in bed format') - parser.add_argument('-s','--sample_pe', required=True, help='2 column file with sample ID and PE metrics information') - parser.add_argument('-p','--pe_evidence', required=True, help='name of file to store collected PE metrics') - parser.add_argument('-c','--command_script', required=True, help='name of file that has scripts to collect PE evidences') - parser.add_argument('-r','--reformat_SV', required=True, help='reformatted SV in svelter format') - parser.add_argument('-u','--unresolved', required=True, help='list of SVIDs to mark unresolved without evaluating (temporary workaround') # TODO - args = parser.parse_args() - - input_bed = args.input - sample_pe_file = args.sample_pe - PE_evidence = args.pe_evidence - command_script = args.command_script - reformatted_SV = args.reformat_SV - - header_pos = header_pos_readin(input_bed) - sample_pe = sample_pe_readin(sample_pe_file) - SVID_sample = SVID_sample_readin(input_bed, header_pos) - - cpx_SV = cpx_SV_readin(input_bed, header_pos, args.unresolved) - cpx_inter_chromo_SV = cpx_inter_chromo_SV_readin(input_bed, header_pos) - write_cpx_SVs(cpx_SV+cpx_inter_chromo_SV, reformatted_SV) - - cpx_sample_batch_readin(cpx_SV+cpx_inter_chromo_SV, SVID_sample, sample_pe, PE_evidence, command_script) + """ + Command-line main block + """ -import os -import sys -import argparse -if __name__ == '__main__': - main() + # Parse command line arguments and options + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument('-i', '--input', required=True, help='input file of CPX events in bed format') + parser.add_argument('-s', '--sample_pe', required=True, help='2 column file with sample ID and PE metrics information') + parser.add_argument('-p', '--pe_evidence', required=True, help='name of file to store collected PE metrics') + parser.add_argument('-c', '--command_script', required=True, help='name of file that has scripts to collect PE evidences') + parser.add_argument('-r', '--reformat_SV', required=True, help='reformatted SV in svelter format') + parser.add_argument('-u', '--unresolved', required=True, help='list of SVIDs to mark unresolved without evaluating (temporary workaround') # TODO + args = parser.parse_args() + input_bed = args.input + sample_pe_file = args.sample_pe + pe_evidence = args.pe_evidence + command_script = args.command_script + reformatted_sv = args.reformat_SV + + header_pos = header_pos_readin(input_bed) + sample_pe = sample_pe_readin(sample_pe_file) + svid_sample = svid_sample_readin(input_bed, header_pos) + + cpx_sv = cpx_sv_readin(input_bed, header_pos, args.unresolved) + cpx_inter_chromo_sv = cpx_inter_chromo_sv_readin(input_bed, header_pos) + write_cpx_svs(cpx_sv + cpx_inter_chromo_sv, reformatted_sv) + + cpx_sample_batch_readin(cpx_sv + cpx_inter_chromo_sv, svid_sample, sample_pe, pe_evidence, command_script) + + +if __name__ == '__main__': + main() From 01c44b245a8bf884a5ea45723f7c867fd39d59cc Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Mon, 21 Oct 2024 17:09:38 -0400 Subject: [PATCH 18/28] lint --- .../scripts/reformat_CPX_bed_and_generate_script.py | 4 ++-- wdl/RefineComplexVariants.wdl | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py b/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py index 8a90e16f3..d37ac7e04 100644 --- a/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py +++ b/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py @@ -468,8 +468,8 @@ def main(): # Parse command line arguments and options parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.RawDescriptionHelpFormatter) + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('-i', '--input', required=True, help='input file of CPX events in bed format') parser.add_argument('-s', '--sample_pe', required=True, help='2 column file with sample ID and PE metrics information') parser.add_argument('-p', '--pe_evidence', required=True, help='name of file to store collected PE metrics') diff --git a/wdl/RefineComplexVariants.wdl b/wdl/RefineComplexVariants.wdl index 2867d902c..da75ef7c3 100644 --- a/wdl/RefineComplexVariants.wdl +++ b/wdl/RefineComplexVariants.wdl @@ -45,7 +45,6 @@ workflow RefineComplexVariants { RuntimeAttr? runtime_attr_generate_cpx_review_script RuntimeAttr? runtime_attr_generate_cnv_segments_from_cpx RuntimeAttr? runtime_attr_get_vcf_header_with_members_info_line - RuntimeAttr? runtime_attr_generate_cnv_segments_from_cpx RuntimeAttr? runtime_attr_extract_cpx_lg_cnv_by_batch RuntimeAttr? runtime_attr_seek_depth_supp_for_cpx RuntimeAttr? runtime_attr_concat_bed_Step1 From 16f436ba67ba57dfe312e2effa9a7d5b1318d798 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Mon, 21 Oct 2024 17:20:20 -0400 Subject: [PATCH 19/28] lint --- .../scripts/reformat_CPX_bed_and_generate_script.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py b/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py index d37ac7e04..8b4b8cef8 100644 --- a/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py +++ b/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py @@ -468,8 +468,8 @@ def main(): # Parse command line arguments and options parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.RawDescriptionHelpFormatter) + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('-i', '--input', required=True, help='input file of CPX events in bed format') parser.add_argument('-s', '--sample_pe', required=True, help='2 column file with sample ID and PE metrics information') parser.add_argument('-p', '--pe_evidence', required=True, help='name of file to store collected PE metrics') From 70a6bfbd2181eedfaa5288922e6a208b225f0418 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Tue, 22 Oct 2024 11:35:28 -0400 Subject: [PATCH 20/28] update pipeline diagram, remove UNRESOLVED_TYPE for ME DELs --- terra_pipeline_diagram.jpg | 4 ++-- wdl/CleanVcfChromosome.wdl | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/terra_pipeline_diagram.jpg b/terra_pipeline_diagram.jpg index e3af1c2f7..226f13c2c 100644 --- a/terra_pipeline_diagram.jpg +++ b/terra_pipeline_diagram.jpg @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7312ab3c1e136b98f3721606ecec017a1a2f6787144aaaf12fe16ab00a9a6a53 -size 295450 +oid sha256:e7b763bd8386fb082264ae43a5ed9ca922b4dd9763f58e079d2028375fb8e8af +size 307785 diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl index 41df87262..f360f66fb 100644 --- a/wdl/CleanVcfChromosome.wdl +++ b/wdl/CleanVcfChromosome.wdl @@ -690,6 +690,7 @@ for record in fin: record.stop = record.info['END2'] record.info.pop("CHR2") record.info.pop("END2") + record.info.pop("UNRESOLVED_TYPE") if hash_MEI_DEL_reset[record.id] == 'overlap_LINE1': record.alts = ('',) if hash_MEI_DEL_reset[record.id] == 'overlap_HERVK': From 97b97cbf4b89e6b7863a144339f4f7fbe7729900 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Tue, 22 Oct 2024 13:34:28 -0400 Subject: [PATCH 21/28] 1 cpu for rescue me dels --- wdl/CleanVcfChromosome.wdl | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl index f360f66fb..a14ffa8c4 100644 --- a/wdl/CleanVcfChromosome.wdl +++ b/wdl/CleanVcfChromosome.wdl @@ -620,14 +620,10 @@ task RescueMobileElementDeletions { } Float input_size = size(vcf, "GiB") - # disk is cheap, read/write speed is proportional to disk size, and disk IO is a significant time factor: - # in tests on large VCFs, memory usage is ~1.0 * input VCF size - # the biggest disk usage is at the end of the task, with input + output VCF on disk - Int cpu_cores = 2 # speed up compression / decompression of VCFs RuntimeAttr runtime_default = object { mem_gb: 3.75 + input_size * 1.5, disk_gb: ceil(100.0 + input_size * 3.0), - cpu_cores: cpu_cores, + cpu_cores: 1, preemptible_tries: 3, max_retries: 1, boot_disk_gb: 10 From 256e24d1b3ad118a39d5e342102fd6b6aa010d8d Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Tue, 22 Oct 2024 14:46:10 -0400 Subject: [PATCH 22/28] include existing no-call GTs in NCR calculation --- src/sv-pipeline/scripts/apply_sl_filter.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/sv-pipeline/scripts/apply_sl_filter.py b/src/sv-pipeline/scripts/apply_sl_filter.py index 74187eb01..e0fa85c66 100644 --- a/src/sv-pipeline/scripts/apply_sl_filter.py +++ b/src/sv-pipeline/scripts/apply_sl_filter.py @@ -114,6 +114,8 @@ def _apply_filter(record, sl_threshold, ploidy_dict, apply_hom_ref, ncr_threshol n_samples += 1 sl = gt.get('SL', None) if sl is None: + if record.info['SVTYPE'] != 'CNV' and _is_no_call(gt['GT']): + n_no_call += 1 continue gt_is_ref = _is_hom_ref(gt['GT']) # SL metrics are over non-ref / no-call samples @@ -128,6 +130,7 @@ def _apply_filter(record, sl_threshold, ploidy_dict, apply_hom_ref, ncr_threshol gt['GT_FILTER'] = _gt_to_filter_status(gt['GT'], fails_filter) if fails_filter: gt['GT'] = _filter_gt(gt['GT'], allele) + if _is_no_call('GT'): n_no_call += 1 # Annotate metrics record.info['NCN'] = n_no_call From 15282bc7247fe79b10127a64dd6a5e9c3a730d81 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Tue, 22 Oct 2024 15:47:59 -0400 Subject: [PATCH 23/28] svconcordance should use cpx refined vcf as input now --- .../cohort_mode/workflow_configurations/SVConcordance.json.tmpl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/SVConcordance.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/SVConcordance.json.tmpl index ff48489ac..1c9dfed26 100644 --- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/SVConcordance.json.tmpl +++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/SVConcordance.json.tmpl @@ -2,7 +2,7 @@ "SVConcordance.gatk_docker": "${workspace.gatk_docker}", "SVConcordance.sv_base_mini_docker": "${workspace.sv_base_mini_docker}", - "SVConcordance.eval_vcf" : "${this.cleaned_vcf}", + "SVConcordance.eval_vcf" : "${this.cpx_refined_vcf}", "SVConcordance.truth_vcf" : "${this.joined_raw_calls_vcf}", "SVConcordance.output_prefix": "${this.sample_set_set_id}", From 028fc6852b180dea0774f1d227d78641c578f69a Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Tue, 22 Oct 2024 16:56:21 -0400 Subject: [PATCH 24/28] reduce resources for a few tasks based on monitoring --- wdl/CollectLargeCNVSupportForCPX.wdl | 4 ++-- wdl/CollectPEMetricsForCPX.wdl | 2 +- wdl/CollectPEMetricsPerBatchCPX.wdl | 4 ++-- wdl/RefineComplexVariants.wdl | 24 ++++++++++++------------ 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/wdl/CollectLargeCNVSupportForCPX.wdl b/wdl/CollectLargeCNVSupportForCPX.wdl index 9675bac4d..6c943238a 100644 --- a/wdl/CollectLargeCNVSupportForCPX.wdl +++ b/wdl/CollectLargeCNVSupportForCPX.wdl @@ -189,9 +189,9 @@ task GenerateCnvSegmentFromCpx { RuntimeAttr default_attr = object { cpu_cores: 1, - mem_gb: 5, + mem_gb: 3.75, disk_gb: 10, - boot_disk_gb: 30, + boot_disk_gb: 10, preemptible_tries: 1, max_retries: 1 } diff --git a/wdl/CollectPEMetricsForCPX.wdl b/wdl/CollectPEMetricsForCPX.wdl index 746f6a907..9a6fbd0c1 100644 --- a/wdl/CollectPEMetricsForCPX.wdl +++ b/wdl/CollectPEMetricsForCPX.wdl @@ -70,7 +70,7 @@ task CalcuPEStat { RuntimeAttr default_attr = object { cpu_cores: 1, - mem_gb: 5, + mem_gb: 3.75, disk_gb: 10, boot_disk_gb: 10, preemptible_tries: 3, diff --git a/wdl/CollectPEMetricsPerBatchCPX.wdl b/wdl/CollectPEMetricsPerBatchCPX.wdl index 655959748..3cc65dbeb 100644 --- a/wdl/CollectPEMetricsPerBatchCPX.wdl +++ b/wdl/CollectPEMetricsPerBatchCPX.wdl @@ -66,7 +66,7 @@ task CollectPEMetrics { RuntimeAttr default_attr = object { cpu_cores: 1, - mem_gb: 5, + mem_gb: 3.75, disk_gb: ceil(30.0 + size(PE_metric, "GiB") * 3), boot_disk_gb: 10, preemptible_tries: 3, @@ -123,7 +123,7 @@ task ConcatEvidences { RuntimeAttr default_attr = object { cpu_cores: 1, - mem_gb: 5, + mem_gb: 3.75, disk_gb: 10, boot_disk_gb: 10, preemptible_tries: 3, diff --git a/wdl/RefineComplexVariants.wdl b/wdl/RefineComplexVariants.wdl index da75ef7c3..4340ef253 100644 --- a/wdl/RefineComplexVariants.wdl +++ b/wdl/RefineComplexVariants.wdl @@ -218,9 +218,9 @@ task GetSampleBatchPEMap { RuntimeAttr default_attr = object { cpu_cores: 1, - mem_gb: 5, + mem_gb: 3.75, disk_gb: 10 + 2*ceil(size(flatten([batch_sample_lists]), "GB")), - boot_disk_gb: 30, + boot_disk_gb: 10, preemptible_tries: 1, max_retries: 1 } @@ -280,9 +280,9 @@ task ReviseVcf { RuntimeAttr default_attr = object { cpu_cores: 1, - mem_gb: 7.5, + mem_gb: 3.75, disk_gb: ceil(5.0 + size(vcf_file, "GB")*3), - boot_disk_gb: 30, + boot_disk_gb: 10, preemptible_tries: 1, max_retries: 1 } @@ -425,9 +425,9 @@ task SplitCpxCtx { RuntimeAttr default_attr = object { cpu_cores: 1, - mem_gb: 7.5, + mem_gb: 3.75, disk_gb: 10, - boot_disk_gb: 30, + boot_disk_gb: 10, preemptible_tries: 1, max_retries: 1 } @@ -481,9 +481,9 @@ task GenerateCpxReviewScript { RuntimeAttr default_attr = object { cpu_cores: 1, - mem_gb: 5, + mem_gb: 3.75, disk_gb: 10, - boot_disk_gb: 30, + boot_disk_gb: 10, preemptible_tries: 1, max_retries: 1 } @@ -537,9 +537,9 @@ task CalculateCpxEvidences { RuntimeAttr default_attr = object { cpu_cores: 1, - mem_gb: 5, + mem_gb: 3.75, disk_gb: 10, - boot_disk_gb: 30, + boot_disk_gb: 10, preemptible_tries: 1, max_retries: 1 } @@ -691,9 +691,9 @@ task CalculateCtxEvidences { RuntimeAttr default_attr = object { cpu_cores: 1, - mem_gb: 5, + mem_gb: 3.75, disk_gb: 10, - boot_disk_gb: 30, + boot_disk_gb: 10, preemptible_tries: 1, max_retries: 1 } From 1034a5c87a3e4403320f6066b9d632a4fb6a2c1a Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Thu, 24 Oct 2024 16:27:21 -0400 Subject: [PATCH 25/28] select PASS for MainVcfQc --- wdl/FilterGenotypes.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wdl/FilterGenotypes.wdl b/wdl/FilterGenotypes.wdl index a703862f1..74375aec5 100644 --- a/wdl/FilterGenotypes.wdl +++ b/wdl/FilterGenotypes.wdl @@ -35,7 +35,7 @@ workflow FilterGenotypes { Array[Array[String]]? sample_level_comparison_datasets # Array of two-element arrays, one per dataset, each of format [prefix, gs:// path to per-sample tarballs] File? sample_renaming_tsv # File with mapping to rename per-sample benchmark sample IDs for compatibility with cohort Boolean run_qc = true - String qc_bcftools_preprocessing_options = "-e 'FILTER~\"UNRESOLVED\" || FILTER~\"HIGH_NCR\"'" + String qc_bcftools_preprocessing_options = "-i 'FILTER=\"PASS\"'" Int qc_sv_per_shard = 2500 Int qc_samples_per_shard = 600 RuntimeAttr? runtime_override_plot_qc_per_family From 6e1c22ce0eec46bd1af222d24e83782d790567bd Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Fri, 25 Oct 2024 15:27:48 -0400 Subject: [PATCH 26/28] fix ncr --- src/sv-pipeline/scripts/apply_sl_filter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sv-pipeline/scripts/apply_sl_filter.py b/src/sv-pipeline/scripts/apply_sl_filter.py index e0fa85c66..c8640b861 100644 --- a/src/sv-pipeline/scripts/apply_sl_filter.py +++ b/src/sv-pipeline/scripts/apply_sl_filter.py @@ -130,7 +130,7 @@ def _apply_filter(record, sl_threshold, ploidy_dict, apply_hom_ref, ncr_threshol gt['GT_FILTER'] = _gt_to_filter_status(gt['GT'], fails_filter) if fails_filter: gt['GT'] = _filter_gt(gt['GT'], allele) - if _is_no_call('GT'): + if _is_no_call(gt['GT']): n_no_call += 1 # Annotate metrics record.info['NCN'] = n_no_call From 680394410b55876ed9671013d0b48b688bb8c7d8 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Mon, 28 Oct 2024 13:57:17 -0400 Subject: [PATCH 27/28] fix ncr for mcnvs; update ref panel output vcfs cleanvcf -> filtergeno --- .../test/SVConcordance/SVConcordance.json.tmpl | 2 +- inputs/values/ref_panel_1kg.json | 14 ++++++++------ src/sv-pipeline/scripts/apply_sl_filter.py | 4 +--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/inputs/templates/test/SVConcordance/SVConcordance.json.tmpl b/inputs/templates/test/SVConcordance/SVConcordance.json.tmpl index 0540305cd..de29c485f 100644 --- a/inputs/templates/test/SVConcordance/SVConcordance.json.tmpl +++ b/inputs/templates/test/SVConcordance/SVConcordance.json.tmpl @@ -2,7 +2,7 @@ "SVConcordance.gatk_docker": {{ dockers.gatk_docker | tojson }}, "SVConcordance.sv_base_mini_docker": {{ dockers.sv_base_mini_docker | tojson }}, - "SVConcordance.eval_vcf" : {{ test_batch.gatk_formatted_vcf | tojson }}, + "SVConcordance.eval_vcf" : {{ test_batch.complex_refined_vcf | tojson }}, "SVConcordance.truth_vcf" : {{ test_batch.joined_raw_calls_vcf | tojson }}, "SVConcordance.output_prefix": {{ test_batch.name | tojson }}, diff --git a/inputs/values/ref_panel_1kg.json b/inputs/values/ref_panel_1kg.json index 4ee29e419..e50439b34 100644 --- a/inputs/values/ref_panel_1kg.json +++ b/inputs/values/ref_panel_1kg.json @@ -1166,8 +1166,8 @@ "gs://gatk-sv-ref-panel-1kg/outputs/mw-make-cohort-vcf-templates/ResolveComplexVariants/breakpoint-overlap-dropped-record-vcfs/ref_panel_1kg.chrX.breakpoint_overlap.dropped_records.vcf.gz.tbi", "gs://gatk-sv-ref-panel-1kg/outputs/mw-make-cohort-vcf-templates/ResolveComplexVariants/breakpoint-overlap-dropped-record-vcfs/ref_panel_1kg.chrY.breakpoint_overlap.dropped_records.vcf.gz.tbi" ], - "clean_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/mw-vcf-reshard/ref_panel_1kg.cleaned.vcf.gz", - "clean_vcf_index": "gs://gatk-sv-ref-panel-1kg/outputs/mw-vcf-reshard/ref_panel_1kg.cleaned.vcf.gz.tbi", + "clean_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/eph-refine-cpx/ref_panel_1kg.cleaned.vcf.gz", + "clean_vcf_index": "gs://gatk-sv-ref-panel-1kg/outputs/eph-refine-cpx/ref_panel_1kg.cleaned.vcf.gz.tbi", "clean_vcf_gatk_formatter_args": "", "cluster_background_fail_lists": [ "gs://gatk-sv-ref-panel-1kg/outputs/mw-make-cohort-vcf-templates/CombineBatches/cluster-background-fail-lists/ref_panel_1kg.chr1.sr_background_fail.updated2.txt", @@ -1329,6 +1329,8 @@ "gs://gatk-sv-ref-panel-1kg/outputs/mw-vcf-reshard/ref_panel_1kg.chrX.regenotyped.vcf.gz.tbi", "gs://gatk-sv-ref-panel-1kg/outputs/mw-vcf-reshard/ref_panel_1kg.chrY.regenotyped.vcf.gz.tbi" ], + "complex_refined_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/eph-refine-cpx/ref_panel_1kg.cpx_refined.vcf.gz", + "complex_refined_vcf_index": "gs://gatk-sv-ref-panel-1kg/outputs/eph-refine-cpx/ref_panel_1kg.cpx_refined.vcf.gz", "complex_resolve_bothside_pass_list": "gs://gatk-sv-ref-panel-1kg/outputs/mw-vcf-reshard/ref_panel_1kg.all.sr_bothside_pass.updated3.txt", "complex_resolve_background_fail_list": "gs://gatk-sv-ref-panel-1kg/outputs/mw-vcf-reshard/ref_panel_1kg.all.sr_background_fail.updated3.txt", "complex_resolve_vcfs": [ @@ -1383,8 +1385,8 @@ "gs://gatk-sv-ref-panel-1kg/outputs/mw-vcf-reshard/ref_panel_1kg.reshard_vcf.chrX.resharded.vcf.gz.tbi", "gs://gatk-sv-ref-panel-1kg/outputs/mw-vcf-reshard/ref_panel_1kg.reshard_vcf.chrY.resharded.vcf.gz.tbi" ], - "concordance_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/mw_sv_concordance_update/ref_panel_1kg.concordance.vcf.gz", - "concordance_vcf_index": "gs://gatk-sv-ref-panel-1kg/outputs/mw_sv_concordance_update/ref_panel_1kg.concordance.vcf.gz.tbi", + "concordance_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/eph-refine-cpx/ref_panel_1kg.concordance.vcf.gz", + "concordance_vcf_index": "gs://gatk-sv-ref-panel-1kg/outputs/eph-refine-cpx/ref_panel_1kg.concordance.vcf.gz.tbi", "contig_ploidy_model_tar": "gs://gatk-sv-resources-public/hg38/v0/sv-resources/ref-panel/1KG/v2/gcnv/ref_panel_1kg_v2-contig-ploidy-model.tar.gz", "counts": [ "gs://gatk-sv-ref-panel-1kg/outputs/tws-no-cram-conversion/GatherSampleEvidenceBatch/HG00096.counts.tsv.gz", @@ -1852,8 +1854,8 @@ "genotype_pesr_pesr_sepcutoff": "gs://gatk-sv-ref-panel-1kg/outputs/GATKSVPipelineBatch/38c65ca4-2a07-4805-86b6-214696075fef/call-GenotypeBatch/GenotypeBatch/ad17f522-0950-4f0a-9148-a13f689082ed/call-GenotypePESRPart1/GenotypePESRPart1/40ec6d76-dd1c-432d-bfab-bc4426d0b1ec/call-TrainRDGenotyping/TrainRDGenotyping/e5540a96-9072-4719-bcfb-afccdfec15c6/call-UpdateCutoff/cacheCopy/ref_panel_1kg.pesr.pesr_sepcutoff.txt", "genotyped_depth_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/GATKSVPipelineBatch/38c65ca4-2a07-4805-86b6-214696075fef/call-GenotypeBatch/GenotypeBatch/ad17f522-0950-4f0a-9148-a13f689082ed/call-GenotypeDepthPart2/GenotypeDepthPart2/0aafd752-e606-4196-86ac-41c1c3ce1eb2/call-ConcatGenotypedVcfs/cacheCopy/ref_panel_1kg.depth.vcf.gz", "genotyped_pesr_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/GATKSVPipelineBatch/38c65ca4-2a07-4805-86b6-214696075fef/call-GenotypeBatch/GenotypeBatch/ad17f522-0950-4f0a-9148-a13f689082ed/call-GenotypePESRPart2/GenotypePESRPart2/ce1f4075-1a3e-44b5-9cfe-bfb701327616/call-ConcatGenotypedVcfs/cacheCopy/ref_panel_1kg.pesr.vcf.gz", - "genotype_filtered_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/mw-filter-genotypes/1kgp/ref_panel_1kg.filter_genotypes.vcf.gz", - "genotype_filtered_vcf_index": "gs://gatk-sv-ref-panel-1kg/outputs/mw-filter-genotypes/1kgp/ref_panel_1kg.filter_genotypes.vcf.gz.tbi", + "genotype_filtered_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/eph-refine-cpx/ref_panel_1kg.filter_genotypes.sanitized.vcf.gz", + "genotype_filtered_vcf_index": "gs://gatk-sv-ref-panel-1kg/outputs/eph-refine-cpx/ref_panel_1kg.filter_genotypes.sanitized.vcf.gz.tbi", "genotype_filtered_qc_tarball": "gs://gatk-sv-ref-panel-1kg/outputs/mw-filter-genotypes/1kgp/ref_panel_1kg.filter_genotypes_SV_VCF_QC_output.tar.gz", "joined_raw_calls_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/mw_sv_concordance_update/ref_panel_1kg.join_raw_calls.vcf.gz", "joined_raw_calls_vcf_index": "gs://gatk-sv-ref-panel-1kg/outputs/mw_sv_concordance_update/ref_panel_1kg.join_raw_calls.vcf.gz.tbi", diff --git a/src/sv-pipeline/scripts/apply_sl_filter.py b/src/sv-pipeline/scripts/apply_sl_filter.py index c8640b861..917b605fb 100644 --- a/src/sv-pipeline/scripts/apply_sl_filter.py +++ b/src/sv-pipeline/scripts/apply_sl_filter.py @@ -114,8 +114,6 @@ def _apply_filter(record, sl_threshold, ploidy_dict, apply_hom_ref, ncr_threshol n_samples += 1 sl = gt.get('SL', None) if sl is None: - if record.info['SVTYPE'] != 'CNV' and _is_no_call(gt['GT']): - n_no_call += 1 continue gt_is_ref = _is_hom_ref(gt['GT']) # SL metrics are over non-ref / no-call samples @@ -130,7 +128,7 @@ def _apply_filter(record, sl_threshold, ploidy_dict, apply_hom_ref, ncr_threshol gt['GT_FILTER'] = _gt_to_filter_status(gt['GT'], fails_filter) if fails_filter: gt['GT'] = _filter_gt(gt['GT'], allele) - if _is_no_call(gt['GT']): + if record.info['SVTYPE'] != 'CNV' and _is_no_call(gt['GT']): n_no_call += 1 # Annotate metrics record.info['NCN'] = n_no_call From 50f78bc7ff124fc0fb458a967b7316c504879bc7 Mon Sep 17 00:00:00 2001 From: Emma Pierce-Hoffman Date: Mon, 28 Oct 2024 18:32:02 -0400 Subject: [PATCH 28/28] include multiallelics in default qc; update qc tarball for ref panel --- inputs/values/ref_panel_1kg.json | 2 +- wdl/FilterGenotypes.wdl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/inputs/values/ref_panel_1kg.json b/inputs/values/ref_panel_1kg.json index e50439b34..f317967a5 100644 --- a/inputs/values/ref_panel_1kg.json +++ b/inputs/values/ref_panel_1kg.json @@ -1856,7 +1856,7 @@ "genotyped_pesr_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/GATKSVPipelineBatch/38c65ca4-2a07-4805-86b6-214696075fef/call-GenotypeBatch/GenotypeBatch/ad17f522-0950-4f0a-9148-a13f689082ed/call-GenotypePESRPart2/GenotypePESRPart2/ce1f4075-1a3e-44b5-9cfe-bfb701327616/call-ConcatGenotypedVcfs/cacheCopy/ref_panel_1kg.pesr.vcf.gz", "genotype_filtered_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/eph-refine-cpx/ref_panel_1kg.filter_genotypes.sanitized.vcf.gz", "genotype_filtered_vcf_index": "gs://gatk-sv-ref-panel-1kg/outputs/eph-refine-cpx/ref_panel_1kg.filter_genotypes.sanitized.vcf.gz.tbi", - "genotype_filtered_qc_tarball": "gs://gatk-sv-ref-panel-1kg/outputs/mw-filter-genotypes/1kgp/ref_panel_1kg.filter_genotypes_SV_VCF_QC_output.tar.gz", + "genotype_filtered_qc_tarball": "gs://gatk-sv-ref-panel-1kg/outputs/eph-refine-cpx/ref_panel_1kg.filter_genotypes_SV_VCF_QC_output.tar.gz", "joined_raw_calls_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/mw_sv_concordance_update/ref_panel_1kg.join_raw_calls.vcf.gz", "joined_raw_calls_vcf_index": "gs://gatk-sv-ref-panel-1kg/outputs/mw_sv_concordance_update/ref_panel_1kg.join_raw_calls.vcf.gz.tbi", "manta_vcfs": [ diff --git a/wdl/FilterGenotypes.wdl b/wdl/FilterGenotypes.wdl index 74375aec5..8926bc31e 100644 --- a/wdl/FilterGenotypes.wdl +++ b/wdl/FilterGenotypes.wdl @@ -35,7 +35,7 @@ workflow FilterGenotypes { Array[Array[String]]? sample_level_comparison_datasets # Array of two-element arrays, one per dataset, each of format [prefix, gs:// path to per-sample tarballs] File? sample_renaming_tsv # File with mapping to rename per-sample benchmark sample IDs for compatibility with cohort Boolean run_qc = true - String qc_bcftools_preprocessing_options = "-i 'FILTER=\"PASS\"'" + String qc_bcftools_preprocessing_options = "-i 'FILTER=\"PASS\" || FILTER=\"MULTIALLELIC\"'" Int qc_sv_per_shard = 2500 Int qc_samples_per_shard = 600 RuntimeAttr? runtime_override_plot_qc_per_family