From 80d9a8db47732fb5fc5ecb1807d8898c3b837b22 Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Thu, 17 Oct 2024 16:49:17 -0400
Subject: [PATCH 01/28] port RefineComplexVariants from
 ManualReview/FilterComplex

---
 .../reformat_CPX_bed_and_generate_script.py   | 474 ++++++++++
 wdl/CollectLargeCNVSupportForCPX.wdl          | 287 ++++++
 wdl/CollectPEMetricsForCPX.wdl                | 102 ++
 wdl/CollectPEMetricsPerBatchCPX.wdl           | 205 +++++
 wdl/RefineComplexVariants.wdl                 | 870 ++++++++++++++++++
 5 files changed, 1938 insertions(+)
 create mode 100644 src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py
 create mode 100644 wdl/CollectLargeCNVSupportForCPX.wdl
 create mode 100644 wdl/CollectPEMetricsForCPX.wdl
 create mode 100644 wdl/CollectPEMetricsPerBatchCPX.wdl
 create mode 100644 wdl/RefineComplexVariants.wdl

diff --git a/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py b/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py
new file mode 100644
index 000000000..f05e52d9b
--- /dev/null
+++ b/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py
@@ -0,0 +1,474 @@
+#!python
+#this script takes in the CPX SVs in complex format, reformat them in svelter format meanwhile generate the script to collect PE metrics
+
+def header_pos_readin(input_bed):
+  fin=os.popen(r'''zcat %s'''%(input_bed))
+  pin=fin.readline().strip().split()
+  out = {}
+  for i in range(len(pin)):
+    out[pin[i]] = i
+  fin.close()
+  return out
+
+def sample_pe_readin(sample_pe_file):
+  #sample_pe_file is a 2 column file with sample ID and PE metrics on both columns
+  out = {}
+  fin=open(sample_pe_file)
+  for line in fin:
+    pin=line.strip().split()
+    if not pin[0] in out.keys():
+      out[pin[0]] = pin[1]
+  fin.close()
+  return out
+
+def SVID_sample_readin(input_bed,header_pos):
+  out={}
+  fin=os.popen(r'''zcat %s'''%(input_bed))
+  for line in fin:
+    pin=line.strip().split('\t')
+    if not pin[0][0]=='#':
+      out[pin[header_pos['name']]] = pin[header_pos['samples']]
+  fin.close()
+  return out
+
+def parse_segment(segment):
+    svtype, interval = segment.split('_')
+    chrom, coords = interval.split(':')
+    coord1, coord2 = coords.split('-')
+    return svtype, chrom, int(coord1), int(coord2)
+
+# segments: CPX_INTERVALS split by ','
+def extract_bp_list_for_inv_cnv_events(segments, cpx_type):
+    s0_svtype, s0_chrom, s0_c1, s0_c2 = parse_segment(segments[0])
+    breakpoints = [s0_chrom]+[s0_c1, s0_c2]
+    if not 'INVdup' in cpx_type:
+      for seg in segments[1:]:
+        seg_svtype, seg_chrom, seg_c1, seg_c2 = parse_segment(seg)
+        breakpoints.append(seg_c2)
+    else:
+      if cpx_type=='INVdup':
+        s1_svtype, s1_chrom, s1_c1, s1_c2 = parse_segment(segments[1])
+        if s1_c1 < breakpoints[2]:
+          breakpoints = breakpoints[:2] + [s1_c1, breakpoints[2]]
+      elif cpx_type == 'dupINVdup' or cpx_type == 'delINVdup':
+        s2_svtype, s2_chrom, s2_c1, s2_c2 = parse_segment(segments[2])
+        breakpoints += [s2_c1, s2_c2]
+    return breakpoints
+
+def extract_bp_list_for_ddup_events(coordinates, segments, small_sv_size_threshold):
+    #example of coordinates: ["chr1",1154129,1154133]
+    #example of segments: DUP_chr1:229987763-230011157
+    del_bp = [coordinates[0], int(coordinates[1]), int(coordinates[2])]
+    # CW: this looks like it only gets the coordinates from the first DUP segment?
+    #dup_seg = [i for i in segments if 'DUP_' in i][0].split('_')[1].split(':')
+    #dup_bp = [int(i) for i in dup_seg[1].split('-')]
+    #INV_flag = len([i for i in segments if "INV_" in i])
+    # rewrite as:
+    INV_flag = 0
+    for segment in segments:
+        if 'DUP_' in segment:
+            seg_svtype, seg_chrom, seg_c1, seg_c2 = parse_segment(segment)
+            dup_bp = [seg_c1, seg_c2]
+        if 'INV_' in segment:
+            INV_flag = INV_flag + 1
+        if 'DEL_' in segment:
+            seg_svtype, seg_chrom, seg_c1, seg_c2 = parse_segment(segment)
+            del_bp = [seg_chrom, seg_c1, seg_c2]
+    #INV_flag == 0 : no INV involved in the insertion
+    #INV_flag > 0: INV involved
+
+    if INV_flag==0:
+      if del_bp[2] < dup_bp[0]:
+          breakpints = del_bp + dup_bp
+          if del_bp[2]-del_bp[1]>small_sv_size_threshold:
+              structures = ['abc','cbc']
+          else:
+              structures = ['ab','bab']
+      elif del_bp[1] > dup_bp[1]:
+          breakpints   = [del_bp[0]]+dup_bp+del_bp[1:]
+          if del_bp[2]-del_bp[1]>small_sv_size_threshold:
+              structures = ['abc','aba']
+          else:
+              structures = ['ab','aba']
+      elif del_bp[1] < dup_bp[0] and not del_bp[2] < dup_bp[0]:
+          breakpints = del_bp[:2] + dup_bp
+          structures = ['ab','bb']
+      elif del_bp[1] < dup_bp[1] and not del_bp[2] < dup_bp[1]:
+          breakpints = [del_bp[0]]+dup_bp+[del_bp[2]]
+          structures = ['ab','aa']
+      else:
+        return 'unresolved'
+    elif INV_flag>0:
+      if del_bp[2] < dup_bp[0]:
+          breakpints = del_bp + dup_bp
+          if del_bp[2]-del_bp[1]>small_sv_size_threshold:
+              structures = ['abc','c^bc']
+          else:
+              structures = ['ab','b^ab']
+      elif del_bp[1] > dup_bp[1]:
+          breakpints   = [del_bp[0]]+dup_bp+del_bp[1:]
+          if del_bp[2]-del_bp[1]>small_sv_size_threshold:
+              structures = ['abc','aba^']
+          else:
+              structures = ['ab','aba^']
+      elif del_bp[1] < dup_bp[0] and not del_bp[2] < dup_bp[0]:
+          breakpints = del_bp[:2] + dup_bp
+          structures = ['ab','b^b']
+      elif del_bp[1] < dup_bp[1] and not del_bp[2] < dup_bp[1]:
+          breakpints = [del_bp[0]]+dup_bp+[del_bp[2]]
+          structures = ['ab','aa^']
+      else:
+          return 'unresolved'
+    return [breakpints, structures]
+
+def extract_bp_list_for_ins_idel(coordinates, segments, small_sv_size_threshold):
+    #example of coordinates: ["chr1",1154129,1154133]
+    #example of segments: DUP_chr1:229987763-230011157
+    del_bp = [coordinates[0], int(coordinates[1]), int(coordinates[2])]
+    dup_seg = [i for i in segments if 'INS_' in i][0].split('_')[1].split(':')
+    dup_bp = [int(i) for i in dup_seg[1].split('-')]
+    INV_flag = len([i for i in segments if "INV_" in i])
+    #INV_flag == 0 : no INV involved in the insertion
+    #INV_flag > 0: INV involved
+    if INV_flag==0:
+      if del_bp[2] < dup_bp[0]:
+          breakpints = del_bp + dup_bp
+          if del_bp[2]-del_bp[1]> small_sv_size_threshold:
+              structures = ['abc','cbc']
+          else:
+              structures = ['ab','bab']
+      elif del_bp[1] > dup_bp[1]:
+          breakpints   = [del_bp[0]]+dup_bp+del_bp[1:]
+          if del_bp[2]-del_bp[1]> small_sv_size_threshold:
+              structures = ['abc','aba']
+          else:
+              structures = ['ab','aba']
+      elif del_bp[1] < dup_bp[0] and not del_bp[2] < dup_bp[0]:
+          breakpints = del_bp[:2] + dup_bp
+          structures = ['ab','bb']
+      elif del_bp[1] < dup_bp[1] and not del_bp[2] < dup_bp[1]:
+          breakpints = [del_bp[0]]+dup_bp+[del_bp[2]]
+          structures = ['ab','aa']
+    elif INV_flag>0:
+      if del_bp[2] < dup_bp[0]:
+          breakpints = del_bp + dup_bp
+          if del_bp[2]-del_bp[1]> small_sv_size_threshold:
+              structures = ['abc','c^bc']
+          else:
+              structures = ['ab','b^ab']
+      elif del_bp[1] > dup_bp[1]:
+          breakpints   = [del_bp[0]]+dup_bp+del_bp[1:]
+          if del_bp[2]-del_bp[1]> small_sv_size_threshold:
+              structures = ['abc','aba^']
+          else:
+              structures = ['ab','aba^']
+      elif del_bp[1] < dup_bp[0] and not del_bp[2] < dup_bp[0]:
+          breakpints = del_bp[:2] + dup_bp
+          structures = ['ab','b^b']
+      elif del_bp[1] < dup_bp[1] and not del_bp[2] < dup_bp[1]:
+          breakpints = [del_bp[0]]+dup_bp+[del_bp[2]]
+          structures = ['ab','aa^']
+    return [breakpints, structures]
+
+def extract_bp_list_V4(coordinates, segments, small_sv_size_threshold):
+    del_bp = [coordinates[0], int(coordinates[1]), int(coordinates[2])]
+    dup_seg = [i for i in segments if 'INV_' in i][0].split('_')[1].split(':')
+    dup_bp = [int(i) for i in dup_seg[1].split('-')]
+    INV_flag = 1
+    if del_bp[2] < dup_bp[0]:
+        breakpoints = del_bp + dup_bp
+        if del_bp[2]-del_bp[1]> small_sv_size_threshold:
+            structures = ['abc','c^bc']
+        else:
+            structures = ['ab','b^ab']
+    elif del_bp[1] > dup_bp[1]:
+        breakpoints = [del_bp[0]]+dup_bp+del_bp[1:]
+        if del_bp[2]-del_bp[1]> small_sv_size_threshold:
+            structures = ['abc','aba^']
+        else:
+            structures = ['ab','aba^']
+    return [breakpoints, structures]
+
+def is_interchromosomal(pin, header_pos):
+   chrom = pin[0]
+   if pin[header_pos['CPX_TYPE']] in ['dDUP', 'dDUP_iDEL']:
+      seg2 = [i for i in pin[header_pos['CPX_INTERVALS']].split(',') if 'DUP_' in i][0].split('_')[1].split(':')
+      if seg2[0] != chrom:
+          return True
+   elif pin[header_pos['CPX_TYPE']] in ['INS_iDEL']:
+      seg2 = [i for i in pin[header_pos['CPX_INTERVALS']].split(',') if 'INS_' in i][0].split('_')[1].split(':')
+      if seg2[0] != chrom:
+         return True
+   elif pin[header_pos['CPX_TYPE']] in ['CTX_PQ/QP', 'CTX_PP/QQ'] or pin[header_pos['SVTYPE']] in ['CTX']:
+      return True
+   elif "INV_" in pin[header_pos['SOURCE']] and pin[header_pos['SVTYPE']]=="INS":
+      seg2 = [i for i in pin[header_pos['SOURCE']].split(',') if 'INV_' in i][0].split('_')[1].split(':')
+      seg2 = [seg2[0]]+seg2[1].split('-')
+      if seg2[0] != chrom:
+          return True
+   return False
+
+def cpx_SV_readin(input_bed, header_pos, unresolved):
+  unresolved_svids = []
+  fin=os.popen(r'''zcat %s'''%(input_bed))
+  out = []
+  small_sv_size_threshold = 250
+  for line in fin:
+    pin=line.strip().split('\t')
+    # label pidups as unresolved since manual inspection indicated low quality
+    if not pin[0][0]=="#":
+      if pin[header_pos['CPX_TYPE']] in ['piDUP_RF', 'piDUP_FR']:
+        unresolved_svids.append(pin[header_pos['name']])
+      elif not is_interchromosomal(pin, header_pos):
+        if pin[header_pos['CPX_TYPE']] in ['delINV', 'INVdel', 'dupINV','INVdup','delINVdel', 'delINVdup','dupINVdel','dupINVdup']:
+          segments = pin[header_pos['CPX_INTERVALS']].split(',')
+          breakpoints = extract_bp_list_for_inv_cnv_events(segments, pin[header_pos['CPX_TYPE']])
+          if pin[header_pos['CPX_TYPE']] == 'delINV':
+            ref_alt = ['ab','b^']
+          if pin[header_pos['CPX_TYPE']] == 'INVdel':
+            ref_alt = ['ab','a^']
+          if pin[header_pos['CPX_TYPE']] == 'dupINV':
+            ref_alt = ['ab','aba^']
+          if pin[header_pos['CPX_TYPE']] == 'INVdup':
+            ref_alt = ['ab','b^ab']
+          if pin[header_pos['CPX_TYPE']] == 'delINVdel':
+            ref_alt = ['abc','b^']
+          if pin[header_pos['CPX_TYPE']] == 'delINVdup':
+            ref_alt = ['abc','c^bc']
+          if pin[header_pos['CPX_TYPE']] == 'dupINVdel':
+            ref_alt = ['abc','aba^']
+          if pin[header_pos['CPX_TYPE']] == 'dupINVdup':
+            ref_alt = ['abc','ac^b^a^c']
+        elif pin[header_pos['CPX_TYPE']] in ['dDUP', 'dDUP_iDEL']:
+          segments = pin[header_pos['CPX_INTERVALS']].split(',')
+          cpx_info = extract_bp_list_for_ddup_events(pin[:3], segments, small_sv_size_threshold)
+          if cpx_info == 'unresolved':
+            unresolved_svids.append(pin[header_pos['name']])
+            continue
+          ref_alt = cpx_info[1]
+          breakpoints = cpx_info[0]
+        elif pin[header_pos['CPX_TYPE']] in ['INS_iDEL']:
+          segments = pin[header_pos['CPX_INTERVALS']].split(',')
+          cpx_info = extract_bp_list_for_ins_idel(pin[:3], segments, small_sv_size_threshold)
+          ref_alt = cpx_info[1]
+          breakpoints = cpx_info[0]
+        else:
+          segments = pin[header_pos['SOURCE']].split(',')
+          cpx_info = extract_bp_list_V4(pin[:3], segments, small_sv_size_threshold)
+          ref_alt = cpx_info[1]
+          breakpoints = cpx_info[0]
+        out.append([breakpoints, ref_alt,pin[3]])
+    else:
+      continue
+  fin.close()
+  with open(unresolved, 'w') as unres:
+    for svid in unresolved_svids:
+      unres.write(svid + "\n")
+  return out
+
+def cpx_inter_chromo_SV_readin(input_bed, header_pos):
+  fin=os.popen(r'''zcat %s'''%(input_bed))
+  out = []
+  chr_list = ['chr'+str(i) for i in range(1,23)]+['chrX','chrY']
+  for line in fin:
+    pin=line.strip().split('\t')
+    if not pin[0][0]=="#":
+      if is_interchromosomal(pin, header_pos):
+        if pin[header_pos['CPX_TYPE']] in ['dDUP', 'dDUP_iDEL']:
+          seg1 = pin[:3]
+          seg2 = [i for i in pin[header_pos['CPX_INTERVALS']].split(',') if 'DUP_' in i][0].split('_')[1].split(':')
+          seg2 = [seg2[0]]+seg2[1].split('-')
+          if chr_list.index(seg1[0]) < chr_list.index(seg2[0]):
+            bp = [[seg1[0]]+[int(i) for i in seg1[1:]], [seg2[0]]+[int(i) for i in seg2[1:]]]
+            if int(seg1[2])-int(seg1[1])>250:
+              ref_alt = ['ab_c', 'cb_c']
+            else:
+              ref_alt = ['a_b', 'ba_b']
+          elif chr_list.index(seg1[0]) > chr_list.index(seg2[0]):
+            bp = [[seg2[0]]+[int(i) for i in seg2[1:]], [seg1[0]]+[int(i) for i in seg1[1:]]]
+            if int(seg1[2])-int(seg1[1])>250:
+              ref_alt = ['a_bc','a_ba']
+            else:
+              ref_alt = ['a_b', 'a_ba']
+        elif pin[header_pos['CPX_TYPE']] in ['INS_iDEL']:
+          seg1 = pin[:3]
+          seg2 = [i for i in pin[header_pos['CPX_INTERVALS']].split(',') if 'INS_' in i][0].split('_')[1].split(':')
+          seg2 = [seg2[0]]+seg2[1].split('-')
+          if chr_list.index(seg1[0]) < chr_list.index(seg2[0]):
+            bp = [[seg1[0]]+[int(i) for i in seg1[1:]], [seg2[0]]+[int(i) for i in seg2[1:]]]
+            if int(seg1[2])-int(seg1[1])>250:
+              ref_alt = ['ab_c', 'cb_c']
+            else:
+              ref_alt = ['a_b', 'ba_b']
+          elif chr_list.index(seg1[0]) > chr_list.index(seg2[0]):
+            bp = [[seg2[0]]+[int(i) for i in seg2[1:]], [seg1[0]]+[int(i) for i in seg1[1:]]]
+            if int(seg1[2])-int(seg1[1])>250:
+              ref_alt = ['a_bc','a_ba']
+            else:
+              ref_alt = ['a_b', 'a_ba']
+        elif pin[header_pos['CPX_TYPE']] in ['CTX_PQ/QP', 'CTX_PP/QQ'] or pin[header_pos['SVTYPE']] in ['CTX']:
+          seg1 = pin[:3]
+          seg2 = [pin[header_pos['CHR2']], pin[header_pos['END2']], pin[header_pos['END2']]]
+          if chr_list.index(seg1[0]) < chr_list.index(seg2[0]):
+            bp = [[seg1[0]]+[int(i) for i in seg1[1:]], [seg2[0]]+[int(i) for i in seg2[1:]]]
+          elif chr_list.index(seg1[0]) > chr_list.index(seg2[0]):
+            bp = [[seg2[0]]+[int(i) for i in seg2[1:]], [seg1[0]]+[int(i) for i in seg1[1:]]]
+          ref_alt = [pin[header_pos['CPX_TYPE']]]
+        elif "INV_" in pin[header_pos['SOURCE']] and pin[header_pos['SVTYPE']]=="INS":
+          seg1 = pin[:3]
+          seg2 = [i for i in pin[header_pos['SOURCE']].split(',') if 'INV_' in i][0].split('_')[1].split(':')
+          seg2 = [seg2[0]]+seg2[1].split('-')
+          if chr_list.index(seg1[0]) < chr_list.index(seg2[0]):
+            bp = [[seg1[0]]+[int(i) for i in seg1[1:]], [seg2[0]]+[int(i) for i in seg2[1:]]]
+            if int(seg1[2])-int(seg1[1])>250:
+              ref_alt = ['ab_c', 'c^b_c']
+            else:
+              ref_alt = ['a_b', 'b^a_b']
+          elif chr_list.index(seg1[0]) > chr_list.index(seg2[0]):
+            bp = [[seg2[0]]+[int(i) for i in seg2[1:]], [seg1[0]]+[int(i) for i in seg1[1:]]]
+            if int(seg1[2])-int(seg1[1])>250:
+              ref_alt = ['a_bc','a_ba^']
+            else:
+              ref_alt = ['a_b', 'a_ba^']
+        out.append([bp, ref_alt, pin[3]])
+  fin.close()
+  return out
+
+def cpx_sample_batch_readin(cpx_SV, SVID_sample,sample_pe, PE_evidence, out_file):
+  out = []
+  flank_back = 1000
+  flank_front = 100
+  fo=open(out_file,'w')
+  for info in cpx_SV:
+    breakpints = info[0]
+    if info[1][0] == 'ab' and info[1][1]=='b^': #delINV
+      common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_back)+'-'+str(breakpints[1]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+      common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_front)+'-'+str(breakpints[2]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>'+str(breakpints[3]-flank_front) , '&&', '$5<'+str(breakpints[3]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+    elif info[1][0] == 'abc' and info[1][1]=='b^': #delINVdel
+      common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_back)+'-'+str(breakpints[1]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+      common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_front)+'-'+str(breakpints[2]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>'+str(breakpints[4]-flank_front) , '&&', '$5<'+str(breakpints[4]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+    elif info[1][0] == 'abc' and info[1][1]=='c^bc': #delINVdup
+      common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_back)+'-'+str(breakpints[1]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>'+str(breakpints[4]-flank_back) , '&&', '$5<'+str(breakpints[4]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+      common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_front)+'-'+str(breakpints[2]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>'+str(breakpints[3]-flank_front) , '&&', '$5<'+str(breakpints[3]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+    elif info[1][0] == 'ab' and info[1][1]=='aba^': #dupINV
+      common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_back)+'-'+str(breakpints[2]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+      common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>'+str(breakpints[3]-flank_front) , '&&', '$5<'+str(breakpints[3]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+    elif info[1][0] == 'abc' and info[1][1]=='ac^b^a^c': #dupINVdup
+      common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_back)+'-'+str(breakpints[2]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>'+str(breakpints[4]-flank_back) , '&&', '$5<'+str(breakpints[4]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+      common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>'+str(breakpints[3]-flank_front) , '&&', '$5<'+str(breakpints[3]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+    elif info[1][0] == 'abc' and info[1][1]=='aba^': #dupINVdel
+      common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_back)+'-'+str(breakpints[2]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+      common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>'+str(breakpints[4]-flank_front) , '&&', '$5<'+str(breakpints[4]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+    elif info[1][0] == 'ab' and info[1][1]=='a^': #INVdel
+      common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_back)+'-'+str(breakpints[1]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>'+str(breakpints[2]-flank_back) , '&&', '$5<'+str(breakpints[2]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+      common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>'+str(breakpints[3]-flank_front) , '&&', '$5<'+str(breakpints[3]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+    elif info[1][0] == 'ab' and info[1][1]=='b^ab': #INVdup
+      common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_back)+'-'+str(breakpints[1]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+      common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>'+str(breakpints[2]-flank_front) , '&&', '$5<'+str(breakpints[2]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+    elif info[1][0] == 'ab' and info[1][1]=='aba': #dupINS
+      common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_back)+'-'+str(breakpints[2]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>'+str(breakpints[3]-flank_front) , '&&', '$5<'+str(breakpints[3]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+      common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+    elif info[1][0] == 'abc' and info[1][1]=='aba': #dupINSdel
+      common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_back)+'-'+str(breakpints[2]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>'+str(breakpints[4]-flank_front) , '&&', '$5<'+str(breakpints[4]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+      common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+    elif info[1][0] == 'ab' and info[1][1]=='aa': #dupdel
+      common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_back)+'-'+str(breakpints[2]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>'+str(breakpints[3]-flank_front) , '&&', '$5<'+str(breakpints[3]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+      common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>'+str(breakpints[2]-flank_back) , '&&', '$5<'+str(breakpints[2]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+    elif info[1][0] == 'ab' and info[1][1]=='bab': #INSdup
+      common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_back)+'-'+str(breakpints[1]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>'+str(breakpints[2]-flank_front) , '&&', '$5<'+str(breakpints[2]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+      common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+    elif info[1][0] == 'abc' and info[1][1]=='cbc': #delINSdup
+      common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_back)+'-'+str(breakpints[1]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>'+str(breakpints[3]-flank_front) , '&&', '$5<'+str(breakpints[3]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+      common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_front)+'-'+str(breakpints[2]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>'+str(breakpints[4]-flank_back) , '&&', '$5<'+str(breakpints[4]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+    elif info[1][0] == 'ab' and info[1][1]=='bb': #deldup
+      common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_back)+'-'+str(breakpints[1]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>'+str(breakpints[2]-flank_front) , '&&', '$5<'+str(breakpints[2]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+      common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_front)+'-'+str(breakpints[2]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+    elif info[1] == ['a_b','ba_b'] or info[1] == ['ab_c','cb_c']:   #ddup or ddup_iDEL, insertion from the smaller chromosome to the larger chromosome
+      common_1 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][1]-flank_back)+'-'+str(breakpints[0][1]+flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="-"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][1]-flank_front) , '&&', '$5<'+str(breakpints[1][1]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+      common_2 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][2]-flank_front)+'-'+str(breakpints[0][2]+flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="+"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][2]-flank_back) , '&&', '$5<'+str(breakpints[1][2]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+    elif info[1] == ['a_b','a_ba'] or info[1] == ['a_bc','a_ba']:   #ddup or ddup_iDEL, insertion from the larger chromosome to the smaller chromosome
+      common_1 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][1]-flank_front)+'-'+str(breakpints[0][1]+flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="+"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][1]-flank_back) , '&&', '$5<'+str(breakpints[1][1]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+      common_2 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][2]-flank_back)+'-'+str(breakpints[0][2]+flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="-"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][2]-flank_front) , '&&', '$5<'+str(breakpints[1][2]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+    elif info[1] == ['a_b', 'b^a_b'] or info[1] == ['ab_c', 'c^b_c']: #inverted insertion, insertion from the smaller chromosome to the larger chromosome
+      common_1 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][1]-flank_back)+'-'+str(breakpints[0][1]+flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="+"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][2]-flank_back) , '&&', '$5<'+str(breakpints[1][2]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+      common_2 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][2]-flank_front)+'-'+str(breakpints[0][2]+flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="-"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][1]-flank_front) , '&&', '$5<'+str(breakpints[1][1]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+    elif info[1] == ['a_b', 'a_ba^'] or info[1] == ['a_bc','a_ba^']:  #inverted insertion, insertion from the larger chromosome to the smaller chromosome
+      common_1 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][1]-flank_front)+'-'+str(breakpints[0][1]+flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="-"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][2]-flank_front) , '&&', '$5<'+str(breakpints[1][2]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+      common_2 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][2]-flank_back)+'-'+str(breakpints[0][2]+flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="+"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][1]-flank_back) , '&&', '$5<'+str(breakpints[1][1]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+    elif info[1] == ['CTX_PQ/QP']:  #inverted insertion, insertion from the larger chromosome to the smaller chromosome
+      common_1 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][1]-flank_back)+'-'+str(breakpints[0][1]+flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="+"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][1]-flank_back) , '&&', '$5<'+str(breakpints[1][1]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+      common_2 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][2]-flank_front)+'-'+str(breakpints[0][2]+flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="-"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][2]-flank_front) , '&&', '$5<'+str(breakpints[1][2]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+    elif info[1] == ['CTX_PP/QQ']:  #inverted insertion, insertion from the larger chromosome to the smaller chromosome
+      common_1 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][1]-flank_back)+'-'+str(breakpints[0][1]+flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="-"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][1]-flank_front) , '&&', '$5<'+str(breakpints[1][1]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+      common_2 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][2]-flank_front)+'-'+str(breakpints[0][2]+flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="+"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][2]-flank_back) , '&&', '$5<'+str(breakpints[1][2]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
+    else:
+      print(info)
+    samples = SVID_sample[info[2]]
+    if not samples =='' and not samples=='NA':
+      sample_list = samples.split(',')
+      pe_metrics_list = [sample_pe[samp] for samp in sample_list]
+      for num in range(len(sample_list)):
+        common_1[1] = pe_metrics_list[num]
+        common_2[1] = pe_metrics_list[num]
+        common_1[4] = sample_list[num]
+        common_2[4] = sample_list[num]
+        write_1 = ' '.join(common_1)
+        write_2 = ' '.join(common_2)
+        print(write_1, file=fo)
+        print(write_2, file=fo)
+
+  fo.close()
+  return out
+
+def write_cpx_command(cpx_command, out_file):
+  fo=open(out_file,'w')
+  for i in cpx_command:
+    print(i, file=fo)
+  fo.close()
+
+def write_cpx_SVs(cpx_SV, out_file):
+  fo=open(out_file,'w')
+  for i in cpx_SV:
+    out = [i[2],':'.join([str(j) for j in i[0]])]+i[1]
+    print('\t'.join(out), file=fo)
+  fo.close()
+
+def main():
+  """
+  Command-line main block
+  """
+
+  # Parse command line arguments and options
+  parser = argparse.ArgumentParser(
+          description=__doc__,
+          formatter_class=argparse.RawDescriptionHelpFormatter)
+  parser.add_argument('-i','--input', required=True, help='input file of CPX events in bed format')
+  parser.add_argument('-s','--sample_pe', required=True, help='2 column file with sample ID and PE metrics information')
+  parser.add_argument('-p','--pe_evidence', required=True, help='name of file to store collected PE metrics')
+  parser.add_argument('-c','--command_script', required=True, help='name of file that has scripts to collect PE evidences')
+  parser.add_argument('-r','--reformat_SV', required=True, help='reformatted SV in svelter format')
+  parser.add_argument('-u','--unresolved', required=True, help='list of SVIDs to mark unresolved without evaluating (temporary workaround')  # TODO
+  args = parser.parse_args()
+
+  input_bed = args.input
+  sample_pe_file = args.sample_pe
+  PE_evidence = args.pe_evidence
+  command_script = args.command_script
+  reformatted_SV = args.reformat_SV
+
+  header_pos = header_pos_readin(input_bed)
+  sample_pe = sample_pe_readin(sample_pe_file)
+  SVID_sample = SVID_sample_readin(input_bed, header_pos)
+
+  cpx_SV = cpx_SV_readin(input_bed, header_pos, args.unresolved)
+  cpx_inter_chromo_SV = cpx_inter_chromo_SV_readin(input_bed, header_pos)
+  write_cpx_SVs(cpx_SV+cpx_inter_chromo_SV, reformatted_SV)
+
+  cpx_sample_batch_readin(cpx_SV+cpx_inter_chromo_SV, SVID_sample, sample_pe, PE_evidence, command_script)
+
+import os
+import sys
+import argparse
+if __name__ == '__main__':
+        main()
+
diff --git a/wdl/CollectLargeCNVSupportForCPX.wdl b/wdl/CollectLargeCNVSupportForCPX.wdl
new file mode 100644
index 000000000..ba925c9b1
--- /dev/null
+++ b/wdl/CollectLargeCNVSupportForCPX.wdl
@@ -0,0 +1,287 @@
+version 1.0
+
+import "Structs.wdl"
+import "TasksMakeCohortVcf.wdl" as MiniTasks
+
+workflow CollectLargeCNVSupportForCPX {
+    input {
+        File cpx_ctx_bed
+        String prefix
+
+        File sample_depth_calls
+        Array[String] batch_name_list
+        Array[File] Depth_DEL_beds
+        Array[File] Depth_DUP_beds
+
+        String sv_base_mini_docker
+        String sv_pipeline_docker
+
+        RuntimeAttr? runtime_attr_generate_cnv_segments_from_cpx
+        RuntimeAttr? runtime_attr_extract_cpx_lg_cnv_by_batch
+        RuntimeAttr? runtime_attr_seek_depth_supp_for_cpx
+        RuntimeAttr? runtime_attr_concat_bed_Step1
+        RuntimeAttr? runtime_attr_concat_bed_Step2
+
+    }
+
+    call GenerateCnvSegmentFromCpx {
+        input:
+            bed = cpx_ctx_bed, #input file including cpx calls in bed format; 
+            sample_depth_calls = sample_depth_calls,
+            sv_pipeline_docker = sv_pipeline_docker,
+            runtime_attr_override = runtime_attr_generate_cnv_segments_from_cpx
+    }
+
+    scatter (i in range(length(batch_name_list))){
+        call ExtractCpxLgCnvByBatch {
+            input:
+                bed_gz = GenerateCnvSegmentFromCpx.cpx_lg_cnv,
+                batch_name = batch_name_list[i],
+                sv_base_mini_docker = sv_base_mini_docker,
+                runtime_attr_override = runtime_attr_extract_cpx_lg_cnv_by_batch
+        }
+
+        call SeekDepthSuppForCpx as seek_depth_supp_for_cpx_del {
+            input:
+                cpx_lg_cnv = ExtractCpxLgCnvByBatch.lg_cnv_del,
+                raw_depth_bed = Depth_DEL_beds[i],
+                sv_pipeline_docker = sv_pipeline_docker,
+                runtime_attr_override = runtime_attr_seek_depth_supp_for_cpx
+        }
+
+        call SeekDepthSuppForCpx as seek_depth_supp_for_cpx_dup {
+            input:
+                cpx_lg_cnv = ExtractCpxLgCnvByBatch.lg_cnv_dup,
+                raw_depth_bed = Depth_DUP_beds[i],
+                sv_pipeline_docker = sv_pipeline_docker,
+                runtime_attr_override = runtime_attr_seek_depth_supp_for_cpx
+        }
+
+        call MiniTasks.ConcatBeds as concat_beds_svtype {
+            input:
+                shard_bed_files = [seek_depth_supp_for_cpx_del.cpx_cnv_depth_supp, seek_depth_supp_for_cpx_dup.cpx_cnv_depth_supp],
+                prefix = "~{batch_name_list[i]}.lg_cnv.depth_supp",
+                sv_base_mini_docker = sv_base_mini_docker,
+                runtime_attr_override = runtime_attr_concat_bed_Step1
+        }
+    }
+
+
+    call MiniTasks.ConcatBeds as concat_beds_across_batches {
+        input:
+            shard_bed_files = concat_beds_svtype.merged_bed_file,
+            prefix = "~{prefix}.lg_cnv.depth_supp",
+            sv_base_mini_docker = sv_base_mini_docker,
+            runtime_attr_override = runtime_attr_concat_bed_Step2
+        }
+
+    output {
+        File lg_cnv_depth_supp = concat_beds_across_batches.merged_bed_file
+    }
+}
+
+task SeekDepthSuppForCpx {
+    input {
+        File cpx_lg_cnv
+        File raw_depth_bed
+        String sv_pipeline_docker
+        RuntimeAttr? runtime_attr_override
+    }
+
+    RuntimeAttr default_attr = object {
+        cpu_cores: 1,
+        mem_gb: 3.75,
+        disk_gb: 10,
+        boot_disk_gb: 10,
+        preemptible_tries: 3,
+        max_retries: 1
+    }
+    RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+    String prefix = basename(cpx_lg_cnv, '.bed.gz')
+
+    output {
+           File cpx_cnv_depth_supp = "~{prefix}.depth_supp.bed.gz"
+    }
+    command <<<
+
+        set -e pipefail
+
+        zcat ~{cpx_lg_cnv} | awk '{print $6}' | sort | uniq > sample_list.tsv
+
+        echo -e '#chr\tpos\tend\tSVTYPE\tSVID\tsample\tbatch\tdepth_cov' > ~{prefix}.depth_supp.bed
+
+        while read sample_name; do
+            zcat ~{cpx_lg_cnv}    | awk -v sample="$sample_name" '{if ($6==sample) print}'  > query.bed
+            zcat ~{raw_depth_bed} | awk -v sample="$sample_name" '{if ($5==sample) print}'  > ref.bed
+            bedtools coverage -a query.bed -b ref.bed \
+                | awk '{print $1,$2,$3,$4,$5,$6,$7,$NF}' \
+                | sed -e 's/ /\t/g' \
+                >> ~{prefix}.depth_supp.bed
+        done < sample_list.tsv
+
+        bgzip ~{prefix}.depth_supp.bed
+
+    >>>
+    runtime {
+        cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+        memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+        disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+        bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+        docker: sv_pipeline_docker
+        preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+        maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+    }
+}
+
+
+task ExtractCpxLgCnvByBatch {
+    input {
+        File bed_gz
+        String batch_name
+        String sv_base_mini_docker
+        RuntimeAttr? runtime_attr_override
+    }
+
+    RuntimeAttr default_attr = object {
+        cpu_cores: 1,
+        mem_gb: 3.75,
+        disk_gb: 10,
+        boot_disk_gb: 10,
+        preemptible_tries: 3,
+        max_retries: 1
+    }
+    RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+    output {
+           File lg_cnv_del = "~{batch_name}.lg_cnv.DEL.bed.gz"
+           File lg_cnv_dup = "~{batch_name}.lg_cnv.DUP.bed.gz"
+    }
+    command <<<
+
+        set -e pipefail
+
+        zcat ~{bed_gz} | awk '{if ($4=="DEL" && $7=="~{batch_name}") print}' > ~{batch_name}.lg_cnv.DEL.bed
+        zcat ~{bed_gz} | awk '{if ($4=="DUP" && $7=="~{batch_name}") print}' > ~{batch_name}.lg_cnv.DUP.bed
+        bgzip ~{batch_name}.lg_cnv.DEL.bed
+        bgzip ~{batch_name}.lg_cnv.DUP.bed
+
+    >>>
+    runtime {
+        cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+        memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+        disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+        bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+        docker: sv_base_mini_docker
+        preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+        maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+    }
+}
+
+task GenerateCnvSegmentFromCpx {
+    input {
+        File bed
+        # sample depth calls is map of sample name to batch ID
+        File sample_depth_calls
+        String sv_pipeline_docker
+        Int min_depth_size = 5000
+        RuntimeAttr? runtime_attr_override
+    }
+
+    RuntimeAttr default_attr = object {
+        cpu_cores: 1,
+        mem_gb: 5,
+        disk_gb: 10,
+        boot_disk_gb: 30,
+        preemptible_tries: 1,
+        max_retries: 1
+    }
+
+    RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+
+    String prefix  = basename(bed, ".bed.gz")
+
+    command <<<
+        set -euo pipefail
+
+        python <<CODE
+
+        import os
+        import sys
+        def split_cpx_interval(info):
+            #eg of info: DUP_chrY:3125606-3125667
+            out = [info.split('_')[0], info.split('_')[1].split(':')[0]] +  [int(i) for i in info.split('_')[1].split(':')[1].split('-')]
+            return out[1:4]+[out[0]]
+
+        def sample_depth_calls_readin(sample_depth):
+            out = {}
+            fin=open(sample_depth)
+            for line in fin:
+                pin=line.strip().split()
+                out[pin[0]] = pin[1:]
+            fin.close()
+            return out
+
+        def readin_cpx_cnv(bed_input,sample_depth_hash,min_depth_size):
+            CPX_CNV = []
+            f_bed = os.popen(r'''zcat %s'''%(bed_input))
+            for line in f_bed:
+                pin=line.strip().split('\t')
+                if pin[0][0]=='#':
+                    pos_CPX_TYPE = pin.index('CPX_TYPE')
+                    pos_CPX_INTERVALS = pin.index('CPX_INTERVALS')
+                    pos_SOURCE = pin.index('SOURCE')
+                    pos_SAMPLES = pin.index('samples')
+                else:
+                    interval = []
+                    if 'DEL_' in pin[pos_CPX_INTERVALS] or 'DUP_' in pin[pos_CPX_INTERVALS]:
+                        interval += [split_cpx_interval(i)+[pin[3]] for i in pin[pos_CPX_INTERVALS].split(",") if "DEL_" in i or "DUP_" in i]
+                    if 'DEL_' in pin[pos_SOURCE] or 'DUP_' in pin[pos_SOURCE]:
+                        interval += [split_cpx_interval(i)+[pin[3]] for i in pin[pos_SOURCE].split(",") if "DEL_" in i or "DUP_" in i]
+                    if len(interval)>0:
+                        for i in interval:
+                            if i[2]-i[1]>=min_depth_size:
+                                sample_names = pin[pos_SAMPLES].split(',')
+                                if i[3]=="DEL":
+                                    for j in sample_names:
+                                        CPX_CNV.append(i+[j, sample_depth_hash[j][0]])
+                                if i[3]=="DUP":
+                                    for j in sample_names:
+                                        CPX_CNV.append(i+[j, sample_depth_hash[j][0]])
+            f_bed.close()
+            return CPX_CNV
+
+        def write_cpx_cnv(CPX_CNV, fileout):
+            fo=open(fileout, 'w')
+            for info in CPX_CNV:
+                print('\t'.join([str(i) for i in info]), file=fo)
+            fo.close()
+
+        bed_input = "~{bed}"
+        fileout = "~{prefix}.lg_CNV.bed"
+        sample_depth = "~{sample_depth_calls}"
+        min_depth_size = ~{min_depth_size}
+        sample_depth_hash = sample_depth_calls_readin(sample_depth)
+        CPX_CNV = readin_cpx_cnv(bed_input, sample_depth_hash, min_depth_size)
+        write_cpx_cnv(CPX_CNV, fileout)
+        CODE
+
+        bgzip "~{prefix}.lg_CNV.bed"
+    >>>
+
+    output {
+        File cpx_lg_cnv = "~{prefix}.lg_CNV.bed.gz"
+    }
+
+    runtime {
+        cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+        memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+        disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+        bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+        docker: sv_pipeline_docker
+        preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+        maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+    }
+}
+
diff --git a/wdl/CollectPEMetricsForCPX.wdl b/wdl/CollectPEMetricsForCPX.wdl
new file mode 100644
index 000000000..746f6a907
--- /dev/null
+++ b/wdl/CollectPEMetricsForCPX.wdl
@@ -0,0 +1,102 @@
+version 1.0
+
+import "Structs.wdl"
+import "CollectPEMetricsPerBatchCPX.wdl" as per_batch
+
+workflow CollectPEMetricsForCPX {
+    input {
+        Array[String] batch_name_list
+        Array[File] PE_metrics
+        Array[File] PE_metrics_idxes
+        File PE_collect_script
+        String prefix
+        Int n_per_split
+        String sv_pipeline_docker
+        String sv_base_mini_docker
+        RuntimeAttr? runtime_attr_collect_pe
+        RuntimeAttr? runtime_attr_split_script
+        RuntimeAttr? runtime_attr_calcu_pe_stat
+        RuntimeAttr? runtime_attr_concat_evidence
+        }
+
+    scatter (i in range(length(batch_name_list))) {
+        call per_batch.CollectPEMetricsPerBatchCPX {
+            input:
+                n_per_split = n_per_split,
+                prefix = "~{prefix}.~{batch_name_list[i]}",
+                batch_name = batch_name_list[i],
+                PE_metric = PE_metrics[i],
+                PE_metrics_idx = PE_metrics_idxes[i],
+                PE_collect_script = PE_collect_script,
+                sv_pipeline_docker = sv_pipeline_docker,
+                sv_base_mini_docker = sv_base_mini_docker,
+                runtime_attr_collect_pe = runtime_attr_collect_pe,
+                runtime_attr_split_script = runtime_attr_split_script,
+                runtime_attr_calcu_pe_stat = runtime_attr_calcu_pe_stat,
+                runtime_attr_concat_evidence = runtime_attr_concat_evidence
+        }
+     }
+
+    call per_batch.ConcatEvidences {
+        input:
+            evidences = CollectPEMetricsPerBatchCPX.evidence,
+            prefix = prefix,
+            sv_base_mini_docker = sv_base_mini_docker,
+            runtime_attr_override = runtime_attr_concat_evidence
+    }
+
+    call CalcuPEStat {
+        input:
+            evidence = ConcatEvidences.concat_evidence,
+            prefix = prefix,
+            sv_base_mini_docker = sv_base_mini_docker,
+            runtime_attr_override = runtime_attr_calcu_pe_stat
+    }
+
+    output {
+        File evidence = ConcatEvidences.concat_evidence
+        File evi_stat = CalcuPEStat.evi_stat
+    }
+}
+
+
+task CalcuPEStat {
+  input {
+    File evidence
+    String prefix
+    String sv_base_mini_docker
+   RuntimeAttr? runtime_attr_override
+  }
+
+  RuntimeAttr default_attr = object {
+    cpu_cores: 1,
+    mem_gb: 5,
+    disk_gb: 10,
+    boot_disk_gb: 10,
+    preemptible_tries: 3,
+    max_retries: 1
+  }
+
+  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+  command <<<
+    set -euo pipefail
+
+    zcat ~{evidence} | cut -f3,6- | uniq -c > ~{prefix}.evi_stat
+    bgzip ~{prefix}.evi_stat
+  >>>
+
+  output {
+    File evi_stat = "~{prefix}.evi_stat.gz"
+  }
+
+  runtime {
+    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+    docker: sv_base_mini_docker
+    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+  }
+}
diff --git a/wdl/CollectPEMetricsPerBatchCPX.wdl b/wdl/CollectPEMetricsPerBatchCPX.wdl
new file mode 100644
index 000000000..655959748
--- /dev/null
+++ b/wdl/CollectPEMetricsPerBatchCPX.wdl
@@ -0,0 +1,205 @@
+version 1.0
+
+import "Structs.wdl"
+workflow CollectPEMetricsPerBatchCPX {
+    input {
+        Int n_per_split
+        String batch_name
+        File PE_metric
+        File PE_metrics_idx
+        File PE_collect_script
+        String prefix
+        String sv_base_mini_docker
+        String sv_pipeline_docker
+        RuntimeAttr? runtime_attr_collect_pe
+        RuntimeAttr? runtime_attr_concat_evidence
+        RuntimeAttr? runtime_attr_calcu_pe_stat
+        RuntimeAttr? runtime_attr_split_script
+        }
+
+    call SplitScripts {
+        input:
+            script = PE_collect_script,
+            n_per_split = n_per_split,
+            batch_name = batch_name,
+            sv_base_mini_docker = sv_base_mini_docker,
+            runtime_attr_override = runtime_attr_split_script
+    }
+
+    scatter (script in SplitScripts.script_splits ) {
+        call CollectPEMetrics {
+            input:
+                batch_name = batch_name,
+                PE_metric = PE_metric,
+                PE_metrics_idx = PE_metrics_idx,
+                PE_collect_script = script,
+                sv_pipeline_docker = sv_pipeline_docker,
+                runtime_attr_override = runtime_attr_collect_pe
+        }
+    }
+
+    call ConcatEvidences {
+        input:
+            evidences = CollectPEMetrics.evidence,
+            prefix = prefix,
+            sv_base_mini_docker = sv_base_mini_docker,
+            runtime_attr_override = runtime_attr_concat_evidence
+    }
+
+    output {
+        File evidence = ConcatEvidences.concat_evidence
+    }
+}
+
+
+
+# collect PE metrics
+task CollectPEMetrics {
+  input {
+    String batch_name
+    File PE_metric
+    File PE_metrics_idx
+    File PE_collect_script
+    String sv_pipeline_docker
+    RuntimeAttr? runtime_attr_override
+  }
+
+  RuntimeAttr default_attr = object {
+    cpu_cores: 1,
+    mem_gb: 5,
+    disk_gb: ceil(30.0 + size(PE_metric, "GiB") * 3),
+    boot_disk_gb: 10,
+    preemptible_tries: 3,
+    max_retries: 1
+  }
+
+  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+  command <<<
+    set -euo pipefail
+    mkdir PE_metrics/
+    gsutil cp ~{PE_metric} ./
+    gsutil cp ~{PE_metrics_idx} ./
+    echo "Metrics succesfully downloaded."
+    echo "Starting the evidence collection ..."
+
+    if [ $(wc -c < "~{PE_collect_script}") -gt 0 ]; then
+      bash ~{PE_collect_script}
+    fi
+
+    touch ~{batch_name}.evidence
+    touch ~{batch_name}.0.PE_evidences
+    for peEvFile in *.PE_evidences
+    do
+       cat ${peEvFile} >> ~{batch_name}.evidence
+    done
+
+    bgzip ~{batch_name}.evidence
+
+  >>>
+
+  output {
+    File evidence = "~{batch_name}.evidence.gz"
+  }
+
+  runtime {
+    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+    docker: sv_pipeline_docker
+    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+  }
+}
+
+task ConcatEvidences {
+  input {
+    Array[File] evidences
+    String prefix
+    String sv_base_mini_docker
+   RuntimeAttr? runtime_attr_override
+  }
+
+  RuntimeAttr default_attr = object {
+    cpu_cores: 1,
+    mem_gb: 5,
+    disk_gb: 10,
+    boot_disk_gb: 10,
+    preemptible_tries: 3,
+    max_retries: 1
+  }
+
+  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+  command <<<
+    set -euo pipefail
+
+    while read SPLIT; do
+      zcat $SPLIT
+    done < ~{write_lines(evidences)} \
+      | bgzip -c \
+      > ~{prefix}.evidence.gz
+
+  >>>
+
+  output {
+    File concat_evidence = "~{prefix}.evidence.gz"
+  }
+
+  runtime {
+    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+    docker: sv_base_mini_docker
+    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+  }
+}
+
+task SplitScripts {
+  input {
+    File script
+    String batch_name
+    Int n_per_split
+    String sv_base_mini_docker
+    RuntimeAttr? runtime_attr_override
+  }
+
+  RuntimeAttr default_attr = object {
+    cpu_cores: 1,
+    mem_gb: 3.75,
+    disk_gb: 10,
+    boot_disk_gb: 10,
+    preemptible_tries: 3,
+    max_retries: 1
+  }
+  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+  output {
+    Array[File] script_splits = glob("collect_PE_evidences.*")
+  }
+  command <<<
+
+    set -e pipefail
+
+    awk '{if ($2~"~{batch_name}") print}' ~{script} > tmp_metrics.sh
+
+    if [ $(wc -c < "tmp_metrics.sh") -gt 0 ]; then
+      split --additional-suffix ".sh" -l ~{n_per_split} -a 6 tmp_metrics.sh collect_PE_evidences.
+    else
+      touch collect_PE_evidences.aaaaaa.sh
+    fi
+
+  >>>
+  runtime {
+    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+    docker: sv_base_mini_docker
+    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+  }
+}
diff --git a/wdl/RefineComplexVariants.wdl b/wdl/RefineComplexVariants.wdl
new file mode 100644
index 000000000..7d773430a
--- /dev/null
+++ b/wdl/RefineComplexVariants.wdl
@@ -0,0 +1,870 @@
+version 1.0
+
+import "Structs.wdl"
+import "HailMerge.wdl" as hail
+import "TasksMakeCohortVcf.wdl" as MiniTasks
+import "CollectPEMetricsForCPX.wdl" as collect_pe_metrics_for_cpx
+import "CollectLargeCNVSupportForCPX.wdl" as collect_lg_cnv_supp_for_cpx
+import "Utils.wdl" as util
+
+workflow RefineComplexVariants {
+    input {
+        File vcf
+        File vcf_idx
+
+        Array[String] batch_name_list
+        Array[File] PE_metrics
+        Array[File] PE_metrics_idxes
+        Array[File] Depth_DEL_beds
+        Array[File] Depth_DUP_beds
+
+        String prefix
+        Int n_per_split
+        File sample_PE_metrics
+        File sample_depth_calls
+        File? script_generate_cpx_review_script
+
+        Boolean use_hail = false
+        String? gcs_project  # required if use_hail = true
+
+        String sv_base_mini_docker
+        String sv_pipeline_docker
+        String linux_docker
+
+        RuntimeAttr? runtime_attr_vcf2bed
+        RuntimeAttr? runtime_attr_collect_pe
+        RuntimeAttr? runtime_attr_scatter_vcf
+        RuntimeAttr? runtime_attr_split_script
+        RuntimeAttr? runtime_attr_calcu_pe_stat
+        RuntimeAttr? runtime_attr_split_cpx_ctx
+        RuntimeAttr? runtime_attr_concat_evidence
+        RuntimeAttr? runtime_attr_concat
+        RuntimeAttr? runtime_attr_preconcat
+        RuntimeAttr? runtime_attr_hail_merge
+        RuntimeAttr? runtime_attr_fix_header
+        RuntimeAttr? runtime_attr_generate_cpx_review_script
+        RuntimeAttr? runtime_attr_generate_cnv_segments_from_cpx
+        RuntimeAttr? runtime_attr_get_vcf_header_with_members_info_line
+        RuntimeAttr? runtime_attr_generate_cnv_segments_from_cpx
+        RuntimeAttr? runtime_attr_extract_cpx_lg_cnv_by_batch
+        RuntimeAttr? runtime_attr_seek_depth_supp_for_cpx
+        RuntimeAttr? runtime_attr_concat_bed_Step1
+        RuntimeAttr? runtime_attr_concat_bed_Step2
+        RuntimeAttr? runtime_attr_calcu_cpx_evidences
+    }
+
+
+    call MiniTasks.ScatterVcf {
+        input:
+          vcf = vcf,
+          prefix = prefix,
+          records_per_shard = n_per_split,
+          sv_pipeline_docker = sv_pipeline_docker,
+          runtime_attr_override = runtime_attr_scatter_vcf
+      }
+
+    scatter (i in range(length(ScatterVcf.shards))) {
+
+        call util.VcfToBed {
+            input:
+                vcf_file = ScatterVcf.shards[i],
+                args = "-i ALL --include-filters",
+                variant_interpretation_docker = sv_pipeline_docker,
+                runtime_attr_override = runtime_attr_vcf2bed
+        }
+
+        call SplitCpxCtx {
+            input:
+                bed = VcfToBed.bed_output,
+                sv_base_mini_docker = sv_base_mini_docker,
+                runtime_attr_override = runtime_attr_split_cpx_ctx
+        }
+
+        call collect_lg_cnv_supp_for_cpx.CollectLargeCNVSupportForCPX {
+            input:
+                cpx_ctx_bed = SplitCpxCtx.cpx_ctx_bed,
+                prefix = "~{prefix}.~{i}",
+
+                sample_depth_calls = sample_depth_calls,
+                batch_name_list = batch_name_list,
+                Depth_DEL_beds = Depth_DEL_beds,
+                Depth_DUP_beds = Depth_DUP_beds,
+
+                sv_base_mini_docker = sv_base_mini_docker,
+                sv_pipeline_docker = sv_pipeline_docker,
+
+                runtime_attr_concat_bed_Step1 = runtime_attr_concat_bed_Step1,
+                runtime_attr_concat_bed_Step2 = runtime_attr_concat_bed_Step2,
+                runtime_attr_seek_depth_supp_for_cpx = runtime_attr_seek_depth_supp_for_cpx,
+                runtime_attr_extract_cpx_lg_cnv_by_batch = runtime_attr_extract_cpx_lg_cnv_by_batch,
+                runtime_attr_generate_cnv_segments_from_cpx = runtime_attr_generate_cnv_segments_from_cpx
+        }
+
+        call GenerateCpxReviewScript {
+            input:
+                bed = SplitCpxCtx.cpx_ctx_bed,
+                sample_PE_metrics = sample_PE_metrics,
+                script = script_generate_cpx_review_script,
+                sv_pipeline_docker = sv_pipeline_docker,
+                runtime_attr_override = runtime_attr_generate_cpx_review_script
+        }
+
+        call collect_pe_metrics_for_cpx.CollectPEMetricsForCPX {
+            input:
+                batch_name_list = batch_name_list,
+                PE_metrics = PE_metrics,
+                PE_metrics_idxes = PE_metrics_idxes,
+                PE_collect_script = GenerateCpxReviewScript.pe_evidence_collection_script,
+                prefix = "~{prefix}.~{i}",
+                n_per_split = n_per_split,
+                sv_pipeline_docker = sv_pipeline_docker,
+                sv_base_mini_docker = sv_base_mini_docker,
+                runtime_attr_collect_pe = runtime_attr_collect_pe,
+                runtime_attr_split_script = runtime_attr_split_script,
+                runtime_attr_calcu_pe_stat = runtime_attr_calcu_pe_stat,
+                runtime_attr_concat_evidence = runtime_attr_concat_evidence
+        }
+
+        call CalculateCpxEvidences {
+            input:
+                PE_collect_script = GenerateCpxReviewScript.pe_evidence_collection_script,
+                PE_supp = CollectPEMetricsForCPX.evi_stat,
+                depth_supp = CollectLargeCNVSupportForCPX.lg_cnv_depth_supp,
+                prefix = "~{prefix}.~{i}",
+                sv_pipeline_docker = sv_pipeline_docker,
+                runtime_attr_override = runtime_attr_calcu_cpx_evidences
+        }
+
+        call CalculateCtxEvidences{
+            input:
+                PE_collect_script = GenerateCpxReviewScript.pe_evidence_collection_script,
+                PE_supp = CollectPEMetricsForCPX.evi_stat,
+                prefix = "~{prefix}.~{i}",
+                sv_pipeline_docker = sv_pipeline_docker,
+                runtime_attr_override = runtime_attr_calcu_cpx_evidences
+        }
+
+        call ReviseVcf {
+            input:
+                vcf_file = ScatterVcf.shards[i],
+                CPX_manual = CalculateCpxEvidences.manual_revise_CPX_results,
+                CTX_manual = CalculateCtxEvidences.manual_revise_CTX_results,
+                unresolved_svids = GenerateCpxReviewScript.unresolved_svids,
+                prefix = "~{prefix}.~{i}",
+                sv_pipeline_docker = sv_pipeline_docker
+        }
+    }
+
+    if (use_hail) {
+        call hail.HailMerge {
+            input:
+                vcfs=ReviseVcf.revised_vcf,
+                prefix="~{prefix}.cpx_filtered",
+                gcs_project=gcs_project,
+                sv_base_mini_docker=sv_base_mini_docker,
+                sv_pipeline_docker=sv_pipeline_docker,
+                runtime_override_preconcat=runtime_attr_preconcat,
+                runtime_override_hail_merge=runtime_attr_hail_merge,
+                runtime_override_fix_header=runtime_attr_fix_header
+        }
+    }
+
+    if (!use_hail) {
+        call MiniTasks.ConcatVcfs {
+            input:
+                vcfs=ReviseVcf.revised_vcf,
+                vcfs_idx=ReviseVcf.revised_vcf_idx,
+                allow_overlaps=true,
+                outfile_prefix="~{prefix}.cpx_filtered",
+                sv_base_mini_docker=sv_base_mini_docker,
+                runtime_attr_override=runtime_attr_concat
+        }
+    }
+
+    call MiniTasks.ConcatHeaderedTextFiles {
+        input:
+            text_files = CalculateCpxEvidences.manual_revise_CPX_results,
+            output_filename = "~{prefix}.CPX_evidence.txt" ,
+            linux_docker = linux_docker
+
+    }
+
+    File reviewed_vcf = select_first([ConcatVcfs.concat_vcf, HailMerge.merged_vcf])
+    File reviewed_vcf_idx = select_first([ConcatVcfs.concat_vcf_idx, HailMerge.merged_vcf_index])
+
+    output {
+        File revised_output_vcf = reviewed_vcf
+        File revised_output_vcf_idx = reviewed_vcf_idx
+        File cpx_evidences = ConcatHeaderedTextFiles.out
+    }
+}
+
+
+task ReviseVcf {
+    input {
+        File vcf_file
+        File CPX_manual
+        File CTX_manual
+        File unresolved_svids
+        String prefix
+        String sv_pipeline_docker
+        RuntimeAttr? runtime_attr_override
+    }
+
+    RuntimeAttr default_attr = object {
+        cpu_cores: 1,
+        mem_gb: 7.5,
+        disk_gb: ceil(5.0 +  size(vcf_file, "GB")*3),
+        boot_disk_gb: 30,
+        preemptible_tries: 1,
+        max_retries: 1
+    }
+    RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+    command<<<
+        set -euo pipefail
+
+        python <<CODE
+
+def recal_qual_score(record):
+    """
+    Recalibrate quality score for a single variant
+    """
+    quals = []
+    for s in [s for s in record.samples]:
+        GT = record.samples[s]['GT']
+        if GT in NULL_and_REF_GTs:
+            continue
+        elif GT in HET_GTs:
+            quals.append(record.samples[s]['GQ'])
+        else:
+            quals.append(99)
+
+    if len(quals) > 0:
+        return int(median(quals))
+
+def CPX_manual_readin(CPX_manual):
+    out={}
+    fin=open(CPX_manual)
+    for line in fin:
+        pin=line.strip().split()
+        if not pin[0] in out.keys():
+            out[pin[0]] = {}
+        out[pin[0]][pin[1]] = pin[2:]
+    fin.close()
+    return out
+
+def CTX_manual_readin(CTX_manual):
+    fin=open(CTX_manual)
+    out={}
+    for line in fin:
+        pin=line.strip().split('\t')
+        if not pin[0] in out.keys():
+            out[pin[0]] = {}
+        if not pin[1] in out[pin[0]].keys():
+            out[pin[0]][pin[1]] = pin[2:]
+    fin.close()
+    return out
+
+
+def unresolved_readin(unresolved_svids):
+    svids = set()
+    with open(unresolved_svids, 'r') as inp:
+        for line in inp:
+            svids.add(line.strip())
+    return svids
+
+
+def revise_vcf(vcf_input, vcf_output, hash_CPX_manual, unresolved_svids, hash_CTX_manual):
+    fin=pysam.VariantFile(vcf_input)
+    #revise vcf header
+    header = fin.header
+    fo=pysam.VariantFile(vcf_output, 'w', header = header)
+    for record in fin:
+        if record.id in unresolved_svids:
+            if 'PASS' in record.filter:
+                record.filter.clear()
+            record.filter.add('UNRESOLVED')
+        #label CPX with manual review results:
+        elif record.id in hash_CPX_manual.keys():
+            unresolve_rec = 0
+            for sample in hash_CPX_manual[record.id].keys():
+                if sample in record.samples.keys():
+                    if hash_CPX_manual[record.id][sample][0] == 'no_PE':
+                        if hash_CPX_manual[record.id][sample][1] in ["NA","lack_depth","lack_depth,lack_depth", "lack_depth,depth","depth,lack_depth"]:
+                            record.samples[sample]['GT'] = [None,None]
+                    elif hash_CPX_manual[record.id][sample][0] == 'low_PE':
+                       if hash_CPX_manual[record.id][sample][1] in ["NA","lack_depth","lack_depth,lack_depth", "lack_depth,depth","depth,lack_depth"]:
+                            record.samples[sample]['GT'] = [None,None]
+                    elif hash_CPX_manual[record.id][sample][0] == 'partial_PE':
+                       if hash_CPX_manual[record.id][sample][1] in ["NA","lack_depth","lack_depth,lack_depth", "lack_depth,depth","depth,lack_depth"]:
+                            record.samples[sample]['GT'] = [None,None]
+                            unresolve_rec+=1
+                    elif hash_CPX_manual[record.id][sample][0] == 'high_PE':
+                       if hash_CPX_manual[record.id][sample][1] in ["lack_depth","lack_depth,lack_depth", "lack_depth,depth","depth,lack_depth"]:
+                            record.samples[sample]['GT'] = [None,None]
+            if not unresolve_rec/len(hash_CPX_manual[record.id].keys())<.5:
+                if 'PASS' in record.filter:
+                    record.filter.clear()
+                record.filter.add('UNRESOLVED')
+        #label CTX with manual review results:
+        elif record.id in hash_CTX_manual.keys():
+            if 'NA' in hash_CTX_manual[record.id].keys():
+                record.filter.add('UNRESOLVED')
+            else:
+                for sample in hash_CTX_manual[record.id].keys():
+                    if sample in record.samples.keys():
+                        if hash_CTX_manual[record.id][sample][1] == 'Not_Enough_PE_Pairs':
+                            record.samples[sample]['GT'] = [None,None]
+                        elif hash_CTX_manual[record.id][sample][1] == 'PARTIAL_PE':
+                            record.samples[sample]['GT'] = [None,None]
+                        elif hash_CTX_manual[record.id][sample][1] == 'unbalanced_paternal_transmission_of_tloc,2nd_junction_is_missing':
+                            record.samples[sample]['GT'] = [None,None]
+                        elif hash_CTX_manual[record.id][sample][1] == 'NON_SPECIFIC':
+                            record.samples[sample]['GT'] = [None,None]
+                        elif hash_CTX_manual[record.id][sample][1] == 'Not_Enough_Coordinate_Span':
+                            record.samples[sample]['GT'] = [None,None]
+                        elif hash_CTX_manual[record.id][sample][1] == 'Interrupted_PE_Patterns_when_sorted_on_2nd_breakpoint':
+                            record.samples[sample]['GT'] = [None,None]
+                        elif hash_CTX_manual[record.id][sample][1] == 'CTX_with_DEL':
+                            record.stop = int(hash_CTX_manual[record.id][sample][0].split(':')[1].split('-')[1])
+        # revisions to insertion-type complex events
+        if record.info['SVTYPE']=="CPX" and record.info['CPX_TYPE'] in ['dDUP','dDUP_iDEL','INS_iDEL']:
+            if record.info['CPX_TYPE']=='INS_iDEL':
+                record.info['CPX_INTERVALS'] = ','.join([x for x in record.info['CPX_INTERVALS']] + [record.info['SOURCE']])
+            del record.info['SOURCE']
+        elif record.info['SVTYPE']=="INS" and 'SOURCE' in record.info.keys() and "INV" in record.info['SOURCE']: 
+            record.info['SVTYPE']="CPX"
+            record.alts = ('<CPX>',)
+            del_section = record.stop - record.pos
+            if del_section<50:
+                record.info['CPX_TYPE']="dDUP"
+                record.info['CPX_INTERVALS'] = record.info['SOURCE'].replace('INV','DUP')+','+record.info['SOURCE']
+            else:
+                record.info['CPX_TYPE']="dDUP_iDEL"
+                record.info['CPX_INTERVALS'] = record.info['SOURCE'].replace('INV','DUP')+','+record.info['SOURCE']+','+"DEL_"+record.chrom+":"+str(record.pos)+'-'+str(record.stop)
+            del record.info['SOURCE']
+        fo.write(record) # write out every record that was in the input - NCR will remove ones with no carriers left
+    fin.close()
+    fo.close()
+
+import os
+import sys
+from numpy import median
+import pysam
+import argparse
+
+#Define global variables
+NULL_GTs = [(None, None), (None, )]
+REF_GTs = [(0, 0), (0, ), (None, 2)]
+NULL_and_REF_GTs = NULL_GTs + REF_GTs
+HET_GTs = [(0, 1), (None, 1), (None, 3)]
+
+unresolved_svids = unresolved_readin("~{unresolved_svids}")
+hash_CPX_manual =  CPX_manual_readin("~{CPX_manual}")
+hash_CTX_manual = CTX_manual_readin("~{CTX_manual}")
+print(len(hash_CPX_manual.keys()))
+revise_vcf("~{vcf_file}", "~{prefix}.Manual_Revised.vcf.gz", hash_CPX_manual, unresolved_svids)
+
+CODE
+
+        tabix ~{prefix}.Manual_Revised.vcf.gz
+    >>>
+
+    output {
+        File revised_vcf = "~{prefix}.Manual_Revised.vcf.gz"
+        File revised_vcf_idx = "~{prefix}.Manual_Revised.vcf.gz.tbi"
+    }
+
+    runtime {
+        cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+        memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+        disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+        bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+        docker: sv_pipeline_docker
+        preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+        maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+    }
+}
+
+task SplitCpxCtx {
+    input {
+        File bed
+        String sv_base_mini_docker
+        RuntimeAttr? runtime_attr_override
+    }
+
+    RuntimeAttr default_attr = object {
+        cpu_cores: 1,
+        mem_gb: 7.5,
+        disk_gb: 10,
+        boot_disk_gb: 30,
+        preemptible_tries: 1,
+        max_retries: 1
+    }
+    RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+
+    String prefix  = basename(bed, ".bed.gz")
+
+    command <<<
+        set -eu
+
+        zcat ~{bed} | head -1 > ~{prefix}.cpx_ctx.bed
+
+        filterColumn=$(zcat ~{bed} | head -1 | tr "\t" "\n" | awk '$1=="FILTER" {print NR}')
+
+        set -o pipefail
+
+        zcat ~{bed} | awk 'NR > 1' | { grep CPX || true; } | awk -v filter_column=${filterColumn} '$filter_column !~ /UNRESOLVED/' >> ~{prefix}.cpx_ctx.bed
+
+        zcat ~{bed} | awk 'NR > 1' | { grep CTX || true; } >> ~{prefix}.cpx_ctx.bed
+
+        # INS with INV in SOURCE - will be converted to CPX later so need to evaluate evidence
+        zcat ~{bed} | awk 'NR > 1' | { grep INS || true; } | { grep INV || true; } >> ~{prefix}.cpx_ctx.bed
+
+        bgzip ~{prefix}.cpx_ctx.bed
+    >>>
+
+    output {
+        File cpx_ctx_bed = "~{prefix}.cpx_ctx.bed.gz"
+    }
+
+    runtime {
+        cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+        memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+        disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+        bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+        docker: sv_base_mini_docker
+        preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+        maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+    }
+}
+
+task GenerateCpxReviewScript {
+    input {
+        File bed
+        File sample_PE_metrics
+        File? script
+        String sv_pipeline_docker
+        RuntimeAttr? runtime_attr_override
+    }
+
+    RuntimeAttr default_attr = object {
+        cpu_cores: 1,
+        mem_gb: 5,
+        disk_gb: 10,
+        boot_disk_gb: 30,
+        preemptible_tries: 1,
+        max_retries: 1
+    }
+    RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+
+    String prefix  = basename(bed, ".bed.gz")
+
+    command <<<
+        set -euo pipefail
+
+        python ~{default="/opt/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py" script} \
+        -i ~{bed} \
+        -s ~{sample_PE_metrics} \
+        -p CPX_CTX_disINS.PASS.PE_evidences \
+        -c collect_PE_evidences.CPX_CTX_disINS.PASS.sh \
+        -r ~{prefix}.svelter \
+        -u ~{prefix}.unresolved_svids.txt
+
+    >>>
+
+    output {
+        File pe_evidence_collection_script = "collect_PE_evidences.CPX_CTX_disINS.PASS.sh"
+        File svelter = "~{prefix}.svelter"
+        File unresolved_svids = "~{prefix}.unresolved_svids.txt"
+    }
+
+    runtime {
+        cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+        memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+        disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+        bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+        docker: sv_pipeline_docker
+        preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+        maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+    }
+}
+
+task GenerateCnvSegmentFromCpx {
+    input {
+        File bed
+        File sample_depth_calls
+        String sv_pipeline_docker
+        RuntimeAttr? runtime_attr_override
+    }
+
+    RuntimeAttr default_attr = object {
+        cpu_cores: 1,
+        mem_gb: 5,
+        disk_gb: 10,
+        boot_disk_gb: 30,
+        preemptible_tries: 1,
+        max_retries: 1
+    }
+
+    RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+
+    String prefix  = basename(bed, ".bed.gz")
+
+    command <<<
+        set -euo pipefail
+
+        python <<CODE
+
+        import os
+        import sys
+        def split_cpx_interval(info):
+            #eg of info: DUP_chrY:3125606-3125667
+            out = [info.split('_')[0], info.split('_')[1].split(':')[0]] +  [int(i) for i in info.split('_')[1].split(':')[1].split('-')]
+            return out[1:4]+[out[0]]
+
+        def sample_depth_calls_readin(sample_depth):
+            out = {}
+            fin=open(sample_depth)
+            for line in fin:
+                pin=line.strip().split()
+                out[pin[0]] = pin[1:]
+            fin.close()
+            return out
+
+        def readin_cpx_cnv(bed_input,sample_depth_hash):
+            CPX_CNV = []
+            f_bed = os.popen(r'''zcat %s'''%(bed_input))
+            for line in f_bed:
+                pin=line.strip().split('\t')
+                if pin[0][0]=='#':
+                    pos_CPX_TYPE = pin.index('CPX_TYPE')
+                    pos_CPX_INTERVALS = pin.index('CPX_INTERVALS')
+                    pos_SOURCE = pin.index('SOURCE')
+                else:
+                    interval = []
+                    if 'DEL_' in pin[pos_CPX_INTERVALS] or 'DUP_' in pin[pos_CPX_INTERVALS]:
+                        interval += [split_cpx_interval(i)+[pin[3]] for i in pin[pos_CPX_INTERVALS].split(",") if "DEL_" in i or "DUP_" in i]
+                    if 'DEL_' in pin[pos_SOURCE] or 'DUP_' in pin[pos_SOURCE]:
+                        interval += [split_cpx_interval(i)+[pin[3]] for i in pin[pos_SOURCE].split(",") if "DEL_" in i or "DUP_" in i]
+                    if len(interval)>0:
+                        for i in interval:
+                            if i[2]-i[1]>4999:
+                                sample_names = pin[5].split(',')
+                                if i[3]=="DEL":
+                                    for j in sample_names:
+                                        CPX_CNV.append(i+[j, sample_depth_hash[j][0]])
+                                if i[3]=="DUP":
+                                    for j in sample_names:
+                                        CPX_CNV.append(i+[j, sample_depth_hash[j][1]])
+            f_bed.close()
+            return CPX_CNV
+
+        def write_cpx_cnv(CPX_CNV, fileout):
+            fo=open(fileout, 'w')
+            for info in CPX_CNV:
+                print('\t'.join(info), file=fo)
+            fo.close()
+
+        bed_input = "~{bed}"
+        fileout = "~{prefix}.lg_CNV.bed"
+
+        sample_depth_hash = sample_depth_calls_readin(sample_depth)
+        CPX_CNV = readin_cpx_cnv(bed_input, sample_depth_hash)
+        write_cpx_cnv(CPX_CNV, fileout)
+        CODE
+
+        bgzip "~{prefix}.lg_CNV.bed"
+    >>>
+
+    output {
+        File cpx_lg_cnv = "~{prefix}.lg_CNV.bed.gz"
+    }
+
+    runtime {
+        cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+        memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+        disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+        bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+        docker: sv_pipeline_docker
+        preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+        maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+    }
+}
+
+task CalculateCpxEvidences {
+    input {
+        File PE_collect_script
+        File PE_supp
+        File depth_supp
+        String prefix
+        String sv_pipeline_docker
+        RuntimeAttr? runtime_attr_override
+    }
+
+    RuntimeAttr default_attr = object {
+        cpu_cores: 1,
+        mem_gb: 5,
+        disk_gb: 10,
+        boot_disk_gb: 30,
+        preemptible_tries: 1,
+        max_retries: 1
+    }
+    RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+    command <<<
+
+        set -eu
+        awk '{print $6, $(NF-3)}' ~{PE_collect_script} | grep -v "_CTX_"| uniq > sample_SVID.tsv
+
+        set -euo pipefail
+
+
+        python <<CODE
+
+        def sample_svid_readin(filein):
+            out = {}
+            fin=open(filein)
+            for line in fin:
+                pin=line.strip().split()
+                if not pin[1] in out.keys():
+                    out[pin[1]] = []
+                out[pin[1]].append(pin[0])
+            fin.close()
+            return out
+
+        def PE_supp_readin(filein):
+            fin=os.popen(r'''zcat %s'''%(filein))
+            out={}
+            for line in fin:
+                pin=line.strip().split()
+                if not pin[4] in out.keys():
+                    out[pin[4]] = {}
+                if not pin[3] in out[pin[4]].keys():
+                    out[pin[4]][pin[3]] = [[],[]]
+                if pin[1]=="+":
+                    out[pin[4]][pin[3]][0] = [int(pin[0])]+pin[1:3]
+                elif pin[1]=="-":
+                    out[pin[4]][pin[3]][1] = [int(pin[0])]+pin[1:3]
+            fin.close()
+            return out
+
+        def Depth_supp_readin(filein):
+            fin=os.popen(r'''zcat %s'''%(filein))
+            out={}
+            for line in fin:
+                pin=line.strip().split()
+                if not pin[0][0]=='#':
+                    if not pin[4] in out.keys():
+                        out[pin[4]] = {}
+                    if not pin[5] in out[pin[4]].keys():
+                        out[pin[4]][pin[5]] = []
+                    out[pin[4]][pin[5]].append(float(pin[7]))
+            fin.close()
+            return out
+
+        def add_PE_supp(sample_svid, PE_supp):
+            out = {}
+            for svid in sample_svid.keys():
+                out[svid] = {}
+                if svid in PE_supp.keys():
+                    for sample in sample_svid[svid]:
+                        if sample in PE_supp[svid].keys():
+                            pe_supp = 'no_PE'
+                            if len(PE_supp[svid][sample][0])==0:
+                                PE_supp[svid][sample][0].append(0)
+                            if len(PE_supp[svid][sample][1])==0:
+                                PE_supp[svid][sample][1].append(0)
+                            if PE_supp[svid][sample][0][0]>0 or PE_supp[svid][sample][1][0]>0:
+                                pe_supp = 'partial_PE'
+                            if PE_supp[svid][sample][0][0]>0 and PE_supp[svid][sample][1][0]>0:
+                                pe_supp = 'low_PE'
+                            if PE_supp[svid][sample][0][0]>1 and PE_supp[svid][sample][1][0]>1:
+                                pe_supp = 'high_PE'
+                            out[svid][sample] = [pe_supp]
+                        else:
+                            out[svid][sample] = ['no_PE']
+                else:
+                    for sample in sample_svid[svid]:
+                        out[svid][sample] = ['no_PE']
+            return out
+
+        def add_depth_supp(sample_svid_pe, Depth_supp):
+            for svid in sample_svid_pe.keys():
+                if svid in Depth_supp.keys():
+                    for sample in sample_svid_pe[svid].keys():
+                        if sample in Depth_supp[svid].keys():
+                            depth_supp = 'lack_depth'
+                            if Depth_supp[svid][sample][0]>.5:
+                                depth_supp = 'depth'
+                                if len(Depth_supp[svid][sample])>1:
+                                    if not Depth_supp[svid][sample][1]>.5:
+                                        depth_supp+=',lack_depth'
+                            else:
+                                if len(Depth_supp[svid][sample])>1:
+                                    if Depth_supp[svid][sample][1]>.5:
+                                        depth_supp+=',depth'
+                            sample_svid_pe[svid][sample].append(depth_supp)
+                        else:
+                            sample_svid_pe[svid][sample].append('NA')
+                else:
+                    for sample in sample_svid_pe[svid].keys():
+                        sample_svid_pe[svid][sample].append('NA')
+            return sample_svid_pe
+
+        def write_pe_depth_supp(sample_svid_pe_depth, fileout):
+            fo=open(fileout,'w')
+            print('\t'.join(['VID','sample','supportive_PE_counts','depth_supp']), file=fo)
+            for svid in sample_svid_pe_depth.keys():
+                for sample in sample_svid_pe_depth[svid].keys():
+                    print('\t'.join([svid,sample] + sample_svid_pe_depth[svid][sample]), file=fo)
+            fo.close()
+
+        import os
+        sample_svid = sample_svid_readin("sample_SVID.tsv")
+        PE_supp = PE_supp_readin("~{PE_supp}")
+        Depth_supp = Depth_supp_readin("~{depth_supp}")
+        sample_svid_pe = add_PE_supp(sample_svid, PE_supp)
+        sample_svid_pe_depth = add_depth_supp(sample_svid_pe, Depth_supp)
+        write_pe_depth_supp(sample_svid_pe_depth, "~{prefix}.manual_revise.CPX_results")
+
+        CODE
+    >>>
+
+    output {
+        File manual_revise_CPX_results = "~{prefix}.manual_revise.CPX_results"
+    }
+
+    runtime {
+        cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+        memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+        disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+        bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+        docker: sv_pipeline_docker
+        preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+        maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+    }
+}
+
+task CalculateCtxEvidences {
+    input {
+        File PE_collect_script
+        File PE_supp
+        String prefix
+        String sv_pipeline_docker
+        RuntimeAttr? runtime_attr_override
+    }
+
+    RuntimeAttr default_attr = object {
+        cpu_cores: 1,
+        mem_gb: 5,
+        disk_gb: 10,
+        boot_disk_gb: 30,
+        preemptible_tries: 1,
+        max_retries: 1
+    }
+    RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+    command<<<
+
+        set -eu
+
+        awk '{print $6, $(NF-3)}' ~{PE_collect_script} | grep "_CTX_"| uniq > sample_SVID.tsv
+
+        set -eu
+
+        zcat ~{PE_supp} | grep "_CTX_"| uniq > PE_supp.tsv
+
+        set -euo pipefail
+
+        python <<CODE
+
+        def sample_svid_readin(filein):
+            out = {}
+            fin=open(filein)
+            for line in fin:
+                pin=line.strip().split()
+                if not pin[1] in out.keys():
+                    out[pin[1]] = []
+                out[pin[1]].append(pin[0])
+            fin.close()
+            return out
+
+        def PE_supp_readin(filein):
+            fin=open(filein)
+            out={}
+            for line in fin:
+                pin=line.strip().split()
+                if not pin[4] in out.keys():
+                    out[pin[4]] = {}
+                if not pin[3] in out[pin[4]].keys():
+                    out[pin[4]][pin[3]] = [[],[]]
+                if pin[1]=="+":
+                    out[pin[4]][pin[3]][0] = [int(pin[0])]+pin[1:3]
+                elif pin[1]=="-":
+                    out[pin[4]][pin[3]][1] = [int(pin[0])]+pin[1:3]
+            fin.close()
+            return out
+
+        def add_PE_supp(sample_svid, PE_supp):
+            out = {}
+            for svid in sample_svid.keys():
+                out[svid] = {}
+                if svid in PE_supp.keys():
+                    for sample in sample_svid[svid]:
+                        if sample in PE_supp[svid].keys():
+                            pe_supp = 'no_PE'
+                            if len(PE_supp[svid][sample][0])==0:
+                                PE_supp[svid][sample][0].append(0)
+                            if len(PE_supp[svid][sample][1])==0:
+                                PE_supp[svid][sample][1].append(0)
+                            if PE_supp[svid][sample][0][0]>0 or PE_supp[svid][sample][1][0]>0:
+                                pe_supp = 'partial_PE'
+                            if PE_supp[svid][sample][0][0]>0 and PE_supp[svid][sample][1][0]>0:
+                                pe_supp = 'low_PE'
+                            if PE_supp[svid][sample][0][0]>2 and PE_supp[svid][sample][1][0]>2:
+                                pe_supp = 'high_PE'
+                            out[svid][sample] = [pe_supp]
+                        else:
+                            out[svid][sample] = ['no_PE']
+                else:
+                    for sample in sample_svid[svid]:
+                        out[svid][sample] = ['no_PE']
+            return out
+
+        def write_pe_depth_supp(sample_svid_pe_depth, fileout):
+            fo=open(fileout,'w')
+            print('\t'.join(['VID','sample','supportive_PE_counts']), file=fo)
+            for svid in sample_svid_pe_depth.keys():
+                for sample in sample_svid_pe_depth[svid].keys():
+                    print('\t'.join([svid,sample] + sample_svid_pe_depth[svid][sample]), file=fo)
+            fo.close()
+
+        import os
+        sample_svid = sample_svid_readin("sample_SVID.tsv")
+        PE_supp = PE_supp_readin("PE_supp.tsv")
+        sample_svid_pe = add_PE_supp(sample_svid, PE_supp)
+        write_pe_depth_supp(sample_svid_pe, "~{prefix}.manual_revise.CTX_results")
+
+        CODE
+    >>>
+
+    output {
+        File manual_revise_CTX_results = "~{prefix}.manual_revise.CTX_results"
+    }
+
+    runtime {
+        cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+        memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+        disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+        bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+        docker: sv_pipeline_docker
+        preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+        maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+    }
+}
+

From a550faa05c013d9cdd986e38ef359384a8028c0a Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Thu, 17 Oct 2024 17:35:16 -0400
Subject: [PATCH 02/28] remove dependency on manually created files

---
 wdl/CollectLargeCNVSupportForCPX.wdl |  33 +++--
 wdl/RefineComplexVariants.wdl        | 186 +++++++++++----------------
 2 files changed, 92 insertions(+), 127 deletions(-)

diff --git a/wdl/CollectLargeCNVSupportForCPX.wdl b/wdl/CollectLargeCNVSupportForCPX.wdl
index ba925c9b1..9675bac4d 100644
--- a/wdl/CollectLargeCNVSupportForCPX.wdl
+++ b/wdl/CollectLargeCNVSupportForCPX.wdl
@@ -8,7 +8,7 @@ workflow CollectLargeCNVSupportForCPX {
         File cpx_ctx_bed
         String prefix
 
-        File sample_depth_calls
+        File sample_batch_pe_map
         Array[String] batch_name_list
         Array[File] Depth_DEL_beds
         Array[File] Depth_DUP_beds
@@ -27,7 +27,7 @@ workflow CollectLargeCNVSupportForCPX {
     call GenerateCnvSegmentFromCpx {
         input:
             bed = cpx_ctx_bed, #input file including cpx calls in bed format; 
-            sample_depth_calls = sample_depth_calls,
+            sample_batch_pe_map = sample_batch_pe_map,
             sv_pipeline_docker = sv_pipeline_docker,
             runtime_attr_override = runtime_attr_generate_cnv_segments_from_cpx
     }
@@ -181,8 +181,7 @@ task ExtractCpxLgCnvByBatch {
 task GenerateCnvSegmentFromCpx {
     input {
         File bed
-        # sample depth calls is map of sample name to batch ID
-        File sample_depth_calls
+        File sample_batch_pe_map
         String sv_pipeline_docker
         Int min_depth_size = 5000
         RuntimeAttr? runtime_attr_override
@@ -214,16 +213,15 @@ task GenerateCnvSegmentFromCpx {
             out = [info.split('_')[0], info.split('_')[1].split(':')[0]] +  [int(i) for i in info.split('_')[1].split(':')[1].split('-')]
             return out[1:4]+[out[0]]
 
-        def sample_depth_calls_readin(sample_depth):
-            out = {}
-            fin=open(sample_depth)
-            for line in fin:
-                pin=line.strip().split()
-                out[pin[0]] = pin[1:]
-            fin.close()
-            return out
+        def get_sample_batch_map(sample_batch_pe_map):
+            sample_to_batch = {}
+            with open(sample_batch_pe_map, 'r') as inp:
+                for line in inp:
+                    sample, batch, pe_file = line.strip().split("\t")
+                    sample_to_batch[sample] = batch
+            return sample_to_batch
 
-        def readin_cpx_cnv(bed_input,sample_depth_hash,min_depth_size):
+        def readin_cpx_cnv(bed_input,sample_to_batch,min_depth_size):
             CPX_CNV = []
             f_bed = os.popen(r'''zcat %s'''%(bed_input))
             for line in f_bed:
@@ -245,10 +243,10 @@ task GenerateCnvSegmentFromCpx {
                                 sample_names = pin[pos_SAMPLES].split(',')
                                 if i[3]=="DEL":
                                     for j in sample_names:
-                                        CPX_CNV.append(i+[j, sample_depth_hash[j][0]])
+                                        CPX_CNV.append(i+[j, sample_to_batch[j]])
                                 if i[3]=="DUP":
                                     for j in sample_names:
-                                        CPX_CNV.append(i+[j, sample_depth_hash[j][0]])
+                                        CPX_CNV.append(i+[j, sample_to_batch[j]])
             f_bed.close()
             return CPX_CNV
 
@@ -260,10 +258,9 @@ task GenerateCnvSegmentFromCpx {
 
         bed_input = "~{bed}"
         fileout = "~{prefix}.lg_CNV.bed"
-        sample_depth = "~{sample_depth_calls}"
         min_depth_size = ~{min_depth_size}
-        sample_depth_hash = sample_depth_calls_readin(sample_depth)
-        CPX_CNV = readin_cpx_cnv(bed_input, sample_depth_hash, min_depth_size)
+        sample_to_batch = get_sample_batch_map("~{sample_batch_pe_map}")
+        CPX_CNV = readin_cpx_cnv(bed_input, sample_to_batch, min_depth_size)
         write_cpx_cnv(CPX_CNV, fileout)
         CODE
 
diff --git a/wdl/RefineComplexVariants.wdl b/wdl/RefineComplexVariants.wdl
index 7d773430a..33e57ee26 100644
--- a/wdl/RefineComplexVariants.wdl
+++ b/wdl/RefineComplexVariants.wdl
@@ -17,11 +17,10 @@ workflow RefineComplexVariants {
         Array[File] PE_metrics_idxes
         Array[File] Depth_DEL_beds
         Array[File] Depth_DUP_beds
+        Array[File] batch_sample_lists
 
         String prefix
         Int n_per_split
-        File sample_PE_metrics
-        File sample_depth_calls
         File? script_generate_cpx_review_script
 
         Boolean use_hail = false
@@ -31,6 +30,7 @@ workflow RefineComplexVariants {
         String sv_pipeline_docker
         String linux_docker
 
+        RuntimeAttr? runtime_attr_sample_batch
         RuntimeAttr? runtime_attr_vcf2bed
         RuntimeAttr? runtime_attr_collect_pe
         RuntimeAttr? runtime_attr_scatter_vcf
@@ -53,6 +53,14 @@ workflow RefineComplexVariants {
         RuntimeAttr? runtime_attr_calcu_cpx_evidences
     }
 
+    call GetSampleBatchPEMap {
+        input:
+            batch_name_list = batch_name_list,
+            batch_sample_lists = batch_sample_lists,
+            batch_pe_files = write_lines(PE_metrics),
+            sv_pipeline_docker = sv_pipeline_docker,
+            runtime_attr_override = runtime_attr_sample_batch
+    }
 
     call MiniTasks.ScatterVcf {
         input:
@@ -85,7 +93,7 @@ workflow RefineComplexVariants {
                 cpx_ctx_bed = SplitCpxCtx.cpx_ctx_bed,
                 prefix = "~{prefix}.~{i}",
 
-                sample_depth_calls = sample_depth_calls,
+                sample_batch_pe_map = GetSampleBatchPEMap.sample_batch_pe_map,
                 batch_name_list = batch_name_list,
                 Depth_DEL_beds = Depth_DEL_beds,
                 Depth_DUP_beds = Depth_DUP_beds,
@@ -103,7 +111,7 @@ workflow RefineComplexVariants {
         call GenerateCpxReviewScript {
             input:
                 bed = SplitCpxCtx.cpx_ctx_bed,
-                sample_PE_metrics = sample_PE_metrics,
+                sample_batch_pe_map = GetSampleBatchPEMap.sample_batch_pe_map,
                 script = script_generate_cpx_review_script,
                 sv_pipeline_docker = sv_pipeline_docker,
                 runtime_attr_override = runtime_attr_generate_cpx_review_script
@@ -200,6 +208,67 @@ workflow RefineComplexVariants {
 }
 
 
+task GetSampleBatchPEMap {
+    input {
+        Array[File] batch_sample_lists
+        Array[String] batch_name_list
+        File batch_pe_files
+        String prefix
+        String sv_pipeline_docker
+        RuntimeAttr? runtime_attr_override
+    }
+
+    RuntimeAttr default_attr = object {
+        cpu_cores: 1,
+        mem_gb: 5,
+        disk_gb: 10 + 2*ceil(size(flatten([batch_sample_lists]), "GB")),
+        boot_disk_gb: 30,
+        preemptible_tries: 1,
+        max_retries: 1
+    }
+
+    RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+    command <<<
+        set -euo pipefail
+
+        python <<CODE
+import os
+batch_sample_lists = ["~{sep='", "' batch_sample_lists}"]
+batch_name_list = ["~{sep='", "' batch_name_list}"]
+batch_pe_files = []
+with open("~{batch_pe_files}", 'r') as pe:
+    for line in pe:
+        local_file = os.path.basename(line.strip())
+        batch_pe_files.append(local_file)
+with open("~{prefix}.sample_batch_pe_map.tsv", 'w') as out:
+    for i in range(len(batch_name_list)):
+        with open(batch_sample_lists[i], 'r') as inp:
+            for line in inp:
+                sample = line.strip()
+                batch = batch_name_list[i]
+                pe_file = batch_pe_files[i]
+                out.write(f"{sample}\t{batch}\t{pe_file}\n")
+CODE
+
+    >>>
+
+    output {
+        File sample_batch_pe_map = "~{prefix}.sample_batch_pe_map.tsv"
+    }
+
+    runtime {
+        cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+        memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+        disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+        bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+        docker: sv_pipeline_docker
+        preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+        maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+    }
+}
+
+
 task ReviseVcf {
     input {
         File vcf_file
@@ -444,7 +513,7 @@ task SplitCpxCtx {
 task GenerateCpxReviewScript {
     input {
         File bed
-        File sample_PE_metrics
+        File sample_batch_pe_map
         File? script
         String sv_pipeline_docker
         RuntimeAttr? runtime_attr_override
@@ -466,9 +535,11 @@ task GenerateCpxReviewScript {
     command <<<
         set -euo pipefail
 
+        cut -f1,3 ~{sample_batch_pe_map} > sample_PE_metrics.tsv
+
         python ~{default="/opt/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py" script} \
         -i ~{bed} \
-        -s ~{sample_PE_metrics} \
+        -s sample_PE_metrics.tsv \
         -p CPX_CTX_disINS.PASS.PE_evidences \
         -c collect_PE_evidences.CPX_CTX_disINS.PASS.sh \
         -r ~{prefix}.svelter \
@@ -493,109 +564,6 @@ task GenerateCpxReviewScript {
     }
 }
 
-task GenerateCnvSegmentFromCpx {
-    input {
-        File bed
-        File sample_depth_calls
-        String sv_pipeline_docker
-        RuntimeAttr? runtime_attr_override
-    }
-
-    RuntimeAttr default_attr = object {
-        cpu_cores: 1,
-        mem_gb: 5,
-        disk_gb: 10,
-        boot_disk_gb: 30,
-        preemptible_tries: 1,
-        max_retries: 1
-    }
-
-    RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
-
-
-    String prefix  = basename(bed, ".bed.gz")
-
-    command <<<
-        set -euo pipefail
-
-        python <<CODE
-
-        import os
-        import sys
-        def split_cpx_interval(info):
-            #eg of info: DUP_chrY:3125606-3125667
-            out = [info.split('_')[0], info.split('_')[1].split(':')[0]] +  [int(i) for i in info.split('_')[1].split(':')[1].split('-')]
-            return out[1:4]+[out[0]]
-
-        def sample_depth_calls_readin(sample_depth):
-            out = {}
-            fin=open(sample_depth)
-            for line in fin:
-                pin=line.strip().split()
-                out[pin[0]] = pin[1:]
-            fin.close()
-            return out
-
-        def readin_cpx_cnv(bed_input,sample_depth_hash):
-            CPX_CNV = []
-            f_bed = os.popen(r'''zcat %s'''%(bed_input))
-            for line in f_bed:
-                pin=line.strip().split('\t')
-                if pin[0][0]=='#':
-                    pos_CPX_TYPE = pin.index('CPX_TYPE')
-                    pos_CPX_INTERVALS = pin.index('CPX_INTERVALS')
-                    pos_SOURCE = pin.index('SOURCE')
-                else:
-                    interval = []
-                    if 'DEL_' in pin[pos_CPX_INTERVALS] or 'DUP_' in pin[pos_CPX_INTERVALS]:
-                        interval += [split_cpx_interval(i)+[pin[3]] for i in pin[pos_CPX_INTERVALS].split(",") if "DEL_" in i or "DUP_" in i]
-                    if 'DEL_' in pin[pos_SOURCE] or 'DUP_' in pin[pos_SOURCE]:
-                        interval += [split_cpx_interval(i)+[pin[3]] for i in pin[pos_SOURCE].split(",") if "DEL_" in i or "DUP_" in i]
-                    if len(interval)>0:
-                        for i in interval:
-                            if i[2]-i[1]>4999:
-                                sample_names = pin[5].split(',')
-                                if i[3]=="DEL":
-                                    for j in sample_names:
-                                        CPX_CNV.append(i+[j, sample_depth_hash[j][0]])
-                                if i[3]=="DUP":
-                                    for j in sample_names:
-                                        CPX_CNV.append(i+[j, sample_depth_hash[j][1]])
-            f_bed.close()
-            return CPX_CNV
-
-        def write_cpx_cnv(CPX_CNV, fileout):
-            fo=open(fileout, 'w')
-            for info in CPX_CNV:
-                print('\t'.join(info), file=fo)
-            fo.close()
-
-        bed_input = "~{bed}"
-        fileout = "~{prefix}.lg_CNV.bed"
-
-        sample_depth_hash = sample_depth_calls_readin(sample_depth)
-        CPX_CNV = readin_cpx_cnv(bed_input, sample_depth_hash)
-        write_cpx_cnv(CPX_CNV, fileout)
-        CODE
-
-        bgzip "~{prefix}.lg_CNV.bed"
-    >>>
-
-    output {
-        File cpx_lg_cnv = "~{prefix}.lg_CNV.bed.gz"
-    }
-
-    runtime {
-        cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
-        memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
-        disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
-        bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
-        docker: sv_pipeline_docker
-        preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
-        maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
-    }
-}
-
 task CalculateCpxEvidences {
     input {
         File PE_collect_script

From b1496861b30941fdbea35e6ffd0fd6a6112b8651 Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Thu, 17 Oct 2024 18:20:00 -0400
Subject: [PATCH 03/28] add json templates

---
 .../RefineComplexVariants.json.tmpl             | 17 +++++++++++++++++
 .../RefineComplexVariants.json.tmpl             | 17 +++++++++++++++++
 wdl/RefineComplexVariants.wdl                   | 10 +++++-----
 3 files changed, 39 insertions(+), 5 deletions(-)
 create mode 100644 inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/RefineComplexVariants.json.tmpl
 create mode 100644 inputs/templates/test/RefineComplexVariants/RefineComplexVariants.json.tmpl

diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/RefineComplexVariants.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/RefineComplexVariants.json.tmpl
new file mode 100644
index 000000000..fb95e1ce4
--- /dev/null
+++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/RefineComplexVariants.json.tmpl
@@ -0,0 +1,17 @@
+{
+  "RefineComplexVariants.vcf": "${this.cleaned_vcf}",
+  "RefineComplexVariants.prefix": "${this.sample_set_set_id}",
+
+  "RefineComplexVariants.batch_name_list": "${this.sample_sets.sample_set_id}",
+  "RefineComplexVariants.batch_sample_lists": "${this.sample_sets.filtered_batch_samples_file}",
+  "RefineComplexVariants.PE_metrics": "${this.sample_sets.merged_PE}",
+  "RefineComplexVariants.PE_metrics_indexes": "${this.sample_sets.merged_PE_index}",
+  "RefineComplexVariants.Depth_DEL_beds": "${this.sample_sets.merged_dels}",
+  "RefineComplexVariants.Depth_DUP_beds": "${this.sample_sets.merged_dups}",
+
+  "RefineComplexVariants.n_per_split": "15000",
+
+  "RefineComplexVariants.sv_base_mini_docker": "${workspace.sv_base_mini_docker}",
+  "RefineComplexVariants.sv_pipeline_docker": "${workspace.sv_pipeline_docker}",
+  "RefineComplexVariants.linux_docker": "${workspace.linux_docker}"
+}
diff --git a/inputs/templates/test/RefineComplexVariants/RefineComplexVariants.json.tmpl b/inputs/templates/test/RefineComplexVariants/RefineComplexVariants.json.tmpl
new file mode 100644
index 000000000..c9feb45a2
--- /dev/null
+++ b/inputs/templates/test/RefineComplexVariants/RefineComplexVariants.json.tmpl
@@ -0,0 +1,17 @@
+{
+  "RefineComplexVariants.vcf": {{ test_batch.clean_vcf | tojson }},
+  "RefineComplexVariants.prefix": {{ test_batch.name | tojson }},
+
+  "RefineComplexVariants.batch_name_list": [{{ test_batch.name | tojson }}],
+  "RefineComplexVariants.batch_sample_lists": [ {{ test_batch.final_sample_list | tojson }} ],
+  "RefineComplexVariants.PE_metrics": [ {{ test_batch.merged_disc_file | tojson }} ],
+  "RefineComplexVariants.PE_metrics_indexes": [ {{ test_batch.merged_disc_file_index | tojson }} ],
+  "RefineComplexVariants.Depth_DEL_beds": [ {{ test_batch.del_bed | tojson }} ],
+  "RefineComplexVariants.Depth_DUP_beds": [ {{ test_batch.dup_bed | tojson }} ],
+
+  "RefineComplexVariants.n_per_split": "15000",
+
+  "RefineComplexVariants.sv_base_mini_docker": {{ dockers.sv_base_mini_docker | tojson }},
+  "RefineComplexVariants.sv_pipeline_docker": {{ dockers.sv_pipeline_docker | tojson }},
+  "RefineComplexVariants.linux_docker": {{ dockers.linux_docker | tojson }}
+}
diff --git a/wdl/RefineComplexVariants.wdl b/wdl/RefineComplexVariants.wdl
index 33e57ee26..1e189eb9b 100644
--- a/wdl/RefineComplexVariants.wdl
+++ b/wdl/RefineComplexVariants.wdl
@@ -10,16 +10,15 @@ import "Utils.wdl" as util
 workflow RefineComplexVariants {
     input {
         File vcf
-        File vcf_idx
+        String prefix
 
         Array[String] batch_name_list
+        Array[File] batch_sample_lists
         Array[File] PE_metrics
-        Array[File] PE_metrics_idxes
+        Array[File] PE_metrics_indexes
         Array[File] Depth_DEL_beds
         Array[File] Depth_DUP_beds
-        Array[File] batch_sample_lists
 
-        String prefix
         Int n_per_split
         File? script_generate_cpx_review_script
 
@@ -58,6 +57,7 @@ workflow RefineComplexVariants {
             batch_name_list = batch_name_list,
             batch_sample_lists = batch_sample_lists,
             batch_pe_files = write_lines(PE_metrics),
+            prefix = prefix,
             sv_pipeline_docker = sv_pipeline_docker,
             runtime_attr_override = runtime_attr_sample_batch
     }
@@ -121,7 +121,7 @@ workflow RefineComplexVariants {
             input:
                 batch_name_list = batch_name_list,
                 PE_metrics = PE_metrics,
-                PE_metrics_idxes = PE_metrics_idxes,
+                PE_metrics_idxes = PE_metrics_indexes,
                 PE_collect_script = GenerateCpxReviewScript.pe_evidence_collection_script,
                 prefix = "~{prefix}.~{i}",
                 n_per_split = n_per_split,

From b3f31b532a674576cc85d3cc96a124f3438c26fe Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Thu, 17 Oct 2024 19:00:32 -0400
Subject: [PATCH 04/28] rescue ME dels in CleanVcf

---
 .../CleanVcf.json.tmpl                        |   2 +
 .../cohort_mode/workspace.tsv.tmpl            |   2 +
 .../test/MakeCohortVcf/CleanVcf.json.tmpl     |   2 +
 inputs/values/resources_hg38.json             |   2 +
 wdl/CleanVcf.wdl                              |   7 ++
 wdl/CleanVcfChromosome.wdl                    | 111 +++++++++++++++++-
 wdl/RefineComplexVariants.wdl                 |   7 +-
 7 files changed, 127 insertions(+), 6 deletions(-)

diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CleanVcf.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CleanVcf.json.tmpl
index e7c1bc7a0..3d895e35b 100644
--- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CleanVcf.json.tmpl
+++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CleanVcf.json.tmpl
@@ -1,6 +1,8 @@
 {
   "CleanVcf.contig_list": "${workspace.primary_contigs_fai}",
   "CleanVcf.allosome_fai": "${workspace.allosome_file}",
+  "CleanVcf.HERVK_reference": "${workspace.HERVK_reference",
+  "CleanVcf.LINE1_reference": "${workspace.LINE1_reference}",
   "CleanVcf.chr_x": "${workspace.chr_x}",
   "CleanVcf.chr_y": "${workspace.chr_y}",
 
diff --git a/inputs/templates/terra_workspaces/cohort_mode/workspace.tsv.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workspace.tsv.tmpl
index b9d5f26d5..6ee7a177d 100644
--- a/inputs/templates/terra_workspaces/cohort_mode/workspace.tsv.tmpl
+++ b/inputs/templates/terra_workspaces/cohort_mode/workspace.tsv.tmpl
@@ -29,6 +29,8 @@ exclude_intervals_for_gcnv_filter_intervals	{{ reference_resources.exclude_inter
 external_af_ref_bed	{{ reference_resources.external_af_ref_bed }}
 external_af_ref_bed_prefix	{{ reference_resources.external_af_ref_bed_prefix }}
 genome_file	{{ reference_resources.genome_file }}
+HERVK_reference	{{ reference_resources.hervk_reference }}
+LINE1_reference	{{ reference_resources.line1_reference }}
 manta_region_bed	{{ reference_resources.manta_region_bed }}
 manta_region_bed_index	{{ reference_resources.manta_region_bed_index }}
 mei_bed	{{ reference_resources.mei_bed }}
diff --git a/inputs/templates/test/MakeCohortVcf/CleanVcf.json.tmpl b/inputs/templates/test/MakeCohortVcf/CleanVcf.json.tmpl
index 8606a4a50..196505724 100644
--- a/inputs/templates/test/MakeCohortVcf/CleanVcf.json.tmpl
+++ b/inputs/templates/test/MakeCohortVcf/CleanVcf.json.tmpl
@@ -1,6 +1,8 @@
 {
   "CleanVcf.contig_list": {{ reference_resources.primary_contigs_fai | tojson }},
   "CleanVcf.allosome_fai": {{ reference_resources.allosome_file | tojson }},
+  "CleanVcf.HERVK_reference": {{ reference_resources.hervk_reference | tojson }},
+  "CleanVcf.LINE1_reference": {{ reference_resources.line1_reference | tojson }},
   "CleanVcf.chr_x": {{ reference_resources.chr_x | tojson }},
   "CleanVcf.chr_y": {{ reference_resources.chr_y | tojson }},
 
diff --git a/inputs/values/resources_hg38.json b/inputs/values/resources_hg38.json
index 73484d5b3..545d4b2b0 100644
--- a/inputs/values/resources_hg38.json
+++ b/inputs/values/resources_hg38.json
@@ -21,6 +21,8 @@
   "external_af_ref_bed" : "gs://gcp-public-data--broad-references/hg38/v0/sv-resources/resources/v1/gnomad_AF/gnomad_v2.1_sv.sites.GRCh38.bed.gz",
   "external_af_ref_bed_prefix" : "gnomad_v2.1_sv",
   "genome_file" : "gs://gcp-public-data--broad-references/hg38/v0/sv-resources/resources/v1/hg38.genome",
+  "hervk_reference": "gs://gatk-sv-resources-public/hg38/v0/sv-resources/resources/v1/HERVK.sorted.bed.gz",
+  "line1_reference": "gs://gatk-sv-resources-public/hg38/v0/sv-resources/resources/v1/LINE1.sorted.bed.gz",
   "manta_region_bed" : "gs://gcp-public-data--broad-references/hg38/v0/sv-resources/resources/v1/primary_contigs_plus_mito.bed.gz",
   "manta_region_bed_index" : "gs://gcp-public-data--broad-references/hg38/v0/sv-resources/resources/v1/primary_contigs_plus_mito.bed.gz.tbi",
   "mei_bed" : "gs://gatk-sv-resources-public/hg38/v0/sv-resources/resources/v1/hg38.repeatmasker.mei.with_SVA.pad_50_merged.bed.gz",
diff --git a/wdl/CleanVcf.wdl b/wdl/CleanVcf.wdl
index c86e88500..ab228ccf7 100644
--- a/wdl/CleanVcf.wdl
+++ b/wdl/CleanVcf.wdl
@@ -24,6 +24,9 @@ workflow CleanVcf {
     Int clean_vcf1b_records_per_shard
     Int clean_vcf5_records_per_shard
 
+    File HERVK_reference
+    File LINE1_reference
+
     String chr_x
     String chr_y
 
@@ -67,6 +70,7 @@ workflow CleanVcf {
     RuntimeAttr? runtime_override_stitch_fragmented_cnvs
     RuntimeAttr? runtime_override_final_cleanup
     RuntimeAttr? runtime_attr_format
+    RuntimeAttr? runtime_override_rescue_me_dels
 
     # Clean vcf 1b
     RuntimeAttr? runtime_attr_override_subset_large_cnvs_1b
@@ -133,6 +137,8 @@ workflow CleanVcf {
         clean_vcf1b_records_per_shard=clean_vcf1b_records_per_shard,
         clean_vcf5_records_per_shard=clean_vcf5_records_per_shard,
         ploidy_table=CreatePloidyTableFromPed.out,
+        HERVK_reference=HERVK_reference,
+        LINE1_reference=LINE1_reference,
         chr_x=chr_x,
         chr_y=chr_y,
         linux_docker=linux_docker,
@@ -170,6 +176,7 @@ workflow CleanVcf {
         runtime_override_concat_vcfs_1b=runtime_override_concat_vcfs_1b,
         runtime_override_cat_multi_cnvs_1b=runtime_override_cat_multi_cnvs_1b,
         runtime_attr_format=runtime_attr_format,
+        runtime_override_rescue_me_dels=runtime_override_rescue_me_dels
     }
   }
 
diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl
index 29b3bcc8e..45258f304 100644
--- a/wdl/CleanVcfChromosome.wdl
+++ b/wdl/CleanVcfChromosome.wdl
@@ -25,6 +25,9 @@ workflow CleanVcfChromosome {
     File? outlier_samples_list
     Int? max_samples_per_shard_step3
 
+    File HERVK_reference
+    File LINE1_reference
+
     File ploidy_table
     String chr_x
     String chr_y
@@ -49,6 +52,7 @@ workflow CleanVcfChromosome {
     RuntimeAttr? runtime_override_clean_vcf_5_polish
     RuntimeAttr? runtime_override_stitch_fragmented_cnvs
     RuntimeAttr? runtime_override_final_cleanup
+    RuntimeAttr? runtime_override_rescue_me_dels
 
     # Clean vcf 1b
     RuntimeAttr? runtime_attr_override_subset_large_cnvs_1b
@@ -244,9 +248,19 @@ workflow CleanVcfChromosome {
       runtime_attr_override_polish=runtime_override_clean_vcf_5_polish
   }
 
+  call RescueMobileElementDeletions {
+    input:
+      vcf = CleanVcf5.polished,
+      prefix = "~{prefix}.~{contig}.rescue_me_dels",
+      LINE1 = LINE1_reference,
+      HERVK = HERVK_reference,
+      sv_pipeline_docker = sv_pipeline_docker,
+      runtime_attr_override = runtime_override_rescue_me_dels
+  }
+
   call DropRedundantCnvs {
     input:
-      vcf=CleanVcf5.polished,
+      vcf=RescueMobileElementDeletions.out,
       prefix="~{prefix}.drop_redundant_cnvs",
       contig=contig,
       sv_pipeline_docker=sv_pipeline_docker,
@@ -595,6 +609,101 @@ task CleanVcf4 {
   }
 }
 
+task RescueMobileElementDeletions {
+  input {
+    File vcf
+    String prefix
+    File LINE1
+    File HERVK
+    String sv_pipeline_docker
+    RuntimeAttr? runtime_attr_override
+  }
+
+  Float input_size = size(vcf, "GiB")
+  # disk is cheap, read/write speed is proportional to disk size, and disk IO is a significant time factor:
+  # in tests on large VCFs, memory usage is ~1.0 * input VCF size
+  # the biggest disk usage is at the end of the task, with input + output VCF on disk
+  Int cpu_cores = 2 # speed up compression / decompression of VCFs
+  RuntimeAttr runtime_default = object {
+    mem_gb: 3.75 + input_size * 1.5,
+    disk_gb: ceil(100.0 + input_size * 3.0),
+    cpu_cores: cpu_cores,
+    preemptible_tries: 3,
+    max_retries: 1,
+    boot_disk_gb: 10
+  }
+  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+  runtime {
+    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
+    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+    docker: sv_pipeline_docker
+    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+  }
+
+  command <<<
+    set -euo pipefail
+
+    python <<CODE
+import os
+import pysam
+fin=pysam.VariantFile("~{vcf}")
+fo=pysam.VariantFile("~{prefix}.bnd_del.vcf.gz", 'w', header = fin.header)
+for record in fin:
+    if record.info['SVTYPE'] in ['BND'] and record.info['STRANDS']=="+-" and record.chrom == record.info['CHR2'] and record.info['END2'] - record.start < 10000:
+        record.info['SVLEN'] = record.info['END2'] - record.start
+        fo.write(record)
+fin.close()
+fo.close()
+CODE
+
+    tabix -p vcf ~{prefix}.bnd_del.vcf.gz
+
+    svtk vcf2bed ~{prefix}.bnd_del.vcf.gz -i ALL --include-filters ~{prefix}.bnd_del.bed
+    bgzip ~{prefix}.bnd_del.bed
+
+    bedtools coverage -wo -a ~{prefix}.bnd_del.bed.gz -b ~{LINE1} | awk '{if ($NF>.5) print}' | cut -f4 | sed -e 's/$/\tDEL\tPASS\toverlap_LINE1/' > manual_revise.MEI_DEL_from_BND.SVID_SVTYPE_FILTER_INFO.tsv
+    bedtools coverage -wo -a ~{prefix}.bnd_del.bed.gz -b ~{HERVK} | awk '{if ($NF>.5) print}' | cut -f4 | sed -e 's/$/\tDEL\tPASS\toverlap_HERVK/' >> manual_revise.MEI_DEL_from_BND.SVID_SVTYPE_FILTER_INFO.tsv
+
+    python <<CODE
+import pysam
+def SVID_MEI_DEL_readin(MEI_DEL_reset):
+    out={}
+    fin=open(MEI_DEL_reset)
+    for line in fin:
+        pin=line.strip().split()
+        if not pin[0] in out.keys():
+            out[pin[0]] = pin[3]
+    fin.close()
+    return out
+
+hash_MEI_DEL_reset = SVID_MEI_DEL_readin("manual_revise.MEI_DEL_from_BND.SVID_SVTYPE_FILTER_INFO.tsv")
+fin=pysam.VariantFile("~{vcf}")
+fo=pysam.VariantFile("~{prefix}.vcf.gz", 'w', header = fin.header)
+for record in fin:
+    if record.id in hash_MEI_DEL_reset.keys():
+        del record.filter['UNRESOLVED']
+        record.info['SVTYPE'] = 'DEL'
+        record.info['SVLEN'] = record.info['END2'] - record.start
+        record.stop = record.info['END2']
+        record.info.pop("CHR2")
+        record.info.pop("END2")
+        if hash_MEI_DEL_reset[record.id] == 'overlap_LINE1':
+            record.alts = ('<DEL:ME:LINE1>',)
+        if hash_MEI_DEL_reset[record.id] == 'overlap_HERVK':
+            record.alts = ('<DEL:ME:HERVK>',)
+fin.close()
+fo.close()
+CODE
+  >>>
+
+  output {
+    File out = "~{prefix}.vcf.gz"
+  }
+}
+
 
 # Remove CNVs that are redundant with CPX events or other CNVs
 task DropRedundantCnvs {
diff --git a/wdl/RefineComplexVariants.wdl b/wdl/RefineComplexVariants.wdl
index 1e189eb9b..f694d4f21 100644
--- a/wdl/RefineComplexVariants.wdl
+++ b/wdl/RefineComplexVariants.wdl
@@ -197,12 +197,9 @@ workflow RefineComplexVariants {
 
     }
 
-    File reviewed_vcf = select_first([ConcatVcfs.concat_vcf, HailMerge.merged_vcf])
-    File reviewed_vcf_idx = select_first([ConcatVcfs.concat_vcf_idx, HailMerge.merged_vcf_index])
-
     output {
-        File revised_output_vcf = reviewed_vcf
-        File revised_output_vcf_idx = reviewed_vcf_idx
+        File cpx_refined_vcf = select_first([ConcatVcfs.concat_vcf, HailMerge.merged_vcf])
+        File cpx_refined_vcf_index = select_first([ConcatVcfs.concat_vcf_idx, HailMerge.merged_vcf_index])
         File cpx_evidences = ConcatHeaderedTextFiles.out
     }
 }

From b556ba2f7b50f178f140d300ad43fc6d73f003c1 Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Thu, 17 Oct 2024 19:16:01 -0400
Subject: [PATCH 05/28] recal qual after gt filtering

---
 src/sv-pipeline/scripts/apply_sl_filter.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/sv-pipeline/scripts/apply_sl_filter.py b/src/sv-pipeline/scripts/apply_sl_filter.py
index ce0151e92..4e8ecc678 100644
--- a/src/sv-pipeline/scripts/apply_sl_filter.py
+++ b/src/sv-pipeline/scripts/apply_sl_filter.py
@@ -78,6 +78,24 @@ def _fails_filter(sl, gt_is_ref, cutoff):
     return sl < cutoff
 
 
+def recal_qual_score(record):
+    """
+    Recalibrate quality score for a single variant
+    """
+    quals = []
+    for s in [s for s in record.samples]:
+        gt = record.samples[s]['GT']
+        if _is_hom_ref(gt) or _is_no_call(gt):
+            continue
+        elif _is_hom_var(gt):
+            quals.append(99)
+        else:
+            quals.append(record.samples[s]['GQ'])
+
+    if len(quals) > 0:
+        return int(median(quals))
+
+
 def _apply_filter(record, sl_threshold, ploidy_dict, apply_hom_ref, ncr_threshold,
                   keep_gq, gq_scale_factor, upper_sl_cap, lower_sl_cap, sl_shift, max_gq):
     record.info['MINSL'] = sl_threshold
@@ -118,6 +136,7 @@ def _apply_filter(record, sl_threshold, ploidy_dict, apply_hom_ref, ncr_threshol
     n_sl = len(sl_list)
     record.info['SL_MEAN'] = sum(sl_list) / n_sl if n_sl > 0 else None
     record.info['SL_MAX'] = max(sl_list) if n_sl > 0 else None
+    record.qual = recal_qual_score(record)
     # Clean out AF metrics since they're no longer valid
     for field in ['AC', 'AF']:
         if field in record.info:

From 7ddf9e9e66c0a0bf123145866b93af9fef84bbb2 Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Thu, 17 Oct 2024 19:44:27 -0400
Subject: [PATCH 06/28] use source for ins_idel not cpx_intervals

---
 .../cohort_mode/workflow_configurations/CleanVcf.json.tmpl  | 2 +-
 .../scripts/reformat_CPX_bed_and_generate_script.py         | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CleanVcf.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CleanVcf.json.tmpl
index 3d895e35b..93c172d8c 100644
--- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CleanVcf.json.tmpl
+++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CleanVcf.json.tmpl
@@ -1,7 +1,7 @@
 {
   "CleanVcf.contig_list": "${workspace.primary_contigs_fai}",
   "CleanVcf.allosome_fai": "${workspace.allosome_file}",
-  "CleanVcf.HERVK_reference": "${workspace.HERVK_reference",
+  "CleanVcf.HERVK_reference": "${workspace.HERVK_reference}",
   "CleanVcf.LINE1_reference": "${workspace.LINE1_reference}",
   "CleanVcf.chr_x": "${workspace.chr_x}",
   "CleanVcf.chr_y": "${workspace.chr_y}",
diff --git a/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py b/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py
index f05e52d9b..06cac1a83 100644
--- a/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py
+++ b/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py
@@ -196,7 +196,7 @@ def is_interchromosomal(pin, header_pos):
       if seg2[0] != chrom:
           return True
    elif pin[header_pos['CPX_TYPE']] in ['INS_iDEL']:
-      seg2 = [i for i in pin[header_pos['CPX_INTERVALS']].split(',') if 'INS_' in i][0].split('_')[1].split(':')
+      seg2 = [i for i in pin[header_pos['SOURCE']].split(',') if 'INS_' in i][0].split('_')[1].split(':')
       if seg2[0] != chrom:
          return True
    elif pin[header_pos['CPX_TYPE']] in ['CTX_PQ/QP', 'CTX_PP/QQ'] or pin[header_pos['SVTYPE']] in ['CTX']:
@@ -248,7 +248,7 @@ def cpx_SV_readin(input_bed, header_pos, unresolved):
           ref_alt = cpx_info[1]
           breakpoints = cpx_info[0]
         elif pin[header_pos['CPX_TYPE']] in ['INS_iDEL']:
-          segments = pin[header_pos['CPX_INTERVALS']].split(',')
+          segments = pin[header_pos['SOURCE']].split(',')
           cpx_info = extract_bp_list_for_ins_idel(pin[:3], segments, small_sv_size_threshold)
           ref_alt = cpx_info[1]
           breakpoints = cpx_info[0]
@@ -292,7 +292,7 @@ def cpx_inter_chromo_SV_readin(input_bed, header_pos):
               ref_alt = ['a_b', 'a_ba']
         elif pin[header_pos['CPX_TYPE']] in ['INS_iDEL']:
           seg1 = pin[:3]
-          seg2 = [i for i in pin[header_pos['CPX_INTERVALS']].split(',') if 'INS_' in i][0].split('_')[1].split(':')
+          seg2 = [i for i in pin[header_pos['SOURCE']].split(',') if 'INS_' in i][0].split('_')[1].split(':')
           seg2 = [seg2[0]]+seg2[1].split('-')
           if chr_list.index(seg1[0]) < chr_list.index(seg2[0]):
             bp = [[seg1[0]]+[int(i) for i in seg1[1:]], [seg2[0]]+[int(i) for i in seg2[1:]]]

From 3e61822625fe6f557399415c4b3ff2de3fa5f438 Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Fri, 18 Oct 2024 14:02:02 -0400
Subject: [PATCH 07/28] pass ctx revisions dict to revise

---
 wdl/RefineComplexVariants.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wdl/RefineComplexVariants.wdl b/wdl/RefineComplexVariants.wdl
index f694d4f21..44ed0ecac 100644
--- a/wdl/RefineComplexVariants.wdl
+++ b/wdl/RefineComplexVariants.wdl
@@ -430,7 +430,7 @@ unresolved_svids = unresolved_readin("~{unresolved_svids}")
 hash_CPX_manual =  CPX_manual_readin("~{CPX_manual}")
 hash_CTX_manual = CTX_manual_readin("~{CTX_manual}")
 print(len(hash_CPX_manual.keys()))
-revise_vcf("~{vcf_file}", "~{prefix}.Manual_Revised.vcf.gz", hash_CPX_manual, unresolved_svids)
+revise_vcf("~{vcf_file}", "~{prefix}.Manual_Revised.vcf.gz", hash_CPX_manual, unresolved_svids, hash_CTX_manual)
 
 CODE
 

From 85817c0ef7a8eaef39fd83369416483c3d78e27b Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Fri, 18 Oct 2024 14:44:13 -0400
Subject: [PATCH 08/28] set filter to pass if empty

---
 src/sv-pipeline/scripts/apply_sl_filter.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/sv-pipeline/scripts/apply_sl_filter.py b/src/sv-pipeline/scripts/apply_sl_filter.py
index 4e8ecc678..664648031 100644
--- a/src/sv-pipeline/scripts/apply_sl_filter.py
+++ b/src/sv-pipeline/scripts/apply_sl_filter.py
@@ -133,6 +133,8 @@ def _apply_filter(record, sl_threshold, ploidy_dict, apply_hom_ref, ncr_threshol
     record.info['NCR'] = n_no_call / n_samples if n_samples > 0 else None
     if ncr_threshold is not None and record.info['NCR'] is not None and record.info['NCR'] >= ncr_threshold:
         record.filter.add(_HIGH_NCR_FILTER)
+    if len(record.filter) == 0:
+        record.filter.add('PASS')
     n_sl = len(sl_list)
     record.info['SL_MEAN'] = sum(sl_list) / n_sl if n_sl > 0 else None
     record.info['SL_MAX'] = max(sl_list) if n_sl > 0 else None

From 755c364de63efdf572d6f68353e294c9b098411b Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Fri, 18 Oct 2024 16:28:53 -0400
Subject: [PATCH 09/28] import median, move me del rescue to end of cleanvcf

---
 src/sv-pipeline/scripts/apply_sl_filter.py |  1 +
 wdl/CleanVcfChromosome.wdl                 | 24 +++++++++++-----------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/src/sv-pipeline/scripts/apply_sl_filter.py b/src/sv-pipeline/scripts/apply_sl_filter.py
index 664648031..74187eb01 100644
--- a/src/sv-pipeline/scripts/apply_sl_filter.py
+++ b/src/sv-pipeline/scripts/apply_sl_filter.py
@@ -4,6 +4,7 @@
 import sys
 import pysam
 import math
+from numpy import median
 from typing import List, Text, Dict, Optional
 
 _gt_no_call_map = dict()
diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl
index 45258f304..957d2fdc0 100644
--- a/wdl/CleanVcfChromosome.wdl
+++ b/wdl/CleanVcfChromosome.wdl
@@ -248,19 +248,9 @@ workflow CleanVcfChromosome {
       runtime_attr_override_polish=runtime_override_clean_vcf_5_polish
   }
 
-  call RescueMobileElementDeletions {
-    input:
-      vcf = CleanVcf5.polished,
-      prefix = "~{prefix}.~{contig}.rescue_me_dels",
-      LINE1 = LINE1_reference,
-      HERVK = HERVK_reference,
-      sv_pipeline_docker = sv_pipeline_docker,
-      runtime_attr_override = runtime_override_rescue_me_dels
-  }
-
   call DropRedundantCnvs {
     input:
-      vcf=RescueMobileElementDeletions.out,
+      vcf=CleanVcf5.polished,
       prefix="~{prefix}.drop_redundant_cnvs",
       contig=contig,
       sv_pipeline_docker=sv_pipeline_docker,
@@ -299,9 +289,19 @@ workflow CleanVcfChromosome {
       runtime_attr_override=runtime_override_stitch_fragmented_cnvs
   }
 
+  call RescueMobileElementDeletions {
+    input:
+      vcf = StitchFragmentedCnvs.stitched_vcf_shard,
+      prefix = "~{prefix}.~{contig}.rescue_me_dels",
+      LINE1 = LINE1_reference,
+      HERVK = HERVK_reference,
+      sv_pipeline_docker = sv_pipeline_docker,
+      runtime_attr_override = runtime_override_rescue_me_dels
+  }
+
   call FinalCleanup {
     input:
-      vcf=StitchFragmentedCnvs.stitched_vcf_shard,
+      vcf=RescueMobileElementDeletions.out,
       contig=contig,
       prefix="~{prefix}.final_cleanup",
       sv_pipeline_docker=sv_pipeline_docker,

From d32de60afb2f34dd1dec56ac5710e541e108ad82 Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Fri, 18 Oct 2024 16:56:33 -0400
Subject: [PATCH 10/28] align ctx revise with input file; cleanup revise

---
 wdl/RefineComplexVariants.wdl | 46 +++--------------------------------
 1 file changed, 4 insertions(+), 42 deletions(-)

diff --git a/wdl/RefineComplexVariants.wdl b/wdl/RefineComplexVariants.wdl
index 44ed0ecac..98eba802d 100644
--- a/wdl/RefineComplexVariants.wdl
+++ b/wdl/RefineComplexVariants.wdl
@@ -167,7 +167,7 @@ workflow RefineComplexVariants {
         call hail.HailMerge {
             input:
                 vcfs=ReviseVcf.revised_vcf,
-                prefix="~{prefix}.cpx_filtered",
+                prefix="~{prefix}.cpx_refined",
                 gcs_project=gcs_project,
                 sv_base_mini_docker=sv_base_mini_docker,
                 sv_pipeline_docker=sv_pipeline_docker,
@@ -183,7 +183,7 @@ workflow RefineComplexVariants {
                 vcfs=ReviseVcf.revised_vcf,
                 vcfs_idx=ReviseVcf.revised_vcf_idx,
                 allow_overlaps=true,
-                outfile_prefix="~{prefix}.cpx_filtered",
+                outfile_prefix="~{prefix}.cpx_refined",
                 sv_base_mini_docker=sv_base_mini_docker,
                 runtime_attr_override=runtime_attr_concat
         }
@@ -292,23 +292,6 @@ task ReviseVcf {
 
         python <<CODE
 
-def recal_qual_score(record):
-    """
-    Recalibrate quality score for a single variant
-    """
-    quals = []
-    for s in [s for s in record.samples]:
-        GT = record.samples[s]['GT']
-        if GT in NULL_and_REF_GTs:
-            continue
-        elif GT in HET_GTs:
-            quals.append(record.samples[s]['GQ'])
-        else:
-            quals.append(99)
-
-    if len(quals) > 0:
-        return int(median(quals))
-
 def CPX_manual_readin(CPX_manual):
     out={}
     fin=open(CPX_manual)
@@ -320,6 +303,7 @@ def CPX_manual_readin(CPX_manual):
     fin.close()
     return out
 
+
 def CTX_manual_readin(CTX_manual):
     fin=open(CTX_manual)
     out={}
@@ -380,20 +364,8 @@ def revise_vcf(vcf_input, vcf_output, hash_CPX_manual, unresolved_svids, hash_CT
             else:
                 for sample in hash_CTX_manual[record.id].keys():
                     if sample in record.samples.keys():
-                        if hash_CTX_manual[record.id][sample][1] == 'Not_Enough_PE_Pairs':
-                            record.samples[sample]['GT'] = [None,None]
-                        elif hash_CTX_manual[record.id][sample][1] == 'PARTIAL_PE':
-                            record.samples[sample]['GT'] = [None,None]
-                        elif hash_CTX_manual[record.id][sample][1] == 'unbalanced_paternal_transmission_of_tloc,2nd_junction_is_missing':
-                            record.samples[sample]['GT'] = [None,None]
-                        elif hash_CTX_manual[record.id][sample][1] == 'NON_SPECIFIC':
+                        if hash_CTX_manual[record.id][sample][0] in ['no_PE', 'low_PE', 'partial_PE']:
                             record.samples[sample]['GT'] = [None,None]
-                        elif hash_CTX_manual[record.id][sample][1] == 'Not_Enough_Coordinate_Span':
-                            record.samples[sample]['GT'] = [None,None]
-                        elif hash_CTX_manual[record.id][sample][1] == 'Interrupted_PE_Patterns_when_sorted_on_2nd_breakpoint':
-                            record.samples[sample]['GT'] = [None,None]
-                        elif hash_CTX_manual[record.id][sample][1] == 'CTX_with_DEL':
-                            record.stop = int(hash_CTX_manual[record.id][sample][0].split(':')[1].split('-')[1])
         # revisions to insertion-type complex events
         if record.info['SVTYPE']=="CPX" and record.info['CPX_TYPE'] in ['dDUP','dDUP_iDEL','INS_iDEL']:
             if record.info['CPX_TYPE']=='INS_iDEL':
@@ -414,17 +386,7 @@ def revise_vcf(vcf_input, vcf_output, hash_CPX_manual, unresolved_svids, hash_CT
     fin.close()
     fo.close()
 
-import os
-import sys
-from numpy import median
 import pysam
-import argparse
-
-#Define global variables
-NULL_GTs = [(None, None), (None, )]
-REF_GTs = [(0, 0), (0, ), (None, 2)]
-NULL_and_REF_GTs = NULL_GTs + REF_GTs
-HET_GTs = [(0, 1), (None, 1), (None, 3)]
 
 unresolved_svids = unresolved_readin("~{unresolved_svids}")
 hash_CPX_manual =  CPX_manual_readin("~{CPX_manual}")

From 64cce6e6e51851ce0dd6fe779fd3b7ff735966a9 Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Mon, 21 Oct 2024 11:43:20 -0400
Subject: [PATCH 11/28] cleanvcf me del rescue fixes

---
 wdl/CleanVcfChromosome.wdl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl
index 957d2fdc0..41df87262 100644
--- a/wdl/CleanVcfChromosome.wdl
+++ b/wdl/CleanVcfChromosome.wdl
@@ -292,7 +292,7 @@ workflow CleanVcfChromosome {
   call RescueMobileElementDeletions {
     input:
       vcf = StitchFragmentedCnvs.stitched_vcf_shard,
-      prefix = "~{prefix}.~{contig}.rescue_me_dels",
+      prefix = "~{prefix}.rescue_me_dels",
       LINE1 = LINE1_reference,
       HERVK = HERVK_reference,
       sv_pipeline_docker = sv_pipeline_docker,
@@ -694,6 +694,7 @@ for record in fin:
             record.alts = ('<DEL:ME:LINE1>',)
         if hash_MEI_DEL_reset[record.id] == 'overlap_HERVK':
             record.alts = ('<DEL:ME:HERVK>',)
+    fo.write(record)
 fin.close()
 fo.close()
 CODE

From 608908c60a3ac958a69ab6ff852c89976aa514d4 Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Mon, 21 Oct 2024 12:30:06 -0400
Subject: [PATCH 12/28] parametrize min pe; remove top-level script override
 input

---
 wdl/RefineComplexVariants.wdl | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/wdl/RefineComplexVariants.wdl b/wdl/RefineComplexVariants.wdl
index 98eba802d..2867d902c 100644
--- a/wdl/RefineComplexVariants.wdl
+++ b/wdl/RefineComplexVariants.wdl
@@ -20,7 +20,8 @@ workflow RefineComplexVariants {
         Array[File] Depth_DUP_beds
 
         Int n_per_split
-        File? script_generate_cpx_review_script
+        Int min_pe_cpx = 3
+        Int min_pe_ctx = 3
 
         Boolean use_hail = false
         String? gcs_project  # required if use_hail = true
@@ -112,7 +113,6 @@ workflow RefineComplexVariants {
             input:
                 bed = SplitCpxCtx.cpx_ctx_bed,
                 sample_batch_pe_map = GetSampleBatchPEMap.sample_batch_pe_map,
-                script = script_generate_cpx_review_script,
                 sv_pipeline_docker = sv_pipeline_docker,
                 runtime_attr_override = runtime_attr_generate_cpx_review_script
         }
@@ -138,6 +138,7 @@ workflow RefineComplexVariants {
                 PE_collect_script = GenerateCpxReviewScript.pe_evidence_collection_script,
                 PE_supp = CollectPEMetricsForCPX.evi_stat,
                 depth_supp = CollectLargeCNVSupportForCPX.lg_cnv_depth_supp,
+                min_pe_cpx = min_pe_cpx,
                 prefix = "~{prefix}.~{i}",
                 sv_pipeline_docker = sv_pipeline_docker,
                 runtime_attr_override = runtime_attr_calcu_cpx_evidences
@@ -147,6 +148,7 @@ workflow RefineComplexVariants {
             input:
                 PE_collect_script = GenerateCpxReviewScript.pe_evidence_collection_script,
                 PE_supp = CollectPEMetricsForCPX.evi_stat,
+                min_pe_ctx = min_pe_ctx,
                 prefix = "~{prefix}.~{i}",
                 sv_pipeline_docker = sv_pipeline_docker,
                 runtime_attr_override = runtime_attr_calcu_cpx_evidences
@@ -528,6 +530,7 @@ task CalculateCpxEvidences {
         File PE_collect_script
         File PE_supp
         File depth_supp
+        Int min_pe_cpx
         String prefix
         String sv_pipeline_docker
         RuntimeAttr? runtime_attr_override
@@ -610,7 +613,7 @@ task CalculateCpxEvidences {
                                 pe_supp = 'partial_PE'
                             if PE_supp[svid][sample][0][0]>0 and PE_supp[svid][sample][1][0]>0:
                                 pe_supp = 'low_PE'
-                            if PE_supp[svid][sample][0][0]>1 and PE_supp[svid][sample][1][0]>1:
+                            if PE_supp[svid][sample][0][0]>=~{min_pe_cpx} and PE_supp[svid][sample][1][0]>=~{min_pe_cpx}:
                                 pe_supp = 'high_PE'
                             out[svid][sample] = [pe_supp]
                         else:
@@ -681,6 +684,7 @@ task CalculateCtxEvidences {
     input {
         File PE_collect_script
         File PE_supp
+        Int min_pe_ctx
         String prefix
         String sv_pipeline_docker
         RuntimeAttr? runtime_attr_override
@@ -753,7 +757,7 @@ task CalculateCtxEvidences {
                                 pe_supp = 'partial_PE'
                             if PE_supp[svid][sample][0][0]>0 and PE_supp[svid][sample][1][0]>0:
                                 pe_supp = 'low_PE'
-                            if PE_supp[svid][sample][0][0]>2 and PE_supp[svid][sample][1][0]>2:
+                            if PE_supp[svid][sample][0][0]>=~{min_pe_ctx} and PE_supp[svid][sample][1][0]>=~{min_pe_ctx}:
                                 pe_supp = 'high_PE'
                             out[svid][sample] = [pe_supp]
                         else:

From 0d0069533e98619c4cefdc72b49309f6ecb1f465 Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Mon, 21 Oct 2024 14:33:50 -0400
Subject: [PATCH 13/28] add hervk/line1 reference inputs; add
 RefineComplexVariants to docs

---
 README.md                                           |  2 +-
 .../cohort_mode_workspace_dashboard.md.tmpl         | 13 +++++++------
 .../workflow_configurations/MakeCohortVcf.json.tmpl |  2 ++
 .../GATKSVPipelineSingleSample.json.tmpl            |  3 +++
 .../single_sample/workspace.tsv.tmpl                |  2 ++
 ...GATKSVPipelineBatch.FromSampleEvidence.json.tmpl |  2 ++
 .../GATKSVPipelineBatch.json.tmpl                   |  2 ++
 .../GATKSVPipelineSingleSample.json.tmpl            |  3 +++
 .../GATKSVPipelineSingleSample.melt.json.tmpl       |  4 ++++
 .../test/MakeCohortVcf/MakeCohortVcf.json.tmpl      |  2 ++
 scripts/test/terra_validation.py                    |  2 +-
 wdl/MakeCohortVcf.wdl                               |  4 ++++
 12 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index cb8808221..09146ef12 100644
--- a/README.md
+++ b/README.md
@@ -27,4 +27,4 @@ continuous integration (CI) and continuous delivery (CD).
 GATK-SV CI/CD is developed as a set of Github Actions
 workflows that are available under the `.github/workflows`
 directory. Please refer to the [workflow's README](.github/workflows/README.md) 
-for their current coverage and setup. 
+for their current coverage and setup. 
\ No newline at end of file
diff --git a/inputs/templates/terra_workspaces/cohort_mode/cohort_mode_workspace_dashboard.md.tmpl b/inputs/templates/terra_workspaces/cohort_mode/cohort_mode_workspace_dashboard.md.tmpl
index 224f83af9..c22618380 100644
--- a/inputs/templates/terra_workspaces/cohort_mode/cohort_mode_workspace_dashboard.md.tmpl
+++ b/inputs/templates/terra_workspaces/cohort_mode/cohort_mode_workspace_dashboard.md.tmpl
@@ -61,10 +61,11 @@ The following workflows are included in this workspace, to be executed in this o
 13. `13-ResolveComplexVariants`: Complex variant resolution
 14. `14-GenotypeComplexVariants`: Complex variant re-genotyping
 15. `15-CleanVcf`: VCF cleanup
-16. `16-JoinRawCalls`: Combines unfiltered calls (from step 5) across batches
-17. `17-SVConcordance`: Annotates variants with genotype concordance against raw calls
-18. `18-FilterGenotypes`: Performs genotype filtering to improve precision and generates QC plots
-19. `19-AnnotateVcf`: Cohort VCF annotations, including functional annotation, allele frequency (AF) annotation, and AF annotation with external population callsets
+16. `16-RefineComplexVariants`: Complex variant and translocation refinement
+17. `17-JoinRawCalls`: Combines unfiltered calls (from step 5) across batches
+18. `18-SVConcordance`: Annotates variants with genotype concordance against raw calls
+19. `19-FilterGenotypes`: Performs genotype filtering to improve precision and generates QC plots
+20. `20-AnnotateVcf`: Cohort VCF annotations, including functional annotation, allele frequency (AF) annotation, and AF annotation with external population callsets
 
 Additional downstream modules, such as those for visualization, are under development. They are not included in this workspace at this time, but the source code can be found in the [GATK-SV GitHub repository](https://github.com/broadinstitute/gatk-sv). See **Downstream steps** towards the bottom of this page for more information.
 
@@ -205,9 +206,9 @@ Read the full MergeBatchSites documentation [here](https://github.com/broadinsti
 Read the full GenotypeBatch documentation [here](https://github.com/broadinstitute/gatk-sv#genotype-batch).
 * Use the same `sample_set` definitions you used for `03-TrainGCNV` through `08-FilterBatchSamples`.
 
-#### 11-RegenotypeCNVs, 12-CombineBatches, 13-ResolveComplexVariants, 14-GenotypeComplexVariants, 15-CleanVcf, 16-JoinRawCalls, 17-SVConcordance, 18-FilterGenotypes, and 19-AnnotateVcf
+#### 11-RegenotypeCNVs, 12-CombineBatches, 13-ResolveComplexVariants, 14-GenotypeComplexVariants, 15-CleanVcf, 16-RefineComplexVariants, 17-JoinRawCalls, 18-SVConcordance, 19-FilterGenotypes, and 20-AnnotateVcf
 
-Read the full documentation for [RegenotypeCNVs](https://github.com/broadinstitute/gatk-sv#regenotype-cnvs), [MakeCohortVcf](https://github.com/broadinstitute/gatk-sv#make-cohort-vcf) (which includes `CombineBatches`, `ResolveComplexVariants`, `GenotypeComplexVariants`, `CleanVcf`), [`JoinRawCalls`](https://github.com/broadinstitute/gatk-sv#join-raw-calls), [`SVConcordance`](https://github.com/broadinstitute/gatk-sv#svconcordance), [`FilterGenotypes`](https://github.com/broadinstitute/gatk-sv#filter-genotypes), and [AnnotateVcf](https://github.com/broadinstitute/gatk-sv#annotate-vcf) on the README.
+Read the full documentation for [RegenotypeCNVs](https://github.com/broadinstitute/gatk-sv#regenotype-cnvs), [MakeCohortVcf](https://github.com/broadinstitute/gatk-sv#make-cohort-vcf) (which includes `CombineBatches`, `ResolveComplexVariants`, `GenotypeComplexVariants`, `CleanVcf`), [`RefineComplexVariants`](https://github.com/broadinstitute/gatk-sv#refine-complex), [`JoinRawCalls`](https://github.com/broadinstitute/gatk-sv#join-raw-calls), [`SVConcordance`](https://github.com/broadinstitute/gatk-sv#svconcordance), [`FilterGenotypes`](https://github.com/broadinstitute/gatk-sv#filter-genotypes), and [AnnotateVcf](https://github.com/broadinstitute/gatk-sv#annotate-vcf) on the README.
 * Use the same cohort `sample_set_set` you created and used for `09-MergeBatchSites`.
 
 #### Downstream steps
diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/MakeCohortVcf.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/MakeCohortVcf.json.tmpl
index ccab739da..f3fbfb018 100644
--- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/MakeCohortVcf.json.tmpl
+++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/MakeCohortVcf.json.tmpl
@@ -8,6 +8,8 @@
   "MakeCohortVcf.depth_exclude_list": "${workspace.depth_exclude_list}",
   "MakeCohortVcf.empty_file" : "${workspace.empty_file}",
   "MakeCohortVcf.ref_dict": "${workspace.reference_dict}",
+  "MakeCohortVcf.HERVK_reference": "${workspace.HERVK_reference}",
+  "MakeCohortVcf.LINE1_reference": "${workspace.LINE1_reference}",
 
   "MakeCohortVcf.site_level_comparison_datasets": [
     {{ reference_resources.ccdg_abel_site_level_benchmarking_dataset | tojson }},
diff --git a/inputs/templates/terra_workspaces/single_sample/GATKSVPipelineSingleSample.json.tmpl b/inputs/templates/terra_workspaces/single_sample/GATKSVPipelineSingleSample.json.tmpl
index e5a920a0c..88c735b2d 100644
--- a/inputs/templates/terra_workspaces/single_sample/GATKSVPipelineSingleSample.json.tmpl
+++ b/inputs/templates/terra_workspaces/single_sample/GATKSVPipelineSingleSample.json.tmpl
@@ -97,6 +97,9 @@
   "GATKSVPipelineSingleSample.clean_vcf_random_seed": 0,
   "GATKSVPipelineSingleSample.run_vcf_qc" : false,
 
+  "GATKSVPipelineSingleSample.MakeCohortVcf.HERVK_reference": "${workspace.reference_HERVK}",
+  "GATKSVPipelineSingleSample.MakeCohortVcf.LINE1_reference": "${workspace.reference_LINE1}",
+
   "GATKSVPipelineSingleSample.protein_coding_gtf" : "${workspace.reference_protein_coding_gtf}",
   "GATKSVPipelineSingleSample.noncoding_bed" :       "${workspace.reference_noncoding_bed}",
   "GATKSVPipelineSingleSample.external_af_ref_bed" :       "${workspace.reference_external_af_ref_bed}",
diff --git a/inputs/templates/terra_workspaces/single_sample/workspace.tsv.tmpl b/inputs/templates/terra_workspaces/single_sample/workspace.tsv.tmpl
index 25e13568f..2bcef0ad2 100644
--- a/inputs/templates/terra_workspaces/single_sample/workspace.tsv.tmpl
+++ b/inputs/templates/terra_workspaces/single_sample/workspace.tsv.tmpl
@@ -55,6 +55,8 @@ reference_exclude_intervals_for_gcnv_filter_intervals	{{ reference_resources.exc
 reference_external_af_ref_bed	{{ reference_resources.external_af_ref_bed }}
 reference_external_af_ref_bed_prefix	{{ reference_resources.external_af_ref_bed_prefix }}
 reference_genome_file	{{ reference_resources.genome_file }}
+reference_HERVK	{{ reference_resources.hervk_reference }}
+reference_LINE1	{{ reference_resources.line1_reference }}
 reference_manta_region_bed	{{ reference_resources.manta_region_bed }}
 reference_manta_region_bed_index	{{ reference_resources.manta_region_bed_index }}
 reference_mei_bed	{{ reference_resources.mei_bed }}
diff --git a/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.FromSampleEvidence.json.tmpl b/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.FromSampleEvidence.json.tmpl
index 2f5ba41c4..2bfbc4f1b 100644
--- a/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.FromSampleEvidence.json.tmpl
+++ b/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.FromSampleEvidence.json.tmpl
@@ -134,6 +134,8 @@
   "GATKSVPipelineBatch.MakeCohortVcf.mei_bed": {{ reference_resources.mei_bed | tojson }},
   "GATKSVPipelineBatch.MakeCohortVcf.pe_exclude_list": {{ reference_resources.pesr_exclude_list | tojson }},
   "GATKSVPipelineBatch.MakeCohortVcf.depth_exclude_list": {{ reference_resources.depth_exclude_list | tojson }},
+  "GATKSVPipelineBatch.MakeCohortVcf.HERVK_reference": {{ reference_resources.hervk_reference | tojson }},
+  "GATKSVPipelineBatch.MakeCohortVcf.LINE1_reference": {{ reference_resources.line1_reference | tojson }},
   "GATKSVPipelineBatch.MakeCohortVcf.min_sr_background_fail_batches": 0.5,
   "GATKSVPipelineBatch.MakeCohortVcf.max_shards_per_chrom_clean_vcf_step1": 200,
   "GATKSVPipelineBatch.MakeCohortVcf.min_records_per_shard_clean_vcf_step1": 5000,
diff --git a/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.json.tmpl b/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.json.tmpl
index 5412fd79e..9ba04d229 100644
--- a/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.json.tmpl
+++ b/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.json.tmpl
@@ -127,6 +127,8 @@
   "GATKSVPipelineBatch.MakeCohortVcf.mei_bed": {{ reference_resources.mei_bed | tojson }},
   "GATKSVPipelineBatch.MakeCohortVcf.pe_exclude_list": {{ reference_resources.pesr_exclude_list | tojson }},
   "GATKSVPipelineBatch.MakeCohortVcf.depth_exclude_list": {{ reference_resources.depth_exclude_list | tojson }},
+  "GATKSVPipelineBatch.MakeCohortVcf.HERVK_reference": {{ reference_resources.hervk_reference | tojson }},
+  "GATKSVPipelineBatch.MakeCohortVcf.LINE1_reference": {{ reference_resources.line1_reference | tojson }},
   "GATKSVPipelineBatch.MakeCohortVcf.min_sr_background_fail_batches": 0.5,
   "GATKSVPipelineBatch.MakeCohortVcf.max_shards_per_chrom_clean_vcf_step1": 200,
   "GATKSVPipelineBatch.MakeCohortVcf.min_records_per_shard_clean_vcf_step1": 5000,
diff --git a/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.json.tmpl b/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.json.tmpl
index f08b38902..870b5b82f 100644
--- a/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.json.tmpl
+++ b/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.json.tmpl
@@ -101,6 +101,9 @@
   "GATKSVPipelineSingleSample.clean_vcf5_records_per_shard": 5000,
   "GATKSVPipelineSingleSample.run_vcf_qc" : false,
 
+  "GATKSVPipelineSingleSample.MakeCohortVcf.HERVK_reference": {{ reference_resources.hervk_reference | tojson }},
+  "GATKSVPipelineSingleSample.MakeCohortVcf.LINE1_reference": {{ reference_resources.line1_reference | tojson }},
+
   "GATKSVPipelineSingleSample.protein_coding_gtf" : {{ reference_resources.protein_coding_gtf | tojson }},
   "GATKSVPipelineSingleSample.noncoding_bed" :       {{ reference_resources.noncoding_bed | tojson }},
   "GATKSVPipelineSingleSample.external_af_ref_bed" :       {{ reference_resources.external_af_ref_bed | tojson }},
diff --git a/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.melt.json.tmpl b/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.melt.json.tmpl
index 4d35ca4ae..8352403df 100644
--- a/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.melt.json.tmpl
+++ b/inputs/templates/test/GATKSVPipelineSingleSample/GATKSVPipelineSingleSample.melt.json.tmpl
@@ -90,6 +90,10 @@
   "GATKSVPipelineSingleSample.depth_clustering_algorithm": "SINGLE_LINKAGE",
 
   "GATKSVPipelineSingleSample.ref_copy_number_autosomal_contigs" : 2,
+
+  "GATKSVPipelineSingleSample.MakeCohortVcf.HERVK_reference": {{ reference_resources.hervk_reference | tojson }},
+  "GATKSVPipelineSingleSample.MakeCohortVcf.LINE1_reference": {{ reference_resources.line1_reference | tojson }},
+
   "GATKSVPipelineSingleSample.clean_vcf_min_sr_background_fail_batches": 0.5,
   "GATKSVPipelineSingleSample.max_shard_size_resolve" : 500,
   "GATKSVPipelineSingleSample.clean_vcf_max_shards_per_chrom_clean_vcf_step1": 200,
diff --git a/inputs/templates/test/MakeCohortVcf/MakeCohortVcf.json.tmpl b/inputs/templates/test/MakeCohortVcf/MakeCohortVcf.json.tmpl
index 8c27a74cf..cad2a9e03 100644
--- a/inputs/templates/test/MakeCohortVcf/MakeCohortVcf.json.tmpl
+++ b/inputs/templates/test/MakeCohortVcf/MakeCohortVcf.json.tmpl
@@ -8,6 +8,8 @@
   "MakeCohortVcf.depth_exclude_list": {{ reference_resources.depth_exclude_list | tojson }},
   "MakeCohortVcf.empty_file" : {{ reference_resources.empty_file | tojson }},
   "MakeCohortVcf.ref_dict": {{ reference_resources.reference_dict | tojson }},
+  "MakeCohortVcf.HERVK_reference": {{ reference_resources.hervk_reference | tojson }},
+  "MakeCohortVcf.LINE1_reference": {{ reference_resources.line1_reference | tojson }},
 
   "MakeCohortVcf.site_level_comparison_datasets": [
     {{ reference_resources.ccdg_abel_site_level_benchmarking_dataset | tojson }},
diff --git a/scripts/test/terra_validation.py b/scripts/test/terra_validation.py
index 2e9a64756..aa9079d19 100644
--- a/scripts/test/terra_validation.py
+++ b/scripts/test/terra_validation.py
@@ -113,7 +113,7 @@ def main():
     parser.add_argument("-j", "--womtool-jar", help="Path to womtool jar", required=True)
     parser.add_argument("-n", "--num-input-jsons",
                         help="Number of Terra input JSONs expected",
-                        required=False, default=24, type=int)
+                        required=False, default=25, type=int)
     parser.add_argument("--log-level",
                         help="Specify level of logging information, ie. info, warning, error (not case-sensitive)",
                         required=False, default="INFO")
diff --git a/wdl/MakeCohortVcf.wdl b/wdl/MakeCohortVcf.wdl
index 5ea66ea50..f3891fc8d 100644
--- a/wdl/MakeCohortVcf.wdl
+++ b/wdl/MakeCohortVcf.wdl
@@ -44,6 +44,8 @@ workflow MakeCohortVcf {
     File pe_exclude_list
     File depth_exclude_list
     File ref_dict
+    File HERVK_reference
+    File LINE1_reference
     Int max_shard_size_resolve
     Int max_shards_per_chrom_clean_vcf_step1
     Int min_records_per_shard_clean_vcf_step1
@@ -378,6 +380,8 @@ workflow MakeCohortVcf {
       allosome_fai=allosome_fai,
       chr_x=chr_x,
       chr_y=chr_y,
+      HERVK_reference=HERVK_reference,
+      LINE1_reference=LINE1_reference,
       max_shards_per_chrom_step1=max_shards_per_chrom_clean_vcf_step1,
       min_records_per_shard_step1=min_records_per_shard_clean_vcf_step1,
       clean_vcf1b_records_per_shard=clean_vcf1b_records_per_shard,

From 395ded1e37a7d513c452949999ed0bf77183bd93 Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Mon, 21 Oct 2024 14:39:51 -0400
Subject: [PATCH 14/28] add downstream steps to dockstore.yml

---
 .github/.dockstore.yml | 72 +++++++++++++++++++++++++++++++-----------
 1 file changed, 54 insertions(+), 18 deletions(-)

diff --git a/.github/.dockstore.yml b/.github/.dockstore.yml
index 3bcad4198..974fdeee6 100644
--- a/.github/.dockstore.yml
+++ b/.github/.dockstore.yml
@@ -4,7 +4,7 @@ workflows:
     name: GatherSampleEvidence
     primaryDescriptorPath: /wdl/GatherSampleEvidence.wdl
     filters:
-      branches: 
+      branches:
         - main
       tags:
         - /.*/
@@ -13,7 +13,7 @@ workflows:
     name: EvidenceQC
     primaryDescriptorPath: /wdl/EvidenceQC.wdl
     filters:
-      branches: 
+      branches:
         - main
       tags:
         - /.*/
@@ -22,7 +22,7 @@ workflows:
     name: TrainGCNV
     primaryDescriptorPath: /wdl/TrainGCNV.wdl
     filters:
-      branches: 
+      branches:
         - main
       tags:
         - /.*/
@@ -31,7 +31,7 @@ workflows:
     name: GatherBatchEvidence
     primaryDescriptorPath: /wdl/GatherBatchEvidence.wdl
     filters:
-      branches: 
+      branches:
         - main
       tags:
         - /.*/
@@ -40,7 +40,7 @@ workflows:
     name: ClusterBatch
     primaryDescriptorPath: /wdl/ClusterBatch.wdl
     filters:
-      branches: 
+      branches:
         - main
       tags:
         - /.*/
@@ -49,7 +49,7 @@ workflows:
     name: GenerateBatchMetrics
     primaryDescriptorPath: /wdl/GenerateBatchMetrics.wdl
     filters:
-      branches: 
+      branches:
         - main
       tags:
         - /.*/
@@ -58,7 +58,7 @@ workflows:
     name: FilterBatchSites
     primaryDescriptorPath: /wdl/FilterBatchSites.wdl
     filters:
-      branches: 
+      branches:
         - main
       tags:
         - /.*/
@@ -67,7 +67,7 @@ workflows:
     name: PlotSVCountsPerSample
     primaryDescriptorPath: /wdl/PlotSVCountsPerSample.wdl
     filters:
-      branches: 
+      branches:
         - main
       tags:
         - /.*/
@@ -76,7 +76,7 @@ workflows:
     name: FilterBatchSamples
     primaryDescriptorPath: /wdl/FilterBatchSamples.wdl
     filters:
-      branches: 
+      branches:
         - main
       tags:
         - /.*/
@@ -85,7 +85,7 @@ workflows:
     name: MergeBatchSites
     primaryDescriptorPath: /wdl/MergeBatchSites.wdl
     filters:
-      branches: 
+      branches:
         - main
       tags:
         - /.*/
@@ -94,7 +94,7 @@ workflows:
     name: GenotypeBatch
     primaryDescriptorPath: /wdl/GenotypeBatch.wdl
     filters:
-      branches: 
+      branches:
         - main
       tags:
         - /.*/
@@ -103,7 +103,7 @@ workflows:
     name: RegenotypeCNVs
     primaryDescriptorPath: /wdl/RegenotypeCNVs.wdl
     filters:
-      branches: 
+      branches:
         - main
       tags:
         - /.*/
@@ -112,7 +112,7 @@ workflows:
     name: CombineBatches
     primaryDescriptorPath: /wdl/CombineBatches.wdl
     filters:
-      branches: 
+      branches:
         - main
       tags:
         - /.*/
@@ -121,7 +121,7 @@ workflows:
     name: ResolveComplexVariants
     primaryDescriptorPath: /wdl/ResolveComplexVariants.wdl
     filters:
-      branches: 
+      branches:
         - main
       tags:
         - /.*/
@@ -130,7 +130,7 @@ workflows:
     name: GenotypeComplexVariants
     primaryDescriptorPath: /wdl/GenotypeComplexVariants.wdl
     filters:
-      branches: 
+      branches:
         - main
       tags:
         - /.*/
@@ -139,7 +139,43 @@ workflows:
     name: CleanVcf
     primaryDescriptorPath: /wdl/CleanVcf.wdl
     filters:
-      branches: 
+      branches:
+        - main
+      tags:
+        - /.*/
+
+  - subclass: WDL
+    name: RefineComplexVariants
+    primaryDescriptorPath: /wdl/RefineComplexVariants.wdl
+    filters:
+      branches:
+        - main
+      tags:
+        - /.*/
+
+  - subclass: WDL
+    name: JoinRawCalls
+    primaryDescriptorPath: /wdl/JoinRawCalls.wdl
+    filters:
+      branches:
+        - main
+      tags:
+        - /.*/
+
+  - subclass: WDL
+    name: SVConcordance
+    primaryDescriptorPath: /wdl/SVConcordance.wdl
+    filters:
+      branches:
+        - main
+      tags:
+        - /.*/
+
+  - subclass: WDL
+    name: FilterGenotypes
+    primaryDescriptorPath: /wdl/FilterGenotypes.wdl
+    filters:
+      branches:
         - main
       tags:
         - /.*/
@@ -148,7 +184,7 @@ workflows:
     name: MainVcfQc
     primaryDescriptorPath: /wdl/MainVcfQc.wdl
     filters:
-      branches: 
+      branches:
         - main
       tags:
         - /.*/
@@ -157,7 +193,7 @@ workflows:
     name: AnnotateVcf
     primaryDescriptorPath: /wdl/AnnotateVcf.wdl
     filters:
-      branches: 
+      branches:
         - main
       tags:
         - /.*/

From 12ba977273141423f520c373ea4a1f41b3d8fe8b Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Mon, 21 Oct 2024 15:24:20 -0400
Subject: [PATCH 15/28] add sanitizeheader to filtergenotypes & fix jsons

---
 .../FilterGenotypes.fixed_cutoffs.json.tmpl   |  8 +-
 ...FilterGenotypes.optimize_cutoffs.json.tmpl |  8 +-
 wdl/FilterGenotypes.wdl                       | 75 ++++++++++++++++++-
 3 files changed, 86 insertions(+), 5 deletions(-)

diff --git a/inputs/templates/test/FilterGenotypes/FilterGenotypes.fixed_cutoffs.json.tmpl b/inputs/templates/test/FilterGenotypes/FilterGenotypes.fixed_cutoffs.json.tmpl
index 5035f0d0b..1025504cf 100644
--- a/inputs/templates/test/FilterGenotypes/FilterGenotypes.fixed_cutoffs.json.tmpl
+++ b/inputs/templates/test/FilterGenotypes/FilterGenotypes.fixed_cutoffs.json.tmpl
@@ -5,7 +5,13 @@
   "FilterGenotypes.gq_recalibrator_model_file": {{ reference_resources.aou_recalibrate_gq_model_file | tojson }},
   "FilterGenotypes.sl_filter_args": "--small-del-threshold 93 --medium-del-threshold 150 --small-dup-threshold -51 --medium-dup-threshold -4 --ins-threshold -13 --inv-threshold -19",
 
-  "FilterGenotypes.genome_tracks": {{ reference_resources.recalibrate_gq_genome_tracks | tojson }},
+  "FilterGenotypes.genome_tracks": [
+    {{ reference_resources.recalibrate_gq_genome_track_repeatmasker | tojson }},
+    {{ reference_resources.recalibrate_gq_genome_track_segdup | tojson }},
+    {{ reference_resources.recalibrate_gq_genome_track_simple_repeats | tojson }},
+    {{ reference_resources.recalibrate_gq_genome_track_umap100 | tojson }},
+    {{ reference_resources.recalibrate_gq_genome_track_umap24 | tojson }}
+  ],
   "FilterGenotypes.recalibrate_gq_args": [
     "--keep-homvar false",
     "--keep-homref true",
diff --git a/inputs/templates/test/FilterGenotypes/FilterGenotypes.optimize_cutoffs.json.tmpl b/inputs/templates/test/FilterGenotypes/FilterGenotypes.optimize_cutoffs.json.tmpl
index b11a12812..53e50c1ba 100644
--- a/inputs/templates/test/FilterGenotypes/FilterGenotypes.optimize_cutoffs.json.tmpl
+++ b/inputs/templates/test/FilterGenotypes/FilterGenotypes.optimize_cutoffs.json.tmpl
@@ -6,7 +6,13 @@
 
   "FilterGenotypes.primary_contigs_fai": {{ reference_resources.primary_contigs_fai | tojson }},
   "FilterGenotypes.gq_recalibrator_model_file": {{ reference_resources.aou_recalibrate_gq_model_file | tojson }},
-  "FilterGenotypes.genome_tracks": {{ reference_resources.recalibrate_gq_genome_tracks | tojson }},
+  "FilterGenotypes.genome_tracks": [
+    {{ reference_resources.recalibrate_gq_genome_track_repeatmasker | tojson }},
+    {{ reference_resources.recalibrate_gq_genome_track_segdup | tojson }},
+    {{ reference_resources.recalibrate_gq_genome_track_simple_repeats | tojson }},
+    {{ reference_resources.recalibrate_gq_genome_track_umap100 | tojson }},
+    {{ reference_resources.recalibrate_gq_genome_track_umap24 | tojson }}
+  ],
   "FilterGenotypes.recalibrate_gq_args": [
     "--keep-homvar false",
     "--keep-homref true",
diff --git a/wdl/FilterGenotypes.wdl b/wdl/FilterGenotypes.wdl
index 0603b5750..58b98d71d 100644
--- a/wdl/FilterGenotypes.wdl
+++ b/wdl/FilterGenotypes.wdl
@@ -25,6 +25,9 @@ workflow FilterGenotypes {
     Int optimize_vcf_records_per_shard = 50000
     Int filter_vcf_records_per_shard = 20000
 
+    # For SanitizeHeader - bcftools annotate -x ~{header_drop_fields}
+    String header_drop_fields = "FILTER/LOW_QUALITY,FORMAT/TRUTH_CN_EQUAL,FORMAT/GT_FILTER,FORMAT/CONC_ST,INFO/STATUS,INFO/TRUTH_AC,INFO/TRUTH_AN,INFO/TRUTH_AF,INFO/TRUTH_VID,INFO/CNV_CONCORDANCE,INFO/GENOTYPE_CONCORDANCE,INFO/HET_PPV,INFO/HET_SENSITIVITY,INFO/HOMVAR_PPV,INFO/HOMVAR_SENSITIVITY,INFO/MINSL,INFO/NON_REF_GENOTYPE_CONCORDANCE,INFO/SL_MAX,INFO/SL_MEAN,INFO/VAR_PPV,INFO/VAR_SENSITIVITY,INFO/VAR_SPECIFICITY"
+
     # For MainVcfQc
     File primary_contigs_fai
     File? ped_file
@@ -120,10 +123,19 @@ workflow FilterGenotypes {
       sv_base_mini_docker=sv_base_mini_docker
   }
 
+  call SanitizeHeader {
+    input:
+      vcf = ConcatVcfs.concat_vcf,
+      vcf_index = ConcatVcfs.concat_vcf_idx,
+      drop_fields = header_drop_fields,
+      prefix = "~{output_prefix_}.filter_genotypes.sanitized",
+      sv_pipeline_docker = sv_pipeline_docker
+  }
+
   if (run_qc) {
     call qc.MainVcfQc {
       input:
-        vcfs=[ConcatVcfs.concat_vcf],
+        vcfs=[SanitizeHeader.out],
         prefix="~{output_prefix_}.filter_genotypes",
         ped_file=ped_file,
         bcftools_preprocessing_options=qc_bcftools_preprocessing_options,
@@ -142,8 +154,8 @@ workflow FilterGenotypes {
   }
 
   output {
-    File filtered_vcf = ConcatVcfs.concat_vcf
-    File filtered_vcf_index = ConcatVcfs.concat_vcf_idx
+    File filtered_vcf = SanitizeHeader.out
+    File filtered_vcf_index = SanitizeHeader.out_index
     File? main_vcf_qc_tarball = MainVcfQc.sv_vcf_qc_output
 
     # For optional analysis
@@ -295,3 +307,60 @@ task FilterVcf {
     maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
   }
 }
+
+task SanitizeHeader {
+  input {
+    File vcf
+    File vcf_index
+    String prefix
+    String drop_fields
+    String sv_pipeline_docker
+    RuntimeAttr? runtime_attr_override
+  }
+
+  RuntimeAttr runtime_default = object {
+                                  mem_gb: 3.75,
+                                  disk_gb: ceil(10.0 + size(vcf, "GiB") * 2.0),
+                                  cpu_cores: 1,
+                                  preemptible_tries: 3,
+                                  max_retries: 1,
+                                  boot_disk_gb: 10
+                                }
+  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+  runtime {
+    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB"
+    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+    docker: sv_pipeline_docker
+    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+  }
+
+  command <<<
+    set -euo pipefail
+
+    bcftools view --no-version -h ~{vcf} > header.vcf
+
+    grep -v -e ^"##bcftools" header.vcf \
+      -e ^"##source=depth" \
+      -e ^"##source=cleanvcf" \
+      -e ^"##ALT=<ID=UNR" \
+      -e "assembly=38" \
+      | sed 's/Split read genotype quality/Split-read genotype quality/g' \
+      | sed 's/##ALT=<ID=BND,Description="Translocation">/##ALT=<ID=BND,Description="Breakend">/g' > newheader.vcf
+
+    bcftools reheader -h newheader.vcf ~{vcf} \
+      | bcftools annotate -x ~{drop_fields} \
+        --no-version \
+        -O z \
+        -o ~{prefix}.vcf.gz
+
+    tabix ~{prefix}.vcf.gz
+  >>>
+
+  output {
+    File out = "~{prefix}.vcf.gz"
+    File out_index = "~{prefix}.vcf.gz.tbi"
+  }
+}

From c473dcb603e4031d64a503d7d4d4067bd8802642 Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Mon, 21 Oct 2024 15:51:12 -0400
Subject: [PATCH 16/28] update sanitize task

---
 wdl/FilterGenotypes.wdl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/wdl/FilterGenotypes.wdl b/wdl/FilterGenotypes.wdl
index 58b98d71d..a703862f1 100644
--- a/wdl/FilterGenotypes.wdl
+++ b/wdl/FilterGenotypes.wdl
@@ -346,7 +346,6 @@ task SanitizeHeader {
       -e ^"##source=depth" \
       -e ^"##source=cleanvcf" \
       -e ^"##ALT=<ID=UNR" \
-      -e "assembly=38" \
       | sed 's/Split read genotype quality/Split-read genotype quality/g' \
       | sed 's/##ALT=<ID=BND,Description="Translocation">/##ALT=<ID=BND,Description="Breakend">/g' > newheader.vcf
 

From b8c36178ff6e02370dc7f5a229ac0f854761e3aa Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Mon, 21 Oct 2024 17:07:03 -0400
Subject: [PATCH 17/28] lint

---
 .../reformat_CPX_bed_and_generate_script.py   | 865 +++++++++---------
 1 file changed, 445 insertions(+), 420 deletions(-)

diff --git a/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py b/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py
index 06cac1a83..8a90e16f3 100644
--- a/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py
+++ b/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py
@@ -1,35 +1,41 @@
 #!python
-#this script takes in the CPX SVs in complex format, reformat them in svelter format meanwhile generate the script to collect PE metrics
+# this script takes in the CPX SVs, reformats in svelter format, and generates the script to collect PE metrics
+import os
+import argparse
+
 
 def header_pos_readin(input_bed):
-  fin=os.popen(r'''zcat %s'''%(input_bed))
-  pin=fin.readline().strip().split()
-  out = {}
-  for i in range(len(pin)):
-    out[pin[i]] = i
-  fin.close()
-  return out
+    fin = os.popen(r'''zcat %s''' % input_bed)
+    pin = fin.readline().strip().split()
+    out = {}
+    for i in range(len(pin)):
+        out[pin[i]] = i
+    fin.close()
+    return out
+
 
 def sample_pe_readin(sample_pe_file):
-  #sample_pe_file is a 2 column file with sample ID and PE metrics on both columns
-  out = {}
-  fin=open(sample_pe_file)
-  for line in fin:
-    pin=line.strip().split()
-    if not pin[0] in out.keys():
-      out[pin[0]] = pin[1]
-  fin.close()
-  return out
-
-def SVID_sample_readin(input_bed,header_pos):
-  out={}
-  fin=os.popen(r'''zcat %s'''%(input_bed))
-  for line in fin:
-    pin=line.strip().split('\t')
-    if not pin[0][0]=='#':
-      out[pin[header_pos['name']]] = pin[header_pos['samples']]
-  fin.close()
-  return out
+    # sample_pe_file is a 2 column file with sample ID and PE metrics on both columns
+    out = {}
+    fin = open(sample_pe_file)
+    for line in fin:
+        pin = line.strip().split()
+        if not pin[0] in out.keys():
+            out[pin[0]] = pin[1]
+    fin.close()
+    return out
+
+
+def svid_sample_readin(input_bed, header_pos):
+    out = {}
+    fin = os.popen(r'''zcat %s''' % input_bed)
+    for line in fin:
+        pin = line.strip().split('\t')
+        if not pin[0][0] == '#':
+            out[pin[header_pos['name']]] = pin[header_pos['samples']]
+    fin.close()
+    return out
+
 
 def parse_segment(segment):
     svtype, interval = segment.split('_')
@@ -37,438 +43,457 @@ def parse_segment(segment):
     coord1, coord2 = coords.split('-')
     return svtype, chrom, int(coord1), int(coord2)
 
+
 # segments: CPX_INTERVALS split by ','
 def extract_bp_list_for_inv_cnv_events(segments, cpx_type):
     s0_svtype, s0_chrom, s0_c1, s0_c2 = parse_segment(segments[0])
-    breakpoints = [s0_chrom]+[s0_c1, s0_c2]
-    if not 'INVdup' in cpx_type:
-      for seg in segments[1:]:
-        seg_svtype, seg_chrom, seg_c1, seg_c2 = parse_segment(seg)
-        breakpoints.append(seg_c2)
+    breakpoints = [s0_chrom] + [s0_c1, s0_c2]
+    if 'INVdup' not in cpx_type:
+        for seg in segments[1:]:
+            seg_svtype, seg_chrom, seg_c1, seg_c2 = parse_segment(seg)
+            breakpoints.append(seg_c2)
     else:
-      if cpx_type=='INVdup':
-        s1_svtype, s1_chrom, s1_c1, s1_c2 = parse_segment(segments[1])
-        if s1_c1 < breakpoints[2]:
-          breakpoints = breakpoints[:2] + [s1_c1, breakpoints[2]]
-      elif cpx_type == 'dupINVdup' or cpx_type == 'delINVdup':
-        s2_svtype, s2_chrom, s2_c1, s2_c2 = parse_segment(segments[2])
-        breakpoints += [s2_c1, s2_c2]
+        if cpx_type == 'INVdup':
+            s1_svtype, s1_chrom, s1_c1, s1_c2 = parse_segment(segments[1])
+            if s1_c1 < breakpoints[2]:
+                breakpoints = breakpoints[:2] + [s1_c1, breakpoints[2]]
+        elif cpx_type == 'dupINVdup' or cpx_type == 'delINVdup':
+            s2_svtype, s2_chrom, s2_c1, s2_c2 = parse_segment(segments[2])
+            breakpoints += [s2_c1, s2_c2]
     return breakpoints
 
+
 def extract_bp_list_for_ddup_events(coordinates, segments, small_sv_size_threshold):
-    #example of coordinates: ["chr1",1154129,1154133]
-    #example of segments: DUP_chr1:229987763-230011157
+    # example of coordinates: ["chr1",1154129,1154133]
+    # example of segments: DUP_chr1:229987763-230011157
     del_bp = [coordinates[0], int(coordinates[1]), int(coordinates[2])]
     # CW: this looks like it only gets the coordinates from the first DUP segment?
-    #dup_seg = [i for i in segments if 'DUP_' in i][0].split('_')[1].split(':')
-    #dup_bp = [int(i) for i in dup_seg[1].split('-')]
-    #INV_flag = len([i for i in segments if "INV_" in i])
+    # dup_seg = [i for i in segments if 'DUP_' in i][0].split('_')[1].split(':')
+    # dup_bp = [int(i) for i in dup_seg[1].split('-')]
+    # inv_flag = len([i for i in segments if "INV_" in i])
     # rewrite as:
-    INV_flag = 0
+    inv_flag = 0
+    dup_bp = []
     for segment in segments:
         if 'DUP_' in segment:
             seg_svtype, seg_chrom, seg_c1, seg_c2 = parse_segment(segment)
             dup_bp = [seg_c1, seg_c2]
         if 'INV_' in segment:
-            INV_flag = INV_flag + 1
+            inv_flag = inv_flag + 1
         if 'DEL_' in segment:
             seg_svtype, seg_chrom, seg_c1, seg_c2 = parse_segment(segment)
             del_bp = [seg_chrom, seg_c1, seg_c2]
-    #INV_flag == 0 : no INV involved in the insertion
-    #INV_flag > 0: INV involved
-
-    if INV_flag==0:
-      if del_bp[2] < dup_bp[0]:
-          breakpints = del_bp + dup_bp
-          if del_bp[2]-del_bp[1]>small_sv_size_threshold:
-              structures = ['abc','cbc']
-          else:
-              structures = ['ab','bab']
-      elif del_bp[1] > dup_bp[1]:
-          breakpints   = [del_bp[0]]+dup_bp+del_bp[1:]
-          if del_bp[2]-del_bp[1]>small_sv_size_threshold:
-              structures = ['abc','aba']
-          else:
-              structures = ['ab','aba']
-      elif del_bp[1] < dup_bp[0] and not del_bp[2] < dup_bp[0]:
-          breakpints = del_bp[:2] + dup_bp
-          structures = ['ab','bb']
-      elif del_bp[1] < dup_bp[1] and not del_bp[2] < dup_bp[1]:
-          breakpints = [del_bp[0]]+dup_bp+[del_bp[2]]
-          structures = ['ab','aa']
-      else:
-        return 'unresolved'
-    elif INV_flag>0:
-      if del_bp[2] < dup_bp[0]:
-          breakpints = del_bp + dup_bp
-          if del_bp[2]-del_bp[1]>small_sv_size_threshold:
-              structures = ['abc','c^bc']
-          else:
-              structures = ['ab','b^ab']
-      elif del_bp[1] > dup_bp[1]:
-          breakpints   = [del_bp[0]]+dup_bp+del_bp[1:]
-          if del_bp[2]-del_bp[1]>small_sv_size_threshold:
-              structures = ['abc','aba^']
-          else:
-              structures = ['ab','aba^']
-      elif del_bp[1] < dup_bp[0] and not del_bp[2] < dup_bp[0]:
-          breakpints = del_bp[:2] + dup_bp
-          structures = ['ab','b^b']
-      elif del_bp[1] < dup_bp[1] and not del_bp[2] < dup_bp[1]:
-          breakpints = [del_bp[0]]+dup_bp+[del_bp[2]]
-          structures = ['ab','aa^']
-      else:
-          return 'unresolved'
-    return [breakpints, structures]
+    # inv_flag == 0 : no INV involved in the insertion
+    # inv_flag > 0: INV involved
+    structures = []
+    breakpoints = []
+    if inv_flag == 0:
+        if del_bp[2] < dup_bp[0]:
+            breakpoints = del_bp + dup_bp
+            if del_bp[2] - del_bp[1] > small_sv_size_threshold:
+                structures = ['abc', 'cbc']
+            else:
+                structures = ['ab', 'bab']
+        elif del_bp[1] > dup_bp[1]:
+            breakpoints = [del_bp[0]] + dup_bp + del_bp[1:]
+            if del_bp[2] - del_bp[1] > small_sv_size_threshold:
+                structures = ['abc', 'aba']
+            else:
+                structures = ['ab', 'aba']
+        elif del_bp[1] < dup_bp[0] and not del_bp[2] < dup_bp[0]:
+            breakpoints = del_bp[:2] + dup_bp
+            structures = ['ab', 'bb']
+        elif del_bp[1] < dup_bp[1] and not del_bp[2] < dup_bp[1]:
+            breakpoints = [del_bp[0]] + dup_bp + [del_bp[2]]
+            structures = ['ab', 'aa']
+        else:
+            return 'unresolved'
+    elif inv_flag > 0:
+        if del_bp[2] < dup_bp[0]:
+            breakpoints = del_bp + dup_bp
+            if del_bp[2] - del_bp[1] > small_sv_size_threshold:
+                structures = ['abc', 'c^bc']
+            else:
+                structures = ['ab', 'b^ab']
+        elif del_bp[1] > dup_bp[1]:
+            breakpoints = [del_bp[0]] + dup_bp + del_bp[1:]
+            if del_bp[2] - del_bp[1] > small_sv_size_threshold:
+                structures = ['abc', 'aba^']
+            else:
+                structures = ['ab', 'aba^']
+        elif del_bp[1] < dup_bp[0] and not del_bp[2] < dup_bp[0]:
+            breakpoints = del_bp[:2] + dup_bp
+            structures = ['ab', 'b^b']
+        elif del_bp[1] < dup_bp[1] and not del_bp[2] < dup_bp[1]:
+            breakpoints = [del_bp[0]] + dup_bp + [del_bp[2]]
+            structures = ['ab', 'aa^']
+        else:
+            return 'unresolved'
+    return [breakpoints, structures]
+
 
 def extract_bp_list_for_ins_idel(coordinates, segments, small_sv_size_threshold):
-    #example of coordinates: ["chr1",1154129,1154133]
-    #example of segments: DUP_chr1:229987763-230011157
+    # example of coordinates: ["chr1",1154129,1154133]
+    # example of segments: DUP_chr1:229987763-230011157
     del_bp = [coordinates[0], int(coordinates[1]), int(coordinates[2])]
     dup_seg = [i for i in segments if 'INS_' in i][0].split('_')[1].split(':')
     dup_bp = [int(i) for i in dup_seg[1].split('-')]
-    INV_flag = len([i for i in segments if "INV_" in i])
-    #INV_flag == 0 : no INV involved in the insertion
-    #INV_flag > 0: INV involved
-    if INV_flag==0:
-      if del_bp[2] < dup_bp[0]:
-          breakpints = del_bp + dup_bp
-          if del_bp[2]-del_bp[1]> small_sv_size_threshold:
-              structures = ['abc','cbc']
-          else:
-              structures = ['ab','bab']
-      elif del_bp[1] > dup_bp[1]:
-          breakpints   = [del_bp[0]]+dup_bp+del_bp[1:]
-          if del_bp[2]-del_bp[1]> small_sv_size_threshold:
-              structures = ['abc','aba']
-          else:
-              structures = ['ab','aba']
-      elif del_bp[1] < dup_bp[0] and not del_bp[2] < dup_bp[0]:
-          breakpints = del_bp[:2] + dup_bp
-          structures = ['ab','bb']
-      elif del_bp[1] < dup_bp[1] and not del_bp[2] < dup_bp[1]:
-          breakpints = [del_bp[0]]+dup_bp+[del_bp[2]]
-          structures = ['ab','aa']
-    elif INV_flag>0:
-      if del_bp[2] < dup_bp[0]:
-          breakpints = del_bp + dup_bp
-          if del_bp[2]-del_bp[1]> small_sv_size_threshold:
-              structures = ['abc','c^bc']
-          else:
-              structures = ['ab','b^ab']
-      elif del_bp[1] > dup_bp[1]:
-          breakpints   = [del_bp[0]]+dup_bp+del_bp[1:]
-          if del_bp[2]-del_bp[1]> small_sv_size_threshold:
-              structures = ['abc','aba^']
-          else:
-              structures = ['ab','aba^']
-      elif del_bp[1] < dup_bp[0] and not del_bp[2] < dup_bp[0]:
-          breakpints = del_bp[:2] + dup_bp
-          structures = ['ab','b^b']
-      elif del_bp[1] < dup_bp[1] and not del_bp[2] < dup_bp[1]:
-          breakpints = [del_bp[0]]+dup_bp+[del_bp[2]]
-          structures = ['ab','aa^']
-    return [breakpints, structures]
-
-def extract_bp_list_V4(coordinates, segments, small_sv_size_threshold):
+    inv_flag = len([i for i in segments if "INV_" in i])
+    # inv_flag == 0 : no INV involved in the insertion
+    # inv_flag > 0: INV involved
+    structures = []
+    breakpoints = []
+    if inv_flag == 0:
+        if del_bp[2] < dup_bp[0]:
+            breakpoints = del_bp + dup_bp
+            if del_bp[2] - del_bp[1] > small_sv_size_threshold:
+                structures = ['abc', 'cbc']
+            else:
+                structures = ['ab', 'bab']
+        elif del_bp[1] > dup_bp[1]:
+            breakpoints = [del_bp[0]] + dup_bp + del_bp[1:]
+            if del_bp[2] - del_bp[1] > small_sv_size_threshold:
+                structures = ['abc', 'aba']
+            else:
+                structures = ['ab', 'aba']
+        elif del_bp[1] < dup_bp[0] and not del_bp[2] < dup_bp[0]:
+            breakpoints = del_bp[:2] + dup_bp
+            structures = ['ab', 'bb']
+        elif del_bp[1] < dup_bp[1] and not del_bp[2] < dup_bp[1]:
+            breakpoints = [del_bp[0]] + dup_bp + [del_bp[2]]
+            structures = ['ab', 'aa']
+    elif inv_flag > 0:
+        if del_bp[2] < dup_bp[0]:
+            breakpoints = del_bp + dup_bp
+            if del_bp[2] - del_bp[1] > small_sv_size_threshold:
+                structures = ['abc', 'c^bc']
+            else:
+                structures = ['ab', 'b^ab']
+        elif del_bp[1] > dup_bp[1]:
+            breakpoints = [del_bp[0]] + dup_bp + del_bp[1:]
+            if del_bp[2] - del_bp[1] > small_sv_size_threshold:
+                structures = ['abc', 'aba^']
+            else:
+                structures = ['ab', 'aba^']
+        elif del_bp[1] < dup_bp[0] and not del_bp[2] < dup_bp[0]:
+            breakpoints = del_bp[:2] + dup_bp
+            structures = ['ab', 'b^b']
+        elif del_bp[1] < dup_bp[1] and not del_bp[2] < dup_bp[1]:
+            breakpoints = [del_bp[0]] + dup_bp + [del_bp[2]]
+            structures = ['ab', 'aa^']
+    return [breakpoints, structures]
+
+
+def extract_bp_list_v4(coordinates, segments, small_sv_size_threshold):
     del_bp = [coordinates[0], int(coordinates[1]), int(coordinates[2])]
     dup_seg = [i for i in segments if 'INV_' in i][0].split('_')[1].split(':')
     dup_bp = [int(i) for i in dup_seg[1].split('-')]
-    INV_flag = 1
+    structures = []
+    breakpoints = []
     if del_bp[2] < dup_bp[0]:
         breakpoints = del_bp + dup_bp
-        if del_bp[2]-del_bp[1]> small_sv_size_threshold:
-            structures = ['abc','c^bc']
+        if del_bp[2] - del_bp[1] > small_sv_size_threshold:
+            structures = ['abc', 'c^bc']
         else:
-            structures = ['ab','b^ab']
+            structures = ['ab', 'b^ab']
     elif del_bp[1] > dup_bp[1]:
-        breakpoints = [del_bp[0]]+dup_bp+del_bp[1:]
-        if del_bp[2]-del_bp[1]> small_sv_size_threshold:
-            structures = ['abc','aba^']
+        breakpoints = [del_bp[0]] + dup_bp + del_bp[1:]
+        if del_bp[2] - del_bp[1] > small_sv_size_threshold:
+            structures = ['abc', 'aba^']
         else:
-            structures = ['ab','aba^']
+            structures = ['ab', 'aba^']
     return [breakpoints, structures]
 
+
 def is_interchromosomal(pin, header_pos):
-   chrom = pin[0]
-   if pin[header_pos['CPX_TYPE']] in ['dDUP', 'dDUP_iDEL']:
-      seg2 = [i for i in pin[header_pos['CPX_INTERVALS']].split(',') if 'DUP_' in i][0].split('_')[1].split(':')
-      if seg2[0] != chrom:
-          return True
-   elif pin[header_pos['CPX_TYPE']] in ['INS_iDEL']:
-      seg2 = [i for i in pin[header_pos['SOURCE']].split(',') if 'INS_' in i][0].split('_')[1].split(':')
-      if seg2[0] != chrom:
-         return True
-   elif pin[header_pos['CPX_TYPE']] in ['CTX_PQ/QP', 'CTX_PP/QQ'] or pin[header_pos['SVTYPE']] in ['CTX']:
-      return True
-   elif "INV_" in pin[header_pos['SOURCE']] and pin[header_pos['SVTYPE']]=="INS":
-      seg2 = [i for i in pin[header_pos['SOURCE']].split(',') if 'INV_' in i][0].split('_')[1].split(':')
-      seg2 = [seg2[0]]+seg2[1].split('-')
-      if seg2[0] != chrom:
-          return True
-   return False
-
-def cpx_SV_readin(input_bed, header_pos, unresolved):
-  unresolved_svids = []
-  fin=os.popen(r'''zcat %s'''%(input_bed))
-  out = []
-  small_sv_size_threshold = 250
-  for line in fin:
-    pin=line.strip().split('\t')
-    # label pidups as unresolved since manual inspection indicated low quality
-    if not pin[0][0]=="#":
-      if pin[header_pos['CPX_TYPE']] in ['piDUP_RF', 'piDUP_FR']:
-        unresolved_svids.append(pin[header_pos['name']])
-      elif not is_interchromosomal(pin, header_pos):
-        if pin[header_pos['CPX_TYPE']] in ['delINV', 'INVdel', 'dupINV','INVdup','delINVdel', 'delINVdup','dupINVdel','dupINVdup']:
-          segments = pin[header_pos['CPX_INTERVALS']].split(',')
-          breakpoints = extract_bp_list_for_inv_cnv_events(segments, pin[header_pos['CPX_TYPE']])
-          if pin[header_pos['CPX_TYPE']] == 'delINV':
-            ref_alt = ['ab','b^']
-          if pin[header_pos['CPX_TYPE']] == 'INVdel':
-            ref_alt = ['ab','a^']
-          if pin[header_pos['CPX_TYPE']] == 'dupINV':
-            ref_alt = ['ab','aba^']
-          if pin[header_pos['CPX_TYPE']] == 'INVdup':
-            ref_alt = ['ab','b^ab']
-          if pin[header_pos['CPX_TYPE']] == 'delINVdel':
-            ref_alt = ['abc','b^']
-          if pin[header_pos['CPX_TYPE']] == 'delINVdup':
-            ref_alt = ['abc','c^bc']
-          if pin[header_pos['CPX_TYPE']] == 'dupINVdel':
-            ref_alt = ['abc','aba^']
-          if pin[header_pos['CPX_TYPE']] == 'dupINVdup':
-            ref_alt = ['abc','ac^b^a^c']
-        elif pin[header_pos['CPX_TYPE']] in ['dDUP', 'dDUP_iDEL']:
-          segments = pin[header_pos['CPX_INTERVALS']].split(',')
-          cpx_info = extract_bp_list_for_ddup_events(pin[:3], segments, small_sv_size_threshold)
-          if cpx_info == 'unresolved':
-            unresolved_svids.append(pin[header_pos['name']])
+    chrom = pin[0]
+    if pin[header_pos['CPX_TYPE']] in ['dDUP', 'dDUP_iDEL']:
+        seg2 = [i for i in pin[header_pos['CPX_INTERVALS']].split(',') if 'DUP_' in i][0].split('_')[1].split(':')
+        if seg2[0] != chrom:
+            return True
+    elif pin[header_pos['CPX_TYPE']] in ['INS_iDEL']:
+        seg2 = [i for i in pin[header_pos['SOURCE']].split(',') if 'INS_' in i][0].split('_')[1].split(':')
+        if seg2[0] != chrom:
+            return True
+    elif pin[header_pos['CPX_TYPE']] in ['CTX_PQ/QP', 'CTX_PP/QQ'] or pin[header_pos['SVTYPE']] in ['CTX']:
+        return True
+    elif "INV_" in pin[header_pos['SOURCE']] and pin[header_pos['SVTYPE']] == "INS":
+        seg2 = [i for i in pin[header_pos['SOURCE']].split(',') if 'INV_' in i][0].split('_')[1].split(':')
+        seg2 = [seg2[0]] + seg2[1].split('-')
+        if seg2[0] != chrom:
+            return True
+    return False
+
+
+def cpx_sv_readin(input_bed, header_pos, unresolved):
+    unresolved_svids = []
+    fin = os.popen(r'''zcat %s''' % input_bed)
+    out = []
+    small_sv_size_threshold = 250
+    ref_alt = []
+    for line in fin:
+        pin = line.strip().split('\t')
+        # label pidups as unresolved since manual inspection indicated low quality
+        if not pin[0][0] == "#":
+            if pin[header_pos['CPX_TYPE']] in ['piDUP_RF', 'piDUP_FR']:
+                unresolved_svids.append(pin[header_pos['name']])
+            elif not is_interchromosomal(pin, header_pos):
+                if pin[header_pos['CPX_TYPE']] in ['delINV', 'INVdel', 'dupINV', 'INVdup', 'delINVdel', 'delINVdup',
+                                                   'dupINVdel', 'dupINVdup']:
+                    segments = pin[header_pos['CPX_INTERVALS']].split(',')
+                    breakpoints = extract_bp_list_for_inv_cnv_events(segments, pin[header_pos['CPX_TYPE']])
+                    if pin[header_pos['CPX_TYPE']] == 'delINV':
+                        ref_alt = ['ab', 'b^']
+                    if pin[header_pos['CPX_TYPE']] == 'INVdel':
+                        ref_alt = ['ab', 'a^']
+                    if pin[header_pos['CPX_TYPE']] == 'dupINV':
+                        ref_alt = ['ab', 'aba^']
+                    if pin[header_pos['CPX_TYPE']] == 'INVdup':
+                        ref_alt = ['ab', 'b^ab']
+                    if pin[header_pos['CPX_TYPE']] == 'delINVdel':
+                        ref_alt = ['abc', 'b^']
+                    if pin[header_pos['CPX_TYPE']] == 'delINVdup':
+                        ref_alt = ['abc', 'c^bc']
+                    if pin[header_pos['CPX_TYPE']] == 'dupINVdel':
+                        ref_alt = ['abc', 'aba^']
+                    if pin[header_pos['CPX_TYPE']] == 'dupINVdup':
+                        ref_alt = ['abc', 'ac^b^a^c']
+                elif pin[header_pos['CPX_TYPE']] in ['dDUP', 'dDUP_iDEL']:
+                    segments = pin[header_pos['CPX_INTERVALS']].split(',')
+                    cpx_info = extract_bp_list_for_ddup_events(pin[:3], segments, small_sv_size_threshold)
+                    if cpx_info == 'unresolved':
+                        unresolved_svids.append(pin[header_pos['name']])
+                        continue
+                    ref_alt = cpx_info[1]
+                    breakpoints = cpx_info[0]
+                elif pin[header_pos['CPX_TYPE']] in ['INS_iDEL']:
+                    segments = pin[header_pos['SOURCE']].split(',')
+                    cpx_info = extract_bp_list_for_ins_idel(pin[:3], segments, small_sv_size_threshold)
+                    ref_alt = cpx_info[1]
+                    breakpoints = cpx_info[0]
+                else:
+                    segments = pin[header_pos['SOURCE']].split(',')
+                    cpx_info = extract_bp_list_v4(pin[:3], segments, small_sv_size_threshold)
+                    ref_alt = cpx_info[1]
+                    breakpoints = cpx_info[0]
+                out.append([breakpoints, ref_alt, pin[3]])
+        else:
             continue
-          ref_alt = cpx_info[1]
-          breakpoints = cpx_info[0]
-        elif pin[header_pos['CPX_TYPE']] in ['INS_iDEL']:
-          segments = pin[header_pos['SOURCE']].split(',')
-          cpx_info = extract_bp_list_for_ins_idel(pin[:3], segments, small_sv_size_threshold)
-          ref_alt = cpx_info[1]
-          breakpoints = cpx_info[0]
+    fin.close()
+    with open(unresolved, 'w') as unres:
+        for svid in unresolved_svids:
+            unres.write(svid + "\n")
+    return out
+
+
+def cpx_inter_chromo_sv_readin(input_bed, header_pos):
+    fin = os.popen(r'''zcat %s''' % input_bed)
+    out = []
+    chr_list = ['chr' + str(i) for i in range(1, 23)] + ['chrX', 'chrY']
+    bp = []
+    ref_alt = []
+    for line in fin:
+        pin = line.strip().split('\t')
+        if not pin[0][0] == "#":
+            if is_interchromosomal(pin, header_pos):
+                if pin[header_pos['CPX_TYPE']] in ['dDUP', 'dDUP_iDEL']:
+                    seg1 = pin[:3]
+                    seg2 = [i for i in pin[header_pos['CPX_INTERVALS']].split(',') if 'DUP_' in i][0].split('_')[1].split(':')
+                    seg2 = [seg2[0]] + seg2[1].split('-')
+                    if chr_list.index(seg1[0]) < chr_list.index(seg2[0]):
+                        bp = [[seg1[0]] + [int(i) for i in seg1[1:]], [seg2[0]] + [int(i) for i in seg2[1:]]]
+                        if int(seg1[2]) - int(seg1[1]) > 250:
+                            ref_alt = ['ab_c', 'cb_c']
+                        else:
+                            ref_alt = ['a_b', 'ba_b']
+                    elif chr_list.index(seg1[0]) > chr_list.index(seg2[0]):
+                        bp = [[seg2[0]] + [int(i) for i in seg2[1:]], [seg1[0]] + [int(i) for i in seg1[1:]]]
+                        if int(seg1[2]) - int(seg1[1]) > 250:
+                            ref_alt = ['a_bc', 'a_ba']
+                        else:
+                            ref_alt = ['a_b', 'a_ba']
+                elif pin[header_pos['CPX_TYPE']] in ['INS_iDEL']:
+                    seg1 = pin[:3]
+                    seg2 = [i for i in pin[header_pos['SOURCE']].split(',') if 'INS_' in i][0].split('_')[1].split(':')
+                    seg2 = [seg2[0]] + seg2[1].split('-')
+                    if chr_list.index(seg1[0]) < chr_list.index(seg2[0]):
+                        bp = [[seg1[0]] + [int(i) for i in seg1[1:]], [seg2[0]] + [int(i) for i in seg2[1:]]]
+                        if int(seg1[2]) - int(seg1[1]) > 250:
+                            ref_alt = ['ab_c', 'cb_c']
+                        else:
+                            ref_alt = ['a_b', 'ba_b']
+                    elif chr_list.index(seg1[0]) > chr_list.index(seg2[0]):
+                        bp = [[seg2[0]] + [int(i) for i in seg2[1:]], [seg1[0]] + [int(i) for i in seg1[1:]]]
+                        if int(seg1[2]) - int(seg1[1]) > 250:
+                            ref_alt = ['a_bc', 'a_ba']
+                        else:
+                            ref_alt = ['a_b', 'a_ba']
+                elif pin[header_pos['CPX_TYPE']] in ['CTX_PQ/QP', 'CTX_PP/QQ'] or pin[header_pos['SVTYPE']] in ['CTX']:
+                    seg1 = pin[:3]
+                    seg2 = [pin[header_pos['CHR2']], pin[header_pos['END2']], pin[header_pos['END2']]]
+                    if chr_list.index(seg1[0]) < chr_list.index(seg2[0]):
+                        bp = [[seg1[0]] + [int(i) for i in seg1[1:]], [seg2[0]] + [int(i) for i in seg2[1:]]]
+                    elif chr_list.index(seg1[0]) > chr_list.index(seg2[0]):
+                        bp = [[seg2[0]] + [int(i) for i in seg2[1:]], [seg1[0]] + [int(i) for i in seg1[1:]]]
+                    ref_alt = [pin[header_pos['CPX_TYPE']]]
+                elif "INV_" in pin[header_pos['SOURCE']] and pin[header_pos['SVTYPE']] == "INS":
+                    seg1 = pin[:3]
+                    seg2 = [i for i in pin[header_pos['SOURCE']].split(',') if 'INV_' in i][0].split('_')[1].split(':')
+                    seg2 = [seg2[0]] + seg2[1].split('-')
+                    if chr_list.index(seg1[0]) < chr_list.index(seg2[0]):
+                        bp = [[seg1[0]] + [int(i) for i in seg1[1:]], [seg2[0]] + [int(i) for i in seg2[1:]]]
+                        if int(seg1[2]) - int(seg1[1]) > 250:
+                            ref_alt = ['ab_c', 'c^b_c']
+                        else:
+                            ref_alt = ['a_b', 'b^a_b']
+                    elif chr_list.index(seg1[0]) > chr_list.index(seg2[0]):
+                        bp = [[seg2[0]] + [int(i) for i in seg2[1:]], [seg1[0]] + [int(i) for i in seg1[1:]]]
+                        if int(seg1[2]) - int(seg1[1]) > 250:
+                            ref_alt = ['a_bc', 'a_ba^']
+                        else:
+                            ref_alt = ['a_b', 'a_ba^']
+                out.append([bp, ref_alt, pin[3]])
+    fin.close()
+    return out
+
+
+def cpx_sample_batch_readin(cpx_sv, svid_sample, sample_pe, pe_evidence, out_file):
+    out = []
+    flank_back = 1000
+    flank_front = 100
+    fo = open(out_file, 'w')
+    for info in cpx_sv:
+        breakpoints = info[0]
+        if info[1][0] == 'ab' and info[1][1] == 'b^':  # delINV
+            common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_back) + '-' + str(breakpoints[1] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>' + str(breakpoints[3] - flank_back), '&&', '$5<' + str(breakpoints[3] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+            common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[2] - flank_front) + '-' + str(breakpoints[2] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>' + str(breakpoints[3] - flank_front), '&&', '$5<' + str(breakpoints[3] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+        elif info[1][0] == 'abc' and info[1][1] == 'b^':  # delINVdel
+            common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_back) + '-' + str(breakpoints[1] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>' + str(breakpoints[3] - flank_back), '&&', '$5<' + str(breakpoints[3] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+            common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[2] - flank_front) + '-' + str(breakpoints[2] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>' + str(breakpoints[4] - flank_front), '&&', '$5<' + str(breakpoints[4] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+        elif info[1][0] == 'abc' and info[1][1] == 'c^bc':  # delINVdup
+            common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_back) + '-' + str(breakpoints[1] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>' + str(breakpoints[4] - flank_back), '&&', '$5<' + str(breakpoints[4] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+            common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[2] - flank_front) + '-' + str(breakpoints[2] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>' + str(breakpoints[3] - flank_front), '&&', '$5<' + str(breakpoints[3] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+        elif info[1][0] == 'ab' and info[1][1] == 'aba^':  # dupINV
+            common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[2] - flank_back) + '-' + str(breakpoints[2] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>' + str(breakpoints[3] - flank_back), '&&', '$5<' + str(breakpoints[3] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+            common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_front) + '-' + str(breakpoints[1] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>' + str(breakpoints[3] - flank_front), '&&', '$5<' + str(breakpoints[3] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+        elif info[1][0] == 'abc' and info[1][1] == 'ac^b^a^c':  # dupINVdup
+            common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[2] - flank_back) + '-' + str(breakpoints[2] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>' + str(breakpoints[4] - flank_back), '&&', '$5<' + str(breakpoints[4] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+            common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_front) + '-' + str(breakpoints[1] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>' + str(breakpoints[3] - flank_front), '&&', '$5<' + str(breakpoints[3] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+        elif info[1][0] == 'abc' and info[1][1] == 'aba^':  # dupINVdel
+            common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[2] - flank_back) + '-' + str(breakpoints[2] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>' + str(breakpoints[3] - flank_back), '&&', '$5<' + str(breakpoints[3] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+            common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_front) + '-' + str(breakpoints[1] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>' + str(breakpoints[4] - flank_front), '&&', '$5<' + str(breakpoints[4] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+        elif info[1][0] == 'ab' and info[1][1] == 'a^':  # INVdel
+            common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_back) + '-' + str(breakpoints[1] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>' + str(breakpoints[2] - flank_back), '&&', '$5<' + str(breakpoints[2] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+            common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_front) + '-' + str(breakpoints[1] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>' + str(breakpoints[3] - flank_front), '&&', '$5<' + str(breakpoints[3] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+        elif info[1][0] == 'ab' and info[1][1] == 'b^ab':  # INVdup
+            common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_back) + '-' + str(breakpoints[1] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>' + str(breakpoints[3] - flank_back), '&&', '$5<' + str(breakpoints[3] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+            common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_front) + '-' + str(breakpoints[1] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>' + str(breakpoints[2] - flank_front), '&&', '$5<' + str(breakpoints[2] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+        elif info[1][0] == 'ab' and info[1][1] == 'aba':  # dupINS
+            common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[2] - flank_back) + '-' + str(breakpoints[2] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>' + str(breakpoints[3] - flank_front), '&&', '$5<' + str(breakpoints[3] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+            common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_front) + '-' + str(breakpoints[1] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>' + str(breakpoints[3] - flank_back), '&&', '$5<' + str(breakpoints[3] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+        elif info[1][0] == 'abc' and info[1][1] == 'aba':  # dupINSdel
+            common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[2] - flank_back) + '-' + str(breakpoints[2] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>' + str(breakpoints[4] - flank_front), '&&', '$5<' + str(breakpoints[4] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+            common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_front) + '-' + str(breakpoints[1] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>' + str(breakpoints[3] - flank_back), '&&', '$5<' + str(breakpoints[3] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+        elif info[1][0] == 'ab' and info[1][1] == 'aa':  # dupdel
+            common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[2] - flank_back) + '-' + str(breakpoints[2] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>' + str(breakpoints[3] - flank_front), '&&', '$5<' + str(breakpoints[3] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+            common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_front) + '-' + str(breakpoints[1] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>' + str(breakpoints[2] - flank_back), '&&', '$5<' + str(breakpoints[2] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+        elif info[1][0] == 'ab' and info[1][1] == 'bab':  # INSdup
+            common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_back) + '-' + str(breakpoints[1] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>' + str(breakpoints[2] - flank_front), '&&', '$5<' + str(breakpoints[2] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+            common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_front) + '-' + str(breakpoints[1] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>' + str(breakpoints[3] - flank_back), '&&', '$5<' + str(breakpoints[3] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+        elif info[1][0] == 'abc' and info[1][1] == 'cbc':  # delINSdup
+            common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_back) + '-' + str(breakpoints[1] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>' + str(breakpoints[3] - flank_front), '&&', '$5<' + str(breakpoints[3] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+            common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[2] - flank_front) + '-' + str(breakpoints[2] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>' + str(breakpoints[4] - flank_back), '&&', '$5<' + str(breakpoints[4] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+        elif info[1][0] == 'ab' and info[1][1] == 'bb':  # deldup
+            common_1 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[1] - flank_back) + '-' + str(breakpoints[1] + flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>' + str(breakpoints[2] - flank_front), '&&', '$5<' + str(breakpoints[2] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+            common_2 = ['tabix', 'PE_metrics', breakpoints[0] + ':' + str(breakpoints[2] - flank_front) + '-' + str(breakpoints[2] + flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>' + str(breakpoints[3] - flank_back), '&&', '$5<' + str(breakpoints[3] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+        elif info[1] == ['a_b', 'ba_b'] or info[1] == ['ab_c', 'cb_c']:   # ddup or ddup_iDEL, insertion from the smaller chromosome to the larger chromosome
+            common_1 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][1] - flank_back) + '-' + str(breakpoints[0][1] + flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="-"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][1] - flank_front), '&&', '$5<' + str(breakpoints[1][1] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+            common_2 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][2] - flank_front) + '-' + str(breakpoints[0][2] + flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="+"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][2] - flank_back), '&&', '$5<' + str(breakpoints[1][2] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+        elif info[1] == ['a_b', 'a_ba'] or info[1] == ['a_bc', 'a_ba']:   # ddup or ddup_iDEL, insertion from the larger chromosome to the smaller chromosome
+            common_1 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][1] - flank_front) + '-' + str(breakpoints[0][1] + flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="+"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][1] - flank_back), '&&', '$5<' + str(breakpoints[1][1] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+            common_2 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][2] - flank_back) + '-' + str(breakpoints[0][2] + flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="-"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][2] - flank_front), '&&', '$5<' + str(breakpoints[1][2] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+        elif info[1] == ['a_b', 'b^a_b'] or info[1] == ['ab_c', 'c^b_c']:  # inverted insertion, insertion from the smaller chromosome to the larger chromosome
+            common_1 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][1] - flank_back) + '-' + str(breakpoints[0][1] + flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="+"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][2] - flank_back), '&&', '$5<' + str(breakpoints[1][2] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+            common_2 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][2] - flank_front) + '-' + str(breakpoints[0][2] + flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="-"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][1] - flank_front), '&&', '$5<' + str(breakpoints[1][1] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+        elif info[1] == ['a_b', 'a_ba^'] or info[1] == ['a_bc', 'a_ba^']:  # inverted insertion, insertion from the larger chromosome to the smaller chromosome
+            common_1 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][1] - flank_front) + '-' + str(breakpoints[0][1] + flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="-"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][2] - flank_front), '&&', '$5<' + str(breakpoints[1][2] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+            common_2 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][2] - flank_back) + '-' + str(breakpoints[0][2] + flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="+"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][1] - flank_back), '&&', '$5<' + str(breakpoints[1][1] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+        elif info[1] == ['CTX_PQ/QP']:  # inverted insertion, insertion from the larger chromosome to the smaller chromosome
+            common_1 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][1] - flank_back) + '-' + str(breakpoints[0][1] + flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="+"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][1] - flank_back), '&&', '$5<' + str(breakpoints[1][1] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+            common_2 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][2] - flank_front) + '-' + str(breakpoints[0][2] + flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="-"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][2] - flank_front), '&&', '$5<' + str(breakpoints[1][2] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+        elif info[1] == ['CTX_PP/QQ']:  # inverted insertion, insertion from the larger chromosome to the smaller chromosome
+            common_1 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][1] - flank_back) + '-' + str(breakpoints[0][1] + flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="-"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][1] - flank_front), '&&', '$5<' + str(breakpoints[1][1] + flank_back), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
+            common_2 = ['tabix', 'PE_metrics', breakpoints[0][0] + ':' + str(breakpoints[0][2] - flank_front) + '-' + str(breakpoints[0][2] + flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="+"', '&&', '$4=="' + breakpoints[1][0] + '" &&', '$5>' + str(breakpoints[1][2] - flank_back), '&&', '$5<' + str(breakpoints[1][2] + flank_front), ") print}' | sed -e 's/$/\\t", info[2], "/'", '>>', pe_evidence]
         else:
-          segments = pin[header_pos['SOURCE']].split(',')
-          cpx_info = extract_bp_list_V4(pin[:3], segments, small_sv_size_threshold)
-          ref_alt = cpx_info[1]
-          breakpoints = cpx_info[0]
-        out.append([breakpoints, ref_alt,pin[3]])
-    else:
-      continue
-  fin.close()
-  with open(unresolved, 'w') as unres:
-    for svid in unresolved_svids:
-      unres.write(svid + "\n")
-  return out
-
-def cpx_inter_chromo_SV_readin(input_bed, header_pos):
-  fin=os.popen(r'''zcat %s'''%(input_bed))
-  out = []
-  chr_list = ['chr'+str(i) for i in range(1,23)]+['chrX','chrY']
-  for line in fin:
-    pin=line.strip().split('\t')
-    if not pin[0][0]=="#":
-      if is_interchromosomal(pin, header_pos):
-        if pin[header_pos['CPX_TYPE']] in ['dDUP', 'dDUP_iDEL']:
-          seg1 = pin[:3]
-          seg2 = [i for i in pin[header_pos['CPX_INTERVALS']].split(',') if 'DUP_' in i][0].split('_')[1].split(':')
-          seg2 = [seg2[0]]+seg2[1].split('-')
-          if chr_list.index(seg1[0]) < chr_list.index(seg2[0]):
-            bp = [[seg1[0]]+[int(i) for i in seg1[1:]], [seg2[0]]+[int(i) for i in seg2[1:]]]
-            if int(seg1[2])-int(seg1[1])>250:
-              ref_alt = ['ab_c', 'cb_c']
-            else:
-              ref_alt = ['a_b', 'ba_b']
-          elif chr_list.index(seg1[0]) > chr_list.index(seg2[0]):
-            bp = [[seg2[0]]+[int(i) for i in seg2[1:]], [seg1[0]]+[int(i) for i in seg1[1:]]]
-            if int(seg1[2])-int(seg1[1])>250:
-              ref_alt = ['a_bc','a_ba']
-            else:
-              ref_alt = ['a_b', 'a_ba']
-        elif pin[header_pos['CPX_TYPE']] in ['INS_iDEL']:
-          seg1 = pin[:3]
-          seg2 = [i for i in pin[header_pos['SOURCE']].split(',') if 'INS_' in i][0].split('_')[1].split(':')
-          seg2 = [seg2[0]]+seg2[1].split('-')
-          if chr_list.index(seg1[0]) < chr_list.index(seg2[0]):
-            bp = [[seg1[0]]+[int(i) for i in seg1[1:]], [seg2[0]]+[int(i) for i in seg2[1:]]]
-            if int(seg1[2])-int(seg1[1])>250:
-              ref_alt = ['ab_c', 'cb_c']
-            else:
-              ref_alt = ['a_b', 'ba_b']
-          elif chr_list.index(seg1[0]) > chr_list.index(seg2[0]):
-            bp = [[seg2[0]]+[int(i) for i in seg2[1:]], [seg1[0]]+[int(i) for i in seg1[1:]]]
-            if int(seg1[2])-int(seg1[1])>250:
-              ref_alt = ['a_bc','a_ba']
-            else:
-              ref_alt = ['a_b', 'a_ba']
-        elif pin[header_pos['CPX_TYPE']] in ['CTX_PQ/QP', 'CTX_PP/QQ'] or pin[header_pos['SVTYPE']] in ['CTX']:
-          seg1 = pin[:3]
-          seg2 = [pin[header_pos['CHR2']], pin[header_pos['END2']], pin[header_pos['END2']]]
-          if chr_list.index(seg1[0]) < chr_list.index(seg2[0]):
-            bp = [[seg1[0]]+[int(i) for i in seg1[1:]], [seg2[0]]+[int(i) for i in seg2[1:]]]
-          elif chr_list.index(seg1[0]) > chr_list.index(seg2[0]):
-            bp = [[seg2[0]]+[int(i) for i in seg2[1:]], [seg1[0]]+[int(i) for i in seg1[1:]]]
-          ref_alt = [pin[header_pos['CPX_TYPE']]]
-        elif "INV_" in pin[header_pos['SOURCE']] and pin[header_pos['SVTYPE']]=="INS":
-          seg1 = pin[:3]
-          seg2 = [i for i in pin[header_pos['SOURCE']].split(',') if 'INV_' in i][0].split('_')[1].split(':')
-          seg2 = [seg2[0]]+seg2[1].split('-')
-          if chr_list.index(seg1[0]) < chr_list.index(seg2[0]):
-            bp = [[seg1[0]]+[int(i) for i in seg1[1:]], [seg2[0]]+[int(i) for i in seg2[1:]]]
-            if int(seg1[2])-int(seg1[1])>250:
-              ref_alt = ['ab_c', 'c^b_c']
-            else:
-              ref_alt = ['a_b', 'b^a_b']
-          elif chr_list.index(seg1[0]) > chr_list.index(seg2[0]):
-            bp = [[seg2[0]]+[int(i) for i in seg2[1:]], [seg1[0]]+[int(i) for i in seg1[1:]]]
-            if int(seg1[2])-int(seg1[1])>250:
-              ref_alt = ['a_bc','a_ba^']
-            else:
-              ref_alt = ['a_b', 'a_ba^']
-        out.append([bp, ref_alt, pin[3]])
-  fin.close()
-  return out
-
-def cpx_sample_batch_readin(cpx_SV, SVID_sample,sample_pe, PE_evidence, out_file):
-  out = []
-  flank_back = 1000
-  flank_front = 100
-  fo=open(out_file,'w')
-  for info in cpx_SV:
-    breakpints = info[0]
-    if info[1][0] == 'ab' and info[1][1]=='b^': #delINV
-      common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_back)+'-'+str(breakpints[1]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-      common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_front)+'-'+str(breakpints[2]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>'+str(breakpints[3]-flank_front) , '&&', '$5<'+str(breakpints[3]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-    elif info[1][0] == 'abc' and info[1][1]=='b^': #delINVdel
-      common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_back)+'-'+str(breakpints[1]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-      common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_front)+'-'+str(breakpints[2]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>'+str(breakpints[4]-flank_front) , '&&', '$5<'+str(breakpints[4]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-    elif info[1][0] == 'abc' and info[1][1]=='c^bc': #delINVdup
-      common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_back)+'-'+str(breakpints[1]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>'+str(breakpints[4]-flank_back) , '&&', '$5<'+str(breakpints[4]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-      common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_front)+'-'+str(breakpints[2]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>'+str(breakpints[3]-flank_front) , '&&', '$5<'+str(breakpints[3]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-    elif info[1][0] == 'ab' and info[1][1]=='aba^': #dupINV
-      common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_back)+'-'+str(breakpints[2]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-      common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>'+str(breakpints[3]-flank_front) , '&&', '$5<'+str(breakpints[3]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-    elif info[1][0] == 'abc' and info[1][1]=='ac^b^a^c': #dupINVdup
-      common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_back)+'-'+str(breakpints[2]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>'+str(breakpints[4]-flank_back) , '&&', '$5<'+str(breakpints[4]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-      common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>'+str(breakpints[3]-flank_front) , '&&', '$5<'+str(breakpints[3]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-    elif info[1][0] == 'abc' and info[1][1]=='aba^': #dupINVdel
-      common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_back)+'-'+str(breakpints[2]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-      common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>'+str(breakpints[4]-flank_front) , '&&', '$5<'+str(breakpints[4]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-    elif info[1][0] == 'ab' and info[1][1]=='a^': #INVdel
-      common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_back)+'-'+str(breakpints[1]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>'+str(breakpints[2]-flank_back) , '&&', '$5<'+str(breakpints[2]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-      common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>'+str(breakpints[3]-flank_front) , '&&', '$5<'+str(breakpints[3]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-    elif info[1][0] == 'ab' and info[1][1]=='b^ab': #INVdup
-      common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_back)+'-'+str(breakpints[1]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-      common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="-"', '&&', '$5>'+str(breakpints[2]-flank_front) , '&&', '$5<'+str(breakpints[2]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-    elif info[1][0] == 'ab' and info[1][1]=='aba': #dupINS
-      common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_back)+'-'+str(breakpints[2]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>'+str(breakpints[3]-flank_front) , '&&', '$5<'+str(breakpints[3]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-      common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-    elif info[1][0] == 'abc' and info[1][1]=='aba': #dupINSdel
-      common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_back)+'-'+str(breakpints[2]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>'+str(breakpints[4]-flank_front) , '&&', '$5<'+str(breakpints[4]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-      common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-    elif info[1][0] == 'ab' and info[1][1]=='aa': #dupdel
-      common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_back)+'-'+str(breakpints[2]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>'+str(breakpints[3]-flank_front) , '&&', '$5<'+str(breakpints[3]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-      common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>'+str(breakpints[2]-flank_back) , '&&', '$5<'+str(breakpints[2]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-    elif info[1][0] == 'ab' and info[1][1]=='bab': #INSdup
-      common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_back)+'-'+str(breakpints[1]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>'+str(breakpints[2]-flank_front) , '&&', '$5<'+str(breakpints[2]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-      common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_front)+'-'+str(breakpints[1]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-    elif info[1][0] == 'abc' and info[1][1]=='cbc': #delINSdup
-      common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_back)+'-'+str(breakpints[1]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>'+str(breakpints[3]-flank_front) , '&&', '$5<'+str(breakpints[3]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-      common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_front)+'-'+str(breakpints[2]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>'+str(breakpints[4]-flank_back) , '&&', '$5<'+str(breakpints[4]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-    elif info[1][0] == 'ab' and info[1][1]=='bb': #deldup
-      common_1 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[1]-flank_back)+'-'+str(breakpints[1]+flank_front), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="+" && $6=="-"', '&&', '$5>'+str(breakpints[2]-flank_front) , '&&', '$5<'+str(breakpints[2]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-      common_2 = ['tabix', 'PE_metrics', breakpints[0]+':'+str(breakpints[2]-flank_front)+'-'+str(breakpints[2]+flank_back), '| grep', 'sample', '| awk', "'{if ($1==$4", '&&', '$3=="-" && $6=="+"', '&&', '$5>'+str(breakpints[3]-flank_back) , '&&', '$5<'+str(breakpints[3]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-    elif info[1] == ['a_b','ba_b'] or info[1] == ['ab_c','cb_c']:   #ddup or ddup_iDEL, insertion from the smaller chromosome to the larger chromosome
-      common_1 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][1]-flank_back)+'-'+str(breakpints[0][1]+flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="-"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][1]-flank_front) , '&&', '$5<'+str(breakpints[1][1]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-      common_2 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][2]-flank_front)+'-'+str(breakpints[0][2]+flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="+"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][2]-flank_back) , '&&', '$5<'+str(breakpints[1][2]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-    elif info[1] == ['a_b','a_ba'] or info[1] == ['a_bc','a_ba']:   #ddup or ddup_iDEL, insertion from the larger chromosome to the smaller chromosome
-      common_1 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][1]-flank_front)+'-'+str(breakpints[0][1]+flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="+"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][1]-flank_back) , '&&', '$5<'+str(breakpints[1][1]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-      common_2 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][2]-flank_back)+'-'+str(breakpints[0][2]+flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="-"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][2]-flank_front) , '&&', '$5<'+str(breakpints[1][2]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-    elif info[1] == ['a_b', 'b^a_b'] or info[1] == ['ab_c', 'c^b_c']: #inverted insertion, insertion from the smaller chromosome to the larger chromosome
-      common_1 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][1]-flank_back)+'-'+str(breakpints[0][1]+flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="+"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][2]-flank_back) , '&&', '$5<'+str(breakpints[1][2]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-      common_2 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][2]-flank_front)+'-'+str(breakpints[0][2]+flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="-"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][1]-flank_front) , '&&', '$5<'+str(breakpints[1][1]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-    elif info[1] == ['a_b', 'a_ba^'] or info[1] == ['a_bc','a_ba^']:  #inverted insertion, insertion from the larger chromosome to the smaller chromosome
-      common_1 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][1]-flank_front)+'-'+str(breakpints[0][1]+flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="-"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][2]-flank_front) , '&&', '$5<'+str(breakpints[1][2]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-      common_2 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][2]-flank_back)+'-'+str(breakpints[0][2]+flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="+"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][1]-flank_back) , '&&', '$5<'+str(breakpints[1][1]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-    elif info[1] == ['CTX_PQ/QP']:  #inverted insertion, insertion from the larger chromosome to the smaller chromosome
-      common_1 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][1]-flank_back)+'-'+str(breakpints[0][1]+flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="+"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][1]-flank_back) , '&&', '$5<'+str(breakpints[1][1]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-      common_2 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][2]-flank_front)+'-'+str(breakpints[0][2]+flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="-"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][2]-flank_front) , '&&', '$5<'+str(breakpints[1][2]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-    elif info[1] == ['CTX_PP/QQ']:  #inverted insertion, insertion from the larger chromosome to the smaller chromosome
-      common_1 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][1]-flank_back)+'-'+str(breakpints[0][1]+flank_front), '| grep', 'sample', '| awk', "'{if (", '$3=="+" && $6=="-"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][1]-flank_front) , '&&', '$5<'+str(breakpints[1][1]+flank_back), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-      common_2 = ['tabix', 'PE_metrics', breakpints[0][0]+':'+str(breakpints[0][2]-flank_front)+'-'+str(breakpints[0][2]+flank_back), '| grep', 'sample', '| awk', "'{if (", '$3=="-" && $6=="+"', '&&', '$4=="' + breakpints[1][0] +'" &&', '$5>'+str(breakpints[1][2]-flank_back) , '&&', '$5<'+str(breakpints[1][2]+flank_front), ") print}' | sed -e 's/$/\\t", info[2],"/'", '>>', PE_evidence]
-    else:
-      print(info)
-    samples = SVID_sample[info[2]]
-    if not samples =='' and not samples=='NA':
-      sample_list = samples.split(',')
-      pe_metrics_list = [sample_pe[samp] for samp in sample_list]
-      for num in range(len(sample_list)):
-        common_1[1] = pe_metrics_list[num]
-        common_2[1] = pe_metrics_list[num]
-        common_1[4] = sample_list[num]
-        common_2[4] = sample_list[num]
-        write_1 = ' '.join(common_1)
-        write_2 = ' '.join(common_2)
-        print(write_1, file=fo)
-        print(write_2, file=fo)
-
-  fo.close()
-  return out
+            print(info)
+            common_1 = None
+            common_2 = None
+        samples = svid_sample[info[2]]
+        if not samples == '' and not samples == 'NA':
+            sample_list = samples.split(',')
+            pe_metrics_list = [sample_pe[samp] for samp in sample_list]
+            for num in range(len(sample_list)):
+                common_1[1] = pe_metrics_list[num]
+                common_2[1] = pe_metrics_list[num]
+                common_1[4] = sample_list[num]
+                common_2[4] = sample_list[num]
+                write_1 = ' '.join(common_1)
+                write_2 = ' '.join(common_2)
+                print(write_1, file=fo)
+                print(write_2, file=fo)
+
+    fo.close()
+    return out
+
 
 def write_cpx_command(cpx_command, out_file):
-  fo=open(out_file,'w')
-  for i in cpx_command:
-    print(i, file=fo)
-  fo.close()
-
-def write_cpx_SVs(cpx_SV, out_file):
-  fo=open(out_file,'w')
-  for i in cpx_SV:
-    out = [i[2],':'.join([str(j) for j in i[0]])]+i[1]
-    print('\t'.join(out), file=fo)
-  fo.close()
+    fo = open(out_file, 'w')
+    for i in cpx_command:
+        print(i, file=fo)
+    fo.close()
+
+
+def write_cpx_svs(cpx_sv, out_file):
+    fo = open(out_file, 'w')
+    for i in cpx_sv:
+        out = [i[2], ':'.join([str(j) for j in i[0]])] + i[1]
+        print('\t'.join(out), file=fo)
+    fo.close()
+
 
 def main():
-  """
-  Command-line main block
-  """
-
-  # Parse command line arguments and options
-  parser = argparse.ArgumentParser(
-          description=__doc__,
-          formatter_class=argparse.RawDescriptionHelpFormatter)
-  parser.add_argument('-i','--input', required=True, help='input file of CPX events in bed format')
-  parser.add_argument('-s','--sample_pe', required=True, help='2 column file with sample ID and PE metrics information')
-  parser.add_argument('-p','--pe_evidence', required=True, help='name of file to store collected PE metrics')
-  parser.add_argument('-c','--command_script', required=True, help='name of file that has scripts to collect PE evidences')
-  parser.add_argument('-r','--reformat_SV', required=True, help='reformatted SV in svelter format')
-  parser.add_argument('-u','--unresolved', required=True, help='list of SVIDs to mark unresolved without evaluating (temporary workaround')  # TODO
-  args = parser.parse_args()
-
-  input_bed = args.input
-  sample_pe_file = args.sample_pe
-  PE_evidence = args.pe_evidence
-  command_script = args.command_script
-  reformatted_SV = args.reformat_SV
-
-  header_pos = header_pos_readin(input_bed)
-  sample_pe = sample_pe_readin(sample_pe_file)
-  SVID_sample = SVID_sample_readin(input_bed, header_pos)
-
-  cpx_SV = cpx_SV_readin(input_bed, header_pos, args.unresolved)
-  cpx_inter_chromo_SV = cpx_inter_chromo_SV_readin(input_bed, header_pos)
-  write_cpx_SVs(cpx_SV+cpx_inter_chromo_SV, reformatted_SV)
-
-  cpx_sample_batch_readin(cpx_SV+cpx_inter_chromo_SV, SVID_sample, sample_pe, PE_evidence, command_script)
+    """
+    Command-line main block
+    """
 
-import os
-import sys
-import argparse
-if __name__ == '__main__':
-        main()
+    # Parse command line arguments and options
+    parser = argparse.ArgumentParser(
+                    description=__doc__,
+                    formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument('-i', '--input', required=True, help='input file of CPX events in bed format')
+    parser.add_argument('-s', '--sample_pe', required=True, help='2 column file with sample ID and PE metrics information')
+    parser.add_argument('-p', '--pe_evidence', required=True, help='name of file to store collected PE metrics')
+    parser.add_argument('-c', '--command_script', required=True, help='name of file that has scripts to collect PE evidences')
+    parser.add_argument('-r', '--reformat_SV', required=True, help='reformatted SV in svelter format')
+    parser.add_argument('-u', '--unresolved', required=True, help='list of SVIDs to mark unresolved without evaluating (temporary workaround')  # TODO
+    args = parser.parse_args()
 
+    input_bed = args.input
+    sample_pe_file = args.sample_pe
+    pe_evidence = args.pe_evidence
+    command_script = args.command_script
+    reformatted_sv = args.reformat_SV
+
+    header_pos = header_pos_readin(input_bed)
+    sample_pe = sample_pe_readin(sample_pe_file)
+    svid_sample = svid_sample_readin(input_bed, header_pos)
+
+    cpx_sv = cpx_sv_readin(input_bed, header_pos, args.unresolved)
+    cpx_inter_chromo_sv = cpx_inter_chromo_sv_readin(input_bed, header_pos)
+    write_cpx_svs(cpx_sv + cpx_inter_chromo_sv, reformatted_sv)
+
+    cpx_sample_batch_readin(cpx_sv + cpx_inter_chromo_sv, svid_sample, sample_pe, pe_evidence, command_script)
+
+
+if __name__ == '__main__':
+    main()

From 01c44b245a8bf884a5ea45723f7c867fd39d59cc Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Mon, 21 Oct 2024 17:09:38 -0400
Subject: [PATCH 18/28] lint

---
 .../scripts/reformat_CPX_bed_and_generate_script.py           | 4 ++--
 wdl/RefineComplexVariants.wdl                                 | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py b/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py
index 8a90e16f3..d37ac7e04 100644
--- a/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py
+++ b/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py
@@ -468,8 +468,8 @@ def main():
 
     # Parse command line arguments and options
     parser = argparse.ArgumentParser(
-                    description=__doc__,
-                    formatter_class=argparse.RawDescriptionHelpFormatter)
+                description=__doc__,
+                formatter_class=argparse.RawDescriptionHelpFormatter)
     parser.add_argument('-i', '--input', required=True, help='input file of CPX events in bed format')
     parser.add_argument('-s', '--sample_pe', required=True, help='2 column file with sample ID and PE metrics information')
     parser.add_argument('-p', '--pe_evidence', required=True, help='name of file to store collected PE metrics')
diff --git a/wdl/RefineComplexVariants.wdl b/wdl/RefineComplexVariants.wdl
index 2867d902c..da75ef7c3 100644
--- a/wdl/RefineComplexVariants.wdl
+++ b/wdl/RefineComplexVariants.wdl
@@ -45,7 +45,6 @@ workflow RefineComplexVariants {
         RuntimeAttr? runtime_attr_generate_cpx_review_script
         RuntimeAttr? runtime_attr_generate_cnv_segments_from_cpx
         RuntimeAttr? runtime_attr_get_vcf_header_with_members_info_line
-        RuntimeAttr? runtime_attr_generate_cnv_segments_from_cpx
         RuntimeAttr? runtime_attr_extract_cpx_lg_cnv_by_batch
         RuntimeAttr? runtime_attr_seek_depth_supp_for_cpx
         RuntimeAttr? runtime_attr_concat_bed_Step1

From 16f436ba67ba57dfe312e2effa9a7d5b1318d798 Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Mon, 21 Oct 2024 17:20:20 -0400
Subject: [PATCH 19/28] lint

---
 .../scripts/reformat_CPX_bed_and_generate_script.py           | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py b/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py
index d37ac7e04..8b4b8cef8 100644
--- a/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py
+++ b/src/sv-pipeline/scripts/reformat_CPX_bed_and_generate_script.py
@@ -468,8 +468,8 @@ def main():
 
     # Parse command line arguments and options
     parser = argparse.ArgumentParser(
-                description=__doc__,
-                formatter_class=argparse.RawDescriptionHelpFormatter)
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
     parser.add_argument('-i', '--input', required=True, help='input file of CPX events in bed format')
     parser.add_argument('-s', '--sample_pe', required=True, help='2 column file with sample ID and PE metrics information')
     parser.add_argument('-p', '--pe_evidence', required=True, help='name of file to store collected PE metrics')

From 70a6bfbd2181eedfaa5288922e6a208b225f0418 Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Tue, 22 Oct 2024 11:35:28 -0400
Subject: [PATCH 20/28] update pipeline diagram, remove UNRESOLVED_TYPE for ME
 DELs

---
 terra_pipeline_diagram.jpg | 4 ++--
 wdl/CleanVcfChromosome.wdl | 1 +
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/terra_pipeline_diagram.jpg b/terra_pipeline_diagram.jpg
index e3af1c2f7..226f13c2c 100644
--- a/terra_pipeline_diagram.jpg
+++ b/terra_pipeline_diagram.jpg
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7312ab3c1e136b98f3721606ecec017a1a2f6787144aaaf12fe16ab00a9a6a53
-size 295450
+oid sha256:e7b763bd8386fb082264ae43a5ed9ca922b4dd9763f58e079d2028375fb8e8af
+size 307785
diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl
index 41df87262..f360f66fb 100644
--- a/wdl/CleanVcfChromosome.wdl
+++ b/wdl/CleanVcfChromosome.wdl
@@ -690,6 +690,7 @@ for record in fin:
         record.stop = record.info['END2']
         record.info.pop("CHR2")
         record.info.pop("END2")
+        record.info.pop("UNRESOLVED_TYPE")
         if hash_MEI_DEL_reset[record.id] == 'overlap_LINE1':
             record.alts = ('<DEL:ME:LINE1>',)
         if hash_MEI_DEL_reset[record.id] == 'overlap_HERVK':

From 97b97cbf4b89e6b7863a144339f4f7fbe7729900 Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Tue, 22 Oct 2024 13:34:28 -0400
Subject: [PATCH 21/28] 1 cpu for rescue me dels

---
 wdl/CleanVcfChromosome.wdl | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl
index f360f66fb..a14ffa8c4 100644
--- a/wdl/CleanVcfChromosome.wdl
+++ b/wdl/CleanVcfChromosome.wdl
@@ -620,14 +620,10 @@ task RescueMobileElementDeletions {
   }
 
   Float input_size = size(vcf, "GiB")
-  # disk is cheap, read/write speed is proportional to disk size, and disk IO is a significant time factor:
-  # in tests on large VCFs, memory usage is ~1.0 * input VCF size
-  # the biggest disk usage is at the end of the task, with input + output VCF on disk
-  Int cpu_cores = 2 # speed up compression / decompression of VCFs
   RuntimeAttr runtime_default = object {
     mem_gb: 3.75 + input_size * 1.5,
     disk_gb: ceil(100.0 + input_size * 3.0),
-    cpu_cores: cpu_cores,
+    cpu_cores: 1,
     preemptible_tries: 3,
     max_retries: 1,
     boot_disk_gb: 10

From 256e24d1b3ad118a39d5e342102fd6b6aa010d8d Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Tue, 22 Oct 2024 14:46:10 -0400
Subject: [PATCH 22/28] include existing no-call GTs in NCR calculation

---
 src/sv-pipeline/scripts/apply_sl_filter.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/sv-pipeline/scripts/apply_sl_filter.py b/src/sv-pipeline/scripts/apply_sl_filter.py
index 74187eb01..e0fa85c66 100644
--- a/src/sv-pipeline/scripts/apply_sl_filter.py
+++ b/src/sv-pipeline/scripts/apply_sl_filter.py
@@ -114,6 +114,8 @@ def _apply_filter(record, sl_threshold, ploidy_dict, apply_hom_ref, ncr_threshol
         n_samples += 1
         sl = gt.get('SL', None)
         if sl is None:
+            if record.info['SVTYPE'] != 'CNV' and _is_no_call(gt['GT']):
+                n_no_call += 1
             continue
         gt_is_ref = _is_hom_ref(gt['GT'])
         # SL metrics are over non-ref / no-call samples
@@ -128,6 +130,7 @@ def _apply_filter(record, sl_threshold, ploidy_dict, apply_hom_ref, ncr_threshol
         gt['GT_FILTER'] = _gt_to_filter_status(gt['GT'], fails_filter)
         if fails_filter:
             gt['GT'] = _filter_gt(gt['GT'], allele)
+        if _is_no_call('GT'):
             n_no_call += 1
     # Annotate metrics
     record.info['NCN'] = n_no_call

From 15282bc7247fe79b10127a64dd6a5e9c3a730d81 Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Tue, 22 Oct 2024 15:47:59 -0400
Subject: [PATCH 23/28] svconcordance should use cpx refined vcf as input now

---
 .../cohort_mode/workflow_configurations/SVConcordance.json.tmpl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/SVConcordance.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/SVConcordance.json.tmpl
index ff48489ac..1c9dfed26 100644
--- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/SVConcordance.json.tmpl
+++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/SVConcordance.json.tmpl
@@ -2,7 +2,7 @@
   "SVConcordance.gatk_docker": "${workspace.gatk_docker}",
   "SVConcordance.sv_base_mini_docker": "${workspace.sv_base_mini_docker}",
 
-  "SVConcordance.eval_vcf" : "${this.cleaned_vcf}",
+  "SVConcordance.eval_vcf" : "${this.cpx_refined_vcf}",
   "SVConcordance.truth_vcf" : "${this.joined_raw_calls_vcf}",
 
   "SVConcordance.output_prefix": "${this.sample_set_set_id}",

From 028fc6852b180dea0774f1d227d78641c578f69a Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Tue, 22 Oct 2024 16:56:21 -0400
Subject: [PATCH 24/28] reduce resources for a few tasks based on monitoring

---
 wdl/CollectLargeCNVSupportForCPX.wdl |  4 ++--
 wdl/CollectPEMetricsForCPX.wdl       |  2 +-
 wdl/CollectPEMetricsPerBatchCPX.wdl  |  4 ++--
 wdl/RefineComplexVariants.wdl        | 24 ++++++++++++------------
 4 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/wdl/CollectLargeCNVSupportForCPX.wdl b/wdl/CollectLargeCNVSupportForCPX.wdl
index 9675bac4d..6c943238a 100644
--- a/wdl/CollectLargeCNVSupportForCPX.wdl
+++ b/wdl/CollectLargeCNVSupportForCPX.wdl
@@ -189,9 +189,9 @@ task GenerateCnvSegmentFromCpx {
 
     RuntimeAttr default_attr = object {
         cpu_cores: 1,
-        mem_gb: 5,
+        mem_gb: 3.75,
         disk_gb: 10,
-        boot_disk_gb: 30,
+        boot_disk_gb: 10,
         preemptible_tries: 1,
         max_retries: 1
     }
diff --git a/wdl/CollectPEMetricsForCPX.wdl b/wdl/CollectPEMetricsForCPX.wdl
index 746f6a907..9a6fbd0c1 100644
--- a/wdl/CollectPEMetricsForCPX.wdl
+++ b/wdl/CollectPEMetricsForCPX.wdl
@@ -70,7 +70,7 @@ task CalcuPEStat {
 
   RuntimeAttr default_attr = object {
     cpu_cores: 1,
-    mem_gb: 5,
+    mem_gb: 3.75,
     disk_gb: 10,
     boot_disk_gb: 10,
     preemptible_tries: 3,
diff --git a/wdl/CollectPEMetricsPerBatchCPX.wdl b/wdl/CollectPEMetricsPerBatchCPX.wdl
index 655959748..3cc65dbeb 100644
--- a/wdl/CollectPEMetricsPerBatchCPX.wdl
+++ b/wdl/CollectPEMetricsPerBatchCPX.wdl
@@ -66,7 +66,7 @@ task CollectPEMetrics {
 
   RuntimeAttr default_attr = object {
     cpu_cores: 1,
-    mem_gb: 5,
+    mem_gb: 3.75,
     disk_gb: ceil(30.0 + size(PE_metric, "GiB") * 3),
     boot_disk_gb: 10,
     preemptible_tries: 3,
@@ -123,7 +123,7 @@ task ConcatEvidences {
 
   RuntimeAttr default_attr = object {
     cpu_cores: 1,
-    mem_gb: 5,
+    mem_gb: 3.75,
     disk_gb: 10,
     boot_disk_gb: 10,
     preemptible_tries: 3,
diff --git a/wdl/RefineComplexVariants.wdl b/wdl/RefineComplexVariants.wdl
index da75ef7c3..4340ef253 100644
--- a/wdl/RefineComplexVariants.wdl
+++ b/wdl/RefineComplexVariants.wdl
@@ -218,9 +218,9 @@ task GetSampleBatchPEMap {
 
     RuntimeAttr default_attr = object {
         cpu_cores: 1,
-        mem_gb: 5,
+        mem_gb: 3.75,
         disk_gb: 10 + 2*ceil(size(flatten([batch_sample_lists]), "GB")),
-        boot_disk_gb: 30,
+        boot_disk_gb: 10,
         preemptible_tries: 1,
         max_retries: 1
     }
@@ -280,9 +280,9 @@ task ReviseVcf {
 
     RuntimeAttr default_attr = object {
         cpu_cores: 1,
-        mem_gb: 7.5,
+        mem_gb: 3.75,
         disk_gb: ceil(5.0 +  size(vcf_file, "GB")*3),
-        boot_disk_gb: 30,
+        boot_disk_gb: 10,
         preemptible_tries: 1,
         max_retries: 1
     }
@@ -425,9 +425,9 @@ task SplitCpxCtx {
 
     RuntimeAttr default_attr = object {
         cpu_cores: 1,
-        mem_gb: 7.5,
+        mem_gb: 3.75,
         disk_gb: 10,
-        boot_disk_gb: 30,
+        boot_disk_gb: 10,
         preemptible_tries: 1,
         max_retries: 1
     }
@@ -481,9 +481,9 @@ task GenerateCpxReviewScript {
 
     RuntimeAttr default_attr = object {
         cpu_cores: 1,
-        mem_gb: 5,
+        mem_gb: 3.75,
         disk_gb: 10,
-        boot_disk_gb: 30,
+        boot_disk_gb: 10,
         preemptible_tries: 1,
         max_retries: 1
     }
@@ -537,9 +537,9 @@ task CalculateCpxEvidences {
 
     RuntimeAttr default_attr = object {
         cpu_cores: 1,
-        mem_gb: 5,
+        mem_gb: 3.75,
         disk_gb: 10,
-        boot_disk_gb: 30,
+        boot_disk_gb: 10,
         preemptible_tries: 1,
         max_retries: 1
     }
@@ -691,9 +691,9 @@ task CalculateCtxEvidences {
 
     RuntimeAttr default_attr = object {
         cpu_cores: 1,
-        mem_gb: 5,
+        mem_gb: 3.75,
         disk_gb: 10,
-        boot_disk_gb: 30,
+        boot_disk_gb: 10,
         preemptible_tries: 1,
         max_retries: 1
     }

From 1034a5c87a3e4403320f6066b9d632a4fb6a2c1a Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Thu, 24 Oct 2024 16:27:21 -0400
Subject: [PATCH 25/28] select PASS for MainVcfQc

---
 wdl/FilterGenotypes.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wdl/FilterGenotypes.wdl b/wdl/FilterGenotypes.wdl
index a703862f1..74375aec5 100644
--- a/wdl/FilterGenotypes.wdl
+++ b/wdl/FilterGenotypes.wdl
@@ -35,7 +35,7 @@ workflow FilterGenotypes {
     Array[Array[String]]? sample_level_comparison_datasets  # Array of two-element arrays, one per dataset, each of format [prefix, gs:// path to per-sample tarballs]
     File? sample_renaming_tsv # File with mapping to rename per-sample benchmark sample IDs for compatibility with cohort
     Boolean run_qc = true
-    String qc_bcftools_preprocessing_options = "-e 'FILTER~\"UNRESOLVED\" || FILTER~\"HIGH_NCR\"'"
+    String qc_bcftools_preprocessing_options = "-i 'FILTER=\"PASS\"'"
     Int qc_sv_per_shard = 2500
     Int qc_samples_per_shard = 600
     RuntimeAttr? runtime_override_plot_qc_per_family

From 6e1c22ce0eec46bd1af222d24e83782d790567bd Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Fri, 25 Oct 2024 15:27:48 -0400
Subject: [PATCH 26/28] fix ncr

---
 src/sv-pipeline/scripts/apply_sl_filter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/sv-pipeline/scripts/apply_sl_filter.py b/src/sv-pipeline/scripts/apply_sl_filter.py
index e0fa85c66..c8640b861 100644
--- a/src/sv-pipeline/scripts/apply_sl_filter.py
+++ b/src/sv-pipeline/scripts/apply_sl_filter.py
@@ -130,7 +130,7 @@ def _apply_filter(record, sl_threshold, ploidy_dict, apply_hom_ref, ncr_threshol
         gt['GT_FILTER'] = _gt_to_filter_status(gt['GT'], fails_filter)
         if fails_filter:
             gt['GT'] = _filter_gt(gt['GT'], allele)
-        if _is_no_call('GT'):
+        if _is_no_call(gt['GT']):
             n_no_call += 1
     # Annotate metrics
     record.info['NCN'] = n_no_call

From 680394410b55876ed9671013d0b48b688bb8c7d8 Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Mon, 28 Oct 2024 13:57:17 -0400
Subject: [PATCH 27/28] fix ncr for mcnvs; update ref panel output vcfs
 cleanvcf -> filtergeno

---
 .../test/SVConcordance/SVConcordance.json.tmpl     |  2 +-
 inputs/values/ref_panel_1kg.json                   | 14 ++++++++------
 src/sv-pipeline/scripts/apply_sl_filter.py         |  4 +---
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/inputs/templates/test/SVConcordance/SVConcordance.json.tmpl b/inputs/templates/test/SVConcordance/SVConcordance.json.tmpl
index 0540305cd..de29c485f 100644
--- a/inputs/templates/test/SVConcordance/SVConcordance.json.tmpl
+++ b/inputs/templates/test/SVConcordance/SVConcordance.json.tmpl
@@ -2,7 +2,7 @@
   "SVConcordance.gatk_docker": {{ dockers.gatk_docker | tojson }},
   "SVConcordance.sv_base_mini_docker": {{ dockers.sv_base_mini_docker | tojson }},
 
-  "SVConcordance.eval_vcf" : {{ test_batch.gatk_formatted_vcf | tojson }},
+  "SVConcordance.eval_vcf" : {{ test_batch.complex_refined_vcf | tojson }},
   "SVConcordance.truth_vcf" : {{ test_batch.joined_raw_calls_vcf | tojson }},
 
   "SVConcordance.output_prefix": {{ test_batch.name | tojson }},
diff --git a/inputs/values/ref_panel_1kg.json b/inputs/values/ref_panel_1kg.json
index 4ee29e419..e50439b34 100644
--- a/inputs/values/ref_panel_1kg.json
+++ b/inputs/values/ref_panel_1kg.json
@@ -1166,8 +1166,8 @@
         "gs://gatk-sv-ref-panel-1kg/outputs/mw-make-cohort-vcf-templates/ResolveComplexVariants/breakpoint-overlap-dropped-record-vcfs/ref_panel_1kg.chrX.breakpoint_overlap.dropped_records.vcf.gz.tbi",
         "gs://gatk-sv-ref-panel-1kg/outputs/mw-make-cohort-vcf-templates/ResolveComplexVariants/breakpoint-overlap-dropped-record-vcfs/ref_panel_1kg.chrY.breakpoint_overlap.dropped_records.vcf.gz.tbi"
     ],
-    "clean_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/mw-vcf-reshard/ref_panel_1kg.cleaned.vcf.gz",
-    "clean_vcf_index": "gs://gatk-sv-ref-panel-1kg/outputs/mw-vcf-reshard/ref_panel_1kg.cleaned.vcf.gz.tbi",
+    "clean_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/eph-refine-cpx/ref_panel_1kg.cleaned.vcf.gz",
+    "clean_vcf_index": "gs://gatk-sv-ref-panel-1kg/outputs/eph-refine-cpx/ref_panel_1kg.cleaned.vcf.gz.tbi",
     "clean_vcf_gatk_formatter_args": "",
     "cluster_background_fail_lists": [
         "gs://gatk-sv-ref-panel-1kg/outputs/mw-make-cohort-vcf-templates/CombineBatches/cluster-background-fail-lists/ref_panel_1kg.chr1.sr_background_fail.updated2.txt",
@@ -1329,6 +1329,8 @@
         "gs://gatk-sv-ref-panel-1kg/outputs/mw-vcf-reshard/ref_panel_1kg.chrX.regenotyped.vcf.gz.tbi",
         "gs://gatk-sv-ref-panel-1kg/outputs/mw-vcf-reshard/ref_panel_1kg.chrY.regenotyped.vcf.gz.tbi"
     ],
+    "complex_refined_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/eph-refine-cpx/ref_panel_1kg.cpx_refined.vcf.gz",
+    "complex_refined_vcf_index": "gs://gatk-sv-ref-panel-1kg/outputs/eph-refine-cpx/ref_panel_1kg.cpx_refined.vcf.gz",
     "complex_resolve_bothside_pass_list": "gs://gatk-sv-ref-panel-1kg/outputs/mw-vcf-reshard/ref_panel_1kg.all.sr_bothside_pass.updated3.txt",
     "complex_resolve_background_fail_list": "gs://gatk-sv-ref-panel-1kg/outputs/mw-vcf-reshard/ref_panel_1kg.all.sr_background_fail.updated3.txt",
     "complex_resolve_vcfs": [
@@ -1383,8 +1385,8 @@
         "gs://gatk-sv-ref-panel-1kg/outputs/mw-vcf-reshard/ref_panel_1kg.reshard_vcf.chrX.resharded.vcf.gz.tbi",
         "gs://gatk-sv-ref-panel-1kg/outputs/mw-vcf-reshard/ref_panel_1kg.reshard_vcf.chrY.resharded.vcf.gz.tbi"
     ],
-    "concordance_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/mw_sv_concordance_update/ref_panel_1kg.concordance.vcf.gz",
-    "concordance_vcf_index": "gs://gatk-sv-ref-panel-1kg/outputs/mw_sv_concordance_update/ref_panel_1kg.concordance.vcf.gz.tbi",
+    "concordance_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/eph-refine-cpx/ref_panel_1kg.concordance.vcf.gz",
+    "concordance_vcf_index": "gs://gatk-sv-ref-panel-1kg/outputs/eph-refine-cpx/ref_panel_1kg.concordance.vcf.gz.tbi",
     "contig_ploidy_model_tar": "gs://gatk-sv-resources-public/hg38/v0/sv-resources/ref-panel/1KG/v2/gcnv/ref_panel_1kg_v2-contig-ploidy-model.tar.gz",
     "counts": [
         "gs://gatk-sv-ref-panel-1kg/outputs/tws-no-cram-conversion/GatherSampleEvidenceBatch/HG00096.counts.tsv.gz",
@@ -1852,8 +1854,8 @@
     "genotype_pesr_pesr_sepcutoff": "gs://gatk-sv-ref-panel-1kg/outputs/GATKSVPipelineBatch/38c65ca4-2a07-4805-86b6-214696075fef/call-GenotypeBatch/GenotypeBatch/ad17f522-0950-4f0a-9148-a13f689082ed/call-GenotypePESRPart1/GenotypePESRPart1/40ec6d76-dd1c-432d-bfab-bc4426d0b1ec/call-TrainRDGenotyping/TrainRDGenotyping/e5540a96-9072-4719-bcfb-afccdfec15c6/call-UpdateCutoff/cacheCopy/ref_panel_1kg.pesr.pesr_sepcutoff.txt",
     "genotyped_depth_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/GATKSVPipelineBatch/38c65ca4-2a07-4805-86b6-214696075fef/call-GenotypeBatch/GenotypeBatch/ad17f522-0950-4f0a-9148-a13f689082ed/call-GenotypeDepthPart2/GenotypeDepthPart2/0aafd752-e606-4196-86ac-41c1c3ce1eb2/call-ConcatGenotypedVcfs/cacheCopy/ref_panel_1kg.depth.vcf.gz",
     "genotyped_pesr_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/GATKSVPipelineBatch/38c65ca4-2a07-4805-86b6-214696075fef/call-GenotypeBatch/GenotypeBatch/ad17f522-0950-4f0a-9148-a13f689082ed/call-GenotypePESRPart2/GenotypePESRPart2/ce1f4075-1a3e-44b5-9cfe-bfb701327616/call-ConcatGenotypedVcfs/cacheCopy/ref_panel_1kg.pesr.vcf.gz",
-    "genotype_filtered_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/mw-filter-genotypes/1kgp/ref_panel_1kg.filter_genotypes.vcf.gz",
-    "genotype_filtered_vcf_index": "gs://gatk-sv-ref-panel-1kg/outputs/mw-filter-genotypes/1kgp/ref_panel_1kg.filter_genotypes.vcf.gz.tbi",
+    "genotype_filtered_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/eph-refine-cpx/ref_panel_1kg.filter_genotypes.sanitized.vcf.gz",
+    "genotype_filtered_vcf_index": "gs://gatk-sv-ref-panel-1kg/outputs/eph-refine-cpx/ref_panel_1kg.filter_genotypes.sanitized.vcf.gz.tbi",
     "genotype_filtered_qc_tarball": "gs://gatk-sv-ref-panel-1kg/outputs/mw-filter-genotypes/1kgp/ref_panel_1kg.filter_genotypes_SV_VCF_QC_output.tar.gz",
     "joined_raw_calls_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/mw_sv_concordance_update/ref_panel_1kg.join_raw_calls.vcf.gz",
     "joined_raw_calls_vcf_index": "gs://gatk-sv-ref-panel-1kg/outputs/mw_sv_concordance_update/ref_panel_1kg.join_raw_calls.vcf.gz.tbi",
diff --git a/src/sv-pipeline/scripts/apply_sl_filter.py b/src/sv-pipeline/scripts/apply_sl_filter.py
index c8640b861..917b605fb 100644
--- a/src/sv-pipeline/scripts/apply_sl_filter.py
+++ b/src/sv-pipeline/scripts/apply_sl_filter.py
@@ -114,8 +114,6 @@ def _apply_filter(record, sl_threshold, ploidy_dict, apply_hom_ref, ncr_threshol
         n_samples += 1
         sl = gt.get('SL', None)
         if sl is None:
-            if record.info['SVTYPE'] != 'CNV' and _is_no_call(gt['GT']):
-                n_no_call += 1
             continue
         gt_is_ref = _is_hom_ref(gt['GT'])
         # SL metrics are over non-ref / no-call samples
@@ -130,7 +128,7 @@ def _apply_filter(record, sl_threshold, ploidy_dict, apply_hom_ref, ncr_threshol
         gt['GT_FILTER'] = _gt_to_filter_status(gt['GT'], fails_filter)
         if fails_filter:
             gt['GT'] = _filter_gt(gt['GT'], allele)
-        if _is_no_call(gt['GT']):
+        if record.info['SVTYPE'] != 'CNV' and _is_no_call(gt['GT']):
             n_no_call += 1
     # Annotate metrics
     record.info['NCN'] = n_no_call

From 50f78bc7ff124fc0fb458a967b7316c504879bc7 Mon Sep 17 00:00:00 2001
From: Emma Pierce-Hoffman <epierceh@broadinstitute.org>
Date: Mon, 28 Oct 2024 18:32:02 -0400
Subject: [PATCH 28/28] include multiallelics in default qc; update qc tarball
 for ref panel

---
 inputs/values/ref_panel_1kg.json | 2 +-
 wdl/FilterGenotypes.wdl          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/inputs/values/ref_panel_1kg.json b/inputs/values/ref_panel_1kg.json
index e50439b34..f317967a5 100644
--- a/inputs/values/ref_panel_1kg.json
+++ b/inputs/values/ref_panel_1kg.json
@@ -1856,7 +1856,7 @@
     "genotyped_pesr_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/GATKSVPipelineBatch/38c65ca4-2a07-4805-86b6-214696075fef/call-GenotypeBatch/GenotypeBatch/ad17f522-0950-4f0a-9148-a13f689082ed/call-GenotypePESRPart2/GenotypePESRPart2/ce1f4075-1a3e-44b5-9cfe-bfb701327616/call-ConcatGenotypedVcfs/cacheCopy/ref_panel_1kg.pesr.vcf.gz",
     "genotype_filtered_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/eph-refine-cpx/ref_panel_1kg.filter_genotypes.sanitized.vcf.gz",
     "genotype_filtered_vcf_index": "gs://gatk-sv-ref-panel-1kg/outputs/eph-refine-cpx/ref_panel_1kg.filter_genotypes.sanitized.vcf.gz.tbi",
-    "genotype_filtered_qc_tarball": "gs://gatk-sv-ref-panel-1kg/outputs/mw-filter-genotypes/1kgp/ref_panel_1kg.filter_genotypes_SV_VCF_QC_output.tar.gz",
+    "genotype_filtered_qc_tarball": "gs://gatk-sv-ref-panel-1kg/outputs/eph-refine-cpx/ref_panel_1kg.filter_genotypes_SV_VCF_QC_output.tar.gz",
     "joined_raw_calls_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/mw_sv_concordance_update/ref_panel_1kg.join_raw_calls.vcf.gz",
     "joined_raw_calls_vcf_index": "gs://gatk-sv-ref-panel-1kg/outputs/mw_sv_concordance_update/ref_panel_1kg.join_raw_calls.vcf.gz.tbi",
     "manta_vcfs": [
diff --git a/wdl/FilterGenotypes.wdl b/wdl/FilterGenotypes.wdl
index 74375aec5..8926bc31e 100644
--- a/wdl/FilterGenotypes.wdl
+++ b/wdl/FilterGenotypes.wdl
@@ -35,7 +35,7 @@ workflow FilterGenotypes {
     Array[Array[String]]? sample_level_comparison_datasets  # Array of two-element arrays, one per dataset, each of format [prefix, gs:// path to per-sample tarballs]
     File? sample_renaming_tsv # File with mapping to rename per-sample benchmark sample IDs for compatibility with cohort
     Boolean run_qc = true
-    String qc_bcftools_preprocessing_options = "-i 'FILTER=\"PASS\"'"
+    String qc_bcftools_preprocessing_options = "-i 'FILTER=\"PASS\" || FILTER=\"MULTIALLELIC\"'"
     Int qc_sv_per_shard = 2500
     Int qc_samples_per_shard = 600
     RuntimeAttr? runtime_override_plot_qc_per_family