From 968ab5bcb041ec23f3d05b09b96c4377a852db8c Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Tue, 6 Aug 2024 14:21:37 -0400
Subject: [PATCH 01/29] Initial commit

---
 .../scripts/identify_duplicates.py            | 131 ++++++++++++++++++
 src/sv-pipeline/scripts/merge_duplicates.py   |  74 ++++++++++
 wdl/MainVcfQc.wdl                             | 122 ++++++++++++++++
 3 files changed, 327 insertions(+)
 create mode 100644 src/sv-pipeline/scripts/identify_duplicates.py
 create mode 100644 src/sv-pipeline/scripts/merge_duplicates.py

diff --git a/src/sv-pipeline/scripts/identify_duplicates.py b/src/sv-pipeline/scripts/identify_duplicates.py
new file mode 100644
index 000000000..e92d7a3a4
--- /dev/null
+++ b/src/sv-pipeline/scripts/identify_duplicates.py
@@ -0,0 +1,131 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Identify and classify duplicated variants from a sorted input VCF
+"""
+
+from typing import List, Text, Optional
+from collections import defaultdict
+from itertools import combinations
+
+import argparse
+import sys
+import pysam
+
+
+def process_duplicates(vcf, fout):
+    # Initialize counters and buffers
+    counts = defaultdict(int)
+    exact_buffer = []
+    ins_buffer = []
+    current_chrom = None
+    current_pos = None
+
+    # Create output files
+    with open(f"{fout}_records.tsv", 'w') as f_records, open(f"{fout}_counts.tsv", 'w') as f_counts:
+        f_records.write("TYPE\tDUP_RECORDS\n")
+        f_counts.write("TYPE\tDUP_COUNTS\n")
+
+        # Iterate through all records
+        for record in vcf.fetch():
+            # Process current buffers if we've reached a new chrom or pos
+            if record.chrom != current_chrom or record.pos != current_pos:
+                process_buffers(exact_buffer, ins_buffer, counts, f_records)
+                exact_buffer = []
+                ins_buffer = []
+                current_chrom = record.chrom
+                current_pos = record.pos
+
+            # Update buffers with new record
+            exact_key = (
+                record.chrom, 
+                record.pos, 
+                record.stop,
+                record.info.get('SVTYPE'), 
+                record.info.get('SVLEN'),
+                record.info.get('CHR2'), 
+                record.info.get('END2'),
+                record.info.get('STRANDS'), 
+                record.info.get('CPX_TYPE'),
+                record.info.get('CPX_INTERVALS')
+            )
+            exact_buffer.append((exact_key, record.id))
+
+            if record.info.get('SVTYPE') == 'INS':
+                insert_key = (
+                    record.id, 
+                    record.info.get('SVLEN'), 
+                    record.alts[0]
+                )
+                ins_buffer.append(insert_key)
+
+        # Process remaining records in the buffer
+        process_buffers(exact_buffer, ins_buffer, counts, f_records)
+
+        # Write counts to file
+        for match_type in sorted(counts.keys()):
+            f_counts.write(f"{match_type}\t{counts[match_type]}\n")
+
+
+def process_buffers(exact_buffer, ins_buffer, counts, f_records):
+    # Process exact matches
+    exact_matches = defaultdict(list)
+    for key, record_id in exact_buffer:
+        exact_matches[key].append(record_id)
+
+    for records in exact_matches.values():
+        if len(records) > 1:
+            counts['Exact'] += 1
+            f_records.write(f"Exact\t{','.join(sorted(records))}\n")
+
+    # Process INS matches
+    for i in range(len(ins_buffer)):
+        for j in range(i+1, len(ins_buffer)):
+            rec1, rec2 = ins_buffer[i], ins_buffer[j]
+            
+            # Size comparison
+            if rec1[1] == rec2[1]:
+                counts['INS 100%'] += 1
+                f_records.write(f"INS 100%\t{rec1[0]},{rec2[0]}\n")
+            elif abs(rec1[1] - rec2[1]) <= 0.5 * max(rec1[1], rec2[1]):
+                counts['INS 50%'] += 1
+                f_records.write(f"INS 50%\t{rec1[0]},{rec2[0]}\n")
+            else:
+                counts['INS 0%'] += 1
+                f_records.write(f"INS 0%\t{rec1[0]},{rec2[0]}\n")
+            
+            # ALT comparison
+            if rec1[2] == rec2[2]:
+                counts['INS ALT Identical'] += 1
+                f_records.write(f"INS ALT Identical\t{rec1[0]},{rec2[0]}\n")
+            elif ('<INS>' in (rec1[2], rec2[2])) and ('<INS:' in (rec1[2] + rec2[2])):
+                counts['INS ALT Subtype'] += 1
+                f_records.write(f"INS ALT Subtype\t{rec1[0]},{rec2[0]}\n")
+            elif rec1[2].startswith('<INS:') and rec2[2].startswith('<INS:') and rec1[2] != rec2[2]:
+                counts['INS ALT Different Subtype'] += 1
+                f_records.write(f"INS ALT Different Subtype\t{rec1[0]},{rec2[0]}\n")
+
+
+def _parse_arguments(argv: List[Text]) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Identify duplicated records from a sorted input VCF",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument('-v', '--vcf', required=True, help='Input VCF.')
+    parser.add_argument('-f', '--fout', required=True, help='Output file name.')
+    parsed_arguments = parser.parse_args(argv[1:])
+    return parsed_arguments
+
+
+def main(argv: Optional[List[Text]] = None):
+    if argv is None:
+        argv = sys.argv
+    args = _parse_arguments(argv)
+
+    vcf = pysam.VariantFile(args.vcf)
+    process_duplicates(vcf, args.fout)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/sv-pipeline/scripts/merge_duplicates.py b/src/sv-pipeline/scripts/merge_duplicates.py
new file mode 100644
index 000000000..86b9dbf6e
--- /dev/null
+++ b/src/sv-pipeline/scripts/merge_duplicates.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Merge duplicate records and counts files across multiple TSVs
+"""
+
+from typing import List, Text, Optional
+from collections import defaultdict
+
+import argparse
+import sys
+import csv
+
+
+def merge_duplicates(record_files: List[str], count_files: List[str], fout: str):
+    # Merge records
+    with open(f"{fout}_records.tsv", 'w', newline='') as out_records:
+        writer = csv.writer(out_records, delimiter='\t')
+
+        # Write header
+        writer.writerow(['TYPE', 'DUP_RECORDS'])
+        
+        # Append records from each file
+        for record_file in record_files:
+            with open(record_file, 'r') as f:
+                reader = csv.reader(f, delimiter='\t')
+                next(reader) 
+                for row in reader:
+                    writer.writerow(row)
+
+    # Sum counts
+    counts = defaultdict(int)
+    for count_file in count_files:
+        with open(count_file, 'r') as f:
+            reader = csv.reader(f, delimiter='\t')
+            next(reader)
+            for row in reader:
+                counts[row[0]] += int(row[1])
+
+    # Merge counts
+    with open(f"{fout}_counts.tsv", 'w', newline='') as out_counts:
+        writer = csv.writer(out_counts, delimiter='\t')
+
+        # Write header
+        writer.writerow(['TYPE', 'DUP_COUNTS'])
+
+        # Append each row from merged counts
+        for category, count in counts.items():
+            writer.writerow([category, count])
+    
+
+def _parse_arguments(argv: List[Text]) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Merge duplicate records and counts files across multiple TSVs",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument('-r', '--records', nargs='+', required=True, help='Input duplicated record TSV files.')
+    parser.add_argument('-c', '--counts', nargs='+', required=True, help='Input duplicated counts TSV files.')
+    parser.add_argument('-f', '--fout', required=True, help='Output file name.')
+    parsed_arguments = parser.parse_args(argv[1:])
+    return parsed_arguments
+
+
+def main(argv: Optional[List[Text]] = None):
+    if argv is None:
+        argv = sys.argv
+    args = _parse_arguments(argv)
+
+    merge_duplicates(args.records, args.counts, args.fout)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/wdl/MainVcfQc.wdl b/wdl/MainVcfQc.wdl
index 04f642e62..7b10372be 100644
--- a/wdl/MainVcfQc.wdl
+++ b/wdl/MainVcfQc.wdl
@@ -39,6 +39,8 @@ workflow MainVcfQc {
     RuntimeAttr? runtime_override_plot_qc_per_sample
     RuntimeAttr? runtime_override_plot_qc_per_family
     RuntimeAttr? runtime_override_per_sample_benchmark_plot
+    RuntimeAttr? runtime_override_identify_duplicates
+    RuntimeAttr? runtime_override_merge_duplicates
     RuntimeAttr? runtime_override_sanitize_outputs
 
     # overrides for MiniTasks or Utils
@@ -275,6 +277,25 @@ workflow MainVcfQc {
     }
   }
 
+  # Identify all duplicates
+  call IdentifyDuplicates {
+    input:
+      prefix=prefix,
+      vcfs=vcfs_for_qc,
+      sv_pipeline_qc_docker=sv_pipeline_qc_docker,
+      runtime_attr_override=runtime_override_identify_duplicates
+  }
+
+  # Merge duplicates
+  call MergeDuplicates {
+    input:
+      prefix=prefix,
+      tsvs=IdentifyDuplicates.duplicates,
+      tsvs_counts=IdentifyDuplicates.duplicates_counts,
+      sv_pipeline_qc_docker=sv_pipeline_qc_docker,
+      runtime_attr_override=runtime_override_merge_duplicates
+  }
+
   # Sanitize all outputs
   call SanitizeOutputs {
     input:
@@ -296,6 +317,8 @@ workflow MainVcfQc {
   output {
     File sv_vcf_qc_output = SanitizeOutputs.vcf_qc_tarball
     File vcf2bed_output = MergeVcf2Bed.merged_bed_file
+    File duplicates_output = MergeDuplicates.duplicates
+    File duplicates_counts_output = MergeDuplicates.duplicates_counts
   }
 }
 
@@ -858,3 +881,102 @@ task SanitizeOutputs {
   }
 }
 
+
+# Identify all duplicates
+task IdentifyDuplicates {
+  input {
+    String prefix
+    Array[File] vcfs
+    String sv_pipeline_qc_docker
+    RuntimeAttr? runtime_attr_override
+  }
+
+  RuntimeAttr runtime_default = object {
+    mem_gb: 3.75,
+    disk_gb: 5 + ceil(size(vcfs, "GiB")),
+    cpu_cores: 1,
+    preemptible_tries: 1,
+    max_retries: 1,
+    boot_disk_gb: 10
+  }
+
+  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+  runtime {
+    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB"
+    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+    docker: sv_pipeline_qc_docker
+    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+  }
+
+  command <<<
+    set -euo pipefail
+
+    for vcf in ~{sep=' ' vcfs}; do
+      vcf_name=$(basename "$vcf" .vcf.gz)
+      fout_name="~{prefix}_duplicate_${vcf_name}"
+
+      echo "Processing $vcf..."
+      python /opt/sv-pipeline/scripts/identify_duplicates.py \
+        --vcf "$vcf" \
+        --fout "$fout_name"
+    done
+
+    echo "All VCFs processed."
+  >>>
+
+  output {
+    Array[File] duplicates = glob("~{prefix}_duplicate_records_*.tsv")
+    Array[File] duplicates_counts = glob("~{prefix}_duplicate_counts_*.tsv")
+  }
+}
+
+
+# Aggregate distinct duplicate summary files
+task MergeDuplicates {
+  input {
+    String prefix
+    Array[File] tsvs
+    Array[File] tsvs_counts
+    String sv_pipeline_qc_docker
+    RuntimeAttr? runtime_attr_override
+  }
+
+  RuntimeAttr runtime_default = object {
+    mem_gb: 3.75,
+    disk_gb: 5 + ceil(size(tsvs, "GiB")) + ceil(size(tsvs_counts, "GiB")),
+    cpu_cores: 1,
+    preemptible_tries: 1,
+    max_retries: 1,
+    boot_disk_gb: 10
+  }
+
+  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+  runtime {
+    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB"
+    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+    docker: sv_pipeline_qc_docker
+    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+  }
+
+  command <<<
+    set -euo pipefail
+
+    python /opt/sv-pipeline/scripts/merge_duplicates.py \
+      --records ~{sep=' ' tsvs} \
+      --counts ~{sep=' ' tsvs_counts} \
+      --fout "~{prefix}_agg_duplicate"
+
+    echo "All TSVs processed."
+  >>>
+
+  output {
+    File duplicates = "~{prefix}_agg_duplicates.tsv"
+    File duplicates_counts = "~{prefix}_agg_duplicates_counts.tsv"
+  }
+}

From 239d7f923aa9c36e3248d11959dacd7b35ee9b4a Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Tue, 6 Aug 2024 15:28:34 -0400
Subject: [PATCH 02/29] Added branch name to dockstore file

---
 .github/.dockstore.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/.dockstore.yml b/.github/.dockstore.yml
index a934e932b..6f3d07451 100644
--- a/.github/.dockstore.yml
+++ b/.github/.dockstore.yml
@@ -150,6 +150,7 @@ workflows:
     filters:
       branches: 
         - main
+        - kj/701_vcf_qc_duplicates
       tags:
         - /.*/
 

From aa921e2efaa6ebac1538d7397a4145a054598d1b Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Tue, 6 Aug 2024 15:32:23 -0400
Subject: [PATCH 03/29] Updated script definitions

---
 src/sv-pipeline/scripts/identify_duplicates.py | 2 +-
 src/sv-pipeline/scripts/merge_duplicates.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/sv-pipeline/scripts/identify_duplicates.py b/src/sv-pipeline/scripts/identify_duplicates.py
index e92d7a3a4..d8f9588b3 100644
--- a/src/sv-pipeline/scripts/identify_duplicates.py
+++ b/src/sv-pipeline/scripts/identify_duplicates.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 
 """
-Identify and classify duplicated variants from a sorted input VCF
+Identify and classify duplicated variants from an input VCF.
 """
 
 from typing import List, Text, Optional
diff --git a/src/sv-pipeline/scripts/merge_duplicates.py b/src/sv-pipeline/scripts/merge_duplicates.py
index 86b9dbf6e..29f66144d 100644
--- a/src/sv-pipeline/scripts/merge_duplicates.py
+++ b/src/sv-pipeline/scripts/merge_duplicates.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 
 """
-Merge duplicate records and counts files across multiple TSVs
+Merge duplicate records and counts files across multiple TSVs.
 """
 
 from typing import List, Text, Optional

From 2317dd7a0bd67dbc09cd8fe714836f20ef825150 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Tue, 6 Aug 2024 15:53:51 -0400
Subject: [PATCH 04/29] Updated WDL to use custom script

---
 wdl/MainVcfQc.wdl | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/wdl/MainVcfQc.wdl b/wdl/MainVcfQc.wdl
index 7b10372be..6b95952e7 100644
--- a/wdl/MainVcfQc.wdl
+++ b/wdl/MainVcfQc.wdl
@@ -17,6 +17,8 @@ workflow MainVcfQc {
     File? ped_file
     File? list_of_samples_to_include
     File? sample_renaming_tsv # File with mapping to rename per-sample benchmark sample IDs for compatibility with cohort
+    File? identify_duplicates_custom
+    File? merge_duplicates_custom
     Int max_trios = 1000
     String prefix
     Int sv_per_shard
@@ -283,6 +285,7 @@ workflow MainVcfQc {
       prefix=prefix,
       vcfs=vcfs_for_qc,
       sv_pipeline_qc_docker=sv_pipeline_qc_docker,
+      custom_script=identify_duplicates_custom,
       runtime_attr_override=runtime_override_identify_duplicates
   }
 
@@ -293,6 +296,7 @@ workflow MainVcfQc {
       tsvs=IdentifyDuplicates.duplicates,
       tsvs_counts=IdentifyDuplicates.duplicates_counts,
       sv_pipeline_qc_docker=sv_pipeline_qc_docker,
+      custom_script=merge_duplicates_custom,
       runtime_attr_override=runtime_override_merge_duplicates
   }
 
@@ -888,6 +892,7 @@ task IdentifyDuplicates {
     String prefix
     Array[File] vcfs
     String sv_pipeline_qc_docker
+    File? custom_script
     RuntimeAttr? runtime_attr_override
   }
 
@@ -914,12 +919,14 @@ task IdentifyDuplicates {
   command <<<
     set -euo pipefail
 
+    SCRIPT_PATH="${default="/opt/sv-pipeline/scripts/identify_duplicates.py" custom_script}"
+
     for vcf in ~{sep=' ' vcfs}; do
       vcf_name=$(basename "$vcf" .vcf.gz)
       fout_name="~{prefix}_duplicate_${vcf_name}"
 
       echo "Processing $vcf..."
-      python /opt/sv-pipeline/scripts/identify_duplicates.py \
+      python "$SCRIPT_PATH" \
         --vcf "$vcf" \
         --fout "$fout_name"
     done
@@ -941,6 +948,7 @@ task MergeDuplicates {
     Array[File] tsvs
     Array[File] tsvs_counts
     String sv_pipeline_qc_docker
+    File? custom_script
     RuntimeAttr? runtime_attr_override
   }
 
@@ -967,7 +975,9 @@ task MergeDuplicates {
   command <<<
     set -euo pipefail
 
-    python /opt/sv-pipeline/scripts/merge_duplicates.py \
+    SCRIPT_PATH="${default="/opt/sv-pipeline/scripts/merge_duplicates.py" custom_script}"
+
+    python "$SCRIPT_PATH" \
       --records ~{sep=' ' tsvs} \
       --counts ~{sep=' ' tsvs_counts} \
       --fout "~{prefix}_agg_duplicate"

From bf4d7bcaa531a18e37f39b6935b24f3a2b4042e2 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Wed, 7 Aug 2024 09:35:13 -0400
Subject: [PATCH 05/29] Modified output file paths

---
 .../scripts/identify_duplicates.py            |  2 +-
 src/sv-pipeline/scripts/merge_duplicates.py   |  4 +--
 wdl/MainVcfQc.wdl                             | 34 +++++++++++--------
 3 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/src/sv-pipeline/scripts/identify_duplicates.py b/src/sv-pipeline/scripts/identify_duplicates.py
index d8f9588b3..56cd226a9 100644
--- a/src/sv-pipeline/scripts/identify_duplicates.py
+++ b/src/sv-pipeline/scripts/identify_duplicates.py
@@ -23,7 +23,7 @@ def process_duplicates(vcf, fout):
     current_pos = None
 
     # Create output files
-    with open(f"{fout}_records.tsv", 'w') as f_records, open(f"{fout}_counts.tsv", 'w') as f_counts:
+    with open(f"{fout}_duplicate_records.tsv", 'w') as f_records, open(f"{fout}_duplicate_counts.tsv", 'w') as f_counts:
         f_records.write("TYPE\tDUP_RECORDS\n")
         f_counts.write("TYPE\tDUP_COUNTS\n")
 
diff --git a/src/sv-pipeline/scripts/merge_duplicates.py b/src/sv-pipeline/scripts/merge_duplicates.py
index 29f66144d..3d9a71ce1 100644
--- a/src/sv-pipeline/scripts/merge_duplicates.py
+++ b/src/sv-pipeline/scripts/merge_duplicates.py
@@ -15,7 +15,7 @@
 
 def merge_duplicates(record_files: List[str], count_files: List[str], fout: str):
     # Merge records
-    with open(f"{fout}_records.tsv", 'w', newline='') as out_records:
+    with open(f"{fout}_duplicate_records.tsv", 'w', newline='') as out_records:
         writer = csv.writer(out_records, delimiter='\t')
 
         # Write header
@@ -39,7 +39,7 @@ def merge_duplicates(record_files: List[str], count_files: List[str], fout: str)
                 counts[row[0]] += int(row[1])
 
     # Merge counts
-    with open(f"{fout}_counts.tsv", 'w', newline='') as out_counts:
+    with open(f"{fout}_duplicate_counts.tsv", 'w', newline='') as out_counts:
         writer = csv.writer(out_counts, delimiter='\t')
 
         # Write header
diff --git a/wdl/MainVcfQc.wdl b/wdl/MainVcfQc.wdl
index 6b95952e7..97b092bb2 100644
--- a/wdl/MainVcfQc.wdl
+++ b/wdl/MainVcfQc.wdl
@@ -293,8 +293,8 @@ workflow MainVcfQc {
   call MergeDuplicates {
     input:
       prefix=prefix,
-      tsvs=IdentifyDuplicates.duplicates,
-      tsvs_counts=IdentifyDuplicates.duplicates_counts,
+      tsv_records=IdentifyDuplicates.duplicate_records,
+      tsv_counts=IdentifyDuplicates.duplicate_counts,
       sv_pipeline_qc_docker=sv_pipeline_qc_docker,
       custom_script=merge_duplicates_custom,
       runtime_attr_override=runtime_override_merge_duplicates
@@ -321,8 +321,8 @@ workflow MainVcfQc {
   output {
     File sv_vcf_qc_output = SanitizeOutputs.vcf_qc_tarball
     File vcf2bed_output = MergeVcf2Bed.merged_bed_file
-    File duplicates_output = MergeDuplicates.duplicates
-    File duplicates_counts_output = MergeDuplicates.duplicates_counts
+    File duplicate_records_output = MergeDuplicates.duplicate_records
+    File duplicate_counts_output = MergeDuplicates.duplicate_counts
   }
 }
 
@@ -923,7 +923,7 @@ task IdentifyDuplicates {
 
     for vcf in ~{sep=' ' vcfs}; do
       vcf_name=$(basename "$vcf" .vcf.gz)
-      fout_name="~{prefix}_duplicate_${vcf_name}"
+      fout_name="~{prefix}_${vcf_name}"
 
       echo "Processing $vcf..."
       python "$SCRIPT_PATH" \
@@ -931,12 +931,16 @@ task IdentifyDuplicates {
         --fout "$fout_name"
     done
 
+    echo "Listing generated files:"
+    ls -l ~{prefix}_*_duplicate_records.tsv
+    ls -l ~{prefix}_*_duplicate_counts.tsv
+
     echo "All VCFs processed."
   >>>
 
   output {
-    Array[File] duplicates = glob("~{prefix}_duplicate_records_*.tsv")
-    Array[File] duplicates_counts = glob("~{prefix}_duplicate_counts_*.tsv")
+    Array[File] duplicate_records = glob("~{prefix}_*_duplicate_records.tsv")
+    Array[File] duplicate_counts = glob("~{prefix}_*_duplicate_counts.tsv")
   }
 }
 
@@ -945,8 +949,8 @@ task IdentifyDuplicates {
 task MergeDuplicates {
   input {
     String prefix
-    Array[File] tsvs
-    Array[File] tsvs_counts
+    Array[File] tsv_records
+    Array[File] tsv_counts
     String sv_pipeline_qc_docker
     File? custom_script
     RuntimeAttr? runtime_attr_override
@@ -954,7 +958,7 @@ task MergeDuplicates {
 
   RuntimeAttr runtime_default = object {
     mem_gb: 3.75,
-    disk_gb: 5 + ceil(size(tsvs, "GiB")) + ceil(size(tsvs_counts, "GiB")),
+    disk_gb: 5 + ceil(size(tsv_records, "GiB")) + ceil(size(tsv_counts, "GiB")),
     cpu_cores: 1,
     preemptible_tries: 1,
     max_retries: 1,
@@ -978,15 +982,15 @@ task MergeDuplicates {
     SCRIPT_PATH="${default="/opt/sv-pipeline/scripts/merge_duplicates.py" custom_script}"
 
     python "$SCRIPT_PATH" \
-      --records ~{sep=' ' tsvs} \
-      --counts ~{sep=' ' tsvs_counts} \
-      --fout "~{prefix}_agg_duplicate"
+      --records ~{sep=' ' tsv_records} \
+      --counts ~{sep=' ' tsv_counts} \
+      --fout "~{prefix}_agg"
 
     echo "All TSVs processed."
   >>>
 
   output {
-    File duplicates = "~{prefix}_agg_duplicates.tsv"
-    File duplicates_counts = "~{prefix}_agg_duplicates_counts.tsv"
+    File duplicate_records = "~{prefix}_agg_duplicate_records.tsv"
+    File duplicate_counts = "~{prefix}_agg_duplicates_counts.tsv"
   }
 }

From f734014b21fe99933c4a635bf8e947bbf7983003 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Wed, 7 Aug 2024 10:27:02 -0400
Subject: [PATCH 06/29] Added more logging

---
 src/sv-pipeline/scripts/identify_duplicates.py |  1 +
 wdl/MainVcfQc.wdl                              | 10 +++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/sv-pipeline/scripts/identify_duplicates.py b/src/sv-pipeline/scripts/identify_duplicates.py
index 56cd226a9..fb63bcfae 100644
--- a/src/sv-pipeline/scripts/identify_duplicates.py
+++ b/src/sv-pipeline/scripts/identify_duplicates.py
@@ -66,6 +66,7 @@ def process_duplicates(vcf, fout):
         # Write counts to file
         for match_type in sorted(counts.keys()):
             f_counts.write(f"{match_type}\t{counts[match_type]}\n")
+            print(f"{match_type}: {counts[match_type]}")
 
 
 def process_buffers(exact_buffer, ins_buffer, counts, f_records):
diff --git a/wdl/MainVcfQc.wdl b/wdl/MainVcfQc.wdl
index 97b092bb2..1c6494fed 100644
--- a/wdl/MainVcfQc.wdl
+++ b/wdl/MainVcfQc.wdl
@@ -931,9 +931,13 @@ task IdentifyDuplicates {
         --fout "$fout_name"
     done
 
+    echo "Listing working directory:"
+    pwd
+    echo "Listing files in working directory:"
+    ls
     echo "Listing generated files:"
-    ls -l ~{prefix}_*_duplicate_records.tsv
-    ls -l ~{prefix}_*_duplicate_counts.tsv
+    ls -l "~{prefix}_*_duplicate_records.tsv"
+    ls -l "~{prefix}_*_duplicate_counts.tsv"
 
     echo "All VCFs processed."
   >>>
@@ -991,6 +995,6 @@ task MergeDuplicates {
 
   output {
     File duplicate_records = "~{prefix}_agg_duplicate_records.tsv"
-    File duplicate_counts = "~{prefix}_agg_duplicates_counts.tsv"
+    File duplicate_counts = "~{prefix}_agg_duplicate_counts.tsv"
   }
 }

From e7e9ddcc8a70a88959de4f6d7d9a558dd2abda25 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Wed, 7 Aug 2024 10:32:40 -0400
Subject: [PATCH 07/29] Used dot notation

---
 wdl/MainVcfQc.wdl | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/wdl/MainVcfQc.wdl b/wdl/MainVcfQc.wdl
index 1c6494fed..021597732 100644
--- a/wdl/MainVcfQc.wdl
+++ b/wdl/MainVcfQc.wdl
@@ -923,7 +923,7 @@ task IdentifyDuplicates {
 
     for vcf in ~{sep=' ' vcfs}; do
       vcf_name=$(basename "$vcf" .vcf.gz)
-      fout_name="~{prefix}_${vcf_name}"
+      fout_name="~{prefix}.${vcf_name}"
 
       echo "Processing $vcf..."
       python "$SCRIPT_PATH" \
@@ -936,15 +936,15 @@ task IdentifyDuplicates {
     echo "Listing files in working directory:"
     ls
     echo "Listing generated files:"
-    ls -l "~{prefix}_*_duplicate_records.tsv"
-    ls -l "~{prefix}_*_duplicate_counts.tsv"
+    ls -l "~{prefix}.*_duplicate_records.tsv"
+    ls -l "~{prefix}.*_duplicate_counts.tsv"
 
     echo "All VCFs processed."
   >>>
 
   output {
-    Array[File] duplicate_records = glob("~{prefix}_*_duplicate_records.tsv")
-    Array[File] duplicate_counts = glob("~{prefix}_*_duplicate_counts.tsv")
+    Array[File] duplicate_records = glob("~{prefix}.*_duplicate_records.tsv")
+    Array[File] duplicate_counts = glob("~{prefix}.*_duplicate_counts.tsv")
   }
 }
 
@@ -988,13 +988,13 @@ task MergeDuplicates {
     python "$SCRIPT_PATH" \
       --records ~{sep=' ' tsv_records} \
       --counts ~{sep=' ' tsv_counts} \
-      --fout "~{prefix}_agg"
+      --fout "~{prefix}.agg"
 
     echo "All TSVs processed."
   >>>
 
   output {
-    File duplicate_records = "~{prefix}_agg_duplicate_records.tsv"
-    File duplicate_counts = "~{prefix}_agg_duplicate_counts.tsv"
+    File duplicate_records = "~{prefix}.agg_duplicate_records.tsv"
+    File duplicate_counts = "~{prefix}.agg_duplicate_counts.tsv"
   }
 }

From 8d98cd1073b6642c1edecfb884bde8a9fc28e2d5 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Wed, 7 Aug 2024 11:18:42 -0400
Subject: [PATCH 08/29] Minor logging changes

---
 src/sv-pipeline/scripts/identify_duplicates.py | 1 -
 wdl/MainVcfQc.wdl                              | 6 +-----
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/sv-pipeline/scripts/identify_duplicates.py b/src/sv-pipeline/scripts/identify_duplicates.py
index fb63bcfae..56cd226a9 100644
--- a/src/sv-pipeline/scripts/identify_duplicates.py
+++ b/src/sv-pipeline/scripts/identify_duplicates.py
@@ -66,7 +66,6 @@ def process_duplicates(vcf, fout):
         # Write counts to file
         for match_type in sorted(counts.keys()):
             f_counts.write(f"{match_type}\t{counts[match_type]}\n")
-            print(f"{match_type}: {counts[match_type]}")
 
 
 def process_buffers(exact_buffer, ins_buffer, counts, f_records):
diff --git a/wdl/MainVcfQc.wdl b/wdl/MainVcfQc.wdl
index 021597732..a984c7b78 100644
--- a/wdl/MainVcfQc.wdl
+++ b/wdl/MainVcfQc.wdl
@@ -930,11 +930,7 @@ task IdentifyDuplicates {
         --vcf "$vcf" \
         --fout "$fout_name"
     done
-
-    echo "Listing working directory:"
-    pwd
-    echo "Listing files in working directory:"
-    ls
+    
     echo "Listing generated files:"
     ls -l "~{prefix}.*_duplicate_records.tsv"
     ls -l "~{prefix}.*_duplicate_counts.tsv"

From cfe266e7e485713d88107e0a2e1b96463ca2221c Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Wed, 7 Aug 2024 14:26:11 -0400
Subject: [PATCH 09/29] Removed glob() references

---
 wdl/MainVcfQc.wdl | 45 +++++++++++++++++++++------------------------
 1 file changed, 21 insertions(+), 24 deletions(-)

diff --git a/wdl/MainVcfQc.wdl b/wdl/MainVcfQc.wdl
index a984c7b78..095ecdb47 100644
--- a/wdl/MainVcfQc.wdl
+++ b/wdl/MainVcfQc.wdl
@@ -280,13 +280,15 @@ workflow MainVcfQc {
   }
 
   # Identify all duplicates
-  call IdentifyDuplicates {
-    input:
-      prefix=prefix,
-      vcfs=vcfs_for_qc,
-      sv_pipeline_qc_docker=sv_pipeline_qc_docker,
-      custom_script=identify_duplicates_custom,
-      runtime_attr_override=runtime_override_identify_duplicates
+  scatter(vcf in vcfs_for_qc) {
+    call IdentifyDuplicates {
+      input:
+        prefix=prefix,
+        vcf=vcf,
+        sv_pipeline_qc_docker=sv_pipeline_qc_docker,
+        custom_script=identify_duplicates_custom,
+        runtime_attr_override=runtime_override_identify_duplicates
+    }
   }
 
   # Merge duplicates
@@ -890,7 +892,7 @@ task SanitizeOutputs {
 task IdentifyDuplicates {
   input {
     String prefix
-    Array[File] vcfs
+    File vcf
     String sv_pipeline_qc_docker
     File? custom_script
     RuntimeAttr? runtime_attr_override
@@ -898,7 +900,7 @@ task IdentifyDuplicates {
 
   RuntimeAttr runtime_default = object {
     mem_gb: 3.75,
-    disk_gb: 5 + ceil(size(vcfs, "GiB")),
+    disk_gb: 2 + ceil(size(vcf, "GiB")),
     cpu_cores: 1,
     preemptible_tries: 1,
     max_retries: 1,
@@ -921,26 +923,21 @@ task IdentifyDuplicates {
 
     SCRIPT_PATH="${default="/opt/sv-pipeline/scripts/identify_duplicates.py" custom_script}"
 
-    for vcf in ~{sep=' ' vcfs}; do
-      vcf_name=$(basename "$vcf" .vcf.gz)
-      fout_name="~{prefix}.${vcf_name}"
+    vcf_name=$(basename "$vcf" .vcf.gz)
+    fout_name="~{prefix}.${vcf_name}"
 
-      echo "Processing $vcf..."
-      python "$SCRIPT_PATH" \
-        --vcf "$vcf" \
-        --fout "$fout_name"
-    done
-    
-    echo "Listing generated files:"
-    ls -l "~{prefix}.*_duplicate_records.tsv"
-    ls -l "~{prefix}.*_duplicate_counts.tsv"
+    echo "Processing ~{vcf} into ${fout_name}..."
+
+    python "$SCRIPT_PATH" \
+      --vcf "$vcf" \
+      --fout "$fout_name"
 
-    echo "All VCFs processed."
+    echo "Finishing processing VCF."
   >>>
 
   output {
-    Array[File] duplicate_records = glob("~{prefix}.*_duplicate_records.tsv")
-    Array[File] duplicate_counts = glob("~{prefix}.*_duplicate_counts.tsv")
+    File duplicate_records = "~{prefix}.${basename(vcf, '.vcf.gz')}_duplicated_records.tsv"
+    File duplicate_counts = "~{prefix}.${basename(vcf, '.vcf.gz')}_duplicated_counts.tsv"
   }
 }
 

From 9d4fcce18dd50595c1a1577bdb5a74bd5ac89853 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Wed, 7 Aug 2024 15:15:31 -0400
Subject: [PATCH 10/29] Standardized variable input structure

---
 wdl/MainVcfQc.wdl | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/wdl/MainVcfQc.wdl b/wdl/MainVcfQc.wdl
index 095ecdb47..71742c920 100644
--- a/wdl/MainVcfQc.wdl
+++ b/wdl/MainVcfQc.wdl
@@ -898,6 +898,12 @@ task IdentifyDuplicates {
     RuntimeAttr? runtime_attr_override
   }
 
+  File default_script = "/opt/sv-pipeline/scripts/merge_duplicates.py"
+  File active_script = select_first([custom_script, default_script])
+
+  String vcf_basename = basename(vcf, ".vcf.gz")
+  String full_prefix = "~{prefix}.~{vcf_basename}"
+
   RuntimeAttr runtime_default = object {
     mem_gb: 3.75,
     disk_gb: 2 + ceil(size(vcf, "GiB")),
@@ -921,23 +927,18 @@ task IdentifyDuplicates {
   command <<<
     set -euo pipefail
 
-    SCRIPT_PATH="${default="/opt/sv-pipeline/scripts/identify_duplicates.py" custom_script}"
-
-    vcf_name=$(basename "$vcf" .vcf.gz)
-    fout_name="~{prefix}.${vcf_name}"
+    echo "Processing ~{vcf} into ~{full_prefix}..."
 
-    echo "Processing ~{vcf} into ${fout_name}..."
-
-    python "$SCRIPT_PATH" \
-      --vcf "$vcf" \
-      --fout "$fout_name"
+    python ~{active_script} \
+      --vcf ~{vcf} \
+      --fout ~{full_prefix}
 
     echo "Finishing processing VCF."
   >>>
 
   output {
-    File duplicate_records = "~{prefix}.${basename(vcf, '.vcf.gz')}_duplicated_records.tsv"
-    File duplicate_counts = "~{prefix}.${basename(vcf, '.vcf.gz')}_duplicated_counts.tsv"
+    File duplicate_records = "~{full_prefix}_duplicated_records.tsv"
+    File duplicate_counts = "~{full_prefix}_duplicated_counts.tsv"
   }
 }
 
@@ -953,6 +954,9 @@ task MergeDuplicates {
     RuntimeAttr? runtime_attr_override
   }
 
+  File default_script = "/opt/sv-pipeline/scripts/merge_duplicates.py"
+  File active_script = select_first([custom_script, default_script])
+
   RuntimeAttr runtime_default = object {
     mem_gb: 3.75,
     disk_gb: 5 + ceil(size(tsv_records, "GiB")) + ceil(size(tsv_counts, "GiB")),
@@ -976,9 +980,7 @@ task MergeDuplicates {
   command <<<
     set -euo pipefail
 
-    SCRIPT_PATH="${default="/opt/sv-pipeline/scripts/merge_duplicates.py" custom_script}"
-
-    python "$SCRIPT_PATH" \
+    python ~{active_script} \
       --records ~{sep=' ' tsv_records} \
       --counts ~{sep=' ' tsv_counts} \
       --fout "~{prefix}.agg"

From 4ff559c5c15b1fed75840bbe101f133dee3c92f8 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Wed, 7 Aug 2024 15:52:21 -0400
Subject: [PATCH 11/29] Removed direct python script call

---
 wdl/MainVcfQc.wdl | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/wdl/MainVcfQc.wdl b/wdl/MainVcfQc.wdl
index 71742c920..1db8d10c3 100644
--- a/wdl/MainVcfQc.wdl
+++ b/wdl/MainVcfQc.wdl
@@ -291,6 +291,7 @@ workflow MainVcfQc {
     }
   }
 
+
   # Merge duplicates
   call MergeDuplicates {
     input:
@@ -377,7 +378,6 @@ task PlotQcVcfWide {
   }
 }
 
-
 # Task to merge VID lists across shards
 task TarShardVidLists {
   input {
@@ -888,7 +888,7 @@ task SanitizeOutputs {
 }
 
 
-# Identify all duplicates
+# Identify all duplicates in a single file
 task IdentifyDuplicates {
   input {
     String prefix
@@ -898,8 +898,8 @@ task IdentifyDuplicates {
     RuntimeAttr? runtime_attr_override
   }
 
-  File default_script = "/opt/sv-pipeline/scripts/merge_duplicates.py"
-  File active_script = select_first([custom_script, default_script])
+  # File default_script = "/opt/sv-pipeline/scripts/merge_duplicates.py"
+  # File active_script = select_first([custom_script, default_script])
 
   String vcf_basename = basename(vcf, ".vcf.gz")
   String full_prefix = "~{prefix}.~{vcf_basename}"
@@ -929,7 +929,7 @@ task IdentifyDuplicates {
 
     echo "Processing ~{vcf} into ~{full_prefix}..."
 
-    python ~{active_script} \
+    python ~{custom_script} \
       --vcf ~{vcf} \
       --fout ~{full_prefix}
 
@@ -954,8 +954,8 @@ task MergeDuplicates {
     RuntimeAttr? runtime_attr_override
   }
 
-  File default_script = "/opt/sv-pipeline/scripts/merge_duplicates.py"
-  File active_script = select_first([custom_script, default_script])
+  # File default_script = "/opt/sv-pipeline/scripts/merge_duplicates.py"
+  # File active_script = select_first([custom_script, default_script])
 
   RuntimeAttr runtime_default = object {
     mem_gb: 3.75,
@@ -980,7 +980,7 @@ task MergeDuplicates {
   command <<<
     set -euo pipefail
 
-    python ~{active_script} \
+    python ~{custom_script} \
       --records ~{sep=' ' tsv_records} \
       --counts ~{sep=' ' tsv_counts} \
       --fout "~{prefix}.agg"

From b72e9d6ba2278250d265c74bad91b53d5e69160f Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Wed, 7 Aug 2024 16:14:44 -0400
Subject: [PATCH 12/29] Renamed output files to be duplicate instead of
 duplicated

---
 wdl/MainVcfQc.wdl | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/wdl/MainVcfQc.wdl b/wdl/MainVcfQc.wdl
index 1db8d10c3..776403f6e 100644
--- a/wdl/MainVcfQc.wdl
+++ b/wdl/MainVcfQc.wdl
@@ -291,7 +291,6 @@ workflow MainVcfQc {
     }
   }
 
-
   # Merge duplicates
   call MergeDuplicates {
     input:
@@ -937,8 +936,8 @@ task IdentifyDuplicates {
   >>>
 
   output {
-    File duplicate_records = "~{full_prefix}_duplicated_records.tsv"
-    File duplicate_counts = "~{full_prefix}_duplicated_counts.tsv"
+    File duplicate_records = "~{full_prefix}_duplicate_records.tsv"
+    File duplicate_counts = "~{full_prefix}_duplicate_counts.tsv"
   }
 }
 

From 2b4c476585cf4bb195f9da9840244ce70edd4d4c Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Wed, 7 Aug 2024 16:37:01 -0400
Subject: [PATCH 13/29] Removed call to MergeDuplicates

---
 wdl/MainVcfQc.wdl | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/wdl/MainVcfQc.wdl b/wdl/MainVcfQc.wdl
index 776403f6e..d350e7b7b 100644
--- a/wdl/MainVcfQc.wdl
+++ b/wdl/MainVcfQc.wdl
@@ -292,15 +292,15 @@ workflow MainVcfQc {
   }
 
   # Merge duplicates
-  call MergeDuplicates {
-    input:
-      prefix=prefix,
-      tsv_records=IdentifyDuplicates.duplicate_records,
-      tsv_counts=IdentifyDuplicates.duplicate_counts,
-      sv_pipeline_qc_docker=sv_pipeline_qc_docker,
-      custom_script=merge_duplicates_custom,
-      runtime_attr_override=runtime_override_merge_duplicates
-  }
+  # call MergeDuplicates {
+  #   input:
+  #     prefix=prefix,
+  #     tsv_records=IdentifyDuplicates.duplicate_records,
+  #     tsv_counts=IdentifyDuplicates.duplicate_counts,
+  #     sv_pipeline_qc_docker=sv_pipeline_qc_docker,
+  #     custom_script=merge_duplicates_custom,
+  #     runtime_attr_override=runtime_override_merge_duplicates
+  # }
 
   # Sanitize all outputs
   call SanitizeOutputs {
@@ -323,8 +323,10 @@ workflow MainVcfQc {
   output {
     File sv_vcf_qc_output = SanitizeOutputs.vcf_qc_tarball
     File vcf2bed_output = MergeVcf2Bed.merged_bed_file
-    File duplicate_records_output = MergeDuplicates.duplicate_records
-    File duplicate_counts_output = MergeDuplicates.duplicate_counts
+    Array[File] duplicate_records_output = IdentifyDuplicates.duplicate_records
+    Array[File] duplicate_counts_output = IdentifyDuplicates.duplicate_counts
+    # File duplicate_records_output = MergeDuplicates.duplicate_records
+    # File duplicate_counts_output = MergeDuplicates.duplicate_counts
   }
 }
 

From ae2f33ebe424bc37b77701bd253df30ae0af0f4d Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Wed, 7 Aug 2024 17:04:19 -0400
Subject: [PATCH 14/29] Undo commenting out of MergeDuplicates

---
 wdl/MainVcfQc.wdl | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/wdl/MainVcfQc.wdl b/wdl/MainVcfQc.wdl
index d350e7b7b..776403f6e 100644
--- a/wdl/MainVcfQc.wdl
+++ b/wdl/MainVcfQc.wdl
@@ -292,15 +292,15 @@ workflow MainVcfQc {
   }
 
   # Merge duplicates
-  # call MergeDuplicates {
-  #   input:
-  #     prefix=prefix,
-  #     tsv_records=IdentifyDuplicates.duplicate_records,
-  #     tsv_counts=IdentifyDuplicates.duplicate_counts,
-  #     sv_pipeline_qc_docker=sv_pipeline_qc_docker,
-  #     custom_script=merge_duplicates_custom,
-  #     runtime_attr_override=runtime_override_merge_duplicates
-  # }
+  call MergeDuplicates {
+    input:
+      prefix=prefix,
+      tsv_records=IdentifyDuplicates.duplicate_records,
+      tsv_counts=IdentifyDuplicates.duplicate_counts,
+      sv_pipeline_qc_docker=sv_pipeline_qc_docker,
+      custom_script=merge_duplicates_custom,
+      runtime_attr_override=runtime_override_merge_duplicates
+  }
 
   # Sanitize all outputs
   call SanitizeOutputs {
@@ -323,10 +323,8 @@ workflow MainVcfQc {
   output {
     File sv_vcf_qc_output = SanitizeOutputs.vcf_qc_tarball
     File vcf2bed_output = MergeVcf2Bed.merged_bed_file
-    Array[File] duplicate_records_output = IdentifyDuplicates.duplicate_records
-    Array[File] duplicate_counts_output = IdentifyDuplicates.duplicate_counts
-    # File duplicate_records_output = MergeDuplicates.duplicate_records
-    # File duplicate_counts_output = MergeDuplicates.duplicate_counts
+    File duplicate_records_output = MergeDuplicates.duplicate_records
+    File duplicate_counts_output = MergeDuplicates.duplicate_counts
   }
 }
 

From 6a270e5947b7484ad984a71e8930029a12d8c1a3 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Thu, 8 Aug 2024 09:25:36 -0400
Subject: [PATCH 15/29] Removed direct reference to vcfs_for_qc

---
 wdl/MainVcfQc.wdl | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/wdl/MainVcfQc.wdl b/wdl/MainVcfQc.wdl
index 776403f6e..d6b87dcf7 100644
--- a/wdl/MainVcfQc.wdl
+++ b/wdl/MainVcfQc.wdl
@@ -280,7 +280,7 @@ workflow MainVcfQc {
   }
 
   # Identify all duplicates
-  scatter(vcf in vcfs_for_qc) {
+  scatter(vcf in select_first([SubsetVcfBySamplesList.vcf_subset, vcfs])) {
     call IdentifyDuplicates {
       input:
         prefix=prefix,
@@ -979,6 +979,10 @@ task MergeDuplicates {
   command <<<
     set -euo pipefail
 
+    echo "Merging all TSV files into one..."
+    echo "Records: ~{sep=', ' tsv_records}"
+    echo "Counts: ~{sep=', ' tsv_counts}"
+
     python ~{custom_script} \
       --records ~{sep=' ' tsv_records} \
       --counts ~{sep=' ' tsv_counts} \

From 0f5a1b6fab4fce9ef6d376b67e32bdce7cea572b Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Thu, 8 Aug 2024 14:58:08 -0400
Subject: [PATCH 16/29] Minor changes

---
 wdl/MainVcfQc.wdl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wdl/MainVcfQc.wdl b/wdl/MainVcfQc.wdl
index d6b87dcf7..d4a710ccf 100644
--- a/wdl/MainVcfQc.wdl
+++ b/wdl/MainVcfQc.wdl
@@ -280,7 +280,7 @@ workflow MainVcfQc {
   }
 
   # Identify all duplicates
-  scatter(vcf in select_first([SubsetVcfBySamplesList.vcf_subset, vcfs])) {
+  scatter(vcf in vcfs_for_qc) {
     call IdentifyDuplicates {
       input:
         prefix=prefix,
@@ -975,7 +975,7 @@ task MergeDuplicates {
     docker: sv_pipeline_qc_docker
     bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
   }
-
+  
   command <<<
     set -euo pipefail
 

From 899ff79ee62ca868c06143f7bc5034980fd43de8 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Fri, 9 Aug 2024 09:16:44 -0400
Subject: [PATCH 17/29] Added option for both default and custom script

---
 wdl/MainVcfQc.wdl | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/wdl/MainVcfQc.wdl b/wdl/MainVcfQc.wdl
index d4a710ccf..b3795405f 100644
--- a/wdl/MainVcfQc.wdl
+++ b/wdl/MainVcfQc.wdl
@@ -897,8 +897,8 @@ task IdentifyDuplicates {
     RuntimeAttr? runtime_attr_override
   }
 
-  # File default_script = "/opt/sv-pipeline/scripts/merge_duplicates.py"
-  # File active_script = select_first([custom_script, default_script])
+  File default_script = "/opt/sv-pipeline/scripts/merge_duplicates.py"
+  File active_script = select_first([custom_script, default_script])
 
   String vcf_basename = basename(vcf, ".vcf.gz")
   String full_prefix = "~{prefix}.~{vcf_basename}"
@@ -928,7 +928,7 @@ task IdentifyDuplicates {
 
     echo "Processing ~{vcf} into ~{full_prefix}..."
 
-    python ~{custom_script} \
+    python ~{active_script} \
       --vcf ~{vcf} \
       --fout ~{full_prefix}
 
@@ -953,8 +953,8 @@ task MergeDuplicates {
     RuntimeAttr? runtime_attr_override
   }
 
-  # File default_script = "/opt/sv-pipeline/scripts/merge_duplicates.py"
-  # File active_script = select_first([custom_script, default_script])
+  File default_script = "/opt/sv-pipeline/scripts/merge_duplicates.py"
+  File active_script = select_first([custom_script, default_script])
 
   RuntimeAttr runtime_default = object {
     mem_gb: 3.75,
@@ -980,10 +980,8 @@ task MergeDuplicates {
     set -euo pipefail
 
     echo "Merging all TSV files into one..."
-    echo "Records: ~{sep=', ' tsv_records}"
-    echo "Counts: ~{sep=', ' tsv_counts}"
 
-    python ~{custom_script} \
+    python ~{active_script} \
       --records ~{sep=' ' tsv_records} \
       --counts ~{sep=' ' tsv_counts} \
       --fout "~{prefix}.agg"

From f9af76719aa1cd6f28caafc29eb405c16240af42 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Fri, 9 Aug 2024 09:49:26 -0400
Subject: [PATCH 18/29] Modified path to python scripts

---
 wdl/MainVcfQc.wdl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wdl/MainVcfQc.wdl b/wdl/MainVcfQc.wdl
index b3795405f..b7c64ce3b 100644
--- a/wdl/MainVcfQc.wdl
+++ b/wdl/MainVcfQc.wdl
@@ -897,7 +897,7 @@ task IdentifyDuplicates {
     RuntimeAttr? runtime_attr_override
   }
 
-  File default_script = "/opt/sv-pipeline/scripts/merge_duplicates.py"
+  File default_script = "/src/sv-pipeline/scripts/merge_duplicates.py"
   File active_script = select_first([custom_script, default_script])
 
   String vcf_basename = basename(vcf, ".vcf.gz")
@@ -953,7 +953,7 @@ task MergeDuplicates {
     RuntimeAttr? runtime_attr_override
   }
 
-  File default_script = "/opt/sv-pipeline/scripts/merge_duplicates.py"
+  File default_script = "/src/sv-pipeline/scripts/merge_duplicates.py"
   File active_script = select_first([custom_script, default_script])
 
   RuntimeAttr runtime_default = object {

From e9e5872d1cf4b419e6be9d1e9e49038b7a3891b4 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Fri, 9 Aug 2024 10:07:26 -0400
Subject: [PATCH 19/29] Removed unused import

---
 src/sv-pipeline/scripts/identify_duplicates.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/sv-pipeline/scripts/identify_duplicates.py b/src/sv-pipeline/scripts/identify_duplicates.py
index 56cd226a9..3d1c4fba4 100644
--- a/src/sv-pipeline/scripts/identify_duplicates.py
+++ b/src/sv-pipeline/scripts/identify_duplicates.py
@@ -7,7 +7,6 @@
 
 from typing import List, Text, Optional
 from collections import defaultdict
-from itertools import combinations
 
 import argparse
 import sys

From 37e20c28cf731660d03ff24f290272388b9e71cb Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Fri, 9 Aug 2024 10:11:33 -0400
Subject: [PATCH 20/29] Resoled all flake8 linting errors

---
 .../scripts/identify_duplicates.py            | 20 +++++++++----------
 src/sv-pipeline/scripts/merge_duplicates.py   |  6 ++----
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/src/sv-pipeline/scripts/identify_duplicates.py b/src/sv-pipeline/scripts/identify_duplicates.py
index 3d1c4fba4..db4276edd 100644
--- a/src/sv-pipeline/scripts/identify_duplicates.py
+++ b/src/sv-pipeline/scripts/identify_duplicates.py
@@ -38,14 +38,14 @@ def process_duplicates(vcf, fout):
 
             # Update buffers with new record
             exact_key = (
-                record.chrom, 
-                record.pos, 
+                record.chrom,
+                record.pos,
                 record.stop,
-                record.info.get('SVTYPE'), 
+                record.info.get('SVTYPE'),
                 record.info.get('SVLEN'),
-                record.info.get('CHR2'), 
+                record.info.get('CHR2'),
                 record.info.get('END2'),
-                record.info.get('STRANDS'), 
+                record.info.get('STRANDS'),
                 record.info.get('CPX_TYPE'),
                 record.info.get('CPX_INTERVALS')
             )
@@ -53,8 +53,8 @@ def process_duplicates(vcf, fout):
 
             if record.info.get('SVTYPE') == 'INS':
                 insert_key = (
-                    record.id, 
-                    record.info.get('SVLEN'), 
+                    record.id,
+                    record.info.get('SVLEN'),
                     record.alts[0]
                 )
                 ins_buffer.append(insert_key)
@@ -80,9 +80,9 @@ def process_buffers(exact_buffer, ins_buffer, counts, f_records):
 
     # Process INS matches
     for i in range(len(ins_buffer)):
-        for j in range(i+1, len(ins_buffer)):
+        for j in range(i + 1, len(ins_buffer)):
             rec1, rec2 = ins_buffer[i], ins_buffer[j]
-            
+
             # Size comparison
             if rec1[1] == rec2[1]:
                 counts['INS 100%'] += 1
@@ -93,7 +93,7 @@ def process_buffers(exact_buffer, ins_buffer, counts, f_records):
             else:
                 counts['INS 0%'] += 1
                 f_records.write(f"INS 0%\t{rec1[0]},{rec2[0]}\n")
-            
+
             # ALT comparison
             if rec1[2] == rec2[2]:
                 counts['INS ALT Identical'] += 1
diff --git a/src/sv-pipeline/scripts/merge_duplicates.py b/src/sv-pipeline/scripts/merge_duplicates.py
index 3d9a71ce1..fda0a84a4 100644
--- a/src/sv-pipeline/scripts/merge_duplicates.py
+++ b/src/sv-pipeline/scripts/merge_duplicates.py
@@ -17,15 +17,13 @@ def merge_duplicates(record_files: List[str], count_files: List[str], fout: str)
     # Merge records
     with open(f"{fout}_duplicate_records.tsv", 'w', newline='') as out_records:
         writer = csv.writer(out_records, delimiter='\t')
-
         # Write header
         writer.writerow(['TYPE', 'DUP_RECORDS'])
-        
         # Append records from each file
         for record_file in record_files:
             with open(record_file, 'r') as f:
                 reader = csv.reader(f, delimiter='\t')
-                next(reader) 
+                next(reader)
                 for row in reader:
                     writer.writerow(row)
 
@@ -48,7 +46,7 @@ def merge_duplicates(record_files: List[str], count_files: List[str], fout: str)
         # Append each row from merged counts
         for category, count in counts.items():
             writer.writerow([category, count])
-    
+
 
 def _parse_arguments(argv: List[Text]) -> argparse.Namespace:
     parser = argparse.ArgumentParser(

From 89cd795340449fc92d72616fb090f6154ea801d0 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Fri, 9 Aug 2024 10:21:13 -0400
Subject: [PATCH 21/29] Rolled back to solely use custom scripts for time being

---
 wdl/MainVcfQc.wdl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/wdl/MainVcfQc.wdl b/wdl/MainVcfQc.wdl
index b7c64ce3b..81c5f5f0b 100644
--- a/wdl/MainVcfQc.wdl
+++ b/wdl/MainVcfQc.wdl
@@ -897,8 +897,8 @@ task IdentifyDuplicates {
     RuntimeAttr? runtime_attr_override
   }
 
-  File default_script = "/src/sv-pipeline/scripts/merge_duplicates.py"
-  File active_script = select_first([custom_script, default_script])
+  # File default_script = "/src/sv-pipeline/scripts/merge_duplicates.py"
+  # File active_script = select_first([custom_script, default_script])
 
   String vcf_basename = basename(vcf, ".vcf.gz")
   String full_prefix = "~{prefix}.~{vcf_basename}"
@@ -928,7 +928,7 @@ task IdentifyDuplicates {
 
     echo "Processing ~{vcf} into ~{full_prefix}..."
 
-    python ~{active_script} \
+    python ~{custom_script} \
       --vcf ~{vcf} \
       --fout ~{full_prefix}
 
@@ -953,8 +953,8 @@ task MergeDuplicates {
     RuntimeAttr? runtime_attr_override
   }
 
-  File default_script = "/src/sv-pipeline/scripts/merge_duplicates.py"
-  File active_script = select_first([custom_script, default_script])
+  # File default_script = "/src/sv-pipeline/scripts/merge_duplicates.py"
+  # File active_script = select_first([custom_script, default_script])
 
   RuntimeAttr runtime_default = object {
     mem_gb: 3.75,
@@ -981,7 +981,7 @@ task MergeDuplicates {
 
     echo "Merging all TSV files into one..."
 
-    python ~{active_script} \
+    python ~{custom_script} \
       --records ~{sep=' ' tsv_records} \
       --counts ~{sep=' ' tsv_counts} \
       --fout "~{prefix}.agg"

From f6a3025d38cfc756264fe930051d64cf8db8e50a Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Wed, 25 Sep 2024 13:56:40 -0400
Subject: [PATCH 22/29] Update wdl/MainVcfQc.wdl

Co-authored-by: Mark Walker <markw@broadinstitute.org>
---
 wdl/MainVcfQc.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wdl/MainVcfQc.wdl b/wdl/MainVcfQc.wdl
index 81c5f5f0b..5f7ad2422 100644
--- a/wdl/MainVcfQc.wdl
+++ b/wdl/MainVcfQc.wdl
@@ -905,7 +905,7 @@ task IdentifyDuplicates {
 
   RuntimeAttr runtime_default = object {
     mem_gb: 3.75,
-    disk_gb: 2 + ceil(size(vcf, "GiB")),
+    disk_gb: 10 + ceil(size(vcf, "GiB")),
     cpu_cores: 1,
     preemptible_tries: 1,
     max_retries: 1,

From 8b5bd124ba274e23df98f26d0cbaa37ca254dded Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Wed, 25 Sep 2024 18:20:18 -0400
Subject: [PATCH 23/29] Updated files

---
 .github/.dockstore.yml                        |   9 +
 .github/workflows/testwdls.yaml               |   5 +-
 README.md                                     |  19 +-
 dockerfiles/sv-base/Dockerfile                |   6 +-
 .../cohort_mode_workspace_dashboard.md.tmpl   |   7 +-
 inputs/values/dockers.json                    |  10 +-
 inputs/values/dockers_azure.json              |  10 +-
 inputs/values/google_cloud.json               |   3 -
 scripts/cromwell/launch_wdl.sh                |  11 --
 scripts/inputs/build_default_inputs.sh        |  21 +-
 scripts/inputs/build_inputs.py                |   5 +-
 src/RdTest/RdTestV2.R                         |   2 +-
 .../scripts/SR_genotype.opt_part1.sh          |  52 ++---
 .../scripts/split_variants.py                 |  48 +++--
 .../scripts/identify_duplicates.py            |  34 ++--
 .../scripts/make_evidence_qc_table.py         |   4 +-
 .../scripts/vcf_qc/collectQC.vcf_wide.sh      |   2 +-
 src/svtk/README.md                            |   9 +-
 src/svtk/svtk/pesr/pe_test.py                 |   3 +-
 src/svtk/svtk/pesr/sr_test.py                 |   5 +-
 wdl/BAFTestChromosome.wdl                     |   3 +-
 wdl/BatchEvidenceMerging.wdl                  |   2 +-
 wdl/GenotypeCpxCnvsPerBatch.wdl               |   2 +-
 wdl/MainVcfQc.wdl                             |  16 +-
 wdl/MatrixQC.wdl                              |   6 +-
 wdl/PETestChromosome.wdl                      |   3 +-
 wdl/RDTestChromosome.wdl                      |   3 +-
 wdl/ResolveCpxSv.wdl                          |   3 +-
 wdl/SRTestChromosome.wdl                      |   3 +-
 wdl/SetSampleIdLegacy.wdl                     |   4 +-
 wdl/TasksGenotypeBatch.wdl                    |   7 +-
 wdl/TrainGCNV.wdl                             |   2 +-
 website/docs/advanced/build_inputs.md         |  30 +--
 website/docs/gs/quick_start.md                |   3 +-
 website/docs/gs/runtime_env.md                |   2 +-
 website/docs/modules/evidence_qc.md           |  33 +++-
 website/docs/modules/gather_batch_evidence.md | 179 ++++++++++++++++--
 .../docs/modules/gather_sample_evidence.md    |  71 ++++++-
 website/docs/modules/index.md                 |  15 ++
 website/docs/modules/train_gcnv.md            | 133 +++++++++----
 website/src/components/highlight.js           |  25 +++
 website/src/css/custom.css                    |  10 +
 42 files changed, 555 insertions(+), 265 deletions(-)
 delete mode 100644 inputs/values/google_cloud.json
 create mode 100644 website/src/components/highlight.js

diff --git a/.github/.dockstore.yml b/.github/.dockstore.yml
index 6f3d07451..3aa477636 100644
--- a/.github/.dockstore.yml
+++ b/.github/.dockstore.yml
@@ -162,3 +162,12 @@ workflows:
         - main
       tags:
         - /.*/
+
+  - subclass: WDL
+    name: SingleSamplePipeline
+    primaryDescriptorPath: /wdl/GATKSVPipelineSingleSample.wdl
+    filters:
+      branches:
+        - main
+      tags:
+        - /.*/
diff --git a/.github/workflows/testwdls.yaml b/.github/workflows/testwdls.yaml
index 3f1fdf353..49d0d7f6d 100644
--- a/.github/workflows/testwdls.yaml
+++ b/.github/workflows/testwdls.yaml
@@ -54,10 +54,7 @@ jobs:
           # Setup for running womtool
           pip install jinja2==3.1.2
           wget -O womtool.jar https://github.com/broadinstitute/cromwell/releases/download/84/womtool-84.jar
-          echo \
-            '{ "google_project_id": "my-google-project-id", "terra_billing_project_id": "my-terra-billing-project" }' \
-            > inputs/values/google_cloud.my_project.json
-          scripts/inputs/build_default_inputs.sh -d . -c google_cloud.my_project
+          scripts/inputs/build_default_inputs.sh -d .
 
       - name: Test with Miniwdl
         run: |
diff --git a/README.md b/README.md
index 4ccce1c17..96d5f8356 100644
--- a/README.md
+++ b/README.md
@@ -90,7 +90,7 @@ Sample IDs should not:
 
 The same requirements apply to family IDs in the PED file, as well as batch IDs and the cohort ID provided as workflow inputs.
 
-Sample IDs are provided to [GatherSampleEvidence](#gather-sample-evidence) directly and need not match sample names from the BAM/CRAM headers. `GetSampleID.wdl` can be used to fetch BAM sample IDs and also generates a set of alternate IDs that are considered safe for this pipeline; alternatively, [this script](https://github.com/talkowski-lab/gnomad_sv_v3/blob/master/sample_id/convert_sample_ids.py) transforms a list of sample IDs to fit these requirements. Currently, sample IDs can be replaced again in [GatherBatchEvidence](#gather-batch-evidence). 
+Sample IDs are provided to [GatherSampleEvidence](#gather-sample-evidence) directly and need not match sample names from the BAM/CRAM headers.  `GetSampleID.wdl` can be used to fetch BAM sample IDs and also generates a set of alternate IDs that are considered safe for this pipeline; alternatively, [this script](https://github.com/talkowski-lab/gnomad_sv_v3/blob/master/sample_id/convert_sample_ids.py) transforms a list of sample IDs to fit these requirements. Currently, sample IDs can be replaced again in [GatherBatchEvidence](#gather-batch-evidence) - to do so, set the parameter `rename_samples = True` and provide updated sample IDs via the `samples` parameter.
 
 The following inputs will need to be updated with the transformed sample IDs:
 * Sample ID list for [GatherSampleEvidence](#gather-sample-evidence) or [GatherBatchEvidence](#gather-batch-evidence)
@@ -125,14 +125,6 @@ Example workflow inputs can be found in `/inputs`. Build using `scripts/inputs/b
 generates input jsons in `/inputs/build`. Except the MELT docker image, all required resources are available in public 
 Google buckets. 
 
-Some workflows require a Google Cloud Project ID to be defined in a cloud environment parameter group. Workspace builds 
-require a Terra billing project ID as well. An example is  provided at `/inputs/values/google_cloud.json` but should 
-not be used, as modifying this file will cause tracked changes in the repository. Instead, create a copy in the same 
-directory with the format `google_cloud.my_project.json` and modify as necessary.
-
-Note that these inputs are required only when certain data are located in requester pays buckets. If this does not 
-apply, users may use placeholder values for the cloud configuration and simply delete the inputs manually.
-
 #### MELT
 **Important**: The example input files contain MELT inputs that are NOT public (see [Requirements](#requirements)). These include:
 
@@ -150,8 +142,7 @@ We recommend running the pipeline on a dedicated [Cromwell](https://github.com/b
 > cp $GATK_SV_ROOT/wdl/*.wdl .
 > zip dep.zip *.wdl
 > cd ..
-> echo '{ "google_project_id": "my-google-project-id", "terra_billing_project_id": "my-terra-billing-project" }' > inputs/values/google_cloud.my_project.json
-> bash scripts/inputs/build_default_inputs.sh -d $GATK_SV_ROOT -c google_cloud.my_project
+> bash scripts/inputs/build_default_inputs.sh -d $GATK_SV_ROOT
 > cp $GATK_SV_ROOT/inputs/build/ref_panel_1kg/test/GATKSVPipelineBatch/GATKSVPipelineBatch.json GATKSVPipelineBatch.my_run.json
 > cromshell submit wdl/GATKSVPipelineBatch.wdl GATKSVPipelineBatch.my_run.json cromwell_config.json wdl/dep.zip
 ```
@@ -231,14 +222,12 @@ Here is an example of how to generate workflow input jsons from `GATKSVPipelineB
     --final-workflow-outputs-dir gs://my-outputs-bucket \
     metadata.json \
     > inputs/values/my_ref_panel.json
-> # Define your google project id (for Cromwell inputs) and Terra billing project (for workspace inputs)
-> echo '{ "google_project_id": "my-google-project-id", "terra_billing_project_id": "my-terra-billing-project" }' > inputs/values/google_cloud.my_project.json
-> # Build test files for batched workflows (google cloud project id required)
+> # Build test files for batched workflows
 > python scripts/inputs/build_inputs.py \
     inputs/values \
     inputs/templates/test \
     inputs/build/my_ref_panel/test \
-    -a '{ "test_batch" : "ref_panel_1kg", "cloud_env": "google_cloud.my_project" }'
+    -a '{ "test_batch" : "ref_panel_1kg" }'
 > # Build test files for the single-sample workflow
 > python scripts/inputs/build_inputs.py \
     inputs/values \
diff --git a/dockerfiles/sv-base/Dockerfile b/dockerfiles/sv-base/Dockerfile
index ac0608caf..bf1cdc19e 100644
--- a/dockerfiles/sv-base/Dockerfile
+++ b/dockerfiles/sv-base/Dockerfile
@@ -1,7 +1,7 @@
 # This is the base dockerfile for the GATK SV pipeline that adds R, a few R packages, and GATK
 ARG SAMTOOLS_CLOUD_IMAGE=samtools-cloud:latest
 ARG VIRTUAL_ENV_IMAGE=sv-base-virtual-env:latest
-ARG GATK_COMMIT="a33bf19dd3188af0af1bd17bce015eb20ba73227"
+ARG GATK_COMMIT="64348bc9750ebf6cc473ecb8c1ced3fc66f05488"
 ARG GATK_JAR="/opt/gatk.jar"
 ARG R_INSTALL_PATH=/opt/R
 
@@ -14,8 +14,8 @@ FROM $SAMTOOLS_CLOUD_IMAGE as samtools_cloud
 FROM $VIRTUAL_ENV_IMAGE as virtual_env_image
 RUN rm_unneeded_r_library_files.sh
 
-ARG GATK_BUILD_DEP="git git-lfs openjdk-8-jdk"
-ARG GATK_RUN_DEP="openjdk-8-jre-headless libgomp1"
+ARG GATK_BUILD_DEP="git git-lfs openjdk-17-jdk"
+ARG GATK_RUN_DEP="openjdk-17-jre-headless libgomp1"
 ARG GATK_COMMIT
 ARG GATK_JAR
 ARG DEBIAN_FRONTEND=noninteractive
diff --git a/inputs/templates/terra_workspaces/cohort_mode/cohort_mode_workspace_dashboard.md.tmpl b/inputs/templates/terra_workspaces/cohort_mode/cohort_mode_workspace_dashboard.md.tmpl
index 7523f1ac1..b210e9b53 100644
--- a/inputs/templates/terra_workspaces/cohort_mode/cohort_mode_workspace_dashboard.md.tmpl
+++ b/inputs/templates/terra_workspaces/cohort_mode/cohort_mode_workspace_dashboard.md.tmpl
@@ -166,7 +166,7 @@ Read the full EvidenceQC documentation [here](https://github.com/broadinstitute/
 
 Read the full TrainGCNV documentation [here](https://github.com/broadinstitute/gatk-sv#gcnv-training-1).
 * Before running this workflow, create the batches (~100-500 samples) you will use for the rest of the pipeline based on sample coverage, WGD score (from `02-EvidenceQC`), and PCR status. These will likely not be the same as the batches you used for `02-EvidenceQC`.
-* By default, `03-TrainGCNV` is configured to be run once per `sample_set` on 100 randomly-chosen samples from that set to create a gCNV model for each batch. If your `sample_set` contains fewer than 100 samples (not recommended), you will need to edit the `n_samples_subsample` parameter to be less than or equal to the number of samples.
+* By default, `03-TrainGCNV` is configured to be run once per `sample_set` on 100 randomly-chosen samples from that set to create a gCNV model for each batch. To modify this behavior, you can set the `n_samples_subsample` parameter to the number of samples to use for training.
 
 #### 04-GatherBatchEvidence
 
@@ -190,7 +190,10 @@ These two workflows make up FilterBatch; they are subdivided in this workspace t
 #### 09-MergeBatchSites
 
 Read the full MergeBatchSites documentation [here](https://github.com/broadinstitute/gatk-sv#merge-batch-sites).
-* `09-MergeBatchSites` is a cohort-level workflow, so it is run on a `sample_set_set` containing all of the batches in the cohort. You can create this `sample_set_set` while you are launching the `09-MergeBatchSites` workflow: click "Select Data", choose "Create new sample_set_set [...]", check all the batches to include (all of the ones used in `03-TrainGCNV` through `08-FilterBatchSamples`), and give it a name that follows the **Sample ID requirements**.
+* `09-MergeBatchSites` is a cohort-level workflow, so it is run on a `sample_set_set` containing all of the batches in the cohort. Navigate to the Data tab of your workspace. If there is no `sample_set_set` data table, you will need to create it. To do this, select the `sample_set` data table, then select (with the check boxes) all of the batches (`sample_set`) in your cohort. These should be the `sample_sets` that you used to run steps `03-TrainGCNV` through `08-FilterBatchSamples`. Then click the "Edit" icon above the table and choose "Save selection as set." Enter a name that follows the **Sample ID requirements**. This will create a new `sample_set_set` containing all of the `sample_sets` in your cohort. When you launch MergeBatchSites, you can now select this `sample_set_set`.
+
+<img alt="selecting batches" title="Selecting sample_sets in the data table" src="https://i.imgur.com/E5x3qqk.png" width="400">              <img alt="creating a new set" title="Creating a new sample_set_set" src="https://i.imgur.com/pizOtX9.png" width="400">
+* If there is already a `sample_set_set` data table in your workspace, you can create this `sample_set_set` while you are launching the `09-MergeBatchSites` workflow: click "Select Data", choose "Create new sample_set_set [...]", check all the batches to include (all of the ones used in `03-TrainGCNV` through `08-FilterBatchSamples`), and give it a name that follows the **Sample ID requirements**.
 
 <img alt="creating a cohort sample_set_set" title="How to create a cohort sample_set_set" src="https://i.imgur.com/zKEtSbe.png" width="500">
 
diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json
index c841e1515..8bc4db59d 100644
--- a/inputs/values/dockers.json
+++ b/inputs/values/dockers.json
@@ -1,6 +1,6 @@
 {
   "name": "dockers",
-  "cnmops_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/cnmops:2024-06-04-v0.28.5-beta-a8dfecba",
+  "cnmops_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/cnmops:2024-08-27-v0.29-beta-6b27c39f",
   "condense_counts_docker": "us.gcr.io/broad-dsde-methods/tsharpe/gatk:4.2.6.1-57-g9e03432",
   "gatk_docker": "us.gcr.io/broad-dsde-methods/eph/gatk:2024-07-02-4.6.0.0-1-g4af2b49e9-NIGHTLY-SNAPSHOT",
   "gatk_docker_pesr_override": "us.gcr.io/broad-dsde-methods/tsharpe/gatk:4.2.6.1-57-g9e03432",
@@ -10,10 +10,10 @@
   "melt_docker": "us.gcr.io/talkowski-sv-gnomad/melt:a85c92f",
   "scramble_docker": "us.gcr.io/broad-dsde-methods/markw/scramble:mw-scramble-99af4c50",
   "samtools_cloud_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/samtools-cloud:2024-01-24-v0.28.4-beta-9debd6d7",
-  "sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2024-01-24-v0.28.4-beta-9debd6d7",
+  "sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2024-08-27-v0.29-beta-6b27c39f",
   "sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base-mini:2024-01-24-v0.28.4-beta-9debd6d7",
-  "sv_pipeline_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2024-07-02-v0.28.5-beta-d9530265",
-  "sv_pipeline_qc_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2024-07-02-v0.28.5-beta-d9530265",
+  "sv_pipeline_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2024-09-25-v0.29-beta-f064b2d7",
+  "sv_pipeline_qc_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2024-09-25-v0.29-beta-f064b2d7",
   "wham_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/wham:2024-01-24-v0.28.4-beta-9debd6d7",
   "igv_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/igv:mw-xz-fixes-2-b1be6a9",
   "duphold_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/duphold:mw-xz-fixes-2-b1be6a9",
@@ -28,5 +28,5 @@
   "sv_utils_docker": "us.gcr.io/broad-dsde-methods/markw/sv-utils:mw-train-genotype-filtering-a9479501",
   "gq_recalibrator_docker": "us.gcr.io/broad-dsde-methods/markw/gatk:mw-tb-form-sv-filter-training-data-899360a",
   "str": "us.gcr.io/broad-dsde-methods/gatk-sv/str:2023-05-23-v0.27.3-beta-e537bdd6",
-  "denovo": "us.gcr.io/broad-dsde-methods/gatk-sv/denovo:2024-07-02-v0.28.5-beta-d9530265"
+  "denovo": "us.gcr.io/broad-dsde-methods/gatk-sv/denovo:2024-09-25-v0.29-beta-f064b2d7"
 }
\ No newline at end of file
diff --git a/inputs/values/dockers_azure.json b/inputs/values/dockers_azure.json
index 454718cf3..80fac2ca5 100644
--- a/inputs/values/dockers_azure.json
+++ b/inputs/values/dockers_azure.json
@@ -1,6 +1,6 @@
 {
   "name": "dockers",
-  "cnmops_docker": "vahid.azurecr.io/gatk-sv/cnmops:2024-06-04-v0.28.5-beta-a8dfecba",
+  "cnmops_docker": "vahid.azurecr.io/gatk-sv/cnmops:2024-08-27-v0.29-beta-6b27c39f",
   "condense_counts_docker": "vahid.azurecr.io/tsharpe/gatk:4.2.6.1-57-g9e03432",
   "gatk_docker": "vahid.azurecr.io/gatk-sv/gatk:2024-07-02-4.6.0.0-1-g4af2b49e9-NIGHTLY-SNAPSHOT",
   "gatk_docker_pesr_override": "vahid.azurecr.io/tsharpe/gatk:4.2.6.1-57-g9e03432",
@@ -10,10 +10,10 @@
   "melt_docker": "vahid.azurecr.io/melt:a85c92f",
   "scramble_docker": "vahid.azurecr.io/scramble:mw-scramble-99af4c50",
   "samtools_cloud_docker": "vahid.azurecr.io/gatk-sv/samtools-cloud:2024-01-24-v0.28.4-beta-9debd6d7",
-  "sv_base_docker": "vahid.azurecr.io/gatk-sv/sv-base:2024-01-24-v0.28.4-beta-9debd6d7",
+  "sv_base_docker": "vahid.azurecr.io/gatk-sv/sv-base:2024-08-27-v0.29-beta-6b27c39f",
   "sv_base_mini_docker": "vahid.azurecr.io/gatk-sv/sv-base-mini:2024-01-24-v0.28.4-beta-9debd6d7",
-  "sv_pipeline_docker": "vahid.azurecr.io/gatk-sv/sv-pipeline:2024-07-02-v0.28.5-beta-d9530265",
-  "sv_pipeline_qc_docker": "vahid.azurecr.io/gatk-sv/sv-pipeline:2024-07-02-v0.28.5-beta-d9530265",
+  "sv_pipeline_docker": "vahid.azurecr.io/gatk-sv/sv-pipeline:2024-09-25-v0.29-beta-f064b2d7",
+  "sv_pipeline_qc_docker": "vahid.azurecr.io/gatk-sv/sv-pipeline:2024-09-25-v0.29-beta-f064b2d7",
   "wham_docker": "vahid.azurecr.io/gatk-sv/wham:2024-01-24-v0.28.4-beta-9debd6d7",
   "igv_docker": "vahid.azurecr.io/gatk-sv/igv:mw-xz-fixes-2-b1be6a9",
   "duphold_docker": "vahid.azurecr.io/gatk-sv/duphold:mw-xz-fixes-2-b1be6a9",
@@ -28,5 +28,5 @@
   "sv_utils_docker": "vahid.azurecr.io/gatk-sv/sv-utils:2024-01-24-v0.28.4-beta-9debd6d7",
   "gq_recalibrator_docker": "vahid.azurecr.io/markw/gatk:mw-tb-form-sv-filter-training-data-899360a",
   "str": "vahid.azurecr.io/gatk-sv/str:2023-05-23-v0.27.3-beta-e537bdd6",
-  "denovo": "vahid.azurecr.io/gatk-sv/denovo:2024-07-02-v0.28.5-beta-d9530265"
+  "denovo": "vahid.azurecr.io/gatk-sv/denovo:2024-09-25-v0.29-beta-f064b2d7"
 }
\ No newline at end of file
diff --git a/inputs/values/google_cloud.json b/inputs/values/google_cloud.json
deleted file mode 100644
index 64603ae3e..000000000
--- a/inputs/values/google_cloud.json
+++ /dev/null
@@ -1,3 +0,0 @@
-{
-  "terra_billing_project_id": "your-terra-billing-project-id-see-readme"
-}
diff --git a/scripts/cromwell/launch_wdl.sh b/scripts/cromwell/launch_wdl.sh
index 0d67ab859..23aa8db4b 100755
--- a/scripts/cromwell/launch_wdl.sh
+++ b/scripts/cromwell/launch_wdl.sh
@@ -15,15 +15,6 @@ done
 WDL_FILENAME=$(basename "$WDL")
 WDL_NAME=${WDL_FILENAME%.*}
 
-CLOUD_ENV="$GATK_SV_ROOT/inputs/values/google_cloud.my_project.json"
-echo "CLOUD_ENV=$CLOUD_ENV"
-cat << EOF > "$CLOUD_ENV"
-{
-  "google_project_id": "broad-dsde-methods",
-  "terra_billing_project_id": "broad-dsde-methods"
-}
-EOF
-
 
 RUN_DIR="$GATK_SV_ROOT/runs/$WDL_NAME"
 DEPS_ZIP="$RUN_DIR/deps.zip"
@@ -34,10 +25,8 @@ zip "$DEPS_ZIP" *.wdl &> /dev/null
 cd "$GATK_SV_ROOT"
 "$GATK_SV_ROOT/scripts/inputs/build_default_inputs.sh" \
   -d "$GATK_SV_ROOT" \
-  -c google_cloud.my_project \
   > /dev/null
 
-rm -f $CLOUD_ENV
 
 echo "Available input jsons:"
 printf "%d\t%s\n" 0 "none (skip cromwell submit)"
diff --git a/scripts/inputs/build_default_inputs.sh b/scripts/inputs/build_default_inputs.sh
index 6397de18a..b184f692a 100755
--- a/scripts/inputs/build_default_inputs.sh
+++ b/scripts/inputs/build_default_inputs.sh
@@ -2,9 +2,8 @@
 
 function usage() {
   printf "Usage: \n \
-    %s -d <REPO_BASE_DIR> -c <CLOUD_ENV> \n \
-    <REPO_BASE_DIR> \t path to gatk-sv base directory \n \
-    <CLOUD_ENV> \t name of cloud environment json (e.g. 'google_cloud.my' for  inputs/values/google_cloud.my.json)" "$1"
+    %s -d <REPO_BASE_DIR> \n \
+    <REPO_BASE_DIR> \t path to gatk-sv base directory" "$1"
 }
 
 if [[ "$#" == 0 ]]; then
@@ -14,10 +13,9 @@ fi
 #################################################
 # Parsing arguments
 #################################################
-while getopts "d:c:" option; do
+while getopts "d:" option; do
   case "$option" in
     d) BASE_DIR="$OPTARG" ;;
-    c) CLOUD_ENV="$OPTARG" ;;
     *) usage "$0" && exit 1 ;;
   esac
 done
@@ -28,12 +26,6 @@ if [ -z "$BASE_DIR" ] ; then
     exit 1
 fi
 
-if [ -z "$CLOUD_ENV" ] ; then
-    echo "xy"
-    usage "$0"
-    exit 1
-fi
-
 if [[ ! -d "$BASE_DIR" ]]; then
    echo "Invalid directory: $BASE_DIR"
    exit 1
@@ -45,17 +37,16 @@ bash scripts/inputs/clean_default_inputs.sh -d ${BASE_DIR}
 
 echo "########## Building ref_panel_1kg test ##########"
 scripts/inputs/build_inputs.py ${BASE_DIR}/inputs/values ${BASE_DIR}/inputs/templates/test ${BASE_DIR}/inputs/build/ref_panel_1kg/test \
-  -a '{ "test_batch" : "ref_panel_1kg", "cloud_env" : "'$CLOUD_ENV'" }'
+  -a '{ "test_batch" : "ref_panel_1kg" }'
 
 echo "########## Building ref_panel_1kg cohort Terra workspace ##########"
 scripts/inputs/build_inputs.py ${BASE_DIR}/inputs/values ${BASE_DIR}/inputs/templates/terra_workspaces/cohort_mode ${BASE_DIR}/inputs/build/ref_panel_1kg/terra \
-  -a '{ "test_batch" : "ref_panel_1kg", "cloud_env" : "'$CLOUD_ENV'" }'
+  -a '{ "test_batch" : "ref_panel_1kg" }'
 
 echo "########## Building hgdp test ##########"
 scripts/inputs/build_inputs.py ${BASE_DIR}/inputs/values ${BASE_DIR}/inputs/templates/test ${BASE_DIR}/inputs/build/hgdp/test \
-  -a '{ "test_batch" : "hgdp", "cloud_env" : "'$CLOUD_ENV'" }'
+  -a '{ "test_batch" : "hgdp" }'
 
-# Note CLOUD_ENV is not currently required for the single-sample workflow
 echo "########## Building NA19240 single-sample test ##########"
 scripts/inputs/build_inputs.py ${BASE_DIR}/inputs/values ${BASE_DIR}/inputs/templates/test/GATKSVPipelineSingleSample ${BASE_DIR}/inputs/build/NA19240/test \
   -a '{ "single_sample" : "test_single_sample_NA19240", "ref_panel" : "ref_panel_1kg" }'
diff --git a/scripts/inputs/build_inputs.py b/scripts/inputs/build_inputs.py
index e097f4d46..57ea26505 100755
--- a/scripts/inputs/build_inputs.py
+++ b/scripts/inputs/build_inputs.py
@@ -117,15 +117,12 @@ def main():
     raw_input_bundles['test_batch_empty']['name'] = 'test_batch'
     raw_input_bundles['single_sample_none'] = {}
     raw_input_bundles['single_sample_none']['name'] = 'single_sample'
-    raw_input_bundles['cloud_env_none'] = {}
-    raw_input_bundles['cloud_env_none']['name'] = 'cloud_env'
 
     default_aliases = {'dockers': 'dockers',
                        'ref_panel': 'ref_panel_empty',
                        'reference_resources': 'resources_hg38',
                        'test_batch': 'test_batch_empty',
-                       'single_sample': 'single_sample_none',
-                       'cloud_env': 'cloud_env_none'}
+                       'single_sample': 'single_sample_none'}
 
     # prepare the input_dict using default, document default, and user-specified aliases
     input_dict = {}
diff --git a/src/RdTest/RdTestV2.R b/src/RdTest/RdTestV2.R
index 595826b3b..e2c859992 100644
--- a/src/RdTest/RdTestV2.R
+++ b/src/RdTest/RdTestV2.R
@@ -1450,7 +1450,7 @@ intervals[, c(2:3)] <-
   })
 intervals <- data.frame(lapply(intervals, as.character), stringsAsFactors=FALSE)
 results<-apply(intervals,1,runRdTest)
-if(class(results)=="list") {
+if(is.list(results)) {
   results<-do.call(rbind,results)
 } else {
   results <- t(results)
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/SR_genotype.opt_part1.sh b/src/sv-pipeline/04_variant_resolution/scripts/SR_genotype.opt_part1.sh
index 02c9de5d1..be29b1f5a 100755
--- a/src/sv-pipeline/04_variant_resolution/scripts/SR_genotype.opt_part1.sh
+++ b/src/sv-pipeline/04_variant_resolution/scripts/SR_genotype.opt_part1.sh
@@ -37,22 +37,25 @@ awk '{if ($NF~"SR") print $4}' int.bed> pass.srtest.txt
 echo "step1"
 
 # Join RD and SR genotypes and filter same as PE
-cat $petrainfile|fgrep -wf pass.srtest.txt > sr.train.include.txt
+cat $petrainfile \
+  |awk -F'\t' -v OFS='\t' 'ARGIND==1{ids[$1]; next} ($1 in ids)' pass.srtest.txt - \
+  > sr.train.include.txt
 
-join -j 1  -a 1 -e "2" -o 1.2 1.3 1.4 2.2 \
+join -j 1 -a 1 -e "2" -o 1.2 1.3 1.4 2.2 \
     <(zcat ${SR_sum} \
-        | fgrep -wf sr.train.include.txt \
+        | awk 'ARGIND==1{ids[$1]; next} ($1 in ids)' sr.train.include.txt - \
         | awk '{print $1"@"$2 "\t" $0}' \
-        | fgrep -wf two.sided.pass.txt \
+        | awk 'ARGIND==1{ids[$1]; next} ($1 in ids)' two.sided.pass.txt - \
         | sort -k1,1 ) \
-    <(zcat $RD_melted_genotypes|fgrep -wf sr.train.include.txt \
+    <(zcat $RD_melted_genotypes \
+        | awk 'ARGIND==1{ids[$1]; next} ($4 in ids)' sr.train.include.txt - \
         | awk '{print $4"@"$5 "\t" $6}' \
-        | fgrep -wf two.sided.pass.txt \
+        | awk 'ARGIND==1{ids[$1]; next} ($1 in ids)' two.sided.pass.txt - \
         | sort -k1,1) \
   | tr ' ' '\t' \
-  > SR.RD.merged.txt 
+  > SR.RD.merged.txt
 
-# Get cutoffs to filter out incorrectly label hom in R and treat combine het (1 and 3) and hom (0 and 4) copy states 
+# Get cutoffs to filter out incorrectly label hom in R and treat combine het (1 and 3) and hom (0 and 4) copy states
 # throw out any copy state  calls that have reads less than with p=0.05 away from copy state 1 or 3
 
 het_cutoff=$(awk '{print $1"@"$2"\t" $3 "\t" $4}' SR.RD.merged.txt \
@@ -74,7 +77,7 @@ median_hom=$(awk '{if ($NF==0 || $NF==4) print $3}'  SR.RD.hetfilter.merged.txt
                          -e 'median(d)' \
                | tr '\n' '\t' \
                | awk '{print $NF}')
-##get std from 1 && 3  for hom restriction###          
+##get std from 1 && 3  for hom restriction###
 sd_het=$(awk '{if ($NF==1 || $NF==3) print $3}'  SR.RD.hetfilter.merged.txt \
            | Rscript -e 'd<-scan("stdin", quiet=TRUE)' \
                      -e 'mad(d)' \
@@ -84,20 +87,20 @@ sd_het=$(awk '{if ($NF==1 || $NF==3) print $3}'  SR.RD.hetfilter.merged.txt \
 ##Genotype SR genotype (0-ref, then estimate copy state based on copy state that is 1 sd from sd_het  )##
 zcat ${SR_sum} \
   | awk '{print $0 "\t" $1"@"$2}' \
-  | fgrep -wf two.sided.pass.txt \
+  | awk -F'\t' -v OFS='\t' 'ARGIND==1{ids[$1]; next} ($4 in ids)' two.sided.pass.txt - \
   | cut -f1-3 \
   | awk -v var=$sr_count -v var1=$median_hom -v var2=$sd_het '{if ($3<var) print $1,$2,$3,0;else if ($3<=var1-var2) print $1,$2,$3,1; else print $1,$2,$3,int($3/(var1/2)+0.5)}'  \
   > sr.geno.final.txt
 
 zcat ${SR_sum} \
   | awk '{print $0 "\t" $1"@"$2}' \
-  | fgrep -wvf two.sided.pass.txt \
+  | awk -F'\t' -v OFS='\t' 'ARGIND==1{ids[$1]; next} (!($4 in ids))' two.sided.pass.txt - \
   | cut -f1-3 \
   | awk '{print $1,$2,$3,0}' \
   >> sr.geno.final.txt
 
 
-gzip sr.geno.final.txt
+gzip -f sr.geno.final.txt
 
 zcat ${SR_sum} \
   | awk '{print $0 "\t" $1"@"$2}' \
@@ -105,7 +108,7 @@ zcat ${SR_sum} \
   | awk -v var=$sr_count -v var1=$median_hom -v var2=$sd_het '{if ($3<var) print $1,$2,$3,0;else if ($3<=var1-var2) print $1,$2,$3,1; else print $1,$2,$3,int($3/(var1/2)+0.5)}' \
   | gzip \
   > sr.geno.final.oneside.txt.gz
-  
+
 echo "step3"
 ##filter by quality of site by looking at % of calls with ##
 ##Allow just one side##
@@ -140,15 +143,18 @@ echo "step4"
 ##pull out cnvs gt1kb and not located on x or y##
 zcat $RD_melted_genotypes|egrep -v "^X|^Y"|awk '{if ($3-$2>=1000) print $4}'|sort -u>idsgt1kb.txt
 
+awk -F'\t' -v OFS='\t' 'ARGIND==1{ids[$1]; next} ($1 in ids)' <(cut -d '@' -f1 sr.final.ids.oneside.txt|sort -u) \
+  <(zcat $RD_melted_genotypes|awk -F'\t' -v OFS='\t' '{if ($6!=2) print $4,$5}') \
+  > nonref_rd.txt
 
 zcat $pegenotypes \
-  |fgrep -wf <(cut -d '@' -f1 sr.final.ids.oneside.txt|sort -u) \
-  |awk '{if ($NF>0) print $1"@"$2}' \
-  |cat - <(fgrep -wf <(cut -d '@' -f1 sr.final.ids.oneside.txt|sort -u) \
-  <(zcat $RD_melted_genotypes|awk '{if ($6!=2) print $4"@"$5}')) \
-  |fgrep -wf idsgt1kb.txt \
-  |fgrep -wf pass.srtest.txt \
+  |awk -F'\t' -v OFS='\t' 'ARGIND==1{ids[$1]; next} ($1 in ids)' <(cut -d '@' -f1 sr.final.ids.oneside.txt|sort -u) - \
+  |awk -F'\t' -v OFS='\t' '{if ($NF>0) print $1,$2}' \
+  |cat - nonref_rd.txt \
+  |awk -F'\t' -v OFS='\t' 'ARGIND==1{ids[$1]; next} ($1 in ids)' idsgt1kb.txt - \
+  |awk -F'\t' -v OFS='\t' 'ARGIND==1{ids[$1]; next} ($1 in ids)' pass.srtest.txt - \
   |sort -u \
+  |tr '\t' '@' \
   >pass.pe_rd.txt
 
 ##look for optimal cutoffs for SR variants using a 1% freq cutoff##
@@ -159,28 +165,28 @@ cat recover.txt \
   |sort -k1,1 \
   |join -j 1 - <(zcat sr.geno.final.oneside.txt.gz|awk '{if ($NF>0) print $1 "\t" $1"@"$2 }'|sort -k1,1) \
   |tr ' ' '\t' \
-  |fgrep -wf pass.pe_rd.txt \
+  |awk -F'\t' -v OFS='\t' 'ARGIND==1{ids[$1]; next} ($5 in ids)' pass.pe_rd.txt - \
   >recover.single.txt
 
 cat recover.bothsides.txt \
   |sort -k1,1 \
   |join -j 1 - <(zcat sr.geno.final.oneside.txt.gz|awk '{if ($NF>0) print $1 "\t" $1"@"$2 }'|sort -k1,1) \
   |tr ' ' '\t' \
-  |fgrep -wf pass.pe_rd.txt \
+  |awk -F'\t' -v OFS='\t' 'ARGIND==1{ids[$1]; next} ($5 in ids)' pass.pe_rd.txt - \
   >recover.both.txt
 
 cat recover.txt \
   |sort -k1,1 \
   |join -j 1 - <(zcat sr.geno.final.oneside.txt.gz|awk '{if ($NF>0) print $1 "\t" $1"@"$2 }'|sort -k1,1) \
   |tr ' ' '\t' \
-  |fgrep -wvf pass.pe_rd.txt \
+  |awk -F'\t' -v OFS='\t' 'ARGIND==1{ids[$1]; next} (!($5 in ids))' pass.pe_rd.txt - \
   >recover.single.fail.txt
 
 cat recover.bothsides.txt \
   |sort -k1,1 \
   |join -j 1 - <(zcat sr.geno.final.oneside.txt.gz|awk '{if ($NF>0) print $1 "\t" $1"@"$2 }'|sort -k1,1) \
   |tr ' ' '\t' \
-  |fgrep -wvf pass.pe_rd.txt \
+  |awk -F'\t' -v OFS='\t' 'ARGIND==1{ids[$1]; next} (!($5 in ids))' pass.pe_rd.txt - \
   >recover.both.fail.txt
 
 echo "step5"
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
index ec0418459..0fabae4d0 100644
--- a/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
+++ b/src/sv-pipeline/04_variant_resolution/scripts/split_variants.py
@@ -3,15 +3,17 @@
 import logging
 
 
-def process_bed_file(input_bed, n_per_split, bca=True):
+def process_bed_file(input_bed, n_per_split, bca=True, digits=9):
     SVTYPE_FIELD = 5
     END_FIELD = 2
     START_FIELD = 1
 
-    # Check the conditions to generate prefixes for the output files
+    # Conditions for each category of variants
     condition_prefixes = {
-        'gt5kb': {'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and (int(line[END_FIELD]) - int(line[START_FIELD]) >= 5000)},
-        'lt5kb': {'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and (int(line[END_FIELD]) - int(line[START_FIELD]) < 5000)},
+        'gt5kb': {'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and
+                                            (int(line[END_FIELD]) - int(line[START_FIELD]) >= 5000)},
+        'lt5kb': {'condition': lambda line: (line[SVTYPE_FIELD] == 'DEL' or line[SVTYPE_FIELD] == 'DUP') and
+                                            (int(line[END_FIELD]) - int(line[START_FIELD]) < 5000)},
         'bca': {'condition': lambda line: bca and line[SVTYPE_FIELD] not in ['DEL', 'DUP'] and not line[SVTYPE_FIELD].startswith('INS')},
         'ins': {'condition': lambda line: bca and line[SVTYPE_FIELD].startswith('INS')}
     }
@@ -19,7 +21,7 @@ def process_bed_file(input_bed, n_per_split, bca=True):
     # Create trackers for the current file information
     current_lines = {prefix: [] for prefix in condition_prefixes.keys()}
     current_counts = {prefix: 0 for prefix in condition_prefixes.keys()}
-    current_suffixes = {prefix: 'a' for prefix in condition_prefixes.keys()}
+    current_suffixes = {prefix: 0 for prefix in condition_prefixes.keys()}
 
     with open(input_bed, 'r') as infile:
         for line in infile:
@@ -27,50 +29,44 @@ def process_bed_file(input_bed, n_per_split, bca=True):
             # This line swaps the last two columns so the sample names are in the fifth column and SV type in the last
             line[4], line[5] = line[5], line[4]
             for prefix, conditions in condition_prefixes.items():
-                # If a line matches a condition add it to the appropriate file
+                # If a line matches a condition add it to the appropriate category
                 if conditions['condition'](line):
                     current_lines[prefix].append('\t'.join(line))
                     current_counts[prefix] += 1
-                    # If a file has met the number of records per file create a new file with the next suffix and write
-                    # the current line to that new file
+                    # If a category has the specified number of records, create a new file and write the current records
                     if current_counts[prefix] == n_per_split:
-                        output_suffix = current_suffixes[prefix].rjust(6, 'a')
-                        output_file = f"{prefix}.{output_suffix}.bed"
+                        output_file = get_file_name(prefix, current_suffixes[prefix], digits)
                         with open(output_file, 'w') as outfile:
                             outfile.write('\n'.join(current_lines[prefix]))
-                        # Keep track of which files have been written after reaching the max number of files
+                        # Log the file name that was created
                         logging.info(f"File '{output_file}' written.")
                         # Update the tracking information
                         current_lines[prefix] = []
                         current_counts[prefix] = 0
-                        current_suffixes[prefix] = increment_suffix(current_suffixes[prefix])
-    # Handle the samples after files with the given number of lines per file have been written
+                        current_suffixes[prefix] = current_suffixes[prefix] + 1
+    # Handle the remaining records
     for prefix, lines in current_lines.items():
         if lines:
-            output_suffix = current_suffixes[prefix].rjust(6, 'a')
-            output_file = f"{prefix}.{output_suffix}.bed"
+            output_file = get_file_name(prefix, current_suffixes[prefix], digits)
             with open(output_file, 'w') as outfile:
                 outfile.write('\n'.join(lines))
             logging.info(f"File '{output_file}' written.")
 
 
-# Create a function to appropriately add a suffix to each corresponding file
-def increment_suffix(suffix):
-    alphabet = 'abcdefghijklmnopqrstuvwxyz'
-    if suffix == 'z' * 6:
-        raise ValueError('All possible files generated.')
-    else:
-        index = alphabet.index(suffix[0])
-        next_char = alphabet[(index + 1) % 26]
-        return next_char + suffix[1:]
+def get_file_name(prefix, suffix, digits):
+    if len(str(suffix)) > digits:
+        raise ValueError('No more files can be generated with the current naming scheme. '
+                         'Increase the digits parameter or the n parameter to proceed.')
+    return f"{prefix}.{str(suffix).zfill(digits)}.bed"
 
 
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--bed", help="Path to input bed file", required=True)
-    parser.add_argument("--n", help="number of variants per file", required=True, type=int)
+    parser.add_argument("--n", help="number of variants per output file", required=True, type=int)
     parser.add_argument("--bca", default=False, help="Flag to set to True if the VCF contains BCAs",
                         action='store_true')
+    parser.add_argument("--digits", "-d", default=9, type=int, help="Number of digits in filename suffix")
     parser.add_argument("--log-level", required=False, default="INFO", help="Specify level of logging information")
     args = parser.parse_args()
 
@@ -79,7 +75,7 @@ def main():
     if not isinstance(numeric_level, int):
         raise ValueError('Invalid log level: %s' % log_level)
     logging.basicConfig(level=numeric_level, format='%(levelname)s: %(message)s')
-    process_bed_file(args.bed, args.n, args.bca)
+    process_bed_file(args.bed, args.n, args.bca, args.digits)
 
 
 if __name__ == '__main__':
diff --git a/src/sv-pipeline/scripts/identify_duplicates.py b/src/sv-pipeline/scripts/identify_duplicates.py
index db4276edd..7b03c0caa 100644
--- a/src/sv-pipeline/scripts/identify_duplicates.py
+++ b/src/sv-pipeline/scripts/identify_duplicates.py
@@ -7,6 +7,7 @@
 
 from typing import List, Text, Optional
 from collections import defaultdict
+from itertools import groupby
 
 import argparse
 import sys
@@ -69,41 +70,42 @@ def process_duplicates(vcf, fout):
 
 def process_buffers(exact_buffer, ins_buffer, counts, f_records):
     # Process exact matches
-    exact_matches = defaultdict(list)
-    for key, record_id in exact_buffer:
-        exact_matches[key].append(record_id)
+    sorted_buffer = sorted(exact_buffer, key=lambda x: x[0])
+    exact_matches = {
+        key: [record for _, record in group] for key, group in groupby(sorted_buffer, key=lambda x: x[0])
+    }
 
     for records in exact_matches.values():
         if len(records) > 1:
             counts['Exact'] += 1
             f_records.write(f"Exact\t{','.join(sorted(records))}\n")
 
-    # Process INS matches
+    # Process insert matches
     for i in range(len(ins_buffer)):
         for j in range(i + 1, len(ins_buffer)):
             rec1, rec2 = ins_buffer[i], ins_buffer[j]
 
             # Size comparison
             if rec1[1] == rec2[1]:
-                counts['INS 100%'] += 1
-                f_records.write(f"INS 100%\t{rec1[0]},{rec2[0]}\n")
+                counts['ins_size_similarity_100'] += 1
+                f_records.write(f"ins_size_similarity_100\t{rec1[0]},{rec2[0]}\n")
             elif abs(rec1[1] - rec2[1]) <= 0.5 * max(rec1[1], rec2[1]):
-                counts['INS 50%'] += 1
-                f_records.write(f"INS 50%\t{rec1[0]},{rec2[0]}\n")
+                counts['ins_size_similarity_50'] += 1
+                f_records.write(f"ins_size_similarity_50\t{rec1[0]},{rec2[0]}\n")
             else:
-                counts['INS 0%'] += 1
-                f_records.write(f"INS 0%\t{rec1[0]},{rec2[0]}\n")
+                counts['ins_size_similarity_0'] += 1
+                f_records.write(f"ins_size_similarity_0\t{rec1[0]},{rec2[0]}\n")
 
             # ALT comparison
             if rec1[2] == rec2[2]:
-                counts['INS ALT Identical'] += 1
-                f_records.write(f"INS ALT Identical\t{rec1[0]},{rec2[0]}\n")
+                counts['ins_alt_identical'] += 1
+                f_records.write(f"ins_alt_identical\t{rec1[0]},{rec2[0]}\n")
             elif ('<INS>' in (rec1[2], rec2[2])) and ('<INS:' in (rec1[2] + rec2[2])):
-                counts['INS ALT Subtype'] += 1
-                f_records.write(f"INS ALT Subtype\t{rec1[0]},{rec2[0]}\n")
+                counts['ins_alt_same_subtype'] += 1
+                f_records.write(f"ins_alt_same_subtype\t{rec1[0]},{rec2[0]}\n")
             elif rec1[2].startswith('<INS:') and rec2[2].startswith('<INS:') and rec1[2] != rec2[2]:
-                counts['INS ALT Different Subtype'] += 1
-                f_records.write(f"INS ALT Different Subtype\t{rec1[0]},{rec2[0]}\n")
+                counts['ins_alt_different_subtype'] += 1
+                f_records.write(f"ins_alt_different_subtype\t{rec1[0]},{rec2[0]}\n")
 
 
 def _parse_arguments(argv: List[Text]) -> argparse.Namespace:
diff --git a/src/sv-pipeline/scripts/make_evidence_qc_table.py b/src/sv-pipeline/scripts/make_evidence_qc_table.py
index 9b1c3b633..8642cc7a5 100644
--- a/src/sv-pipeline/scripts/make_evidence_qc_table.py
+++ b/src/sv-pipeline/scripts/make_evidence_qc_table.py
@@ -147,10 +147,10 @@ def read_all_outlier(outlier_manta_df: pd.DataFrame, outlier_melt_df: pd.DataFra
         merged_dicts.update(counted)
     all_outliers = dict(merged_dicts)
     if len(all_outliers) == 0:
-        all_outliers_df = pd.DataFrame(columns=[ID_COL, outlier_type + "_overall_outliers"])
+        all_outliers_df = pd.DataFrame(columns=[ID_COL, "overall_" + outlier_type + "_outlier"])
     else:
         all_outliers_df = pd.DataFrame.from_dict(all_outliers, orient="index").reset_index()
-        all_outliers_df.columns = [ID_COL, outlier_type + "_overall_outliers"]
+        all_outliers_df.columns = [ID_COL, "overall_" + outlier_type + "_outlier"]
     return all_outliers_df
 
 
diff --git a/src/sv-pipeline/scripts/vcf_qc/collectQC.vcf_wide.sh b/src/sv-pipeline/scripts/vcf_qc/collectQC.vcf_wide.sh
index 38f328feb..811c63124 100755
--- a/src/sv-pipeline/scripts/vcf_qc/collectQC.vcf_wide.sh
+++ b/src/sv-pipeline/scripts/vcf_qc/collectQC.vcf_wide.sh
@@ -185,7 +185,7 @@ awk -v FS="\t" -v OFS="\t" -v nsamp=${nsamp} '{
  for(i=10;i<=NF;i++) {
   split($i,a,":");
     if (a[CN]==2) {homref++}
-    else if (a[CN]==0) {homalt++}
+    else if (a[CN]==0 || a[CN]>=4) {homalt++}
     else {het++} };
   AC=(2*homalt)+het; AN=2*(nsamp-(unknown+other)); AF=AC/AN;
   print $3, nsamp-unknown, homref, het, homalt, other, unknown, AC, AN, AF }' >> \
diff --git a/src/svtk/README.md b/src/svtk/README.md
index a49f3f475..7709df62e 100644
--- a/src/svtk/README.md
+++ b/src/svtk/README.md
@@ -1,14 +1,13 @@
 # SVTK
 
-Utilities for consolidating, filtering, resolving, and annotating structural
-variants.
+Utilities for consolidating, filtering, resolving, and annotating structural variants.
 
 ## Installation
 
 ```
-$ git clone https://github.com/talkowski-lab/svtk.git
-$ cd svtk
-$ pip install -e .
+$ git clone https://github.com/broadinstitute/gatk-sv.git
+$ cd gatk-sv
+$ pip install -e ./src/svtk
 ```
 
 ## Available commands
diff --git a/src/svtk/svtk/pesr/pe_test.py b/src/svtk/svtk/pesr/pe_test.py
index a737e2c0a..bde88cb59 100644
--- a/src/svtk/svtk/pesr/pe_test.py
+++ b/src/svtk/svtk/pesr/pe_test.py
@@ -89,7 +89,8 @@ def _get_coords(pos, strand):
         startA, endA = _get_coords(record.pos, strandA)
         startB, endB = _get_coords(record.stop, strandB)
 
-        region = '{0}:{1}-{2}'.format(record.chrom, startA, endA)
+        # Add 1 because evidence is stored/indexed with 0-based coordinates
+        region = '{0}:{1}-{2}'.format(record.chrom, startA + 1, endA + 1)
 
         try:
             pairs = self.discfile.fetch(region=region, parser=pysam.asTuple())
diff --git a/src/svtk/svtk/pesr/sr_test.py b/src/svtk/svtk/pesr/sr_test.py
index 4fefd6dca..612127de3 100644
--- a/src/svtk/svtk/pesr/sr_test.py
+++ b/src/svtk/svtk/pesr/sr_test.py
@@ -82,7 +82,7 @@ def test_record(self, record, called, background):
         # Clean up columns
         results['name'] = record.id
         results['bg_frac'] = results.called / \
-            (results.background + results.called)
+                             (results.background + results.called)
         results['bg_frac'] = results.bg_frac.fillna(0)
         cols = 'name coord pos log_pval called background bg_frac'.split()
 
@@ -120,7 +120,8 @@ def load_counts(self, chrom, pos, strand):
         """Load pandas DataFrame from tabixfile"""
 
         if pos > 0:
-            region = '{0}:{1}-{1}'.format(chrom, pos)
+            # Add 1 because evidence is stored/indexed with 0-based coordinates
+            region = '{0}:{1}-{1}'.format(chrom, pos + 1)
             try:
                 lines = self.countfile.fetch(region)
             except ValueError:
diff --git a/wdl/BAFTestChromosome.wdl b/wdl/BAFTestChromosome.wdl
index 1e53f1229..c4f42da77 100644
--- a/wdl/BAFTestChromosome.wdl
+++ b/wdl/BAFTestChromosome.wdl
@@ -113,7 +113,6 @@ task BAFTest {
       set -o pipefail
 
       java -Xmx~{java_mem_mb}M -jar ${GATK_JAR} PrintSVEvidence \
-        --skip-header \
         --sequence-dictionary ~{ref_dict} \
         --evidence-file ~{baf_metrics} \
         -L "${chrom}:${start}-${end}" \
@@ -121,9 +120,9 @@ task BAFTest {
     else
       touch local.BAF.txt
       bgzip local.BAF.txt
+      tabix -0 -s1 -b2 -e2 local.BAF.txt.gz
     fi
 
-    tabix -s1 -b2 -e2 local.BAF.txt.gz
     svtk baf-test ~{bed} local.BAF.txt.gz --batch batch.key > ~{prefix}.metrics
   
   >>>
diff --git a/wdl/BatchEvidenceMerging.wdl b/wdl/BatchEvidenceMerging.wdl
index 4225784af..7f6f04ad4 100644
--- a/wdl/BatchEvidenceMerging.wdl
+++ b/wdl/BatchEvidenceMerging.wdl
@@ -158,7 +158,7 @@ task MergeEvidence {
     fi
 
     awk '/txt\.gz$/' evidence.list | while read fil; do
-      tabix -f -s1 -b2 -e2 $fil
+      tabix -f -0 -s1 -b2 -e2 $fil
     done
 
     /gatk/gatk --java-options "-Xmx~{java_heap_size_mb}m" PrintSVEvidence -F evidence.list --sample-names samples.list --sequence-dictionary ~{reference_dict} -O "~{batch}.~{evidence}.txt.gz"
diff --git a/wdl/GenotypeCpxCnvsPerBatch.wdl b/wdl/GenotypeCpxCnvsPerBatch.wdl
index aca267819..0db32f435 100644
--- a/wdl/GenotypeCpxCnvsPerBatch.wdl
+++ b/wdl/GenotypeCpxCnvsPerBatch.wdl
@@ -250,9 +250,9 @@ task RdTestGenotype {
     else
       touch local.RD.txt
       bgzip local.RD.txt
+      tabix -p bed local.RD.txt.gz
     fi
 
-    tabix -p bed local.RD.txt.gz
     tabix -p bed ~{bin_exclude}
 
     Rscript /opt/RdTest/RdTest.R \
diff --git a/wdl/MainVcfQc.wdl b/wdl/MainVcfQc.wdl
index 81c5f5f0b..558865a5b 100644
--- a/wdl/MainVcfQc.wdl
+++ b/wdl/MainVcfQc.wdl
@@ -17,8 +17,6 @@ workflow MainVcfQc {
     File? ped_file
     File? list_of_samples_to_include
     File? sample_renaming_tsv # File with mapping to rename per-sample benchmark sample IDs for compatibility with cohort
-    File? identify_duplicates_custom
-    File? merge_duplicates_custom
     Int max_trios = 1000
     String prefix
     Int sv_per_shard
@@ -286,7 +284,6 @@ workflow MainVcfQc {
         prefix=prefix,
         vcf=vcf,
         sv_pipeline_qc_docker=sv_pipeline_qc_docker,
-        custom_script=identify_duplicates_custom,
         runtime_attr_override=runtime_override_identify_duplicates
     }
   }
@@ -298,7 +295,6 @@ workflow MainVcfQc {
       tsv_records=IdentifyDuplicates.duplicate_records,
       tsv_counts=IdentifyDuplicates.duplicate_counts,
       sv_pipeline_qc_docker=sv_pipeline_qc_docker,
-      custom_script=merge_duplicates_custom,
       runtime_attr_override=runtime_override_merge_duplicates
   }
 
@@ -893,13 +889,9 @@ task IdentifyDuplicates {
     String prefix
     File vcf
     String sv_pipeline_qc_docker
-    File? custom_script
     RuntimeAttr? runtime_attr_override
   }
 
-  # File default_script = "/src/sv-pipeline/scripts/merge_duplicates.py"
-  # File active_script = select_first([custom_script, default_script])
-
   String vcf_basename = basename(vcf, ".vcf.gz")
   String full_prefix = "~{prefix}.~{vcf_basename}"
 
@@ -928,7 +920,7 @@ task IdentifyDuplicates {
 
     echo "Processing ~{vcf} into ~{full_prefix}..."
 
-    python ~{custom_script} \
+    python opt/sv-pipeline/scripts/merge_duplicates.py \
       --vcf ~{vcf} \
       --fout ~{full_prefix}
 
@@ -949,13 +941,9 @@ task MergeDuplicates {
     Array[File] tsv_records
     Array[File] tsv_counts
     String sv_pipeline_qc_docker
-    File? custom_script
     RuntimeAttr? runtime_attr_override
   }
 
-  # File default_script = "/src/sv-pipeline/scripts/merge_duplicates.py"
-  # File active_script = select_first([custom_script, default_script])
-
   RuntimeAttr runtime_default = object {
     mem_gb: 3.75,
     disk_gb: 5 + ceil(size(tsv_records, "GiB")) + ceil(size(tsv_counts, "GiB")),
@@ -981,7 +969,7 @@ task MergeDuplicates {
 
     echo "Merging all TSV files into one..."
 
-    python ~{custom_script} \
+    python opt/sv-pipeline/scripts/merge_duplicates.py \
       --records ~{sep=' ' tsv_records} \
       --counts ~{sep=' ' tsv_counts} \
       --fout "~{prefix}.agg"
diff --git a/wdl/MatrixQC.wdl b/wdl/MatrixQC.wdl
index 902161ac0..e4681f961 100644
--- a/wdl/MatrixQC.wdl
+++ b/wdl/MatrixQC.wdl
@@ -158,10 +158,9 @@ task PESRBAF_QC {
     else
       touch ~{print_ev_output}
       bgzip ~{print_ev_output}
+      tabix -f -0 -s 1 -b 2 -e 2 ~{print_ev_output}
     fi
 
-    tabix -f -s 1 -b 2 -e 2 ~{print_ev_output}
-
     /opt/sv-pipeline/00_preprocessing/misc_scripts/nonRD_matrix_QC.sh \
       -d ~{distance} \
       ~{print_ev_output} \
@@ -238,10 +237,9 @@ task RD_QC {
     else
       touch local.RD.txt
       bgzip local.RD.txt
+      tabix -f -p bed ~{print_ev_output}
     fi
 
-    tabix -f -p bed ~{print_ev_output}
-
     /opt/sv-pipeline/00_preprocessing/misc_scripts/RD_matrix_QC.sh \
       -d ~{distance} \
       ~{print_ev_output} \
diff --git a/wdl/PETestChromosome.wdl b/wdl/PETestChromosome.wdl
index a573cce13..360db4bf0 100644
--- a/wdl/PETestChromosome.wdl
+++ b/wdl/PETestChromosome.wdl
@@ -217,7 +217,6 @@ task PETest {
 
     if [ -s region.merged.bed ]; then
       java -Xmx~{java_mem_mb}M -jar ${GATK_JAR} PrintSVEvidence \
-        --skip-header \
         --sequence-dictionary ~{ref_dict} \
         --evidence-file ~{discfile} \
         -L region.merged.bed \
@@ -225,9 +224,9 @@ task PETest {
     else
       touch local.PE.txt
       bgzip local.PE.txt
+      tabix -0 -s1 -b2 -e2 local.PE.txt.gz
     fi
 
-    tabix -s1 -b2 -e2 local.PE.txt.gz
     svtk pe-test -o ~{window} ~{common_arg} --medianfile ~{medianfile} --samples ~{include_list} ~{vcf} local.PE.txt.gz ~{prefix}.stats
   >>>
   runtime {
diff --git a/wdl/RDTestChromosome.wdl b/wdl/RDTestChromosome.wdl
index df11fabc4..0668fe5d5 100644
--- a/wdl/RDTestChromosome.wdl
+++ b/wdl/RDTestChromosome.wdl
@@ -176,10 +176,9 @@ task RDTest {
     else
       touch local.RD.txt
       bgzip local.RD.txt
+      tabix -p bed local.RD.txt.gz
     fi
 
-    tabix -p bed local.RD.txt.gz
-
     Rscript /opt/RdTest/RdTest.R \
       -b ~{bed} \
       -n ~{prefix} \
diff --git a/wdl/ResolveCpxSv.wdl b/wdl/ResolveCpxSv.wdl
index f63e1979e..cba568831 100644
--- a/wdl/ResolveCpxSv.wdl
+++ b/wdl/ResolveCpxSv.wdl
@@ -345,7 +345,6 @@ task ResolvePrep {
 
         if [ -s regions.bed ]; then
           java -Xmx~{java_mem_mb}M -jar ${GATK_JAR} PrintSVEvidence \
-                --skip-header \
                 --sequence-dictionary ~{ref_dict} \
                 --evidence-file $GS_PATH_TO_DISC_FILE \
                 -L regions.bed \
@@ -385,7 +384,7 @@ task ResolvePrep {
         > discfile.PE.txt.gz
     fi
 
-    tabix -s 1 -b 2 -e 2 -f discfile.PE.txt.gz
+    tabix -0 -s 1 -b 2 -e 2 -f discfile.PE.txt.gz
   >>>
 
   output {
diff --git a/wdl/SRTestChromosome.wdl b/wdl/SRTestChromosome.wdl
index 0f945a972..83987975d 100644
--- a/wdl/SRTestChromosome.wdl
+++ b/wdl/SRTestChromosome.wdl
@@ -218,7 +218,6 @@ task SRTest {
 
     if [ -s region.merged.bed ]; then
       java -Xmx~{java_mem_mb}M -jar ${GATK_JAR} PrintSVEvidence \
-        --skip-header \
         --sequence-dictionary ~{ref_dict} \
         --evidence-file ~{splitfile} \
         -L region.merged.bed \
@@ -226,9 +225,9 @@ task SRTest {
     else
       touch local.SR.txt
       bgzip local.SR.txt
+      tabix -0 -s1 -b2 -e2 local.SR.txt.gz
     fi
 
-    tabix -s1 -b2 -e2 local.SR.txt.gz
     svtk sr-test -w 50 --log ~{common_arg} --medianfile ~{medianfile} --samples ~{include_list} ~{vcf} local.SR.txt.gz ~{prefix}.stats
   >>>
   runtime {
diff --git a/wdl/SetSampleIdLegacy.wdl b/wdl/SetSampleIdLegacy.wdl
index bcc114582..17957d3a3 100644
--- a/wdl/SetSampleIdLegacy.wdl
+++ b/wdl/SetSampleIdLegacy.wdl
@@ -122,13 +122,13 @@ task SetSampleId {
     output_name="~{sample_name}.~{file_type}.txt.gz"
 
     if [ ! -f "~{evidence_file}.tbi" ]; then
-        tabix -s1 -b2 -e2 ~{evidence_file}
+        tabix -0 -s1 -b2 -e2 ~{evidence_file}
     fi
 
     mkfifo $fifo_name
     /gatk/gatk --java-options "-Xmx2000m" PrintSVEvidence -F ~{evidence_file} --sequence-dictionary ~{reference_dict} -O $fifo_name &
     awk '{$~{sample_column}="~{sample_name}"}' < $fifo_name | bgzip -c > $output_name
-    tabix -s1 -b2 -e2 $output_name
+    tabix -0 -s1 -b2 -e2 $output_name
 
   >>>
   runtime {
diff --git a/wdl/TasksGenotypeBatch.wdl b/wdl/TasksGenotypeBatch.wdl
index 7a945ff48..aa1221b3e 100644
--- a/wdl/TasksGenotypeBatch.wdl
+++ b/wdl/TasksGenotypeBatch.wdl
@@ -344,10 +344,9 @@ task RDTestGenotype {
     else
       touch local.RD.txt
       bgzip local.RD.txt
+      tabix -p bed local.RD.txt.gz
     fi
 
-    tabix -p bed local.RD.txt.gz
-
     Rscript /opt/RdTest/RdTest.R \
       -b ~{bed} \
       -c local.RD.txt.gz \
@@ -435,9 +434,9 @@ task CountPE {
     else
       touch local.PE.txt
       bgzip local.PE.txt
+      tabix -0 -s1 -b2 -e2 local.PE.txt.gz
     fi
 
-    tabix -s1 -b2 -e2 local.PE.txt.gz
     svtk count-pe -s ~{write_lines(samples)} --medianfile ~{medianfile} ~{vcf} local.PE.txt.gz ~{prefix}.pe_counts.txt
     gzip ~{prefix}.pe_counts.txt
 
@@ -511,9 +510,9 @@ task CountSR {
     else
       touch local.SR.txt
       bgzip local.SR.txt
+      tabix -0 -s1 -b2 -e2 local.SR.txt.gz
     fi
 
-    tabix -s1 -b2 -e2 local.SR.txt.gz
     svtk count-sr -s ~{write_lines(samples)} --medianfile ~{medianfile} ~{vcf} local.SR.txt.gz ~{prefix}.sr_counts.txt
     /opt/sv-pipeline/04_variant_resolution/scripts/sum_SR.sh ~{prefix}.sr_counts.txt ~{prefix}.sr_sum.txt.gz
     gzip ~{prefix}.sr_counts.txt
diff --git a/wdl/TrainGCNV.wdl b/wdl/TrainGCNV.wdl
index 579d1255e..43c41e8fa 100644
--- a/wdl/TrainGCNV.wdl
+++ b/wdl/TrainGCNV.wdl
@@ -112,7 +112,7 @@ workflow TrainGCNV {
     }
   }
 
-  if (defined(n_samples_subsample) && !defined(sample_ids_training_subset)) {
+  if (defined(n_samples_subsample) && (select_first([n_samples_subsample]) < length(samples)) && !defined(sample_ids_training_subset)) {
     call util.RandomSubsampleStringArray {
       input:
         strings = write_lines(samples),
diff --git a/website/docs/advanced/build_inputs.md b/website/docs/advanced/build_inputs.md
index b986919c0..2beb1993a 100644
--- a/website/docs/advanced/build_inputs.md
+++ b/website/docs/advanced/build_inputs.md
@@ -21,20 +21,10 @@ You may run the following commands to get these example inputs.
    git clone https://github.com/broadinstitute/gatk-sv && cd gatk-sv
     ```
 
-2. Create a JSON file containing the Terra billing project (for use on Terra) 
-   or the Google project ID (for use on Cromwell) that you will use to run 
-   the workflows with the test input. You may create this file by running
-   the following command and replacing `"my-google-project-id"` and 
-   `"my-terra-billing-project"` with your project and billing IDs. 
+2. Create test inputs.
 
     ```shell
-    echo '{ "google_project_id": "my-google-project-id", "terra_billing_project_id": "my-terra-billing-project" }' > inputs/values/google_cloud.my_project.json
-    ```
-
-3. Create test inputs.
-
-    ```shell
-    bash scripts/inputs/build_default_inputs.sh -d . -c google_cloud.my_project 
+    bash scripts/inputs/build_default_inputs.sh -d .
     ```
    
     Running this command generates test inputs in `gatk-sv/inputs/build` with the following structure. 
@@ -62,7 +52,7 @@ python scripts/inputs/build_inputs.py \
   inputs/values \
   inputs/templates/test/GATKSVPipelineSingleSample \
   inputs/build/NA19240/test \
-  -a '{ "test_batch" : "ref_panel_1kg", "cloud_env": "google_cloud.my_project" }'
+  -a '{ "test_batch" : "ref_panel_1kg" }'
 ```
 
 
@@ -98,24 +88,18 @@ Here is an example of how to generate workflow input jsons from `GATKSVPipelineB
       metadata.json \
       > inputs/values/my_ref_panel.json
    ```
-
-3. Define your google project id (for Cromwell inputs) and Terra billing project (for workspace inputs).
-
-   ```shell
-   echo '{ "google_project_id": "my-google-project-id", "terra_billing_project_id": "my-terra-billing-project" }' > inputs/values/google_cloud.my_project.json
-   ```
    
-4. Build test files for batched workflows (google cloud project id required).
+3. Build test files for batched workflows (google cloud project id required).
 
    ```shell
    python scripts/inputs/build_inputs.py \
       inputs/values \
       inputs/templates/test \
       inputs/build/my_ref_panel/test \
-      -a '{ "test_batch" : "ref_panel_1kg", "cloud_env": "google_cloud.my_project" }'
+      -a '{ "test_batch" : "ref_panel_1kg" }'
    ```
 
-5. Build test files for the single-sample workflow
+4. Build test files for the single-sample workflow
 
    ```shell
    python scripts/inputs/build_inputs.py \
@@ -125,7 +109,7 @@ Here is an example of how to generate workflow input jsons from `GATKSVPipelineB
        -a '{ "single_sample" : "test_single_sample_NA19240", "ref_panel" : "my_ref_panel" }'
    ```
    
-6. Build files for a Terra workspace.
+5. Build files for a Terra workspace.
 
    ```shell 
    python scripts/inputs/build_inputs.py \
diff --git a/website/docs/gs/quick_start.md b/website/docs/gs/quick_start.md
index 282b8e72e..b225f7837 100644
--- a/website/docs/gs/quick_start.md
+++ b/website/docs/gs/quick_start.md
@@ -44,8 +44,7 @@ The input values are provided only as an example and are not publicly accessible
 > cp $GATK_SV_ROOT/wdl/*.wdl .
 > zip dep.zip *.wdl
 > cd ..
-> echo '{ "google_project_id": "my-google-project-id", "terra_billing_project_id": "my-terra-billing-project" }' > inputs/values/google_cloud.my_project.json
-> bash scripts/inputs/build_default_inputs.sh -d $GATK_SV_ROOT -c google_cloud.my_project
+> bash scripts/inputs/build_default_inputs.sh -d $GATK_SV_ROOT
 > cp $GATK_SV_ROOT/inputs/build/ref_panel_1kg/test/GATKSVPipelineBatch/GATKSVPipelineBatch.json GATKSVPipelineBatch.my_run.json
 > cromshell submit wdl/GATKSVPipelineBatch.wdl GATKSVPipelineBatch.my_run.json cromwell_config.json wdl/dep.zip
 ```
diff --git a/website/docs/gs/runtime_env.md b/website/docs/gs/runtime_env.md
index 10b8c6234..65aee6032 100644
--- a/website/docs/gs/runtime_env.md
+++ b/website/docs/gs/runtime_env.md
@@ -48,4 +48,4 @@ and share code on forked repositories. Here are a some considerations:
 - The GATK-SV pipeline takes advantage of the massive parallelization possible in the cloud. 
   Local backends may not have the resources to execute all of the workflows. 
   Workflows that use fewer resources or that are less parallelized may be more successful. 
-  For instance, some users have been able to run [GatherSampleEvidence](#gather-sample-evidence) on a SLURM cluster.
+  For instance, some users have been able to run [GatherSampleEvidence](../modules/gse) on a SLURM cluster.
diff --git a/website/docs/modules/evidence_qc.md b/website/docs/modules/evidence_qc.md
index 085945f87..5177636da 100644
--- a/website/docs/modules/evidence_qc.md
+++ b/website/docs/modules/evidence_qc.md
@@ -5,6 +5,8 @@ sidebar_position: 2
 slug: eqc
 ---
 
+import { Highlight, HighlightOptionalArg } from "../../src/components/highlight.js"
+
 Runs ploidy estimation, dosage scoring, and optionally VCF QC. 
 The results from this module can be used for QC and batching.
 
@@ -17,9 +19,36 @@ for further guidance on creating batches.
 We also recommend using sex assignments generated from the ploidy 
 estimates and incorporating them into the PED file, with sex = 0 for sex aneuploidies.
 
-### Prerequisites
+The following diagram illustrates the upstream and downstream workflows of the `EvidenceQC` workflow 
+in the recommended invocation order. You may refer to 
+[this diagram](https://github.com/broadinstitute/gatk-sv/blob/main/terra_pipeline_diagram.jpg) 
+for the overall recommended invocation order.
+
+<br/>
+
+```mermaid
+
+stateDiagram
+  direction LR
+  
+  classDef inModules stroke-width:0px,fill:#00509d,color:#caf0f8
+  classDef thisModule font-weight:bold,stroke-width:0px,fill:#ff9900,color:white
+  classDef outModules stroke-width:0px,fill:#caf0f8,color:#00509d
+
+  gse: GatherSampleEvidence
+  eqc: EvidenceQC
+  batching: Batching, sample QC, and sex assignment
+  
+  gse --> eqc
+  eqc --> batching
+  
+  class eqc thisModule
+  class gse inModules
+  class batching outModules
+```
+
+<br/>
 
-- [Gather Sample Evidence](./gse)
 
 ### Inputs
 
diff --git a/website/docs/modules/gather_batch_evidence.md b/website/docs/modules/gather_batch_evidence.md
index d6de3948f..0bdf8a7a9 100644
--- a/website/docs/modules/gather_batch_evidence.md
+++ b/website/docs/modules/gather_batch_evidence.md
@@ -5,25 +5,174 @@ sidebar_position: 4
 slug: gbe
 ---
 
-Runs CNV callers (cnMOPs, GATK gCNV) and combines single-sample 
-raw evidence into a batch. See above for more information on batching.
+Runs CNV callers ([cn.MOPS](https://academic.oup.com/nar/article/40/9/e69/1136601), GATK-gCNV) 
+and combines single-sample raw evidence into a batch.
 
-### Prerequisites
+The following diagram illustrates the downstream workflows of the `GatherBatchEvidence` workflow 
+in the recommended invocation order. You may refer to 
+[this diagram](https://github.com/broadinstitute/gatk-sv/blob/main/terra_pipeline_diagram.jpg) 
+for the overall recommended invocation order.
 
-- GatherSampleEvidence
-- (Recommended) EvidenceQC
-- gCNV training. 
+```mermaid
 
-### Inputs
-- PED file (updated with EvidenceQC sex assignments, including sex = 0 
-  for sex aneuploidies. Calls will not be made on sex chromosomes 
-  when sex = 0 in order to avoid generating many confusing calls 
-  or upsetting normalized copy numbers for the batch.)
-- Read count, BAF, PE, SD, and SR files (GatherSampleEvidence)
-- Caller VCFs (GatherSampleEvidence)
-- Contig ploidy model and gCNV model files (gCNV training)
+stateDiagram
+  direction LR
+  
+  classDef inModules stroke-width:0px,fill:#00509d,color:#caf0f8
+  classDef thisModule font-weight:bold,stroke-width:0px,fill:#ff9900,color:white
+  classDef outModules stroke-width:0px,fill:#caf0f8,color:#00509d
 
-### Outputs
+  gbe: GatherBatchEvidence
+  t: TrainGCNV
+  cb: ClusterBatch
+  t --> gbe
+  gbe --> cb
+  
+  class gbe thisModule
+  class t inModules
+  class cb outModules
+```
+
+## Inputs
+This workflow takes as input the read counts, BAF, PE, SD, SR, and per-caller VCF files 
+produced in the GatherSampleEvidence workflow, and contig ploidy and gCNV models from 
+the TrainGCNV workflow.
+The following is the list of the inputs the GatherBatchEvidence workflow takes.
+
+
+#### `batch`
+An identifier for the batch.
+
+
+#### `samples`
+Sets the list of sample IDs. 
+
+
+#### `counts`
+Set to the [`GatherSampleEvidence.coverage_counts`](./gse#coverage-counts) output.
+
+
+#### Raw calls
+
+The following inputs set the per-caller raw SV calls, and should be set 
+if the caller was run in the [`GatherSampleEvidence`](./gse) workflow.
+You may set each of the following inputs to the linked output from 
+the GatherSampleEvidence workflow.
+
+
+- `manta_vcfs`: [`GatherSampleEvidence.manta_vcf`](./gse#manta-vcf);
+- `melt_vcfs`: [`GatherSampleEvidence.melt_vcf`](./gse#melt-vcf);
+- `scramble_vcfs`: [`GatherSampleEvidence.scramble_vcf`](./gse#scramble-vcf);
+- `wham_vcfs`: [`GatherSampleEvidence.wham_vcf`](./gse#wham-vcf).
+
+#### `PE_files`
+Set to the [`GatherSampleEvidence.pesr_disc`](./gse#pesr-disc) output.
+
+#### `SR_files`
+Set to the [`GatherSampleEvidence.pesr_split`](./gse#pesr-split)
+
+
+#### `SD_files`
+Set to the [`GatherSampleEvidence.pesr_sd`](./gse#pesr-sd)
+
+
+#### `matrix_qc_distance`
+You may refer to [this file](https://github.com/broadinstitute/gatk-sv/blob/main/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/GatherBatchEvidence.json.tmpl)
+for an example value. 
+
+
+#### `min_svsize`
+Sets the minimum size of SVs to include.
+
+
+#### `ped_file`
+A pedigree file describing the familial relationshipts between the samples in the cohort.
+Please refer to [this section](./#ped_file) for details. 
+
+
+#### `run_matrix_qc`
+Enables or disables running optional QC tasks. 
+
+
+#### `gcnv_qs_cutoff`
+You may refer to [this file](https://github.com/broadinstitute/gatk-sv/blob/main/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/GatherBatchEvidence.json.tmpl)
+for an example value. 
+
+#### cn.MOPS files
+The workflow needs the following cn.MOPS files.
+
+- `cnmops_chrom_file` and `cnmops_allo_file`: FASTA index files (`.fai`) for respectively 
+  non-sex chromosomes (autosomes) and chromosomes X and Y (allosomes). 
+  The file format is explained [on this page](https://www.htslib.org/doc/faidx.html).
+
+  You may use the following files for these fields:
+
+  ```json
+  "cnmops_chrom_file": "gs://gcp-public-data--broad-references/hg38/v0/sv-resources/resources/v1/autosome.fai"
+  "cnmops_allo_file": "gs://gcp-public-data--broad-references/hg38/v0/sv-resources/resources/v1/allosome.fai"
+  ```
+  
+- `cnmops_exclude_list`: 
+  You may use [this file](https://github.com/broadinstitute/gatk-sv/blob/d66f760865a89f30dbce456a3f720dec8b70705c/inputs/values/resources_hg38.json#L10)
+  for this field.
+
+#### GATK-gCNV inputs
+
+The following inputs are configured based on the outputs generated in the [`TrainGCNV`](./gcnv) workflow.
+
+- `contig_ploidy_model_tar`: [`TrainGCNV.cohort_contig_ploidy_model_tar`](./gcnv#contig-ploidy-model-tarball)
+- `gcnv_model_tars`: [`TrainGCNV.cohort_gcnv_model_tars`](./gcnv#model-tarballs)
+
+
+The workflow also enables setting a few optional arguments of gCNV.
+The arguments and their default values are provided 
+[here](https://github.com/broadinstitute/gatk-sv/blob/main/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/GatherBatchEvidence.json.tmpl) 
+as the following, and each argument is documented on 
+[this page](https://gatk.broadinstitute.org/hc/en-us/articles/360037593411-PostprocessGermlineCNVCalls)
+and
+[this page](https://gatk.broadinstitute.org/hc/en-us/articles/360047217671-GermlineCNVCaller).
+
+
+#### Docker images
+
+The workflow needs the following Docker images, the latest versions of which are in 
+[this file](https://github.com/broadinstitute/gatk-sv/blob/main/inputs/values/dockers.json).
+
+  - `cnmops_docker`;
+  - `condense_counts_docker`;
+  - `linux_docker`;
+  - `sv_base_docker`;
+  - `sv_base_mini_docker`;
+  - `sv_pipeline_docker`;
+  - `sv_pipeline_qc_docker`;
+  - `gcnv_gatk_docker`;
+  - `gatk_docker`.
+
+#### Static inputs
+
+You may refer to [this reference file](https://github.com/broadinstitute/gatk-sv/blob/main/inputs/values/resources_hg38.json)
+for values of the following inputs.
+
+ - `primary_contigs_fai`;
+ - `cytoband`;
+ - `ref_dict`;
+ - `mei_bed`;
+ - `genome_file`;
+ - `sd_locs_vcf`.
+
+
+#### Optional Inputs
+The following is the list of a few optional inputs of the 
+workflow, with an example of possible values. 
+
+- `"allosomal_contigs": [["chrX", "chrY"]]`
+- `"ploidy_sample_psi_scale": 0.001`
+
+
+
+
+
+## Outputs
 
 - Combined read count matrix, SR, PE, and BAF files
 - Standardized call VCFs
diff --git a/website/docs/modules/gather_sample_evidence.md b/website/docs/modules/gather_sample_evidence.md
index 918a1b27d..fcc951be3 100644
--- a/website/docs/modules/gather_sample_evidence.md
+++ b/website/docs/modules/gather_sample_evidence.md
@@ -6,20 +6,77 @@ slug: gse
 ---
 
 Runs raw evidence collection on each sample with the following SV callers: 
-Manta, Wham, and/or MELT. For guidance on pre-filtering prior to GatherSampleEvidence, 
+Manta, Wham, Scramble, and/or MELT. For guidance on pre-filtering prior to GatherSampleEvidence, 
 refer to the Sample Exclusion section.
 
-Note: a list of sample IDs must be provided. Refer to the sample ID 
-requirements for specifications of allowable sample IDs. 
+The following diagram illustrates the downstream workflows of the `GatherSampleEvidence` workflow 
+in the recommended invocation order. You may refer to 
+[this diagram](https://github.com/broadinstitute/gatk-sv/blob/main/terra_pipeline_diagram.jpg) 
+for the overall recommended invocation order.
+
+
+```mermaid
+
+stateDiagram
+  direction LR
+  
+  classDef inModules stroke-width:0px,fill:#00509d,color:#caf0f8
+  classDef thisModule font-weight:bold,stroke-width:0px,fill:#ff9900,color:white
+  classDef outModules stroke-width:0px,fill:#caf0f8,color:#00509d
+
+  gse: GatherSampleEvidence
+  eqc: EvidenceQC
+  gse --> eqc
+  
+  class gse thisModule
+  class eqc outModules
+```
+
+
+## Inputs
+
+#### `bam_or_cram_file`
+A BAM or CRAM file aligned to hg38. Index file (.bai) must be provided if using BAM.
+
+#### `sample_id`
+Refer to the [sample ID requirements](/docs/gs/inputs#sampleids) for specifications of allowable sample IDs. 
 IDs that do not meet these requirements may cause errors.
 
-### Inputs
+#### `preprocessed_intervals`
+Picard interval list.
+
+#### `sd_locs_vcf`
+(`sd`: site depth) 
+A VCF file containing allele counts at common SNP loci of the genome, which is used for calculating BAF.  
+For human genome, you may use [`dbSNP`](https://www.ncbi.nlm.nih.gov/snp/) 
+that contains a complete list of common and clinical human single nucleotide variations, 
+microsatellites, and small-scale insertions and deletions. 
+You may find a link to the file in 
+[this reference](https://github.com/broadinstitute/gatk-sv/blob/main/inputs/values/resources_hg38.json).
 
-- Per-sample BAM or CRAM files aligned to hg38. Index files (.bai) must be provided if using BAMs.
 
-### Outputs
+## Outputs
 
-- Caller VCFs (Manta, MELT, and/or Wham)
 - Binned read counts file
 - Split reads (SR) file
 - Discordant read pairs (PE) file
+
+#### `manta_vcf` {#manta-vcf}
+A VCF file containing variants called by Manta. 
+
+#### `melt_vcf` {#melt-vcf}
+A VCF file containing variants called by MELT. 
+
+#### `scramble_vcf` {#scramble-vcf}
+A VCF file containing variants called by Scramble. 
+
+#### `wham_vcf` {#wham-vcf}
+A VCF file containing variants called by Wham. 
+
+#### `coverage_counts` {#coverage-counts}
+
+#### `pesr_disc` {#pesr-disc}
+
+#### `pesr_split` {#pesr-split}
+
+#### `pesr_sd` {#pesr-sd}
\ No newline at end of file
diff --git a/website/docs/modules/index.md b/website/docs/modules/index.md
index 2ffea615f..a5f4ad7c5 100644
--- a/website/docs/modules/index.md
+++ b/website/docs/modules/index.md
@@ -36,3 +36,18 @@ consisting of multiple modules to be executed in the following order.
 - **Module 09 (in development)** Visualization, including scripts that generates IGV screenshots and rd plots.
 
 - Additional modules to be added: de novo and mosaic scripts
+
+
+## Pipeline Parameters
+
+Several inputs are shared across different modules of the pipeline, which are explained in this section.
+
+#### `ped_file`
+
+A pedigree file describing the familial relationships between the samples in the cohort.
+The file needs to be in the 
+[PED format](https://gatk.broadinstitute.org/hc/en-us/articles/360035531972-PED-Pedigree-format).
+Updated with [EvidenceQC](./eqc) sex assignments, including 
+`sex = 0` for sex aneuploidies; 
+genotypes on chrX and chrY for samples with `sex = 0` in the PED file will be set to 
+`./.` and these samples will be excluded from sex-specific training steps.
diff --git a/website/docs/modules/train_gcnv.md b/website/docs/modules/train_gcnv.md
index 7bbd8e934..45c7b3cac 100644
--- a/website/docs/modules/train_gcnv.md
+++ b/website/docs/modules/train_gcnv.md
@@ -5,37 +5,102 @@ sidebar_position: 3
 slug: gcnv
 ---
 
-Trains a gCNV model for use in GatherBatchEvidence. 
-The WDL can be found at /wdl/TrainGCNV.wdl. 
-
-Both the cohort and single-sample modes use the 
-GATK gCNV depth calling pipeline, which requires a 
-trained model as input. The samples used for training 
-should be technically homogeneous and similar to the 
-samples to be processed (i.e. same sample type, 
-library prep protocol, sequencer, sequencing center, etc.). 
-The samples to be processed may comprise all or a subset 
-of the training set. For small, relatively homogenous cohorts, 
-a single gCNV model is usually sufficient. If a cohort 
-contains multiple data sources, we recommend training a separate 
-model for each batch or group of batches with similar dosage 
-score (WGD). The model may be trained on all or a subset of 
-the samples to which it will be applied; a reasonable default 
-is 100 randomly-selected samples from the batch (the random 
-selection can be done as part of the workflow by specifying 
-a number of samples to the n_samples_subsample input 
-parameter in /wdl/TrainGCNV.wdl).
-
-### Prerequisites
-
-- GatherSampleEvidence
-- (Recommended) EvidenceQC
-
-### Inputs
-
-- Read count files (GatherSampleEvidence)
-
-### Outputs
-
-- Contig ploidy model tarball
-- gCNV model tarballs
\ No newline at end of file
+import { Highlight, HighlightOptionalArg } from "../../src/components/highlight.js"
+
+[GATK-gCNV](https://www.nature.com/articles/s41588-023-01449-0)
+is a method for detecting rare germline copy number variants (CNVs)
+from short-read sequencing read-depth information.
+The [TrainGCNV](https://github.com/broadinstitute/gatk-sv/blob/main/wdl/TrainGCNV.wdl)
+module trains a gCNV model for use in the [GatherBatchEvidence](./gbe) workflow. 
+The upstream and downstream dependencies of the TrainGCNV module are illustrated in the following diagram.
+
+
+The samples used for training should be homogeneous (concerning sequencing platform, 
+coverage, library preparation, etc.) and similar 
+to the samples on which the model will be applied in terms of sample type, 
+library preparation protocol, sequencer, sequencing center, and etc.
+
+
+For small, relatively homogeneous cohorts, a single gCNV model is usually sufficient. 
+However, for larger cohorts, especially those with multiple data sources, 
+we recommend training a separate model for each batch or group of batches (see 
+[batching section](/docs/run/joint#batching) for details).
+The model can be trained on all or a subset of the samples to which it will be applied. 
+A subset of 100 randomly selected samples from the batch is a reasonable
+input size for training the model; when the `n_samples_subsample` input is provided, 
+the `TrainGCNV` workflow can automatically perform this random selection.
+
+The following diagram illustrates the upstream and downstream workflows of the `TrainGCNV` workflow 
+in the recommended invocation order. You may refer to 
+[this diagram](https://github.com/broadinstitute/gatk-sv/blob/main/terra_pipeline_diagram.jpg) 
+for the overall recommended invocation order.
+
+```mermaid
+
+stateDiagram
+  direction LR
+  
+  classDef inModules stroke-width:0px,fill:#00509d,color:#caf0f8
+  classDef thisModule font-weight:bold,stroke-width:0px,fill:#ff9900,color:white
+  classDef outModules stroke-width:0px,fill:#caf0f8,color:#00509d
+
+  batching: Batching, sample QC, and sex assignment
+  t: TrainGCNV
+  gbe: GatherBatchEvidence
+  
+  batching --> t
+  t --> gbe 
+  
+  class t thisModule
+  class batching inModules
+  class gbe outModules
+```
+
+## Inputs
+
+This section provides a brief description on the _required_ inputs of the TrainGCNV workflow.
+For a description on the _optional_ inputs and their default values, you may refer to the 
+[source code](https://github.com/broadinstitute/gatk-sv/blob/main/wdl/TrainGCNV.wdl) of the TrainGCNV workflow.
+Additionally, the majority of the optional inputs of the workflow map to the optional arguments of the 
+tool the workflow uses, `GATK GermlineCNVCaller`; hence, you may refer to the 
+[documentation](https://gatk.broadinstitute.org/hc/en-us/articles/360040097712-GermlineCNVCaller) 
+of the tool for a description on these optional inputs. 
+
+#### `samples`
+A list of sample IDs. 
+The order of IDs in this list should match the order of files in `count_files`.
+
+#### `count_files`
+A list of per-sample coverage counts generated in the [GatherSampleEvidence](./gse#outputs) workflow.
+
+#### `contig_ploidy_priors`
+A tabular file with ploidy prior probability per contig.
+You may find the link to this input from 
+[this reference](https://github.com/broadinstitute/gatk-sv/blob/main/inputs/values/resources_hg38.json)
+and a description to the file format 
+[here](https://gatk.broadinstitute.org/hc/en-us/articles/360037224772-DetermineGermlineContigPloidy).
+
+
+#### `reference_fasta`
+`reference_fasta`, `reference_index`, `reference_dict` are respectively the 
+reference genome sequence in the FASTA format, its index file, and a corresponding 
+[dictionary file](https://gatk.broadinstitute.org/hc/en-us/articles/360035531652-FASTA-Reference-genome-format).
+You may find links to these files from 
+[this reference](https://github.com/broadinstitute/gatk-sv/blob/main/inputs/values/resources_hg38.json).
+
+
+## Outputs
+
+#### <HighlightOptionalArg>Optional</HighlightOptionalArg> `annotated_intervals` {#annotated-intervals}
+
+The count files from [GatherSampleEvidence](./gse) with adjacent intervals combined into 
+locus-sorted `DepthEvidence` files using `GATK CondenseDepthEvidence` tool, which are
+annotated with GC content, mappability, and segmental-duplication content using 
+[`GATK AnnotateIntervals`](https://gatk.broadinstitute.org/hc/en-us/articles/360041416652-AnnotateIntervals)
+tool. This output is generated if the optional input `do_explicit_gc_correction` is set to `True`.
+
+#### <HighlightOptionalArg>Optional</HighlightOptionalArg> `filtered_intervals_cnv` {#filtered-intervals-cnv}
+
+#### <HighlightOptionalArg>Optional</HighlightOptionalArg> `cohort_contig_ploidy_model_tar` {#contig-ploidy-model-tarball}
+
+#### <HighlightOptionalArg>Optional</HighlightOptionalArg> `cohort_gcnv_model_tars` {#model-tarballs}
diff --git a/website/src/components/highlight.js b/website/src/components/highlight.js
new file mode 100644
index 000000000..8f41722bc
--- /dev/null
+++ b/website/src/components/highlight.js
@@ -0,0 +1,25 @@
+const Highlight = ({children, color}) => (
+  <span
+    style={{
+      backgroundColor: color || 'var(--highlight-background-color)',
+      borderRadius: '2px',
+      color: 'var(--highlight-text-color)',
+      padding: '0.2rem',
+    }}>
+    {children}
+  </span>
+);
+
+const HighlightOptionalArg = ({children}) => (
+  <span
+    style={{
+      backgroundColor: 'var(--highlight-optional-arg-background-color)',
+      borderRadius: '2px',
+      color: 'var(--highlight-optional-arg-text-color)',
+      padding: '0.2rem',
+    }}>
+    {children}
+  </span>
+);
+
+export { Highlight, HighlightOptionalArg };
diff --git a/website/src/css/custom.css b/website/src/css/custom.css
index 2bc6a4cfd..e34580355 100644
--- a/website/src/css/custom.css
+++ b/website/src/css/custom.css
@@ -15,6 +15,11 @@
   --ifm-color-primary-lightest: #3cad6e;
   --ifm-code-font-size: 95%;
   --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.1);
+
+  --highlight-text-color: black;
+  --highlight-background-color: #7091e6;
+  --highlight-optional-arg-text-color: black;
+  --highlight-optional-arg-background-color: #7091e6;
 }
 
 /* For readability concerns, you should choose a lighter palette in dark mode. */
@@ -27,4 +32,9 @@
   --ifm-color-primary-lighter: #32d8b4;
   --ifm-color-primary-lightest: #4fddbf;
   --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.3);
+
+  --highlight-text-color: black;
+  --highlight-background-color: #7091e6;
+  --highlight-optional-arg-text-color: black;
+  --highlight-optional-arg-background-color: #7091e6;
 }

From be4f722eb5324a0b50d25bc55a10e3712a5dab88 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Thu, 26 Sep 2024 08:37:33 -0400
Subject: [PATCH 24/29] Re-added extra line

---
 wdl/MainVcfQc.wdl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/wdl/MainVcfQc.wdl b/wdl/MainVcfQc.wdl
index 138c66d05..7967a8995 100644
--- a/wdl/MainVcfQc.wdl
+++ b/wdl/MainVcfQc.wdl
@@ -198,6 +198,7 @@ workflow MainVcfQc {
       }
     }
 
+
     # Merge all VID lists into single output directory and tar it
     call TarShardVidLists {
       input:

From ad609e1d13439f3f9ec4cf9fcbc59461aeb110d7 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Thu, 26 Sep 2024 08:39:40 -0400
Subject: [PATCH 25/29] Corrected line add to test docker image publishing

---
 wdl/MainVcfQc.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wdl/MainVcfQc.wdl b/wdl/MainVcfQc.wdl
index 7967a8995..e8a40c6b7 100644
--- a/wdl/MainVcfQc.wdl
+++ b/wdl/MainVcfQc.wdl
@@ -198,7 +198,6 @@ workflow MainVcfQc {
       }
     }
 
-
     # Merge all VID lists into single output directory and tar it
     call TarShardVidLists {
       input:
@@ -374,6 +373,7 @@ task PlotQcVcfWide {
   }
 }
 
+
 # Task to merge VID lists across shards
 task TarShardVidLists {
   input {

From f5b2ed01970fbfd8449be2f4ac745bfc5d06b03b Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Thu, 26 Sep 2024 08:54:04 -0400
Subject: [PATCH 26/29] Corrected to use identify_duplicates in task

---
 wdl/MainVcfQc.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wdl/MainVcfQc.wdl b/wdl/MainVcfQc.wdl
index e8a40c6b7..9d7dae885 100644
--- a/wdl/MainVcfQc.wdl
+++ b/wdl/MainVcfQc.wdl
@@ -921,7 +921,7 @@ task IdentifyDuplicates {
 
     echo "Processing ~{vcf} into ~{full_prefix}..."
 
-    python opt/sv-pipeline/scripts/merge_duplicates.py \
+    python opt/sv-pipeline/scripts/identify_duplicates.py \
       --vcf ~{vcf} \
       --fout ~{full_prefix}
 

From 3fb16a4a7d85d6f85b7270eab53159a38fbc5610 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Thu, 26 Sep 2024 11:13:27 -0400
Subject: [PATCH 27/29] Added root path for python files

---
 wdl/MainVcfQc.wdl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wdl/MainVcfQc.wdl b/wdl/MainVcfQc.wdl
index 9d7dae885..7c11f3f87 100644
--- a/wdl/MainVcfQc.wdl
+++ b/wdl/MainVcfQc.wdl
@@ -921,7 +921,7 @@ task IdentifyDuplicates {
 
     echo "Processing ~{vcf} into ~{full_prefix}..."
 
-    python opt/sv-pipeline/scripts/identify_duplicates.py \
+    python /opt/sv-pipeline/scripts/identify_duplicates.py \
       --vcf ~{vcf} \
       --fout ~{full_prefix}
 
@@ -970,7 +970,7 @@ task MergeDuplicates {
 
     echo "Merging all TSV files into one..."
 
-    python opt/sv-pipeline/scripts/merge_duplicates.py \
+    python /opt/sv-pipeline/scripts/merge_duplicates.py \
       --records ~{sep=' ' tsv_records} \
       --counts ~{sep=' ' tsv_counts} \
       --fout "~{prefix}.agg"

From 577e6b1e556c784d67b39fb622ca86281382f398 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Thu, 26 Sep 2024 11:58:25 -0400
Subject: [PATCH 28/29] Minor script update - use lowercase exact

---
 src/sv-pipeline/scripts/identify_duplicates.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/sv-pipeline/scripts/identify_duplicates.py b/src/sv-pipeline/scripts/identify_duplicates.py
index 7b03c0caa..1ca56ac5c 100644
--- a/src/sv-pipeline/scripts/identify_duplicates.py
+++ b/src/sv-pipeline/scripts/identify_duplicates.py
@@ -74,11 +74,10 @@ def process_buffers(exact_buffer, ins_buffer, counts, f_records):
     exact_matches = {
         key: [record for _, record in group] for key, group in groupby(sorted_buffer, key=lambda x: x[0])
     }
-
     for records in exact_matches.values():
         if len(records) > 1:
-            counts['Exact'] += 1
-            f_records.write(f"Exact\t{','.join(sorted(records))}\n")
+            counts['exact'] += 1
+            f_records.write(f"exact\t{','.join(sorted(records))}\n")
 
     # Process insert matches
     for i in range(len(ins_buffer)):

From 25a146746aec3f7a0989b8dc65e76079bed9df55 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Thu, 26 Sep 2024 11:59:09 -0400
Subject: [PATCH 29/29] Removed dockstore push

---
 .github/.dockstore.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/.dockstore.yml b/.github/.dockstore.yml
index 3aa477636..3bcad4198 100644
--- a/.github/.dockstore.yml
+++ b/.github/.dockstore.yml
@@ -150,7 +150,6 @@ workflows:
     filters:
       branches: 
         - main
-        - kj/701_vcf_qc_duplicates
       tags:
         - /.*/