broadinstitute · kjaisingh · Oct 28, 2024 · Aug 6, 2024 · Aug 6, 2024 · Aug 6, 2024
diff --git a/src/sv-pipeline/scripts/identify_duplicates.py b/src/sv-pipeline/scripts/identify_duplicates.py
@@ -0,0 +1,131 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Identify and classify duplicated variants from an input VCF.
+"""
+
+from typing import List, Text, Optional
+from collections import defaultdict
+from itertools import groupby
+
+import argparse
+import sys
+import pysam
+
+
+def process_duplicates(vcf, fout):
+    # Initialize counters and buffers
+    counts = defaultdict(int)
+    exact_buffer = []
+    ins_buffer = []
+    current_chrom = None
+    current_pos = None
+
+    # Create output files
+    with open(f"{fout}_duplicate_records.tsv", 'w') as f_records, open(f"{fout}_duplicate_counts.tsv", 'w') as f_counts:
+        f_records.write("TYPE\tDUP_RECORDS\n")
+        f_counts.write("TYPE\tDUP_COUNTS\n")
+
+        # Iterate through all records
+        for record in vcf.fetch():
+            # Process current buffers if we've reached a new chrom or pos
+            if record.chrom != current_chrom or record.pos != current_pos:
+                process_buffers(exact_buffer, ins_buffer, counts, f_records)
+                exact_buffer = []
+                ins_buffer = []
+                current_chrom = record.chrom
+                current_pos = record.pos
+
+            # Update buffers with new record
+            exact_key = (
+                record.chrom,
+                record.pos,
+                record.stop,
+                record.info.get('SVTYPE'),
+                record.info.get('SVLEN'),
+                record.info.get('CHR2'),
+                record.info.get('END2'),
+                record.info.get('STRANDS'),
+                record.info.get('CPX_TYPE'),
+                record.info.get('CPX_INTERVALS')
+            )
+            exact_buffer.append((exact_key, record.id))
+
+            if record.info.get('SVTYPE') == 'INS':
+                insert_key = (
+                    record.id,
+                    record.info.get('SVLEN'),
+                    record.alts[0]
+                )
+                ins_buffer.append(insert_key)
+
+        # Process remaining records in the buffer
+        process_buffers(exact_buffer, ins_buffer, counts, f_records)
+
+        # Write counts to file
+        for match_type in sorted(counts.keys()):
+            f_counts.write(f"{match_type}\t{counts[match_type]}\n")
+
+
+def process_buffers(exact_buffer, ins_buffer, counts, f_records):
+    # Process exact matches
+    sorted_buffer = sorted(exact_buffer, key=lambda x: x[0])
+    exact_matches = {
+        key: [record for _, record in group] for key, group in groupby(sorted_buffer, key=lambda x: x[0])
+    }
+    for records in exact_matches.values():
+        if len(records) > 1:
+            counts['exact'] += 1
+            f_records.write(f"exact\t{','.join(sorted(records))}\n")
+
+    # Process insert matches
+    for i in range(len(ins_buffer)):
+        for j in range(i + 1, len(ins_buffer)):
+            rec1, rec2 = ins_buffer[i], ins_buffer[j]
+
+            # Size comparison
+            if rec1[1] == rec2[1]:
+                counts['ins_size_similarity_100'] += 1
+                f_records.write(f"ins_size_similarity_100\t{rec1[0]},{rec2[0]}\n")
+            elif abs(rec1[1] - rec2[1]) <= 0.5 * max(rec1[1], rec2[1]):
+                counts['ins_size_similarity_50'] += 1
+                f_records.write(f"ins_size_similarity_50\t{rec1[0]},{rec2[0]}\n")
+            else:
+                counts['ins_size_similarity_0'] += 1
+                f_records.write(f"ins_size_similarity_0\t{rec1[0]},{rec2[0]}\n")
+
+            # ALT comparison
+            if rec1[2] == rec2[2]:
+                counts['ins_alt_identical'] += 1
+                f_records.write(f"ins_alt_identical\t{rec1[0]},{rec2[0]}\n")
+            elif ('<INS>' in (rec1[2], rec2[2])) and ('<INS:' in (rec1[2] + rec2[2])):
+                counts['ins_alt_same_subtype'] += 1
+                f_records.write(f"ins_alt_same_subtype\t{rec1[0]},{rec2[0]}\n")
+            elif rec1[2].startswith('<INS:') and rec2[2].startswith('<INS:') and rec1[2] != rec2[2]:
+                counts['ins_alt_different_subtype'] += 1
+                f_records.write(f"ins_alt_different_subtype\t{rec1[0]},{rec2[0]}\n")
+
+
+def _parse_arguments(argv: List[Text]) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Identify duplicated records from a sorted input VCF",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument('-v', '--vcf', required=True, help='Input VCF.')
+    parser.add_argument('-f', '--fout', required=True, help='Output file name.')
+    parsed_arguments = parser.parse_args(argv[1:])
+    return parsed_arguments
+
+
+def main(argv: Optional[List[Text]] = None):
+    if argv is None:
+        argv = sys.argv
+    args = _parse_arguments(argv)
+
+    vcf = pysam.VariantFile(args.vcf)
+    process_duplicates(vcf, args.fout)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/sv-pipeline/scripts/merge_duplicates.py b/src/sv-pipeline/scripts/merge_duplicates.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+Merge duplicate records and counts files across multiple TSVs.
+"""
+
+from typing import List, Text, Optional
+from collections import defaultdict
+
+import argparse
+import sys
+import csv
+
+
+def merge_duplicates(record_files: List[str], count_files: List[str], fout: str):
+    # Merge records
+    with open(f"{fout}_duplicate_records.tsv", 'w', newline='') as out_records:
+        writer = csv.writer(out_records, delimiter='\t')
+        # Write header
+        writer.writerow(['TYPE', 'DUP_RECORDS'])
+        # Append records from each file
+        for record_file in record_files:
+            with open(record_file, 'r') as f:
+                reader = csv.reader(f, delimiter='\t')
+                next(reader)
+                for row in reader:
+                    writer.writerow(row)
+
+    # Sum counts
+    counts = defaultdict(int)
+    for count_file in count_files:
+        with open(count_file, 'r') as f:
+            reader = csv.reader(f, delimiter='\t')
+            next(reader)
+            for row in reader:
+                counts[row[0]] += int(row[1])
+
+    # Merge counts
+    with open(f"{fout}_duplicate_counts.tsv", 'w', newline='') as out_counts:
+        writer = csv.writer(out_counts, delimiter='\t')
+
+        # Write header
+        writer.writerow(['TYPE', 'DUP_COUNTS'])
+
+        # Append each row from merged counts
+        for category, count in counts.items():
+            writer.writerow([category, count])
+
+
+def _parse_arguments(argv: List[Text]) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Merge duplicate records and counts files across multiple TSVs",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument('-r', '--records', nargs='+', required=True, help='Input duplicated record TSV files.')
+    parser.add_argument('-c', '--counts', nargs='+', required=True, help='Input duplicated counts TSV files.')
+    parser.add_argument('-f', '--fout', required=True, help='Output file name.')
+    parsed_arguments = parser.parse_args(argv[1:])
+    return parsed_arguments
+
+
+def main(argv: Optional[List[Text]] = None):
+    if argv is None:
+        argv = sys.argv
+    args = _parse_arguments(argv)
+
+    merge_duplicates(args.records, args.counts, args.fout)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/wdl/MainVcfQc.wdl b/wdl/MainVcfQc.wdl
@@ -39,6 +39,8 @@ workflow MainVcfQc {
     RuntimeAttr? runtime_override_plot_qc_per_sample
     RuntimeAttr? runtime_override_plot_qc_per_family
     RuntimeAttr? runtime_override_per_sample_benchmark_plot
+    RuntimeAttr? runtime_override_identify_duplicates
+    RuntimeAttr? runtime_override_merge_duplicates
     RuntimeAttr? runtime_override_sanitize_outputs
 
     # overrides for MiniTasks or Utils
@@ -275,6 +277,27 @@ workflow MainVcfQc {
     }
   }
 
+  # Identify all duplicates
+  scatter(vcf in vcfs_for_qc) {
+    call IdentifyDuplicates {
+      input:
+        prefix=prefix,
+        vcf=vcf,
+        sv_pipeline_qc_docker=sv_pipeline_qc_docker,
+        runtime_attr_override=runtime_override_identify_duplicates
+    }
+  }
+
+  # Merge duplicates
+  call MergeDuplicates {
+    input:
+      prefix=prefix,
+      tsv_records=IdentifyDuplicates.duplicate_records,
+      tsv_counts=IdentifyDuplicates.duplicate_counts,
+      sv_pipeline_qc_docker=sv_pipeline_qc_docker,
+      runtime_attr_override=runtime_override_merge_duplicates
+  }
+
   # Sanitize all outputs
   call SanitizeOutputs {
     input:
@@ -296,6 +319,8 @@ workflow MainVcfQc {
   output {
     File sv_vcf_qc_output = SanitizeOutputs.vcf_qc_tarball
     File vcf2bed_output = MergeVcf2Bed.merged_bed_file
+    File duplicate_records_output = MergeDuplicates.duplicate_records
+    File duplicate_counts_output = MergeDuplicates.duplicate_counts
   }
 }
 
@@ -858,3 +883,103 @@ task SanitizeOutputs {
   }
 }
 
+
+# Identify all duplicates in a single file
+task IdentifyDuplicates {
+  input {
+    String prefix
+    File vcf
+    String sv_pipeline_qc_docker
+    RuntimeAttr? runtime_attr_override
+  }
+
+  String vcf_basename = basename(vcf, ".vcf.gz")
+  String full_prefix = "~{prefix}.~{vcf_basename}"
+
+  RuntimeAttr runtime_default = object {
+    mem_gb: 3.75,
+    disk_gb: 10 + ceil(size(vcf, "GiB")),
+    cpu_cores: 1,
+    preemptible_tries: 1,
+    max_retries: 1,
+    boot_disk_gb: 10
+  }
+
+  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+  runtime {
+    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB"
+    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+    docker: sv_pipeline_qc_docker
+    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+  }
+
+  command <<<
+    set -euo pipefail
+
+    echo "Processing ~{vcf} into ~{full_prefix}..."
+
+    python /opt/sv-pipeline/scripts/identify_duplicates.py \
+      --vcf ~{vcf} \
+      --fout ~{full_prefix}
+
+    echo "Finishing processing VCF."
+  >>>
+
+  output {
+    File duplicate_records = "~{full_prefix}_duplicate_records.tsv"
+    File duplicate_counts = "~{full_prefix}_duplicate_counts.tsv"
+  }
+}
+
+
+# Aggregate distinct duplicate summary files
+task MergeDuplicates {
+  input {
+    String prefix
+    Array[File] tsv_records
+    Array[File] tsv_counts
+    String sv_pipeline_qc_docker
+    RuntimeAttr? runtime_attr_override
+  }
+
+  RuntimeAttr runtime_default = object {
+    mem_gb: 3.75,
+    disk_gb: 5 + ceil(size(tsv_records, "GiB")) + ceil(size(tsv_counts, "GiB")),
+    cpu_cores: 1,
+    preemptible_tries: 1,
+    max_retries: 1,
+    boot_disk_gb: 10
+  }
+
+  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+  runtime {
+    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB"
+    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+    docker: sv_pipeline_qc_docker
+    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+  }
+
+  command <<<
+    set -euo pipefail
+
+    echo "Merging all TSV files into one..."
+
+    python /opt/sv-pipeline/scripts/merge_duplicates.py \
+      --records ~{sep=' ' tsv_records} \
+      --counts ~{sep=' ' tsv_counts} \
+      --fout "~{prefix}.agg"
+
+    echo "All TSVs processed."
+  >>>
+
+  output {
+    File duplicate_records = "~{prefix}.agg_duplicate_records.tsv"
+    File duplicate_counts = "~{prefix}.agg_duplicate_counts.tsv"
+  }
+}