From be512149ca2a4739f4ad6706d04d1fda54ac0067 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Thu, 17 Oct 2024 12:32:21 -0400
Subject: [PATCH 01/40] Initial commit

---
 wdl/ResolveComplexVariants.wdl | 5 +++++
 wdl/TasksMakeCohortVcf.wdl     | 7 ++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/wdl/ResolveComplexVariants.wdl b/wdl/ResolveComplexVariants.wdl
index f712537b6..e2d32128b 100644
--- a/wdl/ResolveComplexVariants.wdl
+++ b/wdl/ResolveComplexVariants.wdl
@@ -17,6 +17,9 @@ workflow ResolveComplexVariants {
     Array[File] disc_files
     Array[File] rf_cutoff_files
 
+    Array[String]? background_fail_columns
+    Array[String]? bothsides_pass_columns
+
     File contig_list
     Int max_shard_size
     File cytobands
@@ -194,6 +197,7 @@ workflow ResolveComplexVariants {
         vcf=RenameVariants.renamed_vcf,
         original_list=cluster_bothside_pass_lists[i],
         outfile="~{cohort_name}.~{contig}.sr_bothside_pass.updated3.txt",
+        header_columns=select_first([bothsides_pass_columns, ["1", "2", "3", "4"]]),
         sv_pipeline_docker=sv_pipeline_docker,
         runtime_attr_override=runtime_override_update_sr_list_pass
     }
@@ -204,6 +208,7 @@ workflow ResolveComplexVariants {
         vcf=RenameVariants.renamed_vcf,
         original_list=cluster_background_fail_lists[i],
         outfile="~{cohort_name}.~{contig}.sr_background_fail.updated3.txt",
+        header_columns=select_first([background_fail_columns, ["1", "2", "3", "4"]]),
         sv_pipeline_docker=sv_pipeline_docker,
         runtime_attr_override=runtime_override_update_sr_list_fail
     }
diff --git a/wdl/TasksMakeCohortVcf.wdl b/wdl/TasksMakeCohortVcf.wdl
index d489831e8..fef15e068 100644
--- a/wdl/TasksMakeCohortVcf.wdl
+++ b/wdl/TasksMakeCohortVcf.wdl
@@ -684,6 +684,7 @@ task UpdateSrList {
     File vcf
     File original_list
     String outfile
+    Array[String]? header_columns
     String sv_pipeline_docker
     RuntimeAttr? runtime_attr_override
   }
@@ -711,6 +712,10 @@ task UpdateSrList {
   command <<<
     set -euxo pipefail
 
+    if [[ ! -z "~{sep=' ' header_columns}" ]]; then
+      echo -e "~{sep='\t' header_columns}" > ~{outfile}
+    fi
+
     # append new ids to original list
     svtk vcf2bed ~{vcf} int.bed -i MEMBERS --no-samples --no-header
 
@@ -724,7 +729,7 @@ task UpdateSrList {
         else print $0,$NF; \
       }' int.bed ~{original_list} \
       | sort -k1,1n \
-      > ~{outfile}
+      >> ~{outfile}
   >>>
 
   output {

From d20d5b87e46ac7e259abd4f95c456f80f7545fbd Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Thu, 31 Oct 2024 15:34:37 -0400
Subject: [PATCH 02/40] E2E working in local integration tests

---
 .github/.dockstore.yml                        |    1 +
 dockerfiles/sv-pipeline/Dockerfile            |   12 -
 .../scripts/clean_vcf_part2.sh                |  171 --
 .../svpipeline/CleanVCFPart1.java             |  316 ----
 .../svpipeline/CleanVCFPart1UnitTest.java     |   40 -
 wdl/CleanVcf.wdl                              |    9 +-
 wdl/CleanVcf5.wdl                             |    6 +-
 wdl/CleanVcfChromosome.wdl                    | 1577 ++++++++---------
 8 files changed, 791 insertions(+), 1341 deletions(-)
 delete mode 100755 src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part2.sh
 delete mode 100644 src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1.java
 delete mode 100644 src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1UnitTest.java

diff --git a/.github/.dockstore.yml b/.github/.dockstore.yml
index 974fdeee6..938508af3 100644
--- a/.github/.dockstore.yml
+++ b/.github/.dockstore.yml
@@ -141,6 +141,7 @@ workflows:
     filters:
       branches:
         - main
+        - kj_sv_cleanvcf
       tags:
         - /.*/
 
diff --git a/dockerfiles/sv-pipeline/Dockerfile b/dockerfiles/sv-pipeline/Dockerfile
index d4f9aa687..5d9c759c7 100644
--- a/dockerfiles/sv-pipeline/Dockerfile
+++ b/dockerfiles/sv-pipeline/Dockerfile
@@ -70,13 +70,9 @@ RUN plink2 || true
 # -Compile StitchFragmentedCNVs Java program
 # -Compile StitchFragmentedCNVs unit tests
 # -Compile VCFParser unit tests
-# -Compile and test CleanVCFPart1 Java program
-# -Compile and test CleanVCFPart1 unit tests
 ENV STITCH_JAR="/opt/sv-pipeline/java/build/StitchFragmentedCNVs.jar"
 ARG STITCH_UNIT_TEST_JAR="/opt/sv-pipeline/java/build/StitchFragmentedCNVsUnitTest.jar"
 ARG VCF_PARSER_UNIT_TEST_JAR="/opt/sv-pipeline/java/build/VCFParserUnitTest.jar"
-ENV CLEAN_VCF_PART_1_JAR="/opt/sv-pipeline/java/build/CleanVCFPart1.jar"
-ARG CLEAN_VCF_PART_1_UNIT_TEST_JAR="/opt/sv-pipeline/java/build/CleanVCFPart1UnitTest.jar"
 ARG BUILD_DEPS="openjdk-8-jdk"
 ARG DEBIAN_FRONTEND=noninteractive
 RUN export APT_TRANSIENT_PACKAGES=$(diff_of_lists.sh "$BUILD_DEPS" $APT_REQUIRED_PACKAGES) && \
@@ -97,14 +93,6 @@ RUN export APT_TRANSIENT_PACKAGES=$(diff_of_lists.sh "$BUILD_DEPS" $APT_REQUIRED
     echo "Running VCFParserUnitTest..." && \
     java -enableassertions -jar $VCF_PARSER_UNIT_TEST_JAR && \
     rm -rf build/classes/* $VCF_PARSER_UNIT_TEST_JAR && \
-    javac -d build/classes org/broadinstitute/svpipeline/CleanVCFPart1.java org/broadinstitute/svpipeline/VCFParser.java && \
-    jar cfe build/CleanVCFPart1.jar "org.broadinstitute.svpipeline.CleanVCFPart1" -C build/classes . && \
-    rm -rf build/classes/* && \
-    javac -d build/classes org/broadinstitute/svpipeline/CleanVCFPart1UnitTest.java org/broadinstitute/svpipeline/CleanVCFPart1.java org/broadinstitute/svpipeline/VCFParser.java && \
-    jar cfe build/CleanVCFPart1UnitTest.jar "org.broadinstitute.svpipeline.CleanVCFPart1UnitTest" -C build/classes . && \
-    echo "Running CleanVCFPart1UnitTest..." && \
-    java -enableassertions -jar $CLEAN_VCF_PART_1_UNIT_TEST_JAR && \
-    rm -rf build/classes/* $CLEAN_VCF_PART_1_UNIT_TEST_JAR && \
     apt-get -qqy remove --purge $APT_TRANSIENT_PACKAGES && \
     apt-get -qqy autoremove --purge && \
     apt-get -qqy clean && \
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part2.sh b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part2.sh
deleted file mode 100755
index 5d4827fa5..000000000
--- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part2.sh
+++ /dev/null
@@ -1,171 +0,0 @@
-#!/bin/bash
-#
-# clean_vcf_part2.sh
-#
-
-set -euo pipefail
-
-##bgzipped combined vcf from clean vcf part1b.sh##
-normal_revise_vcf_gz=$1
-##whitelist of split ids for parallelization##
-whitelist=$2
-##list of multiallelic CNVs from 1b##
-multi_cnv=$3
-##output filename##
-outputfile=$4
-
-export LC_ALL=C
-
-##subset vcf to whitelist samples##
-bcftools view $normal_revise_vcf_gz -S $whitelist --no-update \
-  | gzip \
-  > subset.vcf.gz
-
-
-##create new bed with updated genotypes###
-zcat subset.vcf.gz \
-  | awk 'BEGIN{FS=OFS="\t"}{if (substr($1,1,1)=="#" || $5=="<DEL>" || $5=="<DUP>") print}' \
-  | svtk vcf2bed stdin stdout \
-  | sed 1d \
-  | sort -k4,4 \
-  | gzip \
-  > int.afternormalfix.bed.gz
-
-##Find overlapping depth based variants and reassign depth based; note this is necessary because depth call >5kb genotypes are 100% driven by depth ##
-##generate a sample list based on depth for depth overlap check below. Necessary because genotype is capped at 1/1 and by direction (i.e no dels in dups)##
-##grab all samples per variant with a non normal copy state##
-zcat subset.vcf.gz \
-  | awk 'BEGIN{FS=OFS="\t"}{if (substr($1,1,1)=="#") print; else if ($5=="<DEL>" || $5=="<DUP>") {$1=$3; print}}' \
-  | vcftools --vcf - --stdout --extract-FORMAT-info RD_CN \
-  | awk 'BEGIN{FS=OFS="\t"} NR==1{for (i=3;i<=NF;i++) header[i]=$i} NR>1{for(j=3;j<=NF;j++) print $1,header[j],$j }' \
-  | sort -k1,1 \
-  | awk 'BEGIN{FS=OFS="\t"} \
-              {if (($1~"DEL" && $3<2 && $3!=".") || ($1~"DUP" && $3>2 && $3!=".")) a[$1]=a[$1]?a[$1]","$2:$2} \
-         END{for (i in a) print i,a[i]}' \
-  | sort -k1,1 \
-  > afternormal.combined.RD_CN.list.txt
-
-##get a list of samples for actual variants not just those with aberrant copy states##
-zcat int.afternormalfix.bed.gz \
-  | awk 'BEGIN{FS=OFS="\t"} $6!="" {split($6,samples,","); for (i in samples) print $4"@"samples[i]}' \
-  | sort -k1,1 \
-  > fullvar.afternormal.list.txt
-
-##create bed with anything that has abnormal copy state##
-##do not compress all.bed because of bedtools bug: https://github.com/arq5x/bedtools2/issues/643##
-zcat int.afternormalfix.bed.gz \
-  | cut -f1-5 \
-  | awk 'BEGIN{FS=OFS="\t"}{if($3-$2>=5000)print $0}' \
-  | join -1 4 -2 1 -t '	' - afternormal.combined.RD_CN.list.txt \
-  | awk 'BEGIN {FS=OFS="\t"} \
-         $6!=""{split($6,samples,","); \
-                for (i in samples) {s = samples[i]; print $2"_"s,$3,$4,$1,$5,s,$1"@"s}}' \
-  | sort -k7,7 \
-  | join -t '	' -a 1 -1 7 -2 1 -e "NA" -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.1  - fullvar.afternormal.list.txt \
-  > all.bed
-
-##intersect variants and always set larger to left##
-bedtools intersect -wa -wb -a all.bed -b all.bed \
-  | awk 'BEGIN{FS=OFS="\t"} \
-              {if ($8=="NA") { if ($16!="NA") print $9,$10,$11,$12,$13,$14,$15,$1,$2,$3,$4,$5,$6,$7; } \
-               else if ($4!=$12) {
-                if ($3-$2>=$11-$10) print $1,$2,$3,$4,$5,$6,$7,$9,$10,$11,$12,$13,$14,$15; \
-                else if ($16!="NA") print $9,$10,$11,$12,$13,$14,$15,$1,$2,$3,$4,$5,$6,$7; \
-                else print $1,$2,$3,$4,$5,$6,$7,$9,$10,$11,$12,$13,$14,$15;}}' \
-  | sort -k7,7 \
-  | uniq \
-  | gzip \
-  > bed.overlap.txt.gz
-
-zcat subset.vcf.gz \
-  | grep '^#' \
-  > vcf_header
-zcat bed.overlap.txt.gz \
-  | awk '{print $4; print $11}' \
-  | sort -u \
-  > overlap.events
-echo '~~~' >> overlap.events
-zcat subset.vcf.gz \
-  | grep -v '^#' \
-  | sort -k3,3 \
-  | join -t '	' -1 1 -2 3 overlap.events - \
-  | awk 'BEGIN{FS=OFS="\t"}{tmp=$2;$2=$3;$3=tmp;print}' \
-  | sort -k2n,2 \
-  > overlap.events.vcf
-
-##get info for each variant##
-zcat bed.overlap.txt.gz \
-  | awk 'BEGIN{FS=OFS="\t"}{print $7; print $14;}' \
-  | sort -u \
-  > combined.bed
-for var in EV RD_CN GT
-do
- echo '~~~' >> combined.bed
- cat vcf_header overlap.events.vcf \
-  | vcftools --vcf - --stdout --extract-FORMAT-info ${var} \
-  | awk 'BEGIN{FS=OFS="\t"} \
-         NR==1{for (i=3;i<=NF;i++) header[i]=$i} \
-         NR>1 {for (j=3;j<=NF;j++) print $1"@"header[j],$j}' \
-  | sort -k1,1 \
-  | join -t '	' -1 1 -2 1 combined.bed - \
-  > combined.bed.tmp
-  mv combined.bed.tmp combined.bed
-done
-
-zcat bed.overlap.txt.gz \
-  | join -t '	' -1 7 -2 1 - combined.bed \
-  | cut -f2- \
-  | sort -k13,13 \
-  | join -t '	' -1 13 -2 1 - combined.bed \
-  | cut -f2- \
-  | awk 'BEGIN{FS=OFS="\t"}{print $3-$2,$9-$8,$0}' \
-  | sort -k1nr,1 -k2nr,2 \
-  | cut -f3- \
-  | gzip \
-  > all.combined.bed.gz
-
-zcat all.combined.bed.gz \
-  | awk -v multiCNVFile=$multi_cnv ' \
-     function makeRevision( id, val ) { reviseCN[id] = val; if ( val == 2 ) wasRevisedToNormal[id] = 1; } \
-     BEGIN \
-       {FS=OFS="\t"; \
-        while ( getline < multiCNVFile ) multiCNV[$0] = 1; \
-        close(multiCNVFile)} \
-       {chr_sample1 = $1; start1 = $2; stop1 = $3; ev1 = $4; svtype1 = $5; sample1 = $6; \
-        chr_sample2 = $7; start2 = $8; stop2 = $9; ev2 = $10; svtype2 = $11; sample2 = $12; \
-        support1 = $13; RD_CN1 = $14; GT1 = $15; support2 = $16; RD_CN2 = $17; GT2 = $18; \
-        id1 = ev1"@"sample1; id2 = ev2"@"sample2; \
-        length1 = stop1 - start1; length2 = stop2 - start2; \
-        if ( id1 in wasRevisedToNormal ) next; \
-        if ( id1 in reviseCN ) RD_CN1 = reviseCN[id1]; \
-        if ( id2 in reviseCN ) RD_CN2 = reviseCN[id2]; \
-        overlap = (stop1 < stop2 ? stop1 : stop2) - (start1 > start2 ? start1 : start2); \
-        smallOverlap50 = overlap/length2 > .5; \
-        largeOverlap50 = overlap/length1 > .5; \
-        ##Call where smaller depth call is being driven by larger## \
-        if ( support1 ~ /RD/ && support1 != "RD" && support2 == "RD" && smallOverlap50 && !(ev1 in multiCNV) ) { \
-          if ( RD_CN1 == 0 ) makeRevision(id2, RD_CN2 + 2); \
-          else if ( RD_CN1 == 1 ) makeRevision(id2, RD_CN2 + RD_CN1); \
-          else if ( RD_CN1 > 1 ) { newCN = RD_CN2 - RD_CN1 + 2; if ( newCN < 0 ) newCN = 0; makeRevision(id2, newCN); } } \
-        ##Smaller CNV driving larger CNV genotype## \
-        else if ( support1 == "RD" && support2 ~ /RD/ && support2 != "RD" && smallOverlap50 && !(ev2 in multiCNV) && GT2 != "0/0" && largeOverlap50 ) { \
-          if ( RD_CN2 == 0 ) makeRevision(id1, RD_CN1 + 2); \
-          else if ( RD_CN2 == 1 ) makeRevision(id1, RD_CN1 + RD_CN2); \
-          else if ( RD_CN2 > 1 ) { newCN = RD_CN1 - RD_CN2 + 2; if ( newCN < 0 ) newCN = 0; makeRevision(id1, newCN); } } \
-        ##Depth only calls where smaller call is being driven by larger## \
-        else if ( support1 == "RD" && support2 == "RD" && smallOverlap50 && svtype1 == svtype2 && !(ev1 in multiCNV) ) { \
-          if ( RD_CN1 == 0 && RD_CN1 != RD_CN2 ) makeRevision(id2, RD_CN2 + 2); \
-          else if ( RD_CN1 == 1 && RD_CN1 > RD_CN2 ) makeRevision(id2, 1); \
-          else if ( RD_CN1 > 1 && RD_CN1 < RD_CN2 ) { newCN = RD_CN2 - RD_CN1 + 2; if ( newCN < 0 ) newCN = 0; makeRevision(id2, newCN); } \
-          else makeRevision(id2, 2); } \
-        ##Any other time a larger call is driving a smaller call## \
-        else if ( support1 ~ /RD/ && smallOverlap50 && length2 > 5000 && !(ev1 in multiCNV) ) { \
-          if ( RD_CN1 == 0 ) makeRevision(id2, RD_CN2 + 2); \
-          else if ( RD_CN1 == 1 ) makeRevision(id2, RD_CN2 + RD_CN1); \
-          else if ( RD_CN1 > 1 ) { newCN = RD_CN2 - RD_CN1 + 2; if ( newCN < 0 ) newCN = 0; makeRevision(id2, newCN); } } } \
-     END \
-       {for ( id in reviseCN ) print id,reviseCN[id]; }' \
-  | sed 's/@/	/' \
-  | sort \
-  | awk 'BEGIN{FS=OFS="\t"}{if ($3<0) $3=0; print $0}' \
-  > $outputfile
diff --git a/src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1.java b/src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1.java
deleted file mode 100644
index 5637154bb..000000000
--- a/src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1.java
+++ /dev/null
@@ -1,316 +0,0 @@
-package org.broadinstitute.svpipeline;
-
-import java.io.*;
-import java.nio.charset.StandardCharsets;
-import java.util.*;
-import java.util.regex.Pattern;
-import org.broadinstitute.svpipeline.VCFParser.*;
-
-public class CleanVCFPart1 {
-    private static final ByteSequence[] EV_VALS = {
-            null,
-            new ByteSequence("RD"),
-            new ByteSequence("PE"),
-            new ByteSequence("RD,PE"),
-            new ByteSequence("SR"),
-            new ByteSequence("RD,SR"),
-            new ByteSequence("PE,SR"),
-            new ByteSequence("RD,PE,SR")
-    };
-    private static final ByteSequence FORMAT_LINE = new ByteSequence("FORMAT");
-    private static final ByteSequence ID_KEY = new ByteSequence("ID");
-    private static final ByteSequence EV_VALUE = new ByteSequence("EV");
-    private static final ByteSequence TYPE_KEY = new ByteSequence("Type");
-    private static final ByteSequence STRING_VALUE = new ByteSequence("String");
-    private static final ByteSequence NUMBER_KEY = new ByteSequence("Number");
-    private static final ByteSequence SVTYPE_KEY = new ByteSequence("SVTYPE");
-    private static final ByteSequence ME_VALUE = new ByteSequence(":ME");
-    private static final ByteSequence LT_VALUE = new ByteSequence("<");
-    private static final ByteSequence GT_VALUE = new ByteSequence(">");
-    private static final ByteSequence N_VALUE = new ByteSequence("N");
-    private static final ByteSequence END_KEY = new ByteSequence("END");
-    private static final ByteSequence VARGQ_KEY = new ByteSequence("varGQ");
-    private static final ByteSequence MULTIALLELIC_KEY = new ByteSequence("MULTIALLELIC");
-    private static final ByteSequence UNRESOLVED_KEY = new ByteSequence("UNRESOLVED");
-    private static final ByteSequence HIGH_SR_BACKGROUND_KEY = new ByteSequence("HIGH_SR_BACKGROUND");
-    private static final ByteSequence BOTHSIDES_SUPPORT_KEY = new ByteSequence("BOTHSIDES_SUPPORT");
-    private static final ByteSequence DEL_VALUE = new ByteSequence("DEL");
-    private static final ByteSequence DUP_VALUE = new ByteSequence("DUP");
-    private static final ByteSequence RDCN_VALUE = new ByteSequence("RD_CN");
-    private static final ByteSequence MISSING_VALUE = new ByteSequence(".");
-    private static final ByteSequence MISSING_GENOTYPE = new ByteSequence("./.");
-    private static final ByteSequence GT_REF_REF = new ByteSequence("0/0");
-    private static final ByteSequence GT_REF_ALT = new ByteSequence("0/1");
-    private static final ByteSequence GT_ALT_ALT = new ByteSequence("1/1");
-
-    private static final int MIN_ALLOSOME_EVENT_SIZE = 5000;
-
-    public static void main( final String[] args ) {
-        if ( args.length != 8 ) {
-            System.err.println("Usage: java org.broadinstitute.svpipeline.CleanVCFPart1 " +
-                    "INPUTVCFFILE PEDIGREES XCHR YCHR NOISYEVENTS BOTHSIDES SAMPLESOUT REVISEDEVENTSOUT");
-            System.exit(1);
-        }
-        final VCFParser parser = new VCFParser(args[0]);
-        final ByteSequence xChrName = new ByteSequence(args[2]);
-        final ByteSequence yChrName = new ByteSequence(args[3]);
-        final Set<ByteSequence> noisyEvents = readLastColumn(args[4]);
-        final Set<ByteSequence> bothsidesSupportEvents = readLastColumn(args[5]);
-        try ( final OutputStream os
-                      = new BufferedOutputStream(new FileOutputStream(FileDescriptor.out));
-              final OutputStream osSamples = new BufferedOutputStream(new FileOutputStream(args[6]));
-              final OutputStream osRevEvents = new BufferedOutputStream(new FileOutputStream(args[7])) ) {
-            int[] sexForSample = null;
-            while ( parser.hasMetadata() ) {
-                final Metadata metadata = parser.nextMetaData();
-                if ( metadata instanceof ColumnHeaderMetadata ) {
-                    final ColumnHeaderMetadata cols = ((ColumnHeaderMetadata)metadata);
-                    final List<ByteSequence> colNames = cols.getValue();
-                    final int nCols = colNames.size();
-                    for ( int idx = 9; idx < nCols; ++idx ) {
-                        colNames.get(idx).write(osSamples);
-                        osSamples.write('\n');
-                    }
-                    sexForSample = readPedFile(args[1], cols.getValue());
-                    os.write(("##INFO=<ID=HIGH_SR_BACKGROUND,Number=0,Type=Flag,Description=\"High number of "
-                            + "SR splits in background samples indicating messy region\">\n")
-                                .getBytes(StandardCharsets.UTF_8));
-                    os.write("##FILTER=<ID=UNRESOLVED,Description=\"Variant is unresolved\">\n"
-                                .getBytes(StandardCharsets.UTF_8));
-                    os.write(("##INFO=<ID=BOTHSIDES_SUPPORT,Number=0,Type=Flag,Description=\"Variant has " +
-                            "read-level support for both sides of breakpoint\">\n")
-                                .getBytes(StandardCharsets.UTF_8));
-                } else if ( metadata instanceof KeyAttributesMetadata ) {
-                    final KeyAttributesMetadata keyAttrs = (KeyAttributesMetadata)metadata;
-                    if ( keyAttrs.getKey().equals(FORMAT_LINE) ) {
-                        final List<KeyValue> kvs = keyAttrs.getValue();
-                        final int nKVs = kvs.size();
-                        if ( nKVs > 2 ) {
-                            final KeyValue kv0 = kvs.get(0);
-                            final KeyValue kv1 = kvs.get(1);
-                            final KeyValue kv2 = kvs.get(2);
-                            if ( kv0.getKey().equals(ID_KEY) && kv0.getValue().equals(EV_VALUE) ) {
-                                if ( kv1.getKey().equals(NUMBER_KEY) ) {
-                                    kvs.set(1, new KeyValue(NUMBER_KEY, MISSING_VALUE));
-                                }
-                                if ( kv2.getKey().equals(TYPE_KEY) ) {
-                                    kvs.set(2, new KeyValue(TYPE_KEY, STRING_VALUE));
-                                }
-                            }
-                        }
-                    }
-                }
-                metadata.write(os);
-            }
-            if ( sexForSample == null ) {
-                throw new RuntimeException("header line with sample names is missing.");
-            }
-            while ( parser.hasRecord() ) {
-                final Record record = parser.nextRecord();
-
-                // replace the numeric EV value with a text value
-                final int evIdx = record.getFormat().indexOf(EV_VALUE);
-                if ( evIdx >= 0 ) {
-                    for ( final CompoundField genotypeVals : record.getGenotypes() ) {
-                        genotypeVals.set(evIdx, EV_VALS[genotypeVals.get(evIdx).asInt()]);
-                    }
-                }
-
-                // move the SVTYPE to the ALT field (except for MEs)
-                final InfoField info = record.getInfo();
-                final ByteSequence svType = info.get(SVTYPE_KEY);
-                if ( !record.getAlt().contains(ME_VALUE) ) {
-                    if ( svType != null ) {
-                        record.setAlt(new ByteSequence(LT_VALUE, svType, GT_VALUE));
-                    }
-                }
-                record.setRef(N_VALUE);
-
-                // move varGQ info field to quality column
-                final ByteSequence varGQ = info.get(VARGQ_KEY);
-                if ( varGQ != null ) {
-                    record.setQuality(varGQ);
-                    info.remove(VARGQ_KEY);
-                }
-
-                // remove MULTIALLELIC flag, if present
-                info.remove(MULTIALLELIC_KEY);
-
-                // remove UNRESOLVED flag and add it as a filter
-                if ( info.containsKey(UNRESOLVED_KEY) ) {
-                    record.getFilter().add(UNRESOLVED_KEY);
-                    info.remove(UNRESOLVED_KEY);
-                }
-
-                // mark noisy events
-                if ( noisyEvents.contains(record.getID()) ) {
-                    record.getInfo().put(HIGH_SR_BACKGROUND_KEY, null);
-                }
-
-                // mark bothsides support
-                if ( bothsidesSupportEvents.contains(record.getID()) ) {
-                    record.getInfo().put(BOTHSIDES_SUPPORT_KEY, null);
-                }
-
-                // fix genotypes on allosomes
-                final boolean isY;
-                if ( (isY = yChrName.equals(record.getChromosome())) ||
-                        xChrName.equals(record.getChromosome())) {
-                    final List<CompoundField> genotypes = record.getGenotypes();
-                    final int rdCNIndex = record.getFormat().indexOf(RDCN_VALUE);
-                    final ByteSequence end = info.get(END_KEY);
-                    boolean adjustMale = false;
-                    final boolean isDel;
-                    if ( ((isDel = DEL_VALUE.equals(svType)) || DUP_VALUE.equals(svType)) && rdCNIndex >= 0 && end != null &&
-                            end.asInt() + 1 - record.getPosition() > MIN_ALLOSOME_EVENT_SIZE ) {
-                        adjustMale = isRevisableEvent(genotypes, rdCNIndex, sexForSample, isY);
-                        if ( adjustMale ) {
-                            record.getID().write(osRevEvents);
-                            osRevEvents.write('\n');
-                        }
-                    }
-                    CompoundField emptyGenotype = null;
-                    final int nSamples = genotypes.size();
-                    for ( int sampleIdx = 0; sampleIdx < nSamples; ++sampleIdx ) {
-                        final int sampleSex = sexForSample[sampleIdx];
-                        final CompoundField genotype = genotypes.get(sampleIdx);
-                        if ( sampleSex == 1 ) {
-                            if ( adjustMale ) {
-                                final ByteSequence rdCN = genotype.get(rdCNIndex);
-                                if ( rdCN.equals(MISSING_VALUE) ) {
-                                    continue;
-                                }
-                                final int rdCNVal = rdCN.asInt();
-                                genotype.set(rdCNIndex, new ByteSequence(Integer.toString(rdCNVal + 1)));
-                                if ( isDel ) {
-                                    if ( rdCNVal >= 1 ) genotype.set(0, GT_REF_REF);
-                                    else if ( rdCNVal == 0 ) genotype.set(0, GT_REF_ALT);
-                                } else {
-                                    if ( rdCNVal <= 1 ) genotype.set(0, GT_REF_REF);
-                                    else if ( rdCNVal == 2 ) genotype.set(0, GT_REF_ALT);
-                                    else genotype.set(0, GT_ALT_ALT);
-                                }
-                            }
-                        } else if ( sampleSex == 2 ) {
-                            if ( isY ) {
-                                if ( emptyGenotype == null ) {
-                                    emptyGenotype = new CompoundField(MISSING_GENOTYPE, ':');
-                                    int nFields = genotype.size();
-                                    while ( --nFields > 0 ) {
-                                        emptyGenotype.add(MISSING_VALUE);
-                                    }
-                                    emptyGenotype.getValue(); // performance hack to put the pieces together
-                                }
-                                genotypes.set(sampleIdx, emptyGenotype);
-                            }
-                        } else {
-                            genotype.set(0, MISSING_GENOTYPE);
-                        }
-                    }
-                }
-
-                record.write(os);
-            }
-        } catch ( final IOException ioe ) {
-            throw new RuntimeException("Can't write to stdout", ioe);
-        }
-    }
-
-    private static boolean isRevisableEvent( final List<CompoundField> genotypes,
-                                             final int rdCNIndex,
-                                             final int[] sexForColumn,
-                                             final boolean isY ) {
-        // We're going to calculate the median rdCN values for males and females.
-        // We only care if the median is 0, 1, 2, or something larger, so we'll use 4 bins to
-        // sum up the counts:  all values >2 go into the last bucket.
-        final int[] maleCounts = new int[4];
-        final int[] femaleCounts = new int[4];
-        final int nSamples = genotypes.size();
-        for ( int sampleIdx = 0; sampleIdx < nSamples; ++sampleIdx ) {
-            final ByteSequence rdCN = genotypes.get(sampleIdx).get(rdCNIndex);
-            if ( MISSING_VALUE.equals(rdCN) ) {
-                continue;
-            }
-            int rdCNVal = rdCN.asInt();
-            if ( rdCNVal > 2 ) {
-                rdCNVal = 3;
-            }
-            final int sampleSex = sexForColumn[sampleIdx];
-            if ( sampleSex == 1 ) {
-                maleCounts[rdCNVal] += 1;
-            } else if ( sampleSex == 2 ) {
-                femaleCounts[rdCNVal] += 1;
-            }
-        }
-        final double maleMedian = calcMedian(maleCounts);
-        double femaleMedian = calcMedian(femaleCounts);
-        return maleMedian == 1. && (isY ? femaleMedian == 0. : femaleMedian == 2.);
-    }
-
-    // visible for testing
-    static double calcMedian( final int[] counts ) {
-        final double target = (counts[0] + counts[1] + counts[2] + counts[3]) / 2.;
-        if ( target == 0. ) {
-            return Double.NaN;
-        }
-        int total = 0;
-        for ( int iii = 0; iii < 4; ++iii ) {
-            total += counts[iii];
-            if ( total == target ) {
-                return iii + .5;
-            } else if ( total > target ) {
-                return (double)iii;
-            }
-        }
-        throw new IllegalStateException("we should never reach this statement");
-    }
-
-    private static Set<ByteSequence> readLastColumn( final String filename ) {
-        final Set<ByteSequence> values = new HashSet<>();
-        try {
-            final BufferedReader neRdr =
-                    new BufferedReader(new InputStreamReader(new FileInputStream(filename)));
-            String line;
-            while ( (line = neRdr.readLine()) != null ) {
-                final String lastCol = line.substring(line.lastIndexOf('\t') + 1);
-                values.add(new ByteSequence(lastCol));
-            }
-        } catch ( final IOException ioe ) {
-            throw new RuntimeException("can't read table file " + filename);
-        }
-        return values;
-    }
-
-    private static int[] readPedFile( final String pedFilename, List<ByteSequence> sampleNames ) {
-        final int nCols = sampleNames.size() - 9;
-        final Map<ByteSequence, Integer> sexForSampleMap = new HashMap<>(2*nCols);
-        final int[] sexForSample = new int[nCols];
-        try {
-            final BufferedReader pedRdr =
-                    new BufferedReader(new InputStreamReader(new FileInputStream(pedFilename)));
-            final Pattern tabPattern = Pattern.compile("\\t");
-            String line;
-            while ( (line = pedRdr.readLine()) != null ) {
-                if ( line.startsWith("#") ) continue;
-                final Scanner scanner = new Scanner(line).useDelimiter(tabPattern);
-                scanner.next(); // family ignored
-                final String sampleName = scanner.next();
-                scanner.next(); // mom ignored
-                scanner.next(); // pop ignored
-                final int sex = scanner.nextInt();
-                sexForSampleMap.put(new ByteSequence(sampleName), sex);
-            }
-        } catch ( final IOException ioe ) {
-            throw new RuntimeException("can't read " + pedFilename, ioe);
-        }
-        for ( int col = 0; col < nCols; ++col ) {
-            final ByteSequence sampleName = sampleNames.get(col + 9);
-            final Integer sex = sexForSampleMap.get(sampleName);
-            if ( sex == null ) {
-                throw new RuntimeException("can't determine sex for sample " + sampleName);
-            }
-            sexForSample[col] = sex;
-        }
-        return sexForSample;
-    }
-}
diff --git a/src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1UnitTest.java b/src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1UnitTest.java
deleted file mode 100644
index 77a6b5658..000000000
--- a/src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1UnitTest.java
+++ /dev/null
@@ -1,40 +0,0 @@
-package org.broadinstitute.svpipeline;
-
-public class CleanVCFPart1UnitTest {
-    public static void main( final String[] args ) {
-        testAsserts();
-        testMedianCalculation();
-        System.out.println("OK");
-    }
-
-    public static void testAsserts() {
-        boolean caughtIt = false;
-        try {
-            assert(false);
-        } catch ( final AssertionError ae ) {
-            caughtIt = true;
-        }
-        if ( !caughtIt ) {
-            throw new AssertionError("assertions aren't turned on, so you're not testing anything.");
-        }
-    }
-
-    public static void testMedianCalculation() {
-        final int[] counts = new int[4];
-        assert(Double.isNaN(CleanVCFPart1.calcMedian(counts)));
-        counts[0] = 1;
-        assert(CleanVCFPart1.calcMedian(counts) == 0.0);
-        counts[1] = 1;
-        assert(CleanVCFPart1.calcMedian(counts) == 0.5);
-        counts[2] = 1;
-        assert(CleanVCFPart1.calcMedian(counts) == 1.0);
-        counts[3] = 1;
-        assert(CleanVCFPart1.calcMedian(counts) == 1.5);
-        counts[2] = 2;
-        assert(CleanVCFPart1.calcMedian(counts) == 2.0);
-        counts[3] = 4;
-        assert(CleanVCFPart1.calcMedian(counts) == 2.5);
-        counts[3] = 5;
-        assert(CleanVCFPart1.calcMedian(counts) == 3.0);
-    }
-}
diff --git a/wdl/CleanVcf.wdl b/wdl/CleanVcf.wdl
index ab228ccf7..f58c7f4f1 100644
--- a/wdl/CleanVcf.wdl
+++ b/wdl/CleanVcf.wdl
@@ -141,6 +141,7 @@ workflow CleanVcf {
         LINE1_reference=LINE1_reference,
         chr_x=chr_x,
         chr_y=chr_y,
+        gatk_docker="docker.io/broadinstitute/gatk:3eb5c3d38d6c8c65e71f29abe9346c98bfbb1cbe",
         linux_docker=linux_docker,
         sv_base_mini_docker=sv_base_mini_docker,
         sv_pipeline_docker=sv_pipeline_docker,
@@ -167,14 +168,6 @@ workflow CleanVcf {
         runtime_override_hail_merge_drc=runtime_override_hail_merge_drc,
         runtime_override_fix_header_drc=runtime_override_fix_header_drc,
         runtime_override_drop_redundant_cnvs=runtime_override_drop_redundant_cnvs,
-        runtime_attr_override_subset_large_cnvs_1b=runtime_attr_override_subset_large_cnvs_1b,
-        runtime_attr_override_sort_bed_1b=runtime_attr_override_sort_bed_1b,
-        runtime_attr_override_intersect_bed_1b=runtime_attr_override_intersect_bed_1b,
-        runtime_attr_override_build_dict_1b=runtime_attr_override_build_dict_1b,
-        runtime_attr_override_scatter_1b=runtime_attr_override_scatter_1b,
-        runtime_attr_override_filter_vcf_1b=runtime_attr_override_filter_vcf_1b,
-        runtime_override_concat_vcfs_1b=runtime_override_concat_vcfs_1b,
-        runtime_override_cat_multi_cnvs_1b=runtime_override_cat_multi_cnvs_1b,
         runtime_attr_format=runtime_attr_format,
         runtime_override_rescue_me_dels=runtime_override_rescue_me_dels
     }
diff --git a/wdl/CleanVcf5.wdl b/wdl/CleanVcf5.wdl
index 085aaa5e5..f4396df52 100644
--- a/wdl/CleanVcf5.wdl
+++ b/wdl/CleanVcf5.wdl
@@ -8,7 +8,6 @@ workflow CleanVcf5 {
         File normal_revise_vcf
         File revise_vcf_lines
         File ped_file
-        File sex_chr_revise
         File multi_ids
         File? outlier_samples_list
 
@@ -44,7 +43,6 @@ workflow CleanVcf5 {
                 revise_vcf_lines=revise_vcf_lines,
                 normal_revise_vcf=ScatterVcf.shards[i],
                 ped_file=ped_file,
-                sex_chr_revise=sex_chr_revise,
                 multi_ids=multi_ids,
                 outlier_samples_list=outlier_samples_list,
                 make_clean_gq_script=make_clean_gq_script,
@@ -83,7 +81,6 @@ task MakeCleanGQ {
         File revise_vcf_lines
         File normal_revise_vcf
         File ped_file
-        File sex_chr_revise
         File multi_ids
         File? outlier_samples_list
         File? make_clean_gq_script
@@ -96,7 +93,7 @@ task MakeCleanGQ {
     # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed
     # generally assume working memory is ~3 * inputs
     Float input_size = size(
-                       select_all([revise_vcf_lines, normal_revise_vcf, ped_file, sex_chr_revise, multi_ids, outlier_samples_list]),
+                       select_all([revise_vcf_lines, normal_revise_vcf, ped_file, multi_ids, outlier_samples_list]),
                        "GB")
     Float base_disk_gb = 10.0
 
@@ -133,7 +130,6 @@ task MakeCleanGQ {
             revise.vcf.lines.vcf.gz \
             ~{normal_revise_vcf} \
             ~{ped_file} \
-            ~{sex_chr_revise} \
             ~{multi_ids} \
             outliers.txt \
             ~{prefix}
diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl
index a14ffa8c4..3e5332ad2 100644
--- a/wdl/CleanVcfChromosome.wdl
+++ b/wdl/CleanVcfChromosome.wdl
@@ -3,852 +3,851 @@ version 1.0
 import "Structs.wdl"
 import "TasksMakeCohortVcf.wdl" as MiniTasks
 import "FormatVcfForGatk.wdl" as fvcf
-import "CleanVcf1b.wdl" as c1b
-import "CleanVcf5.wdl" as c5
 import "HailMerge.wdl" as HailMerge
 
 workflow CleanVcfChromosome {
-  input {
-    File vcf
-    String contig
-    File background_list
-    File ped_file
-    File allosome_fai
-    String prefix
-    Int max_shards_per_chrom_step1
-    File bothsides_pass_list
-    Int min_records_per_shard_step1
-    Int samples_per_step2_shard
-    Int clean_vcf1b_records_per_shard
-    Int clean_vcf5_records_per_shard
-    Int? clean_vcf5_threads_per_task
-    File? outlier_samples_list
-    Int? max_samples_per_shard_step3
-
-    File HERVK_reference
-    File LINE1_reference
-
-    File ploidy_table
-    String chr_x
-    String chr_y
-
-    File? svtk_to_gatk_script  # For debugging
-
-    Boolean use_hail
-    String? gcs_project
-
-    String linux_docker
-    String sv_base_mini_docker
-    String sv_pipeline_docker
-
-    # overrides for local tasks
-    RuntimeAttr? runtime_override_clean_vcf_1a
-    RuntimeAttr? runtime_override_clean_vcf_2
-    RuntimeAttr? runtime_override_clean_vcf_3
-    RuntimeAttr? runtime_override_clean_vcf_4
-    RuntimeAttr? runtime_override_clean_vcf_5_scatter
-    RuntimeAttr? runtime_override_clean_vcf_5_make_cleangq
-    RuntimeAttr? runtime_override_clean_vcf_5_find_redundant_multiallelics
-    RuntimeAttr? runtime_override_clean_vcf_5_polish
-    RuntimeAttr? runtime_override_stitch_fragmented_cnvs
-    RuntimeAttr? runtime_override_final_cleanup
-    RuntimeAttr? runtime_override_rescue_me_dels
-
-    # Clean vcf 1b
-    RuntimeAttr? runtime_attr_override_subset_large_cnvs_1b
-    RuntimeAttr? runtime_attr_override_sort_bed_1b
-    RuntimeAttr? runtime_attr_override_intersect_bed_1b
-    RuntimeAttr? runtime_attr_override_build_dict_1b
-    RuntimeAttr? runtime_attr_override_scatter_1b
-    RuntimeAttr? runtime_attr_override_filter_vcf_1b
-    RuntimeAttr? runtime_override_concat_vcfs_1b
-    RuntimeAttr? runtime_override_cat_multi_cnvs_1b
-
-    RuntimeAttr? runtime_override_preconcat_step1
-    RuntimeAttr? runtime_override_hail_merge_step1
-    RuntimeAttr? runtime_override_fix_header_step1
-
-    RuntimeAttr? runtime_override_preconcat_drc
-    RuntimeAttr? runtime_override_hail_merge_drc
-    RuntimeAttr? runtime_override_fix_header_drc
-
-    # overrides for MiniTasks
-    RuntimeAttr? runtime_override_split_vcf_to_clean
-    RuntimeAttr? runtime_override_combine_step_1_sex_chr_revisions
-    RuntimeAttr? runtime_override_split_include_list
-    RuntimeAttr? runtime_override_combine_clean_vcf_2
-    RuntimeAttr? runtime_override_combine_revised_4
-    RuntimeAttr? runtime_override_combine_multi_ids_4
-    RuntimeAttr? runtime_override_drop_redundant_cnvs
-    RuntimeAttr? runtime_override_combine_step_1_vcfs
-    RuntimeAttr? runtime_override_sort_drop_redundant_cnvs
-    RuntimeAttr? runtime_attr_format
-
-  }
-
-  call MiniTasks.SplitVcf as SplitVcfToClean {
-    input:
-      vcf=vcf,
-      contig=contig,
-      prefix="~{prefix}.shard_",
-      n_shards=max_shards_per_chrom_step1,
-      min_vars_per_shard=min_records_per_shard_step1,
-      sv_base_mini_docker=sv_base_mini_docker,
-      runtime_attr_override=runtime_override_split_vcf_to_clean
-  }
-
-  scatter ( i in range(length(SplitVcfToClean.vcf_shards)) ) {
-    call CleanVcf1a {
-      input:
-        vcf=SplitVcfToClean.vcf_shards[i],
-        prefix="~{prefix}.clean_vcf_1.shard_~{i}",
-        background_fail_list=background_list,
-        bothsides_pass_list=bothsides_pass_list,
-        ped_file=ped_file,
-        allosome_fai=allosome_fai,
-        chr_x=chr_x,
-        chr_y=chr_y,
-        sv_pipeline_docker=sv_pipeline_docker,
-        runtime_attr_override=runtime_override_clean_vcf_1a
-    }
-  }
-
-  if (use_hail) {
-    call HailMerge.HailMerge as CombineStep1VcfsHail {
-      input:
-        vcfs=CleanVcf1a.intermediate_vcf,
-        prefix="~{prefix}.combine_step_1_vcfs",
-        gcs_project=gcs_project,
-        sv_base_mini_docker=sv_base_mini_docker,
-        sv_pipeline_docker=sv_pipeline_docker,
-        runtime_override_preconcat=runtime_override_preconcat_step1,
-        runtime_override_hail_merge=runtime_override_hail_merge_step1,
-        runtime_override_fix_header=runtime_override_fix_header_step1
-    }
-  }
-  if (!use_hail) {
-    call MiniTasks.ConcatVcfs as CombineStep1Vcfs {
-      input:
-        vcfs=CleanVcf1a.intermediate_vcf,
-        vcfs_idx=CleanVcf1a.intermediate_vcf_idx,
-        naive=true,
-        generate_index=false,
-        outfile_prefix="~{prefix}.combine_step_1_vcfs",
-        sv_base_mini_docker=sv_base_mini_docker,
-        runtime_attr_override=runtime_override_combine_step_1_vcfs
-    }
-  }
-
-  call MiniTasks.CatUncompressedFiles as CombineStep1SexChrRevisions {
-    input:
-      shards=CleanVcf1a.sex,
-      outfile_name="~{prefix}.combine_step_1_sex_chr_revisions.txt",
-      sv_base_mini_docker=sv_base_mini_docker,
-      runtime_attr_override=runtime_override_combine_step_1_sex_chr_revisions
-  }
-
-  call c1b.CleanVcf1b {
-    input:
-      intermediate_vcf=select_first([CombineStep1Vcfs.concat_vcf, CombineStep1VcfsHail.merged_vcf]),
-      prefix="~{prefix}.clean_vcf_1b",
-      records_per_shard=clean_vcf1b_records_per_shard,
-      sv_pipeline_docker=sv_pipeline_docker,
-      sv_base_mini_docker=sv_base_mini_docker,
-      runtime_attr_override_subset_large_cnvs=runtime_attr_override_subset_large_cnvs_1b,
-      runtime_attr_override_sort_bed=runtime_attr_override_sort_bed_1b,
-      runtime_attr_override_intersect_bed=runtime_attr_override_intersect_bed_1b,
-      runtime_attr_override_build_dict=runtime_attr_override_build_dict_1b,
-      runtime_attr_override_scatter=runtime_attr_override_scatter_1b,
-      runtime_attr_override_filter_vcf=runtime_attr_override_filter_vcf_1b,
-      runtime_override_concat_vcfs=runtime_override_concat_vcfs_1b,
-      runtime_override_cat_multi_cnvs=runtime_override_cat_multi_cnvs_1b
-  }
-
-  call MiniTasks.SplitUncompressed as SplitIncludeList {
-    input:
-      whole_file=CleanVcf1a.include_list[0],
-      lines_per_shard=samples_per_step2_shard,
-      shard_prefix="~{prefix}.split_include_list.",
-      sv_pipeline_docker=sv_pipeline_docker,
-      runtime_attr_override=runtime_override_split_include_list
-  }
-
-  scatter ( i in range(length(SplitIncludeList.shards)) ){
-    call CleanVcf2 {
-      input:
-        normal_revise_vcf=CleanVcf1b.normal,
-        prefix="~{prefix}.clean_vcf_2.shard_~{i}",
-        include_list=SplitIncludeList.shards[i],
-        multi_cnvs=CleanVcf1b.multi,
-        sv_pipeline_docker=sv_pipeline_docker,
-        runtime_attr_override=runtime_override_clean_vcf_2
-      }
-  }
-
-  call MiniTasks.CatUncompressedFiles as CombineCleanVcf2 {
-    input:
-      shards=CleanVcf2.out,
-      outfile_name="~{prefix}.combine_clean_vcf_2.txt",
-      sv_base_mini_docker=sv_base_mini_docker,
-      runtime_attr_override=runtime_override_combine_clean_vcf_2
-  }
-
-  call CleanVcf3 {
-    input:
-      rd_cn_revise=CombineCleanVcf2.outfile,
-      max_samples_shard = max_samples_per_shard_step3,
-      sv_pipeline_docker=sv_pipeline_docker,
-      runtime_attr_override=runtime_override_clean_vcf_3
-  }
-
-  scatter ( i in range(length(CleanVcf3.shards)) ){
-    call CleanVcf4 {
-      input:
-        rd_cn_revise=CleanVcf3.shards[i],
-        normal_revise_vcf=CleanVcf1b.normal,
-        prefix="~{prefix}.clean_vcf_4.shard_~{i}",
-        sv_pipeline_docker=sv_pipeline_docker,
-        runtime_attr_override=runtime_override_clean_vcf_4
-    }
-  }
-
-  call MiniTasks.CatUncompressedFiles as CombineRevised4 {
-    input:
-      shards=CleanVcf4.out,
-      outfile_name="~{prefix}.combine_revised_4.txt.gz",
-      sv_base_mini_docker=sv_base_mini_docker,
-      runtime_attr_override=runtime_override_combine_revised_4
-  }
-
-  call MiniTasks.CatUncompressedFiles as CombineMultiIds4 {
-    input:
-      shards=CleanVcf4.multi_ids,
-      outfile_name="~{prefix}.combine_multi_ids_4.txt.gz",
-      sv_base_mini_docker=sv_base_mini_docker,
-      runtime_attr_override=runtime_override_combine_multi_ids_4
-  }
-
-  call c5.CleanVcf5 {
-    input:
-      revise_vcf_lines=CombineRevised4.outfile,
-      normal_revise_vcf=CleanVcf1b.normal,
-      ped_file=ped_file,
-      sex_chr_revise=CombineStep1SexChrRevisions.outfile,
-      multi_ids=CombineMultiIds4.outfile,
-      outlier_samples_list=outlier_samples_list,
-      contig=contig,
-      prefix="~{prefix}.clean_vcf_5",
-      records_per_shard=clean_vcf5_records_per_shard,
-      threads_per_task=clean_vcf5_threads_per_task,
-      sv_pipeline_docker=sv_pipeline_docker,
-      sv_base_mini_docker=sv_base_mini_docker,
-      runtime_attr_override_scatter=runtime_override_clean_vcf_5_scatter,
-      runtime_attr_override_make_cleangq=runtime_override_clean_vcf_5_make_cleangq,
-      runtime_attr_override_find_redundant_multiallelics=runtime_override_clean_vcf_5_find_redundant_multiallelics,
-      runtime_attr_override_polish=runtime_override_clean_vcf_5_polish
-  }
-
-  call DropRedundantCnvs {
-    input:
-      vcf=CleanVcf5.polished,
-      prefix="~{prefix}.drop_redundant_cnvs",
-      contig=contig,
-      sv_pipeline_docker=sv_pipeline_docker,
-      runtime_attr_override=runtime_override_drop_redundant_cnvs
-  }
-
-  if (use_hail) {
-    call HailMerge.HailMerge as SortDropRedundantCnvsHail {
-      input:
-        vcfs=[DropRedundantCnvs.out],
-        prefix="~{prefix}.drop_redundant_cnvs.sorted",
-        gcs_project=gcs_project,
-        reset_cnv_gts=true,
-        sv_base_mini_docker=sv_base_mini_docker,
-        sv_pipeline_docker=sv_pipeline_docker,
-        runtime_override_preconcat=runtime_override_preconcat_drc,
-        runtime_override_hail_merge=runtime_override_hail_merge_drc,
-        runtime_override_fix_header=runtime_override_fix_header_drc
-    }
-  }
-  if (!use_hail) {
-    call MiniTasks.SortVcf as SortDropRedundantCnvs {
-      input:
-        vcf=DropRedundantCnvs.out,
-        outfile_prefix="~{prefix}.drop_redundant_cnvs.sorted",
-        sv_base_mini_docker=sv_base_mini_docker,
-        runtime_attr_override=runtime_override_sort_drop_redundant_cnvs
-    }
-  }
-
-  call StitchFragmentedCnvs {
-    input:
-      vcf=select_first([SortDropRedundantCnvs.out, SortDropRedundantCnvsHail.merged_vcf]),
-      prefix="~{prefix}.stitch_fragmented_cnvs",
-      sv_pipeline_docker=sv_pipeline_docker,
-      runtime_attr_override=runtime_override_stitch_fragmented_cnvs
-  }
-
-  call RescueMobileElementDeletions {
-    input:
-      vcf = StitchFragmentedCnvs.stitched_vcf_shard,
-      prefix = "~{prefix}.rescue_me_dels",
-      LINE1 = LINE1_reference,
-      HERVK = HERVK_reference,
-      sv_pipeline_docker = sv_pipeline_docker,
-      runtime_attr_override = runtime_override_rescue_me_dels
-  }
-
-  call FinalCleanup {
-    input:
-      vcf=RescueMobileElementDeletions.out,
-      contig=contig,
-      prefix="~{prefix}.final_cleanup",
-      sv_pipeline_docker=sv_pipeline_docker,
-      runtime_attr_override=runtime_override_final_cleanup
-  }
-
-  call fvcf.FormatVcf {
-    input:
-      vcf=FinalCleanup.final_cleaned_shard,
-      ploidy_table=ploidy_table,
-      args="--scale-down-gq",
-      output_prefix="~{prefix}.final_format",
-      script=svtk_to_gatk_script,
-      sv_pipeline_docker=sv_pipeline_docker,
-      runtime_attr_override=runtime_attr_format
-  }
-  
-  output {
-    File out = FormatVcf.out
-    File out_idx = FormatVcf.out_index
-  }
+	input {
+		File vcf
+		String contig
+		File background_list
+		File ped_file
+		File allosome_fai
+		String prefix
+		Int max_shards_per_chrom_step1
+		File bothsides_pass_list
+		Int min_records_per_shard_step1
+		Int samples_per_step2_shard
+		Int clean_vcf5_records_per_shard
+		Int? clean_vcf5_threads_per_task
+		File? outlier_samples_list
+		Int? max_samples_per_shard_step3
+
+		File HERVK_reference
+		File LINE1_reference
+
+		File ploidy_table
+		String chr_x
+		String chr_y
+
+		File? svtk_to_gatk_script  # For debugging
+
+		Boolean use_hail
+		String? gcs_project
+
+		String gatk_docker
+		String linux_docker
+		String sv_base_mini_docker
+		String sv_pipeline_docker
+
+		# overrides for local tasks
+		RuntimeAttr? runtime_override_clean_vcf_1a
+		RuntimeAttr? runtime_override_clean_vcf_1b
+		RuntimeAttr? runtime_override_clean_vcf_2
+		RuntimeAttr? runtime_override_clean_vcf_3
+		RuntimeAttr? runtime_override_clean_vcf_4
+		RuntimeAttr? runtime_override_clean_vcf_5
+		RuntimeAttr? runtime_override_stitch_fragmented_cnvs
+		RuntimeAttr? runtime_override_final_cleanup
+		RuntimeAttr? runtime_override_rescue_me_dels
+
+		RuntimeAttr? runtime_override_preconcat_step1
+		RuntimeAttr? runtime_override_hail_merge_step1
+		RuntimeAttr? runtime_override_fix_header_step1
+
+		RuntimeAttr? runtime_override_preconcat_drc
+		RuntimeAttr? runtime_override_hail_merge_drc
+		RuntimeAttr? runtime_override_fix_header_drc
+
+		# overrides for MiniTasks
+		RuntimeAttr? runtime_override_split_vcf_to_clean
+		RuntimeAttr? runtime_override_split_include_list
+		RuntimeAttr? runtime_override_combine_clean_vcf_2
+		RuntimeAttr? runtime_override_drop_redundant_cnvs
+		RuntimeAttr? runtime_override_combine_step_1_vcfs
+		RuntimeAttr? runtime_override_sort_drop_redundant_cnvs
+		RuntimeAttr? runtime_attr_format
+	}
+
+	call fvcf.FormatVcf as FormatVcfToClean {
+		input:
+			vcf=vcf,
+			ploidy_table=ploidy_table,
+			output_prefix="~{prefix}.formatted",
+			sv_pipeline_docker=sv_pipeline_docker,
+			runtime_attr_override=runtime_attr_format
+	}
+
+	call MiniTasks.SplitVcf as SplitVcfToClean {
+		input:
+			vcf=FormatVcfToClean.out,
+			contig=contig,
+			prefix="~{prefix}.shard_",
+			n_shards=max_shards_per_chrom_step1,
+			min_vars_per_shard=min_records_per_shard_step1,
+			sv_base_mini_docker=sv_base_mini_docker,
+			runtime_attr_override=runtime_override_split_vcf_to_clean
+	}
+
+	scatter ( i in range(length(SplitVcfToClean.vcf_shards)) ) {
+		call CleanVcf1a {
+			input:
+				vcf=SplitVcfToClean.vcf_shards[i],
+				prefix="~{prefix}.clean_vcf_1a.shard_~{i}",
+				background_fail_list=background_list,
+				bothsides_pass_list=bothsides_pass_list,
+				ped_file=ped_file,
+				allosome_fai=allosome_fai,
+				chr_x=chr_x,
+				chr_y=chr_y,
+				gatk_docker=gatk_docker,
+				runtime_attr_override=runtime_override_clean_vcf_1a
+		}
+	}
+
+	if (use_hail) {
+		call HailMerge.HailMerge as CombineStep1VcfsHail {
+			input:
+				vcfs=CleanVcf1a.intermediate_vcf,
+				prefix="~{prefix}.combine_step_1_vcfs",
+				gcs_project=gcs_project,
+				sv_base_mini_docker=sv_base_mini_docker,
+				sv_pipeline_docker=sv_pipeline_docker,
+				runtime_override_preconcat=runtime_override_preconcat_step1,
+				runtime_override_hail_merge=runtime_override_hail_merge_step1,
+				runtime_override_fix_header=runtime_override_fix_header_step1
+		}
+	}
+	if (!use_hail) {
+		call MiniTasks.ConcatVcfs as CombineStep1Vcfs {
+			input:
+				vcfs=CleanVcf1a.intermediate_vcf,
+				vcfs_idx=CleanVcf1a.intermediate_vcf_idx,
+				naive=true,
+				generate_index=false,
+				outfile_prefix="~{prefix}.combine_step_1_vcfs",
+				sv_base_mini_docker=sv_base_mini_docker,
+				runtime_attr_override=runtime_override_combine_step_1_vcfs
+		}
+	}
+
+	call CleanVcf1b {
+		input:
+			vcf=select_first([CombineStep1Vcfs.concat_vcf, CombineStep1VcfsHail.merged_vcf]),
+			prefix="~{prefix}.clean_vcf_1b",
+			gatk_docker=gatk_docker,
+			runtime_attr_override=runtime_override_clean_vcf_1b
+	}
+
+	call MiniTasks.SplitUncompressed as SplitIncludeList {
+		input:
+			whole_file=CleanVcf1a.include_list[0],
+			lines_per_shard=samples_per_step2_shard,
+			shard_prefix="~{prefix}.split_include_list.",
+			sv_pipeline_docker=sv_pipeline_docker,
+			runtime_attr_override=runtime_override_split_include_list
+	}
+
+	scatter ( i in range(length(SplitIncludeList.shards)) ){
+		call CleanVcf2 {
+			input:
+				vcf=CleanVcf1b.out,
+				prefix="~{prefix}.clean_vcf_2.shard_~{i}",
+				include_list=SplitIncludeList.shards[i],
+				gatk_docker=gatk_docker,
+				runtime_attr_override=runtime_override_clean_vcf_2
+			}
+	}
+
+	call MiniTasks.CatUncompressedFiles as CombineCleanVcf2 {
+		input:
+			shards=CleanVcf2.out,
+			outfile_name="~{prefix}.combine_clean_vcf_2.txt",
+			sv_base_mini_docker=sv_base_mini_docker,
+			runtime_attr_override=runtime_override_combine_clean_vcf_2
+	}
+
+	call CleanVcf3 {
+		input:
+			rd_cn_revise=CombineCleanVcf2.outfile,
+			max_samples_shard = max_samples_per_shard_step3,
+			sv_pipeline_docker=sv_pipeline_docker,
+			runtime_attr_override=runtime_override_clean_vcf_3
+	}
+
+	scatter ( i in range(length(CleanVcf3.shards)) ){
+		call CleanVcf4 {
+			input:
+				vcf=CleanVcf1b.out,
+				prefix="~{prefix}.clean_vcf_4.shard_~{i}",
+				outlier_samples_list=outlier_samples_list,
+				rd_cn_revise=CleanVcf3.shards[i],
+				gatk_docker=gatk_docker,
+				runtime_attr_override=runtime_override_clean_vcf_4
+		}
+	}
+
+	if (use_hail) {
+		call HailMerge.HailMerge as CombineStep4VcfsHail {
+			input:
+				vcfs=CleanVcf4.out,
+				prefix="~{prefix}.combine_revised_4",
+				gcs_project=gcs_project,
+				sv_base_mini_docker=sv_base_mini_docker,
+				sv_pipeline_docker=sv_pipeline_docker,
+				runtime_override_preconcat=runtime_override_preconcat_step1,
+				runtime_override_hail_merge=runtime_override_hail_merge_step1,
+				runtime_override_fix_header=runtime_override_fix_header_step1
+		}
+	}
+	if (!use_hail) {
+		call MiniTasks.ConcatVcfs as CombineStep4Vcfs {
+			input:
+				vcfs=CleanVcf4.out,
+				vcfs_idx=CleanVcf4.out_idx,
+				naive=true,
+				generate_index=true,
+				outfile_prefix="~{prefix}.combine_revised_4",
+				sv_base_mini_docker=sv_base_mini_docker,
+				runtime_attr_override=runtime_override_combine_step_1_vcfs
+		}
+	}
+
+	call CleanVcf5 {
+		input:
+			vcf=select_first([CombineStep4Vcfs.concat_vcf, CombineStep4VcfsHail.merged_vcf]),
+			prefix="~{prefix}.clean_vcf_5",
+			gatk_docker=gatk_docker,
+			runtime_attr_override=runtime_override_clean_vcf_5
+	}
+
+	call DropRedundantCnvs {
+		input:
+			vcf=CleanVcf5.out,
+			prefix="~{prefix}.drop_redundant_cnvs",
+			contig=contig,
+			sv_pipeline_docker=sv_pipeline_docker,
+			runtime_attr_override=runtime_override_drop_redundant_cnvs
+	}
+
+	if (use_hail) {
+		call HailMerge.HailMerge as SortDropRedundantCnvsHail {
+			input:
+				vcfs=[DropRedundantCnvs.out],
+				prefix="~{prefix}.drop_redundant_cnvs.sorted",
+				gcs_project=gcs_project,
+				reset_cnv_gts=true,
+				sv_base_mini_docker=sv_base_mini_docker,
+				sv_pipeline_docker=sv_pipeline_docker,
+				runtime_override_preconcat=runtime_override_preconcat_drc,
+				runtime_override_hail_merge=runtime_override_hail_merge_drc,
+				runtime_override_fix_header=runtime_override_fix_header_drc
+		}
+	}
+	if (!use_hail) {
+		call MiniTasks.SortVcf as SortDropRedundantCnvs {
+			input:
+				vcf=DropRedundantCnvs.out,
+				outfile_prefix="~{prefix}.drop_redundant_cnvs.sorted",
+				sv_base_mini_docker=sv_base_mini_docker,
+				runtime_attr_override=runtime_override_sort_drop_redundant_cnvs
+		}
+	}
+
+	call StitchFragmentedCnvs {
+		input:
+			vcf=select_first([SortDropRedundantCnvs.out, SortDropRedundantCnvsHail.merged_vcf]),
+			prefix="~{prefix}.stitch_fragmented_cnvs",
+			sv_pipeline_docker=sv_pipeline_docker,
+			runtime_attr_override=runtime_override_stitch_fragmented_cnvs
+	}
+
+	call RescueMobileElementDeletions {
+		input:
+			vcf = StitchFragmentedCnvs.stitched_vcf_shard,
+			prefix = "~{prefix}.rescue_me_dels",
+			LINE1 = LINE1_reference,
+			HERVK = HERVK_reference,
+			sv_pipeline_docker = sv_pipeline_docker,
+			runtime_attr_override = runtime_override_rescue_me_dels
+	}
+
+	call FinalCleanup {
+		input:
+			vcf=RescueMobileElementDeletions.out,
+			contig=contig,
+			prefix="~{prefix}.final_cleanup",
+			sv_pipeline_docker=sv_pipeline_docker,
+			runtime_attr_override=runtime_override_final_cleanup
+	}
+
+	call fvcf.FormatVcf as FormatVcfToOutput {
+		input:
+			vcf=FinalCleanup.final_cleaned_shard,
+			ploidy_table=ploidy_table,
+			args="--scale-down-gq",
+			output_prefix="~{prefix}.final_format",
+			script=svtk_to_gatk_script,
+			sv_pipeline_docker=sv_pipeline_docker,
+			runtime_attr_override=runtime_attr_format
+	}
+	
+	output {
+		File out = FormatVcfToOutput.out
+		File out_idx = FormatVcfToOutput.out_index
+	}
 }
 
 
 task CleanVcf1a {
-  input {
-    File vcf
-    String prefix
-    File background_fail_list
-    File bothsides_pass_list
-    File ped_file
-    File allosome_fai
-    String chr_x
-    String chr_y
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([vcf, background_fail_list, bothsides_pass_list], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + input_size * 2),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euo pipefail
-
-    touch ~{prefix}.includelist.txt
-    touch ~{prefix}.sexchr.revise.txt
-
-    # outputs
-    # includelist.txt: the names of all the samples in the input vcf
-    # sexchr.revise.txt: the names of the events where genotypes got tweaked on allosomes
-    # stdout: a revised vcf
-    java -jar $CLEAN_VCF_PART_1_JAR \
-      ~{vcf} \
-      ~{ped_file} \
-      ~{chr_x} \
-      ~{chr_y} \
-      ~{background_fail_list} \
-      ~{bothsides_pass_list} \
-      ~{prefix}.includelist.txt \
-      ~{prefix}.sexchr.revise.txt \
-      | bgzip \
-      > ~{prefix}.vcf.gz
-    tabix ~{prefix}.vcf.gz
-  >>>
-
-  output {
-    File include_list="~{prefix}.includelist.txt"
-    File sex="~{prefix}.sexchr.revise.txt"
-    File intermediate_vcf="~{prefix}.vcf.gz"
-    File intermediate_vcf_idx="~{prefix}.vcf.gz.tbi"
-  }
+	input {
+		File vcf
+		String prefix
+		File background_fail_list
+		File bothsides_pass_list
+		File ped_file
+		File allosome_fai
+		String chr_x
+		String chr_y
+		String gatk_docker
+		RuntimeAttr? runtime_attr_override
+	}
+
+	Float input_size = size([vcf, background_fail_list, bothsides_pass_list], "GB")
+	RuntimeAttr runtime_default = object {
+																	mem_gb: 3.75,
+																	disk_gb: ceil(10.0 + input_size * 2),
+																	cpu_cores: 1,
+																	preemptible_tries: 3,
+																	max_retries: 1,
+																	boot_disk_gb: 10
+																}						
+	RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+	runtime {
+		memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
+		disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+		cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+		preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+		maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+		docker: gatk_docker
+		bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+	}
+
+	Int java_mem_mb = ceil(runtime_override.mem_gb * 1000 * 0.7)
+	String output_vcf = "~{prefix}.vcf.gz"
+	String output_samples_list = "~{prefix}.includelist.txt"
+
+	command <<<
+		set -euo pipefail
+
+		if [ ! -f "~{vcf}.tbi" ]; then
+			tabix -p vcf ~{vcf}
+		fi
+		
+		gatk --java-options "-Xmx~{java_mem_mb}m" SVCleanPt1a \
+			-V ~{vcf} \
+			-O ~{output_vcf} \
+			--fail-list ~{background_fail_list} \
+			--pass-list ~{bothsides_pass_list} \
+			--chr-X ~{chr_x} \
+			--chr-Y ~{chr_y} \
+			--output-samples-list ~{output_samples_list}
+	>>>
+
+	output {
+		File include_list="~{output_samples_list}"
+		File intermediate_vcf="~{output_vcf}"
+		File intermediate_vcf_idx="~{output_vcf}.tbi"
+	}
+}
+
+task CleanVcf1b {
+	input {
+		File vcf
+		String prefix
+		String gatk_docker
+		RuntimeAttr? runtime_attr_override
+	}
+
+	Float input_size = size([vcf], "GB")
+	RuntimeAttr runtime_default = object {
+																	mem_gb: 3.75,
+																	disk_gb: ceil(10.0 + input_size * 2),
+																	cpu_cores: 1,
+																	preemptible_tries: 3,
+																	max_retries: 1,
+																	boot_disk_gb: 10
+																}
+	RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+	runtime {
+		memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
+		disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+		cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+		preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+		maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+		docker: gatk_docker
+		bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+	}
+
+	String output_vcf = "~{prefix}.vcf.gz"
+	Int java_mem_mb = ceil(runtime_override.mem_gb * 1000 * 0.7)
+
+	command <<<
+		set -euo pipefail
+
+		if [ ! -f "~{vcf}.tbi" ]; then
+			tabix -p vcf ~{vcf}
+		fi
+		
+		gatk --java-options "-Xmx~{java_mem_mb}m" SVCleanPt1b \
+			-V ~{vcf} \
+			-O ~{output_vcf}
+	>>>
+
+	output {
+		File out="~{output_vcf}"
+		File out_idx="~{output_vcf}.tbi"
+	}
 }
 
 task CleanVcf2 {
-  input {
-    File normal_revise_vcf
-    String prefix
-    File include_list
-    File multi_cnvs
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed
-  # generally assume working memory is ~3 * inputs
-  Float input_size = size([normal_revise_vcf, include_list, multi_cnvs], "GB")
-  Float base_disk_gb = 10.0
-  Float input_disk_scale = 3.0
-  RuntimeAttr runtime_default = object {
-    mem_gb: 2.0,
-    disk_gb: ceil(base_disk_gb + input_size * input_disk_scale),
-    cpu_cores: 1,
-    preemptible_tries: 3,
-    max_retries: 1,
-    boot_disk_gb: 10
-  }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -eu -o pipefail
-
-    bcftools index ~{normal_revise_vcf}
-    /opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part2.sh \
-      ~{normal_revise_vcf} \
-      ~{include_list} \
-      ~{multi_cnvs} \
-      "~{prefix}.txt"
-  >>>
-
-  output {
-    File out="~{prefix}.txt"
-  }
+	input {
+		File vcf
+		String prefix
+		File include_list
+		String gatk_docker
+		RuntimeAttr? runtime_attr_override
+	}
+
+	Float input_size = size([vcf, include_list], "GB")
+	Float base_disk_gb = 10.0
+	Float input_disk_scale = 3.0
+	RuntimeAttr runtime_default = object {
+		mem_gb: 2.0,
+		disk_gb: ceil(base_disk_gb + input_size * input_disk_scale),
+		cpu_cores: 1,
+		preemptible_tries: 3,
+		max_retries: 1,
+		boot_disk_gb: 10
+	}
+	RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+	runtime {
+		memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
+		disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+		cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+		preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+		maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+		docker: gatk_docker
+		bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+	}
+
+	String output_revised_list = "~{prefix}.txt"
+	Int java_mem_mb = ceil(runtime_override.mem_gb * 1000 * 0.7)
+
+	command <<<
+		set -euo pipefail
+
+		if [ ! -f "~{vcf}.tbi" ]; then
+			tabix -p vcf ~{vcf}
+		fi
+		
+		gatk --java-options "-Xmx~{java_mem_mb}m" SVCleanPt2 \
+			-V ~{vcf} \
+			--sample-list ~{include_list} \
+			--output-revised-list ~{output_revised_list}
+	>>>
+
+	output {
+		File out="~{output_revised_list}"
+	}
 }
 
 
 task CleanVcf3 {
-  input {
-    File rd_cn_revise
-    Int? max_samples_shard
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-  Int max_samples_shard_ = select_first([max_samples_shard, 7000])
-  # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed
-  # generally assume working memory is ~3 * inputs
-  Float input_size = size(rd_cn_revise, "GB")
-  RuntimeAttr runtime_default = object {
-    mem_gb: 3.75,
-    disk_gb: ceil(10.0 + input_size * 2.0),
-    cpu_cores: 1,
-    preemptible_tries: 3,
-    max_retries: 1,
-    boot_disk_gb: 10
-  }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euo pipefail
-    python /opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part3.py ~{rd_cn_revise} -s ~{max_samples_shard_}
-    # Ensure there is at least one shard
-    touch shards/out.0_0.txt
-  >>>
-
-  output {
-     Array[File] shards = glob("shards/*")
-  }
+	input {
+		File rd_cn_revise
+		Int? max_samples_shard
+		String sv_pipeline_docker
+		RuntimeAttr? runtime_attr_override
+	}
+	
+	Int max_samples_shard_ = select_first([max_samples_shard, 7000])
+	Float input_size = size(rd_cn_revise, "GB")
+	RuntimeAttr runtime_default = object {
+		mem_gb: 3.75,
+		disk_gb: ceil(10.0 + input_size * 2.0),
+		cpu_cores: 1,
+		preemptible_tries: 3,
+		max_retries: 1,
+		boot_disk_gb: 10
+	}
+	RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+	runtime {
+		memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
+		disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+		cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+		preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+		maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+		docker: sv_pipeline_docker
+		bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+	}
+
+	command <<<
+		set -euo pipefail
+		python /opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part3.py ~{rd_cn_revise} -s ~{max_samples_shard_}
+		# Ensure there is at least one shard
+		touch shards/out.0_0.txt
+	>>>
+
+	output {
+		 Array[File] shards = glob("shards/*")
+	}
 }
 
 
 task CleanVcf4 {
-  input {
-    File rd_cn_revise
-    File normal_revise_vcf
-    String prefix
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([rd_cn_revise, normal_revise_vcf], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 2.0,
-                                  disk_gb: 50,
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euo pipefail
-    python3 <<CODE
-    import pysam
-    import os
-
-    # Inputs
-    REGENO_FILE="~{rd_cn_revise}"
-    VCF_FILE="~{normal_revise_vcf}"
-
-    # Build map of variants to regenotype
-    with open(REGENO_FILE) as f:
-      vid_sample_cn_map = {}
-      for line in f:
-        tokens = line.strip().split('\t')
-        vid = tokens[0]
-        if vid not in vid_sample_cn_map:
-          vid_sample_cn_map[vid] = []
-        vid_sample_cn_map[vid].append(tuple(tokens[1:]))
-
-    # Traverse VCF and replace genotypes
-    with open("~{prefix}.revise_vcf_lines.txt", "w") as f:
-      vcf = pysam.VariantFile(VCF_FILE)
-      num_vcf_records = 0
-      for record in vcf:
-        num_vcf_records += 1
-        if record.id not in vid_sample_cn_map:
-          continue
-        for entry in vid_sample_cn_map[record.id]:
-          s = record.samples[entry[0]]
-          s['GT'] = (0, 1)
-          s['RD_CN'] = int(entry[1])
-        f.write(str(record))
-      vcf.close()
-
-    # Get batch size
-    regeno_file_name_tokens = os.path.basename(REGENO_FILE).split('.')[1].split('_')
-    batch_num = max(int(regeno_file_name_tokens[0]), 1)
-    total_batch = max(int(regeno_file_name_tokens[1]), 1)
-    segments = num_vcf_records / float(total_batch)
-    print("{} {} {}".format(batch_num, total_batch, segments))
-
-    vcf = pysam.VariantFile(VCF_FILE)
-    # Max sample count with PE or SR GT over 3
-    max_vf = max(len(vcf.header.samples) * 0.01, 2)
-    record_start = (batch_num - 1) * segments
-    record_end = batch_num * segments
-    record_idx = 0
-    print("{} {} {}".format(max_vf, record_start, record_end))
-    multi_geno_ids = set([])
-    for record in vcf:
-      record_idx += 1
-      if record_idx < record_start:
-        continue
-      elif record_idx > record_end:
-        break
-      num_gt_over_2 = 0
-      for sid in record.samples:
-        s = record.samples[sid]
-        # Pick best GT
-        if s['PE_GT'] is None:
-          continue
-        elif s['SR_GT'] is None:
-          gt = s['PE_GT']
-        elif s['PE_GT'] > 0 and s['SR_GT'] == 0:
-          gt = s['PE_GT']
-        elif s['PE_GT'] == 0:
-          gt = s['SR_GT']
-        elif s['PE_GQ'] >= s['SR_GQ']:
-          gt = s['PE_GT']
-        else:
-          gt = s['SR_GT']
-        if gt > 2:
-          num_gt_over_2 += 1
-      if num_gt_over_2 > max_vf:
-        multi_geno_ids.add(record.id)
-    vcf.close()
-
-    multi_geno_ids = sorted(list(multi_geno_ids))
-    with open("~{prefix}.multi_geno_ids.txt", "w") as f:
-      for vid in multi_geno_ids:
-        f.write(vid + "\n")
-    CODE
-
-    bgzip ~{prefix}.revise_vcf_lines.txt
-    gzip ~{prefix}.multi_geno_ids.txt
-  >>>
-
-  output {
-    File out="~{prefix}.revise_vcf_lines.txt.gz"
-    File multi_ids="~{prefix}.multi_geno_ids.txt.gz"
-  }
+	input {
+		File vcf
+		String prefix
+		File rd_cn_revise
+		File? outlier_samples_list
+		String gatk_docker
+		RuntimeAttr? runtime_attr_override
+	}
+
+	Float input_size = size([vcf, rd_cn_revise], "GB")
+	RuntimeAttr runtime_default = object {
+																	mem_gb: 2.0,
+																	disk_gb: 50,
+																	cpu_cores: 1,
+																	preemptible_tries: 3,
+																	max_retries: 1,
+																	boot_disk_gb: 10
+																}
+	RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+	runtime {
+		memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
+		disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+		cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+		preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+		maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+		docker: gatk_docker
+		bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+	}
+
+	String output_vcf = "~{prefix}.vcf.gz"
+	Int java_mem_mb = ceil(runtime_override.mem_gb * 1000 * 0.7)
+
+	command <<<
+		set -euo pipefail
+
+		if [ ! -f "~{vcf}.tbi" ]; then
+			tabix -p vcf ~{vcf}
+		fi
+		
+		gatk --java-options "-Xmx~{java_mem_mb}m" SVCleanPt4 \
+			-V ~{vcf} \
+			-O ~{output_vcf} \
+			--revised-cn-list ~{rd_cn_revise} \
+			~{if defined(outlier_samples_list) then "--outliers-list ~{outlier_samples_list}" else "" }
+	>>>
+
+	output {
+		File out="~{output_vcf}"
+		File out_idx="~{output_vcf}.tbi"
+	}
+}
+
+
+task CleanVcf5 {
+	input {
+		File vcf
+		String prefix
+		String gatk_docker
+		RuntimeAttr? runtime_attr_override
+	}
+
+	Float input_size = size([vcf], "GB")
+	RuntimeAttr runtime_default = object {
+																	mem_gb: 2.0,
+																	disk_gb: 50,
+																	cpu_cores: 1,
+																	preemptible_tries: 3,
+																	max_retries: 1,
+																	boot_disk_gb: 10
+																}
+	RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+	runtime {
+		memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
+		disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+		cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+		preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+		maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+		docker: gatk_docker
+		bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+	}
+
+	String output_vcf = "~{prefix}.vcf.gz"
+	Int java_mem_mb = ceil(runtime_override.mem_gb * 1000 * 0.7)
+
+	command <<<
+		set -euo pipefail
+
+		if [ ! -f "~{vcf}.tbi" ]; then
+			tabix -p vcf ~{vcf}
+		fi
+		
+		gatk --java-options "-Xmx~{java_mem_mb}m" SVCleanPt5 \
+			-V ~{vcf} \
+			-O ~{output_vcf}
+	>>>
+
+	output {
+		File out="~{output_vcf}"
+		File out_idx="~{output_vcf}.tbi"
+	}
 }
 
+
 task RescueMobileElementDeletions {
-  input {
-    File vcf
-    String prefix
-    File LINE1
-    File HERVK
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size(vcf, "GiB")
-  RuntimeAttr runtime_default = object {
-    mem_gb: 3.75 + input_size * 1.5,
-    disk_gb: ceil(100.0 + input_size * 3.0),
-    cpu_cores: 1,
-    preemptible_tries: 3,
-    max_retries: 1,
-    boot_disk_gb: 10
-  }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euo pipefail
-
-    python <<CODE
+	input {
+		File vcf
+		String prefix
+		File LINE1
+		File HERVK
+		String sv_pipeline_docker
+		RuntimeAttr? runtime_attr_override
+	}
+
+	Float input_size = size(vcf, "GiB")
+	RuntimeAttr runtime_default = object {
+		mem_gb: 3.75 + input_size * 1.5,
+		disk_gb: ceil(100.0 + input_size * 3.0),
+		cpu_cores: 1,
+		preemptible_tries: 3,
+		max_retries: 1,
+		boot_disk_gb: 10
+	}
+	RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+	runtime {
+		memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
+		disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+		cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+		preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+		maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+		docker: sv_pipeline_docker
+		bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+	}
+
+	command <<<
+		set -euo pipefail
+
+		python <<CODE
 import os
 import pysam
 fin=pysam.VariantFile("~{vcf}")
 fo=pysam.VariantFile("~{prefix}.bnd_del.vcf.gz", 'w', header = fin.header)
 for record in fin:
-    if record.info['SVTYPE'] in ['BND'] and record.info['STRANDS']=="+-" and record.chrom == record.info['CHR2'] and record.info['END2'] - record.start < 10000:
-        record.info['SVLEN'] = record.info['END2'] - record.start
-        fo.write(record)
+		if record.info['SVTYPE'] in ['BND'] and record.info['STRANDS']=="+-" and record.chrom == record.info['CHR2'] and record.info['END2'] - record.start < 10000:
+				record.info['SVLEN'] = record.info['END2'] - record.start
+				fo.write(record)
 fin.close()
 fo.close()
 CODE
 
-    tabix -p vcf ~{prefix}.bnd_del.vcf.gz
+		tabix -p vcf ~{prefix}.bnd_del.vcf.gz
 
-    svtk vcf2bed ~{prefix}.bnd_del.vcf.gz -i ALL --include-filters ~{prefix}.bnd_del.bed
-    bgzip ~{prefix}.bnd_del.bed
+		svtk vcf2bed ~{prefix}.bnd_del.vcf.gz -i ALL --include-filters ~{prefix}.bnd_del.bed
+		bgzip ~{prefix}.bnd_del.bed
 
-    bedtools coverage -wo -a ~{prefix}.bnd_del.bed.gz -b ~{LINE1} | awk '{if ($NF>.5) print}' | cut -f4 | sed -e 's/$/\tDEL\tPASS\toverlap_LINE1/' > manual_revise.MEI_DEL_from_BND.SVID_SVTYPE_FILTER_INFO.tsv
-    bedtools coverage -wo -a ~{prefix}.bnd_del.bed.gz -b ~{HERVK} | awk '{if ($NF>.5) print}' | cut -f4 | sed -e 's/$/\tDEL\tPASS\toverlap_HERVK/' >> manual_revise.MEI_DEL_from_BND.SVID_SVTYPE_FILTER_INFO.tsv
+		bedtools coverage -wo -a ~{prefix}.bnd_del.bed.gz -b ~{LINE1} | awk '{if ($NF>.5) print}' | cut -f4 | sed -e 's/$/\tDEL\tPASS\toverlap_LINE1/' > manual_revise.MEI_DEL_from_BND.SVID_SVTYPE_FILTER_INFO.tsv
+		bedtools coverage -wo -a ~{prefix}.bnd_del.bed.gz -b ~{HERVK} | awk '{if ($NF>.5) print}' | cut -f4 | sed -e 's/$/\tDEL\tPASS\toverlap_HERVK/' >> manual_revise.MEI_DEL_from_BND.SVID_SVTYPE_FILTER_INFO.tsv
 
-    python <<CODE
+		python <<CODE
 import pysam
 def SVID_MEI_DEL_readin(MEI_DEL_reset):
-    out={}
-    fin=open(MEI_DEL_reset)
-    for line in fin:
-        pin=line.strip().split()
-        if not pin[0] in out.keys():
-            out[pin[0]] = pin[3]
-    fin.close()
-    return out
+		out={}
+		fin=open(MEI_DEL_reset)
+		for line in fin:
+				pin=line.strip().split()
+				if not pin[0] in out.keys():
+						out[pin[0]] = pin[3]
+		fin.close()
+		return out
 
 hash_MEI_DEL_reset = SVID_MEI_DEL_readin("manual_revise.MEI_DEL_from_BND.SVID_SVTYPE_FILTER_INFO.tsv")
 fin=pysam.VariantFile("~{vcf}")
 fo=pysam.VariantFile("~{prefix}.vcf.gz", 'w', header = fin.header)
 for record in fin:
-    if record.id in hash_MEI_DEL_reset.keys():
-        del record.filter['UNRESOLVED']
-        record.info['SVTYPE'] = 'DEL'
-        record.info['SVLEN'] = record.info['END2'] - record.start
-        record.stop = record.info['END2']
-        record.info.pop("CHR2")
-        record.info.pop("END2")
-        record.info.pop("UNRESOLVED_TYPE")
-        if hash_MEI_DEL_reset[record.id] == 'overlap_LINE1':
-            record.alts = ('<DEL:ME:LINE1>',)
-        if hash_MEI_DEL_reset[record.id] == 'overlap_HERVK':
-            record.alts = ('<DEL:ME:HERVK>',)
-    fo.write(record)
+		if record.id in hash_MEI_DEL_reset.keys():
+				del record.filter['UNRESOLVED']
+				record.info['SVTYPE'] = 'DEL'
+				record.info['SVLEN'] = record.info['END2'] - record.start
+				record.stop = record.info['END2']
+				record.info.pop("CHR2")
+				record.info.pop("END2")
+				record.info.pop("UNRESOLVED_TYPE")
+				if hash_MEI_DEL_reset[record.id] == 'overlap_LINE1':
+						record.alts = ('<DEL:ME:LINE1>',)
+				if hash_MEI_DEL_reset[record.id] == 'overlap_HERVK':
+						record.alts = ('<DEL:ME:HERVK>',)
+		fo.write(record)
 fin.close()
 fo.close()
 CODE
-  >>>
+	>>>
 
-  output {
-    File out = "~{prefix}.vcf.gz"
-  }
+	output {
+		File out = "~{prefix}.vcf.gz"
+	}
 }
 
 
 # Remove CNVs that are redundant with CPX events or other CNVs
 task DropRedundantCnvs {
-  input {
-    File vcf
-    String prefix
-    String contig
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size(vcf, "GiB")
-  # disk is cheap, read/write speed is proportional to disk size, and disk IO is a significant time factor:
-  # in tests on large VCFs, memory usage is ~1.0 * input VCF size
-  # the biggest disk usage is at the end of the task, with input + output VCF on disk
-  Int cpu_cores = 2 # speed up compression / decompression of VCFs
-  RuntimeAttr runtime_default = object {
-    mem_gb: 3.75 + input_size * 1.5,
-    disk_gb: ceil(100.0 + input_size * 2.0),
-    cpu_cores: cpu_cores,
-    preemptible_tries: 3,
-    max_retries: 1,
-    boot_disk_gb: 10
-  }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euo pipefail
-    /opt/sv-pipeline/04_variant_resolution/scripts/resolve_cpx_cnv_redundancies.py \
-      ~{vcf} ~{prefix}.vcf.gz --temp-dir ./tmp
-  >>>
-
-  output {
-    File out = "~{prefix}.vcf.gz"
-  }
+	input {
+		File vcf
+		String prefix
+		String contig
+		String sv_pipeline_docker
+		RuntimeAttr? runtime_attr_override
+	}
+
+	Float input_size = size(vcf, "GiB")
+	# disk is cheap, read/write speed is proportional to disk size, and disk IO is a significant time factor:
+	# in tests on large VCFs, memory usage is ~1.0 * input VCF size
+	# the biggest disk usage is at the end of the task, with input + output VCF on disk
+	Int cpu_cores = 2 # speed up compression / decompression of VCFs
+	RuntimeAttr runtime_default = object {
+		mem_gb: 3.75 + input_size * 1.5,
+		disk_gb: ceil(100.0 + input_size * 2.0),
+		cpu_cores: cpu_cores,
+		preemptible_tries: 3,
+		max_retries: 1,
+		boot_disk_gb: 10
+	}
+	RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+	runtime {
+		memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
+		disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+		cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+		preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+		maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+		docker: sv_pipeline_docker
+		bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+	}
+
+	command <<<
+		set -euo pipefail
+		/opt/sv-pipeline/04_variant_resolution/scripts/resolve_cpx_cnv_redundancies.py \
+			~{vcf} ~{prefix}.vcf.gz --temp-dir ./tmp
+	>>>
+
+	output {
+		File out = "~{prefix}.vcf.gz"
+	}
 }
 
 
 # Stitch fragmented RD-only calls found in 100% of the same samples
 task StitchFragmentedCnvs {
-  input {
-    File vcf
-    String prefix
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size(vcf, "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 2),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  Float mem_gb = select_first([runtime_override.mem_gb, runtime_default.mem_gb])
-  Int java_mem_mb = ceil(mem_gb * 1000 * 0.8)
-
-  runtime {
-    memory: "~{mem_gb} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euo pipefail
-    echo "First pass..."
-    java -Xmx~{java_mem_mb}M -jar ${STITCH_JAR} 0.2 200000 0.2 ~{vcf} \
-      | bgzip \
-      > tmp.vcf.gz
-    rm ~{vcf}
-    echo "Second pass..."
-    java -Xmx~{java_mem_mb}M -jar ${STITCH_JAR} 0.2 200000 0.2 tmp.vcf.gz \
-      | bgzip \
-      > ~{prefix}.vcf.gz
-  >>>
-
-  output {
-    File stitched_vcf_shard = "~{prefix}.vcf.gz"
-  }
+	input {
+		File vcf
+		String prefix
+		String sv_pipeline_docker
+		RuntimeAttr? runtime_attr_override
+	}
+
+	Float input_size = size(vcf, "GB")
+	RuntimeAttr runtime_default = object {
+																	mem_gb: 7.5,
+																	disk_gb: ceil(10.0 + input_size * 2),
+																	cpu_cores: 1,
+																	preemptible_tries: 3,
+																	max_retries: 1,
+																	boot_disk_gb: 10
+																}
+	RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+	Float mem_gb = select_first([runtime_override.mem_gb, runtime_default.mem_gb])
+	Int java_mem_mb = ceil(mem_gb * 1000 * 0.8)
+
+	runtime {
+		memory: "~{mem_gb} GB"
+		disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+		cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+		preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+		maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+		docker: sv_pipeline_docker
+		bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+	}
+
+	command <<<
+		set -euo pipefail
+		echo "First pass..."
+		java -Xmx~{java_mem_mb}M -jar ${STITCH_JAR} 0.2 200000 0.2 ~{vcf} \
+			| bgzip \
+			> tmp.vcf.gz
+		rm ~{vcf}
+		echo "Second pass..."
+		java -Xmx~{java_mem_mb}M -jar ${STITCH_JAR} 0.2 200000 0.2 tmp.vcf.gz \
+			| bgzip \
+			> ~{prefix}.vcf.gz
+	>>>
+
+	output {
+		File stitched_vcf_shard = "~{prefix}.vcf.gz"
+	}
 }
 
 
 # Final VCF cleanup
 task FinalCleanup {
-  input {
-    File vcf
-    String contig
-    String prefix
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed
-  # generally assume working memory is ~3 * inputs
-  Float input_size = size(vcf, "GB")
-  Float base_disk_gb = 10.0
-  Float base_mem_gb = 2.0
-  Float input_mem_scale = 3.0
-  Float input_disk_scale = 5.0
-  RuntimeAttr runtime_default = object {
-    mem_gb: base_mem_gb + input_size * input_mem_scale,
-    disk_gb: ceil(base_disk_gb + input_size * input_disk_scale),
-    cpu_cores: 1,
-    preemptible_tries: 3,
-    max_retries: 1,
-    boot_disk_gb: 10
-  }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -eu -o pipefail
-    
-    /opt/sv-pipeline/04_variant_resolution/scripts/rename_after_vcfcluster.py \
-      --chrom ~{contig} \
-      --prefix ~{prefix} \
-      ~{vcf} stdout \
-      | bcftools annotate --no-version -e 'SVTYPE=="CNV" && SVLEN<5000' -x INFO/MEMBERS -Oz -o ~{prefix}.vcf.gz
-    tabix ~{prefix}.vcf.gz
-  >>>
-
-  output {
-    File final_cleaned_shard = "~{prefix}.vcf.gz"
-    File final_cleaned_shard_idx = "~{prefix}.vcf.gz.tbi"
-  }
+	input {
+		File vcf
+		String contig
+		String prefix
+		String sv_pipeline_docker
+		RuntimeAttr? runtime_attr_override
+	}
+
+	# generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed
+	# generally assume working memory is ~3 * inputs
+	Float input_size = size(vcf, "GB")
+	Float base_disk_gb = 10.0
+	Float base_mem_gb = 2.0
+	Float input_mem_scale = 3.0
+	Float input_disk_scale = 5.0
+	RuntimeAttr runtime_default = object {
+		mem_gb: base_mem_gb + input_size * input_mem_scale,
+		disk_gb: ceil(base_disk_gb + input_size * input_disk_scale),
+		cpu_cores: 1,
+		preemptible_tries: 3,
+		max_retries: 1,
+		boot_disk_gb: 10
+	}
+	RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+	runtime {
+		memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
+		disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+		cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+		preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+		maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+		docker: sv_pipeline_docker
+		bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+	}
+
+	command <<<
+		set -eu -o pipefail
+		
+		/opt/sv-pipeline/04_variant_resolution/scripts/rename_after_vcfcluster.py \
+			--chrom ~{contig} \
+			--prefix ~{prefix} \
+			~{vcf} stdout \
+			| bcftools annotate --no-version -e 'SVTYPE=="CNV" && SVLEN<5000' -x INFO/MEMBERS -Oz -o ~{prefix}.vcf.gz
+		tabix ~{prefix}.vcf.gz
+	>>>
+
+	output {
+		File final_cleaned_shard = "~{prefix}.vcf.gz"
+		File final_cleaned_shard_idx = "~{prefix}.vcf.gz.tbi"
+	}
 }
\ No newline at end of file

From 7eaa215ba3f6bb1ce1bea123be76e4d3ef239979 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Thu, 31 Oct 2024 21:21:09 -0400
Subject: [PATCH 03/40] Furhter changes - reverted changes to 1b, removed 5

---
 wdl/CleanVcf1b.wdl | 353 ---------------------------------------------
 wdl/CleanVcf5.wdl  |   6 +-
 2 files changed, 5 insertions(+), 354 deletions(-)
 delete mode 100644 wdl/CleanVcf1b.wdl

diff --git a/wdl/CleanVcf1b.wdl b/wdl/CleanVcf1b.wdl
deleted file mode 100644
index 691d0591c..000000000
--- a/wdl/CleanVcf1b.wdl
+++ /dev/null
@@ -1,353 +0,0 @@
-version 1.0
-
-import "Structs.wdl"
-import "CleanVcf5.wdl" as CleanVcf5
-import "TasksMakeCohortVcf.wdl" as MiniTasks
-
-workflow CleanVcf1b {
-    input {
-        File intermediate_vcf
-        String prefix
-        Int records_per_shard
-
-        String sv_pipeline_docker
-        String sv_base_mini_docker
-
-        RuntimeAttr? runtime_attr_override_subset_large_cnvs
-        RuntimeAttr? runtime_attr_override_sort_bed
-        RuntimeAttr? runtime_attr_override_intersect_bed
-        RuntimeAttr? runtime_attr_override_build_dict
-        RuntimeAttr? runtime_attr_override_scatter
-        RuntimeAttr? runtime_attr_override_filter_vcf
-        RuntimeAttr? runtime_override_concat_vcfs
-        RuntimeAttr? runtime_override_cat_multi_cnvs
-    }
-
-    call SubsetLargeCNVs {
-        input:
-            vcf=intermediate_vcf,
-            prefix="~{prefix}.subset_large_cnvs",
-            sv_pipeline_docker=sv_pipeline_docker,
-            runtime_attr_override=runtime_attr_override_subset_large_cnvs
-    }
-
-    call Vcf2Bed {
-        input:
-            vcf=SubsetLargeCNVs.out,
-            prefix="~{prefix}.subset_large_cnvs",
-            sv_pipeline_docker=sv_pipeline_docker,
-            runtime_attr_override=runtime_attr_override_subset_large_cnvs
-    }
-
-    call SortBed {
-        input:
-            bed=Vcf2Bed.out,
-            prefix="~{prefix}.subset_large_cnvs.sorted",
-            sv_base_mini_docker=sv_base_mini_docker,
-            runtime_attr_override=runtime_attr_override_sort_bed
-    }
-
-    call BedtoolsIntersect {
-        input:
-            bed=SortBed.out,
-            prefix="~{prefix}.bedtools_intersect",
-            sv_base_mini_docker=sv_base_mini_docker,
-            runtime_attr_override=runtime_attr_override_intersect_bed
-    }
-
-    call BuildGenoNormalReviseDictionary {
-        input:
-            filtered_vcf=SubsetLargeCNVs.out,
-            intersect_bed=BedtoolsIntersect.out,
-            prefix="~{prefix}.geno_normal_revise",
-            sv_pipeline_docker=sv_pipeline_docker,
-            runtime_attr_override=runtime_attr_override_build_dict
-    }
-
-    call MiniTasks.ScatterVcf {
-        input:
-            vcf=intermediate_vcf,
-            records_per_shard=records_per_shard,
-            prefix="~{prefix}.scatter_vcf",
-            sv_pipeline_docker=sv_pipeline_docker,
-            runtime_attr_override=runtime_attr_override_scatter
-    }
-
-    scatter ( i in range(length(ScatterVcf.shards)) ) {
-        call FilterVcf {
-            input:
-                intermediate_vcf=ScatterVcf.shards[i],
-                dictionary_json_gz=BuildGenoNormalReviseDictionary.out,
-                prefix="~{prefix}.filter_vcf.shard_~{i}",
-                sv_pipeline_docker=sv_pipeline_docker,
-                runtime_attr_override=runtime_attr_override_filter_vcf
-        }
-    }
-
-    call MiniTasks.ConcatVcfs as ConcatCleanVcf1bShards {
-        input:
-            vcfs=FilterVcf.out,
-            naive=true,
-            sort_vcf_list=true,
-            outfile_prefix="~{prefix}.concat_vcfs",
-            sv_base_mini_docker=sv_pipeline_docker,
-            runtime_attr_override=runtime_override_concat_vcfs
-    }
-
-    call MiniTasks.CatUncompressedFiles as ConcatMultiCnvs  {
-        input:
-            shards=FilterVcf.multi_cnvs,
-            outfile_name="~{prefix}.multi.cnvs.txt",
-            sv_base_mini_docker=sv_base_mini_docker,
-            runtime_attr_override=runtime_override_cat_multi_cnvs
-    }
-
-    output {
-        File normal = ConcatCleanVcf1bShards.concat_vcf
-        File multi = ConcatMultiCnvs.outfile
-    }
-}
-
-task SubsetLargeCNVs {
-    input {
-        File vcf
-        String prefix
-        String sv_pipeline_docker
-        RuntimeAttr? runtime_attr_override
-    }
-
-    Float input_size = size(vcf, "GB")
-    RuntimeAttr runtime_default = object {
-                                      mem_gb: 3.75,
-                                      disk_gb: ceil(10.0 + input_size * 2.0),
-                                      cpu_cores: 1,
-                                      preemptible_tries: 3,
-                                      max_retries: 1,
-                                      boot_disk_gb: 10
-                                  }
-    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-    runtime {
-        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-        docker: sv_pipeline_docker
-        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-    }
-
-    command <<<
-        set -euo pipefail
-        bcftools view --no-version \
-            -i '(INFO/SVTYPE=="DEL" || INFO/SVTYPE=="DUP") && INFO/SVLEN>=5000' \
-            ~{vcf} \
-            | bgzip \
-            > ~{prefix}.vcf.gz
-    >>>
-    output {
-        File out = "~{prefix}.vcf.gz"
-    }
-}
-
-task Vcf2Bed {
-    input {
-        File vcf
-        String prefix
-        String sv_pipeline_docker
-        RuntimeAttr? runtime_attr_override
-    }
-
-    Float input_size = size(vcf, "GB")
-    RuntimeAttr runtime_default = object {
-                                      mem_gb: 3.75,
-                                      disk_gb: ceil(10.0 + input_size * 2.0),
-                                      cpu_cores: 1,
-                                      preemptible_tries: 3,
-                                      max_retries: 1,
-                                      boot_disk_gb: 10
-                                  }
-    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-    runtime {
-        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-        docker: sv_pipeline_docker
-        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-    }
-
-    command <<<
-        set -euo pipefail
-        svtk vcf2bed --no-header ~{vcf} stdout \
-            | awk -F'\t' -v OFS='\t' '{if ($6=="") $6="blanksample";print $0}' \
-            | gzip -1 \
-            > ~{prefix}.bed.gz
-    >>>
-    output {
-        File out = "~{prefix}.bed.gz"
-    }
-}
-
-task SortBed {
-    input {
-        File bed
-        String prefix
-        String sv_base_mini_docker
-        RuntimeAttr? runtime_attr_override
-    }
-
-    Float input_size = size(bed, "GB")
-    RuntimeAttr runtime_default = object {
-                                      mem_gb: 3.75,
-                                      disk_gb: ceil(10.0 + input_size * 10.0),
-                                      cpu_cores: 1,
-                                      preemptible_tries: 3,
-                                      max_retries: 1,
-                                      boot_disk_gb: 10
-                                  }
-    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-    runtime {
-        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-        docker: sv_base_mini_docker
-        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-    }
-
-    command <<<
-        set -euo pipefail
-        mkdir tmp
-        zcat ~{bed} \
-            | sort -T tmp -k1,1 -k2,2n \
-            | gzip -1 \
-            > ~{prefix}.bed.gz
-    >>>
-    output {
-        File out = "~{prefix}.bed.gz"
-    }
-}
-
-task BedtoolsIntersect {
-    input {
-        File bed
-        String prefix
-        String sv_base_mini_docker
-        RuntimeAttr? runtime_attr_override
-    }
-
-    Float input_size = size(bed, "GB")
-    RuntimeAttr runtime_default = object {
-                                      mem_gb: 3.75,
-                                      disk_gb: ceil(10.0 + input_size * 10.0),
-                                      cpu_cores: 1,
-                                      preemptible_tries: 3,
-                                      max_retries: 1,
-                                      boot_disk_gb: 10
-                                  }
-    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-    runtime {
-        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-        docker: sv_base_mini_docker
-        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-    }
-
-    command <<<
-        set -euo pipefail
-        bedtools intersect -sorted -wa -wb -a <(zcat ~{bed}) -b <(zcat ~{bed}) \
-            | awk -F'\t' -v OFS='\t' '$4!=$10 && $5!=$11' \
-            | gzip -1 \
-            > ~{prefix}.bed.gz
-    >>>
-    output {
-        File out = "~{prefix}.bed.gz"
-    }
-}
-
-task BuildGenoNormalReviseDictionary {
-    input {
-        File filtered_vcf
-        File intersect_bed
-        String prefix
-        String sv_pipeline_docker
-        RuntimeAttr? runtime_attr_override
-    }
-
-    Float input_size = size([filtered_vcf, intersect_bed], "GB")
-    RuntimeAttr runtime_default = object {
-                                      mem_gb: 3.75,
-                                      disk_gb: ceil(10.0 + input_size * 2.0),
-                                      cpu_cores: 1,
-                                      preemptible_tries: 3,
-                                      max_retries: 1,
-                                      boot_disk_gb: 10
-                                  }
-    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-    runtime {
-        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-        docker: sv_pipeline_docker
-        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-    }
-
-    command <<<
-        set -euo pipefail
-        python /opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_build_dict.py ~{filtered_vcf} ~{intersect_bed} \
-            | gzip -1 \
-            > ~{prefix}.json.gz
-    >>>
-    output {
-        File out = "~{prefix}.json.gz"
-    }
-}
-
-task FilterVcf {
-    input {
-        File intermediate_vcf
-        File dictionary_json_gz
-        String prefix
-        String sv_pipeline_docker
-        RuntimeAttr? runtime_attr_override
-    }
-
-    Float input_size = size([intermediate_vcf, dictionary_json_gz], "GB")
-    RuntimeAttr runtime_default = object {
-                                      mem_gb: 3.75,
-                                      disk_gb: ceil(10.0 + input_size * 2.0),
-                                      cpu_cores: 1,
-                                      preemptible_tries: 3,
-                                      max_retries: 1,
-                                      boot_disk_gb: 10
-                                  }
-    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-    runtime {
-        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-        docker: sv_pipeline_docker
-        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-    }
-
-    command <<<
-        set -euo pipefail
-        python /opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_filter.py ~{dictionary_json_gz} ~{intermediate_vcf} \
-            | bgzip \
-            > ~{prefix}.vcf.gz
-        mv multi.cnvs.txt ~{prefix}.multi.cnvs.txt
-    >>>
-    output {
-        File out = "~{prefix}.vcf.gz"
-        File multi_cnvs = "~{prefix}.multi.cnvs.txt"
-    }
-}
diff --git a/wdl/CleanVcf5.wdl b/wdl/CleanVcf5.wdl
index f4396df52..085aaa5e5 100644
--- a/wdl/CleanVcf5.wdl
+++ b/wdl/CleanVcf5.wdl
@@ -8,6 +8,7 @@ workflow CleanVcf5 {
         File normal_revise_vcf
         File revise_vcf_lines
         File ped_file
+        File sex_chr_revise
         File multi_ids
         File? outlier_samples_list
 
@@ -43,6 +44,7 @@ workflow CleanVcf5 {
                 revise_vcf_lines=revise_vcf_lines,
                 normal_revise_vcf=ScatterVcf.shards[i],
                 ped_file=ped_file,
+                sex_chr_revise=sex_chr_revise,
                 multi_ids=multi_ids,
                 outlier_samples_list=outlier_samples_list,
                 make_clean_gq_script=make_clean_gq_script,
@@ -81,6 +83,7 @@ task MakeCleanGQ {
         File revise_vcf_lines
         File normal_revise_vcf
         File ped_file
+        File sex_chr_revise
         File multi_ids
         File? outlier_samples_list
         File? make_clean_gq_script
@@ -93,7 +96,7 @@ task MakeCleanGQ {
     # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed
     # generally assume working memory is ~3 * inputs
     Float input_size = size(
-                       select_all([revise_vcf_lines, normal_revise_vcf, ped_file, multi_ids, outlier_samples_list]),
+                       select_all([revise_vcf_lines, normal_revise_vcf, ped_file, sex_chr_revise, multi_ids, outlier_samples_list]),
                        "GB")
     Float base_disk_gb = 10.0
 
@@ -130,6 +133,7 @@ task MakeCleanGQ {
             revise.vcf.lines.vcf.gz \
             ~{normal_revise_vcf} \
             ~{ped_file} \
+            ~{sex_chr_revise} \
             ~{multi_ids} \
             outliers.txt \
             ~{prefix}

From 1ad3c1c9755d0c334f04e4db977b4289539197e7 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Thu, 31 Oct 2024 21:24:14 -0400
Subject: [PATCH 04/40] More files removed...

---
 .../scripts/clean_vcf_part1b_build_dict.py    | 154 ----------
 .../scripts/clean_vcf_part1b_filter.py        |  82 ------
 ..._vcf_part5_find_redundant_multiallelics.py |  60 ----
 .../scripts/clean_vcf_part5_update_records.py | 191 -------------
 wdl/CalcAF.wdl                                |   1 -
 wdl/CleanVcf5.wdl                             | 265 ------------------
 6 files changed, 753 deletions(-)
 delete mode 100644 src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_build_dict.py
 delete mode 100644 src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_filter.py
 delete mode 100755 src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_find_redundant_multiallelics.py
 delete mode 100755 src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_update_records.py
 delete mode 100644 wdl/CleanVcf5.wdl

diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_build_dict.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_build_dict.py
deleted file mode 100644
index b7da153cb..000000000
--- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_build_dict.py
+++ /dev/null
@@ -1,154 +0,0 @@
-"""
-Remove CNVs that are improperly genotyped by depth because they are nested
-within a real CNV
-"""
-
-import logging
-import pybedtools
-import pysam
-import sys
-import json
-
-from collections import defaultdict
-
-SVTYPE = "SVTYPE"
-BLANK_SAMPLES = "blanksample"
-
-
-class SVType:
-    DUP = "DUP"
-    DEL = "DEL"
-
-
-class VariantFormatTypes:
-    # Predicted copy state
-    RD_CN = "RD_CN"
-    # Classes of evidence supporting final genotype
-    EV = "EV"
-
-
-class VCFReviser:
-    def __init__(self):
-        self.rd_cn = {}
-        self.sample_indices_dict = {}
-        self.sample_list = []
-
-    def _update_rd_cn(self, variant, sample_indices):
-        self.rd_cn[variant.id] = {s: variant.samples[s][VariantFormatTypes.RD_CN] for s in sample_indices}
-
-    @staticmethod
-    def get_wider(f):
-        # f[1] : first interval start
-        # f[2] : first interval end
-        # f[7] : second interval start
-        # f[8] : second interval end
-        if int(f[2]) - int(f[1]) >= int(f[8]) - int(f[7]):
-            return f[0:6], f[6:12]
-        else:
-            return f[6:12], f[0:6]
-
-    @staticmethod
-    def get_coverage(wider, narrower):
-        n_start = int(narrower[1])
-        n_stop = int(narrower[2])
-        w_start = int(wider[1])
-        w_stop = int(wider[2])
-
-        coverage = 0
-        if w_start <= n_stop and n_start <= w_stop:
-            intersection_size = min(n_stop, w_stop) - max(n_start, w_start)
-            coverage = intersection_size / (n_stop - n_start)
-        return coverage
-
-    def get_geno_normal_revise(self, vcf_file, bed_file):
-        overlap_test_text = defaultdict(dict)
-        with pysam.VariantFile(vcf_file, "r") as f:
-            header = f.header
-            i = -1
-            for sample in header.samples:
-                i += 1
-                self.sample_indices_dict[sample] = i
-                self.sample_list.append(sample)
-
-            logging.info("Filtering intersect results")
-            bed = pybedtools.BedTool(bed_file)
-            for interval in bed.intervals:
-                wider, narrower = self.get_wider(interval.fields)
-                # wider and narrower are lists/tuples with the following fields:
-                # [0] : contig
-                # [1] : start position
-                # [2] : end position
-                # [3] : variant ID
-                # [4] : SV type
-                # [5] : comma-delimited sample lists, or BLANK_SAMPLES if none
-                if wider[5] == BLANK_SAMPLES:
-                    continue
-
-                coverage = self.get_coverage(wider, narrower)
-                if coverage >= 0.5:
-                    wider_samples = set(wider[5].split(","))
-                    narrower_samples = set(narrower[5].split(","))
-                    non_common_samples = [self.sample_indices_dict[s] for s in wider_samples - narrower_samples]
-                    for x in non_common_samples:
-                        vid = narrower[3]
-                        overlap_test_text[vid][x] = (wider[3], wider[4])
-
-            # Determine for which vid/sample pairs we need RD_CN
-            # Substantially reduces memory
-            logging.info('Getting revised variant IDs')
-            revise_vids = defaultdict(set)
-            for var_id, samples_dict in overlap_test_text.items():
-                for sample_index, v in samples_dict.items():
-                    # v[0] : variant ID
-                    # v[1] : SV type
-                    if v[1] == SVType.DUP or v[1] == SVType.DEL:
-                        revise_vids[var_id].add(sample_index)
-                        revise_vids[v[0]].add(sample_index)
-
-            logging.info('Getting RD_CN/EV')
-            for variant in f:
-                if variant.id in revise_vids:
-                    sample_indices = revise_vids[variant.id]
-                    self._update_rd_cn(variant, sample_indices)
-
-        logging.info('Generating geno_normal_revise_dict')
-        geno_normal_revise_dict = {}
-        for var_id, samples_dict in overlap_test_text.items():
-            for sample_index, v in samples_dict.items():
-                # v[0] : variant ID
-                # v[1] : SV type
-                new_val = None
-                if sample_index not in revise_vids[v[0]]:
-                    sys.stderr.write("{} {}\n".format(sample_index, v[0]))
-                if v[1] == SVType.DUP and \
-                        self.rd_cn[var_id][sample_index] == 2 and \
-                        self.rd_cn[v[0]][sample_index] == 3:
-                    new_val = 1
-                elif v[1] == SVType.DEL and \
-                        self.rd_cn[var_id][sample_index] == 2 \
-                        and self.rd_cn[v[0]][sample_index] == 1:
-                    new_val = 3
-
-                if new_val:
-                    if var_id not in geno_normal_revise_dict:
-                        geno_normal_revise_dict[var_id] = {}
-                    sample_id = self.sample_list[sample_index]
-                    geno_normal_revise_dict[var_id][sample_id] = new_val
-
-        return geno_normal_revise_dict
-
-
-def main(args):
-    logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)
-    logging.info('Starting script')
-    reviser = VCFReviser()
-    filtered_vcf = args[1]
-    intersected_bed = args[2]
-    geno_normal_revise_dict = reviser.get_geno_normal_revise(filtered_vcf, intersected_bed)
-    logging.info('Dumping dictionary')
-    sys.stdout.write(json.dumps(geno_normal_revise_dict))
-    logging.info('Done')
-
-
-if __name__ == '__main__':
-    main(sys.argv)
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_filter.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_filter.py
deleted file mode 100644
index e63b890cd..000000000
--- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_filter.py
+++ /dev/null
@@ -1,82 +0,0 @@
-"""
-Remove CNVs that are improperly genotyped by depth because they are nested
-within a real CNV
-"""
-
-import os
-import logging
-import pysam
-import sys
-from pathlib import Path
-import json
-import gzip
-
-SVTYPE = "SVTYPE"
-BLANK_SAMPLES = "B"
-
-
-class SVType:
-    DUP = "DUP"
-    DEL = "DEL"
-
-
-class VariantFormatTypes:
-    # Predicted copy state
-    RD_CN = "RD_CN"
-    # Classes of evidence supporting final genotype
-    EV = "EV"
-
-
-def modify_variants(dict_file_gz, vcf, multi_cnvs):
-    logging.info('Loading dictionary')
-    with gzip.open(dict_file_gz, 'rt') as f:
-        geno_normal_revise_dict = json.load(f)
-
-    logging.info('Filtering variants')
-    with pysam.VariantFile(vcf, "r") as f_in:
-        header = f_in.header
-        sys.stdout.write(str(header))
-        with open(multi_cnvs, "w") as multi_cnvs_f:
-            variants = f_in.fetch()
-            for variant in variants:
-                if variant.id in geno_normal_revise_dict:
-                    for sample_id in geno_normal_revise_dict[variant.id]:
-                        o = variant.samples[sample_id]
-                        o.update({"GT": (0, 1)})
-                        o.update({"GQ": o["RD_GQ"]})
-
-                if variant.stop - variant.start >= 1000:
-                    if variant.info[SVTYPE] in [SVType.DEL, SVType.DUP]:
-                        is_del = variant.info[SVTYPE] == SVType.DEL
-                        for k, v in variant.samples.items():
-                            rd_cn = v[VariantFormatTypes.RD_CN]
-                            if rd_cn is None:
-                                continue
-                            if (is_del and rd_cn > 3) or \
-                                    (not is_del and (rd_cn < 1 or rd_cn > 4)):
-                                multi_cnvs_f.write(variant.id + "\n")
-                                break
-
-                sys.stdout.write(str(variant))
-
-
-def ensure_file(filename):
-    filename = os.path.join(".", filename)
-    filename = Path(filename)
-    if filename.exists():
-        os.remove(filename)
-    return filename.name
-
-
-def main(args):
-    logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)
-    logging.info('Starting script')
-    multi_cnvs_filename = ensure_file("multi.cnvs.txt")
-    dict_file_gz = args[1]
-    vcf_file = args[2]
-    modify_variants(dict_file_gz, vcf_file, multi_cnvs_filename)
-    logging.info('Done')
-
-
-if __name__ == '__main__':
-    main(sys.argv)
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_find_redundant_multiallelics.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_find_redundant_multiallelics.py
deleted file mode 100755
index ad2b744a5..000000000
--- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_find_redundant_multiallelics.py
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/usr/bin/env python
-
-import argparse
-import sys
-import svtk.utils as svu
-
-
-def process_features_for_size1(features_for_size1, redundant_multiallelics):
-    for intersection in sorted(features_for_size1, key=lambda x: int(x[9]) - int(x[8]), reverse=True):
-        b_len = int(intersection.fields[9]) - int(intersection.fields[8])
-        overlap = int(intersection.fields[14])
-        small_coverage = overlap / b_len
-        if small_coverage > 0.50:
-            if intersection.fields[3] not in redundant_multiallelics:
-                redundant_multiallelics.add(intersection.fields[10])
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description=__doc__,
-        formatter_class=argparse.RawDescriptionHelpFormatter)
-    parser.add_argument('multiallelic_filename')
-    parser.add_argument('fout')
-    args = parser.parse_args()
-
-    print("finding redundant overlapping sites", file=sys.stderr)
-    multiallelic_bed = svu.vcf2bedtool(args.multiallelic_filename, include_filters=True)
-
-    redundant_multiallelics = set()
-    # feature fields:
-    #   [1] : first interval start
-    #   [2] : first interval end
-    #   [3] : first interval variant ID
-    #   [8] : second interval start
-    #   [9] : second interval end
-    #   [10] : second interval variant ID
-    self_inter = multiallelic_bed.intersect(multiallelic_bed, wo=True)\
-        .filter(lambda feature: feature[3] != feature[10]) \
-        .filter(lambda feature: (int(feature[2]) - int(feature[1])) >= (int(feature[9]) - int(feature[8]))) \
-        .sort(sizeD=True)
-    current_size1 = -1
-    features_for_size1 = []
-    for feature in self_inter:
-        size1 = int(feature[2]) - int(feature[1])
-        if size1 != current_size1:
-            process_features_for_size1(features_for_size1, redundant_multiallelics)
-            features_for_size1 = []
-
-        current_size1 = size1
-        features_for_size1.append(feature)
-
-    process_features_for_size1(features_for_size1, redundant_multiallelics)
-    print("identified {} redundant multiallelic sites".format(len(redundant_multiallelics)), file=sys.stderr)
-    with open(args.fout, "w") as list_file:
-        for vid in redundant_multiallelics:
-            print(vid, file=list_file)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_update_records.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_update_records.py
deleted file mode 100755
index 51675b5ab..000000000
--- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_update_records.py
+++ /dev/null
@@ -1,191 +0,0 @@
-#!/usr/bin/env python
-
-import argparse
-from collections import Counter
-import gzip
-import pysam
-import sys
-import svtk.utils as svu
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description=__doc__,
-        formatter_class=argparse.RawDescriptionHelpFormatter)
-    parser.add_argument('revise_vcf_lines', type=argparse.FileType('r'))
-    parser.add_argument('normal_revise_vcf')
-    parser.add_argument('famfile', type=argparse.FileType('r'))
-    parser.add_argument('sexchr_revise')
-    parser.add_argument('multi_geno_ids_txt')
-    parser.add_argument('outlier_samples_list', type=argparse.FileType('r'))
-    parser.add_argument('out_prefix')
-    parser.add_argument('--threads_per_file', required=False, default=2, type=int)
-    args = parser.parse_args()
-
-    # load the revised lines and index by ID
-    with pysam.VariantFile(args.revise_vcf_lines, threads=args.threads_per_file) as revise_vcf:
-        header2 = revise_vcf.header
-        revised_lines_by_id = {record.id: record for record in revise_vcf}
-    print("loaded {} revised lines".format(len(revised_lines_by_id)), file=sys.stderr)
-
-    outlier_samples = set([line.rstrip() for line in args.outlier_samples_list if not line.isspace()])
-    print("loaded {} outlier samples".format(len(outlier_samples)), file=sys.stderr)
-
-    male_samples = set()
-    for line in args.famfile:
-        if line.isspace():
-            continue
-        fields = line.rstrip().split("\t")
-        if fields[4] == '1':
-            male_samples.add(fields[1])
-    print("identified {} male samples".format(len(male_samples)), file=sys.stderr)
-
-    if args.sexchr_revise.endswith(".gz"):
-        sexchr_revise = {line.rstrip() for line in gzip.open(args.sexchr_revise, 'rt')}
-    else:
-        sexchr_revise = {line.rstrip() for line in open(args.sexchr_revise, 'rt')}
-    print("{} sites to revise on sex chromosomes".format(len(sexchr_revise)), file=sys.stderr)
-
-    if args.multi_geno_ids_txt.endswith(".gz"):
-        multi_geno_ids = {line.rstrip() for line in gzip.open(args.multi_geno_ids_txt, 'rt')}
-    else:
-        multi_geno_ids = {line.rstrip() for line in open(args.multi_geno_ids_txt, 'rt')}
-    print("{} multiallelic sites".format(len(multi_geno_ids)), file=sys.stderr)
-
-    NEW_HEADER_LINES = ['##ALT=<ID=CNV,Description="Copy Number Polymorphism">',
-                        '##FORMAT=<ID=CNQ,Number=1,Type=Integer,Description="Read-depth genotype quality">',
-                        '##FORMAT=<ID=CN,Number=1,Type=Integer,Description="Predicted copy state">',
-                        '##INFO=<ID=PESR_GT_OVERDISPERSION,Number=0,Type=Flag,'
-                        'Description="High PESR dispersion count">',
-                        '##FILTER=<ID=MULTIALLELIC,Description="Multiallelic site">']
-
-    with pysam.VariantFile(args.normal_revise_vcf) as normal_vcf:
-
-        # # Add metadata lines for annotations
-        header1 = normal_vcf.header
-
-        for f in NEW_HEADER_LINES:
-            header1.add_line(f)
-            header2.add_line(f)
-
-        non_outlier_samples = {s for s in header1.samples if s not in outlier_samples}
-        vf_1 = max(len(non_outlier_samples) * 0.01, 2)
-
-        biallelic_gts = {(1, 1), (0, 0), (0, 1), (None, None)}
-
-        print("reformatting records", file=sys.stderr)
-        cleangq_filename = args.out_prefix + ".cleanGQ.vcf.gz"
-        multiallelic_filename = args.out_prefix + ".multiallelic.vcf.gz"
-        no_variant_samples_list_file = args.out_prefix + ".no_called_samples.list"
-
-        with pysam.VariantFile(cleangq_filename, 'w', header=normal_vcf.header, threads=args.threads_per_file) as cleanqg_out, \
-                pysam.VariantFile(multiallelic_filename, 'w', header=normal_vcf.header) as multiallelic_out, \
-                open(no_variant_samples_list_file, 'w') as no_variant_samples_out:
-            for idx, record in enumerate(normal_vcf):
-                multi_del = False
-                multi_dup = False
-                gt4_copystate = False
-                gt5kb_dup = False
-                gt5kb_del = False
-                if (idx - 1) % 1000 == 0:
-                    print("processed {} records".format(idx), file=sys.stderr)
-                if record.id in revised_lines_by_id:
-                    record = revised_lines_by_id[record.id]
-                if record.info.get('SVTYPE', None) == 'DEL':
-                    if abs(record.stop - record.pos) >= 1000:
-                        sample_cn_map = {s: record.samples[s]['RD_CN'] for s in non_outlier_samples}
-                        if len([s for s in sample_cn_map if (sample_cn_map[s] is not None and sample_cn_map[s] > 3)]) > vf_1:
-                            multi_del = True
-                    gts = [record.samples[s]['GT'] for s in non_outlier_samples]
-                    if any(gt not in biallelic_gts for gt in gts):
-                        gt5kb_del = True
-                    if abs(record.stop - record.pos) >= 5000:
-                        if not multi_del:
-                            gt5kb_del = True
-
-                if record.info.get('SVTYPE', None) == 'DUP':
-                    if abs(record.stop - record.pos) >= 1000:
-                        sample_cn_map = {s: record.samples[s]['RD_CN'] for s in non_outlier_samples}
-                        if sum(1 for s in sample_cn_map if sample_cn_map[s] is not None and sample_cn_map[s] > 4) > vf_1:
-                            multi_dup = True
-                        if sum(1 for x in Counter(sample_cn_map.values()) if x is not None and (x < 1 or x > 4)) > 4:
-                            gt4_copystate = True
-                        if sum(1 for s in sample_cn_map if sample_cn_map[s] is not None and
-                                (sample_cn_map[s] < 1 or sample_cn_map[s] > 4) and gt4_copystate) > vf_1:
-                            multi_dup = True
-                    gts = [record.samples[s]['GT'] for s in non_outlier_samples]
-                    if any(gt not in biallelic_gts for gt in gts):
-                        gt5kb_dup = True
-                    if abs(record.stop - record.pos) >= 5000:
-                        if not multi_dup:
-                            gt5kb_dup = True
-
-                if gt5kb_del:
-                    for sample_obj in record.samples.itervalues():
-                        # Leave no-calls
-                        if sample_obj['GT'] == (None, None):
-                            continue
-                        if not sample_obj['GQ'] is None and \
-                                (sample_obj['RD_CN'] is not None and sample_obj['RD_CN'] >= 2):
-                            sample_obj['GT'] = (0, 0)
-                        elif not sample_obj['GQ'] is None and \
-                                (sample_obj['RD_CN'] is not None and sample_obj['RD_CN'] == 1):
-                            sample_obj['GT'] = (0, 1)
-                        elif not sample_obj['GQ'] is None:
-                            sample_obj['GT'] = (1, 1)  # RD_CN 0 DEL
-
-                if gt5kb_dup:
-                    for sample_obj in record.samples.itervalues():
-                        # Leave no-calls
-                        if sample_obj['GT'] == (None, None):
-                            continue
-                        if not sample_obj['GQ'] is None and \
-                                (sample_obj['RD_CN'] is not None and sample_obj['RD_CN'] <= 2):
-                            sample_obj['GT'] = (0, 0)
-                        elif not sample_obj['GQ'] is None and \
-                                (sample_obj['RD_CN'] is not None and sample_obj['RD_CN'] == 3):
-                            sample_obj['GT'] = (0, 1)
-                        elif not sample_obj['GQ'] is None:
-                            sample_obj['GT'] = (1, 1)  # RD_CN > 3 DUP
-
-                if record.id in multi_geno_ids:
-                    record.info['PESR_GT_OVERDISPERSION'] = True
-
-                if multi_del or multi_dup:
-                    record.filter.add('MULTIALLELIC')
-                    for j, sample in enumerate(record.samples):
-                        record.samples[sample]['GT'] = None
-                        record.samples[sample]['GQ'] = None
-                        record.samples[sample]['CN'] = record.samples[sample]['RD_CN']
-                        record.samples[sample]['CNQ'] = record.samples[sample]['RD_GQ']
-
-                if len(record.filter) > 1 and 'PASS' in record.filter:
-                    del record.filter['PASS']
-
-                if 'MULTIALLELIC' in record.filter and ('<DUP>' in record.alts or '<DEL>' in record.alts):
-                    record.alts = ('<CNV>',)
-                    record.info['SVTYPE'] = 'CNV'
-
-                if record.id in sexchr_revise:
-                    for sample in record.samples:
-                        if sample in male_samples:
-                            cn = record.samples[sample]['RD_CN']
-                            if cn is not None and int(cn) > 0:
-                                cn = int(cn)
-                                record.samples[sample]['RD_CN'] = cn - 1
-                                if 'CN' in record.samples[sample]:
-                                    record.samples[sample]['CN'] = cn - 1  # the old script didn't do this but I think it should
-
-                cleanqg_out.write(record)
-
-                if 'MULTIALLELIC' in record.filter:
-                    multiallelic_out.write(record)
-
-                if len(svu.get_called_samples(record)) == 0:
-                    print(record.id, file=no_variant_samples_out)
-
-    print("done", file=sys.stderr)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/wdl/CalcAF.wdl b/wdl/CalcAF.wdl
index cbc124e2a..064c3b28a 100644
--- a/wdl/CalcAF.wdl
+++ b/wdl/CalcAF.wdl
@@ -1,7 +1,6 @@
 version 1.0
 
 import "Structs.wdl"
-import "CleanVcf5.wdl" as cleanvcf5
 import "TasksMakeCohortVcf.wdl" as tmc
 
 workflow CalcAF {
diff --git a/wdl/CleanVcf5.wdl b/wdl/CleanVcf5.wdl
deleted file mode 100644
index 085aaa5e5..000000000
--- a/wdl/CleanVcf5.wdl
+++ /dev/null
@@ -1,265 +0,0 @@
-version 1.0
-
-import "Structs.wdl"
-import "TasksMakeCohortVcf.wdl" as tasks
-
-workflow CleanVcf5 {
-    input {
-        File normal_revise_vcf
-        File revise_vcf_lines
-        File ped_file
-        File sex_chr_revise
-        File multi_ids
-        File? outlier_samples_list
-
-        String prefix
-        String contig
-        Int records_per_shard
-
-        File? make_clean_gq_script
-        File? find_redundant_sites_script
-
-        String sv_base_mini_docker
-        String sv_pipeline_docker
-
-        Int? threads_per_task
-        RuntimeAttr? runtime_attr_override_scatter
-        RuntimeAttr? runtime_attr_override_make_cleangq
-        RuntimeAttr? runtime_attr_override_find_redundant_multiallelics
-        RuntimeAttr? runtime_attr_override_polish
-    }
-
-    call tasks.ScatterVcf {
-        input:
-            vcf=normal_revise_vcf,
-            records_per_shard = records_per_shard,
-            prefix = "~{prefix}.scatter_vcf",
-            sv_pipeline_docker=sv_pipeline_docker,
-            runtime_attr_override=runtime_attr_override_scatter
-    }
-
-    scatter ( i in range(length(ScatterVcf.shards)) ) {
-        call MakeCleanGQ {
-            input:
-                revise_vcf_lines=revise_vcf_lines,
-                normal_revise_vcf=ScatterVcf.shards[i],
-                ped_file=ped_file,
-                sex_chr_revise=sex_chr_revise,
-                multi_ids=multi_ids,
-                outlier_samples_list=outlier_samples_list,
-                make_clean_gq_script=make_clean_gq_script,
-                prefix="~{prefix}.make_clean_gq.shard_~{i}",
-                sv_pipeline_docker=sv_pipeline_docker,
-                runtime_attr_override=runtime_attr_override_make_cleangq
-        }
-    }
-
-    call FindRedundantMultiallelics {
-        input:
-            multiallelic_vcfs=MakeCleanGQ.multiallelic_vcf,
-            find_redundant_sites_script=find_redundant_sites_script,
-            prefix="~{prefix}.find_redundant_multiallelics",
-            sv_pipeline_docker=sv_pipeline_docker,
-            runtime_attr_override=runtime_attr_override_find_redundant_multiallelics
-    }
-
-    call Polish {
-        input:
-            clean_gq_vcfs=MakeCleanGQ.clean_gq_vcf,
-            no_sample_lists=MakeCleanGQ.no_sample_list,
-            redundant_multiallelics_list=FindRedundantMultiallelics.redundant_multiallelics_list,
-            prefix="~{prefix}.polish",
-            sv_pipeline_docker=sv_pipeline_docker,
-            runtime_attr_override=runtime_attr_override_polish
-    }
-
-    output {
-        File polished=Polish.polished
-    }
-}
-
-task MakeCleanGQ {
-    input {
-        File revise_vcf_lines
-        File normal_revise_vcf
-        File ped_file
-        File sex_chr_revise
-        File multi_ids
-        File? outlier_samples_list
-        File? make_clean_gq_script
-        String prefix
-        Int? threads = 2
-        String sv_pipeline_docker
-        RuntimeAttr? runtime_attr_override
-    }
-
-    # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed
-    # generally assume working memory is ~3 * inputs
-    Float input_size = size(
-                       select_all([revise_vcf_lines, normal_revise_vcf, ped_file, sex_chr_revise, multi_ids, outlier_samples_list]),
-                       "GB")
-    Float base_disk_gb = 10.0
-
-    RuntimeAttr runtime_default = object {
-                                      mem_gb: 16,
-                                      disk_gb: ceil(base_disk_gb + input_size * 5.0),
-                                      cpu_cores: 1,
-                                      preemptible_tries: 3,
-                                      max_retries: 1,
-                                      boot_disk_gb: 10
-                                  }
-    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-    runtime {
-        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-        docker: sv_pipeline_docker
-        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-    }
-
-    command <<<
-        set -eu -o pipefail
-
-        ~{if defined(outlier_samples_list) then "ln ~{outlier_samples_list} outliers.txt" else "touch outliers.txt"}
-
-        # put the revise lines into a normal VCF format
-        bcftools view -h ~{normal_revise_vcf} > header.txt
-        cat header.txt <(zcat ~{revise_vcf_lines} | grep . | tr " " "\t") | bgzip -c > revise.vcf.lines.vcf.gz
-
-        python3 ~{default="/opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_update_records.py" make_clean_gq_script} \
-            --threads_per_file ~{threads} \
-            revise.vcf.lines.vcf.gz \
-            ~{normal_revise_vcf} \
-            ~{ped_file} \
-            ~{sex_chr_revise} \
-            ~{multi_ids} \
-            outliers.txt \
-            ~{prefix}
-
-        bcftools view -G -O z ~{prefix}.multiallelic.vcf.gz > ~{prefix}.multiallelic.sites.vcf.gz
-        tabix ~{prefix}.cleanGQ.vcf.gz
-    >>>
-
-    output {
-        File clean_gq_vcf=prefix + ".cleanGQ.vcf.gz"
-        File clean_gq_vcf_idx=prefix + ".cleanGQ.vcf.gz.tbi"
-        File multiallelic_vcf=prefix + ".multiallelic.sites.vcf.gz"
-        File no_sample_list = prefix + ".no_called_samples.list"
-    }
-}
-
-task FindRedundantMultiallelics {
-    input {
-        Array[File] multiallelic_vcfs
-        File? find_redundant_sites_script
-        String prefix
-        String sv_pipeline_docker
-        RuntimeAttr? runtime_attr_override
-    }
-
-    # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed
-    # generally assume working memory is ~3 * inputs
-    Float input_size = size(multiallelic_vcfs, "GB")
-    Float base_disk_gb = 10.0
-
-    RuntimeAttr runtime_default = object {
-                                      mem_gb: 16,
-                                      disk_gb: ceil(base_disk_gb + input_size * 5.0),
-                                      cpu_cores: 1,
-                                      preemptible_tries: 3,
-                                      max_retries: 1,
-                                      boot_disk_gb: 10
-                                  }
-    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-    runtime {
-        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-        docker: sv_pipeline_docker
-        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-    }
-
-    command <<<
-        set -euo pipefail
-        VCFS="~{write_lines(multiallelic_vcfs)}"
-        cat $VCFS | awk -F '/' '{print $NF"\t"$0}' | sort -k1,1V | awk '{print $2}' > vcfs_sorted.list
-        bcftools concat --no-version --output-type z --file-list vcfs_sorted.list --output multiallelic.vcf.gz
-
-        python3 ~{default="/opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_find_redundant_multiallelics.py" find_redundant_sites_script} \
-            multiallelic.vcf.gz \
-            ~{prefix}.list
-
-    >>>
-
-    output {
-        File redundant_multiallelics_list="~{prefix}.list"
-    }
-}
-
-
-task Polish {
-    input {
-        Array[File] clean_gq_vcfs
-        Array[File] no_sample_lists
-        File redundant_multiallelics_list
-        String prefix
-        String sv_pipeline_docker
-        Int threads = 2
-        RuntimeAttr? runtime_attr_override
-    }
-
-    # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed
-    # generally assume working memory is ~3 * inputs
-    Float input_size = size(clean_gq_vcfs, "GB")
-    Float base_disk_gb = 10.0
-
-    RuntimeAttr runtime_default = object {
-                                      mem_gb: 16,
-                                      disk_gb: ceil(base_disk_gb + input_size * 5.0),
-                                      cpu_cores: 4,
-                                      preemptible_tries: 3,
-                                      max_retries: 1,
-                                      boot_disk_gb: 10
-                                  }
-    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-    runtime {
-        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-        docker: sv_pipeline_docker
-        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-    }
-
-    command <<<
-        set -euo pipefail
-
-        VCFS="~{write_lines(clean_gq_vcfs)}"
-        cat $VCFS | awk -F '/' '{print $NF"\t"$0}' | sort -k1,1V | awk '{print $2}' > vcfs_sorted.list
-        cat ~{redundant_multiallelics_list} ~{sep=" " no_sample_lists} > ids_to_remove.list
-        bcftools concat --no-version --output-type u --file-list vcfs_sorted.list | \
-            bcftools view --no-version \
-                --exclude 'ID=@ids_to_remove.list' \
-                --output-type z -o polished.need_reheader.vcf.gz --threads ~{threads}
-
-        # do the last bit of header cleanup
-        bcftools view -h polished.need_reheader.vcf.gz > original_header.vcf
-        cat original_header.vcf | fgrep '##fileformat' > new_header.vcf
-        cat original_header.vcf \
-            | egrep -v "CIPOS|CIEND|RMSSTD|EVENT|INFO=<ID=UNRESOLVED,|source|varGQ|bcftools|ALT=<ID=UNR|INFO=<ID=MULTIALLELIC|GATKCommandLine|#CHROM|##contig|##fileformat" \
-            | sort >> new_header.vcf
-        # Don't sort contigs lexicographically, which would result in incorrect chr1, chr10, chr11, ... ordering
-        cat original_header.vcf | fgrep '##contig' >> new_header.vcf
-        cat original_header.vcf | fgrep '#CHROM' >> new_header.vcf
-        bcftools reheader polished.need_reheader.vcf.gz -h new_header.vcf -o ~{prefix}.vcf.gz
-    >>>
-
-    output {
-        File polished="~{prefix}.vcf.gz"
-    }
-}

From db6b9b8c4146ec0bf850455ffaa261f0d8abb637 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Fri, 1 Nov 2024 11:45:23 -0400
Subject: [PATCH 05/40] Minor changes to merge with latest changes

---
 wdl/CleanVcfChromosome.wdl | 78 +++++++++++++-------------------------
 1 file changed, 27 insertions(+), 51 deletions(-)

diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl
index 0edce82fb..8f34d3680 100644
--- a/wdl/CleanVcfChromosome.wdl
+++ b/wdl/CleanVcfChromosome.wdl
@@ -17,8 +17,6 @@ workflow CleanVcfChromosome {
 		File bothsides_pass_list
 		Int min_records_per_shard_step1
 		Int samples_per_step2_shard
-		Int clean_vcf5_records_per_shard
-		Int? clean_vcf5_threads_per_task
 		File? outlier_samples_list
 		Int? max_samples_per_shard_step3
 
@@ -49,29 +47,7 @@ workflow CleanVcfChromosome {
 		RuntimeAttr? runtime_override_stitch_fragmented_cnvs
 		RuntimeAttr? runtime_override_final_cleanup
 		RuntimeAttr? runtime_override_rescue_me_dels
-    # overrides for local tasks
-    RuntimeAttr? runtime_override_clean_vcf_1a
-    RuntimeAttr? runtime_override_clean_vcf_2
-    RuntimeAttr? runtime_override_clean_vcf_3
-    RuntimeAttr? runtime_override_clean_vcf_4
-    RuntimeAttr? runtime_override_clean_vcf_5_scatter
-    RuntimeAttr? runtime_override_clean_vcf_5_make_cleangq
-    RuntimeAttr? runtime_override_clean_vcf_5_find_redundant_multiallelics
-    RuntimeAttr? runtime_override_clean_vcf_5_polish
-    RuntimeAttr? runtime_override_stitch_fragmented_cnvs
-    RuntimeAttr? runtime_override_final_cleanup
-    RuntimeAttr? runtime_override_rescue_me_dels
-    RuntimeAttr? runtime_attr_add_high_fp_rate_filters
-
-    # Clean vcf 1b
-    RuntimeAttr? runtime_attr_override_subset_large_cnvs_1b
-    RuntimeAttr? runtime_attr_override_sort_bed_1b
-    RuntimeAttr? runtime_attr_override_intersect_bed_1b
-    RuntimeAttr? runtime_attr_override_build_dict_1b
-    RuntimeAttr? runtime_attr_override_scatter_1b
-    RuntimeAttr? runtime_attr_override_filter_vcf_1b
-    RuntimeAttr? runtime_override_concat_vcfs_1b
-    RuntimeAttr? runtime_override_cat_multi_cnvs_1b
+		RuntimeAttr? runtime_attr_add_high_fp_rate_filters
 
 		RuntimeAttr? runtime_override_preconcat_step1
 		RuntimeAttr? runtime_override_hail_merge_step1
@@ -285,18 +261,18 @@ workflow CleanVcfChromosome {
 	}
 
 	call RescueMobileElementDeletions {
-		input:
-			vcf = StitchFragmentedCnvs.stitched_vcf_shard,
-			prefix = "~{prefix}.rescue_me_dels",
-			LINE1 = LINE1_reference,
-			HERVK = HERVK_reference,
-			sv_pipeline_docker = sv_pipeline_docker,
-			runtime_attr_override = runtime_override_rescue_me_dels
-	}
+    input:
+      vcf = StitchFragmentedCnvs.stitched_vcf_shard,
+      prefix = "~{prefix}.rescue_me_dels",
+      LINE1 = LINE1_reference,
+      HERVK = HERVK_reference,
+      sv_pipeline_docker = sv_pipeline_docker,
+      runtime_attr_override = runtime_override_rescue_me_dels
+  }
 
-	call AddHighFDRFilters {
-		input:
-			vcf=RescueMobileElementDeletions.out,
+  call AddHighFDRFilters {
+    input:
+      vcf=RescueMobileElementDeletions.out,
       prefix="~{prefix}.high_fdr_filtered",
       sv_pipeline_docker=sv_pipeline_docker,
       runtime_attr_override=runtime_attr_add_high_fp_rate_filters
@@ -305,22 +281,22 @@ workflow CleanVcfChromosome {
   call FinalCleanup {
     input:
       vcf=AddHighFDRFilters.out,
-			contig=contig,
-			prefix="~{prefix}.final_cleanup",
-			sv_pipeline_docker=sv_pipeline_docker,
-			runtime_attr_override=runtime_override_final_cleanup
-	}
+      contig=contig,
+      prefix="~{prefix}.final_cleanup",
+      sv_pipeline_docker=sv_pipeline_docker,
+      runtime_attr_override=runtime_override_final_cleanup
+  }
 
-	call fvcf.FormatVcf as FormatVcfToOutput {
-		input:
-			vcf=FinalCleanup.final_cleaned_shard,
-			ploidy_table=ploidy_table,
-			args="--scale-down-gq",
-			output_prefix="~{prefix}.final_format",
-			script=svtk_to_gatk_script,
-			sv_pipeline_docker=sv_pipeline_docker,
-			runtime_attr_override=runtime_attr_format
-	}
+  call fvcf.FormatVcf as FormatVcfToOutput {
+    input:
+      vcf=FinalCleanup.final_cleaned_shard,
+      ploidy_table=ploidy_table,
+      args="--scale-down-gq",
+      output_prefix="~{prefix}.final_format",
+      script=svtk_to_gatk_script,
+      sv_pipeline_docker=sv_pipeline_docker,
+      runtime_attr_override=runtime_attr_format
+  }
 	
 	output {
 		File out = FormatVcfToOutput.out

From e70ec78a3a972493c2a956ff5c1a7ac355bf3e54 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Fri, 1 Nov 2024 12:30:48 -0400
Subject: [PATCH 06/40] Modified java_mem_gb to use select_first

---
 wdl/CleanVcfChromosome.wdl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl
index 8f34d3680..57acd9c64 100644
--- a/wdl/CleanVcfChromosome.wdl
+++ b/wdl/CleanVcfChromosome.wdl
@@ -339,7 +339,7 @@ task CleanVcf1a {
 		bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
 	}
 
-	Int java_mem_mb = ceil(runtime_override.mem_gb * 1000 * 0.7)
+	Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7)
 	String output_vcf = "~{prefix}.vcf.gz"
 	String output_samples_list = "~{prefix}.includelist.txt"
 
@@ -396,7 +396,7 @@ task CleanVcf1b {
 	}
 
 	String output_vcf = "~{prefix}.vcf.gz"
-	Int java_mem_mb = ceil(runtime_override.mem_gb * 1000 * 0.7)
+	Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7)
 
 	command <<<
 		set -euo pipefail
@@ -448,7 +448,7 @@ task CleanVcf2 {
 	}
 
 	String output_revised_list = "~{prefix}.txt"
-	Int java_mem_mb = ceil(runtime_override.mem_gb * 1000 * 0.7)
+	Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7)
 
 	command <<<
 		set -euo pipefail
@@ -542,7 +542,7 @@ task CleanVcf4 {
 	}
 
 	String output_vcf = "~{prefix}.vcf.gz"
-	Int java_mem_mb = ceil(runtime_override.mem_gb * 1000 * 0.7)
+	Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7)
 
 	command <<<
 		set -euo pipefail
@@ -594,7 +594,7 @@ task CleanVcf5 {
 	}
 
 	String output_vcf = "~{prefix}.vcf.gz"
-	Int java_mem_mb = ceil(runtime_override.mem_gb * 1000 * 0.7)
+	Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7)
 
 	command <<<
 		set -euo pipefail

From 2cb19cafd5282aad79142c49f2b32f73a95dc4b6 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Fri, 1 Nov 2024 12:47:08 -0400
Subject: [PATCH 07/40] Clean up CleanVcf.wdl inputs

---
 wdl/CleanVcf.wdl | 29 +++++------------------------
 1 file changed, 5 insertions(+), 24 deletions(-)

diff --git a/wdl/CleanVcf.wdl b/wdl/CleanVcf.wdl
index f58c7f4f1..07fb28b53 100644
--- a/wdl/CleanVcf.wdl
+++ b/wdl/CleanVcf.wdl
@@ -60,28 +60,16 @@ workflow CleanVcf {
 
     # overrides for CleanVcfContig
     RuntimeAttr? runtime_override_clean_vcf_1a
+    RuntimeAttr? runtime_override_clean_vcf_1b
     RuntimeAttr? runtime_override_clean_vcf_2
     RuntimeAttr? runtime_override_clean_vcf_3
     RuntimeAttr? runtime_override_clean_vcf_4
-    RuntimeAttr? runtime_override_clean_vcf_5_scatter
-    RuntimeAttr? runtime_override_clean_vcf_5_make_cleangq
-    RuntimeAttr? runtime_override_clean_vcf_5_find_redundant_multiallelics
-    RuntimeAttr? runtime_override_clean_vcf_5_polish
+    RuntimeAttr? runtime_override_clean_vcf_5
     RuntimeAttr? runtime_override_stitch_fragmented_cnvs
     RuntimeAttr? runtime_override_final_cleanup
     RuntimeAttr? runtime_attr_format
     RuntimeAttr? runtime_override_rescue_me_dels
 
-    # Clean vcf 1b
-    RuntimeAttr? runtime_attr_override_subset_large_cnvs_1b
-    RuntimeAttr? runtime_attr_override_sort_bed_1b
-    RuntimeAttr? runtime_attr_override_intersect_bed_1b
-    RuntimeAttr? runtime_attr_override_build_dict_1b
-    RuntimeAttr? runtime_attr_override_scatter_1b
-    RuntimeAttr? runtime_attr_override_filter_vcf_1b
-    RuntimeAttr? runtime_override_concat_vcfs_1b
-    RuntimeAttr? runtime_override_cat_multi_cnvs_1b
-
     RuntimeAttr? runtime_override_preconcat_step1
     RuntimeAttr? runtime_override_hail_merge_step1
     RuntimeAttr? runtime_override_fix_header_step1
@@ -91,11 +79,8 @@ workflow CleanVcf {
     RuntimeAttr? runtime_override_fix_header_drc
 
     RuntimeAttr? runtime_override_split_vcf_to_clean
-    RuntimeAttr? runtime_override_combine_step_1_sex_chr_revisions
     RuntimeAttr? runtime_override_split_include_list
     RuntimeAttr? runtime_override_combine_clean_vcf_2
-    RuntimeAttr? runtime_override_combine_revised_4
-    RuntimeAttr? runtime_override_combine_multi_ids_4
     RuntimeAttr? runtime_override_drop_redundant_cnvs
     RuntimeAttr? runtime_override_combine_step_1_vcfs
     RuntimeAttr? runtime_override_sort_drop_redundant_cnvs
@@ -149,18 +134,12 @@ workflow CleanVcf {
         runtime_override_clean_vcf_2=runtime_override_clean_vcf_2,
         runtime_override_clean_vcf_3=runtime_override_clean_vcf_3,
         runtime_override_clean_vcf_4=runtime_override_clean_vcf_4,
-        runtime_override_clean_vcf_5_scatter=runtime_override_clean_vcf_5_scatter,
-        runtime_override_clean_vcf_5_make_cleangq=runtime_override_clean_vcf_5_make_cleangq,
-        runtime_override_clean_vcf_5_find_redundant_multiallelics=runtime_override_clean_vcf_5_find_redundant_multiallelics,
-        runtime_override_clean_vcf_5_polish=runtime_override_clean_vcf_5_polish,
+        runtime_override_clean_vcf_5=runtime_override_clean_vcf_5,
         runtime_override_stitch_fragmented_cnvs=runtime_override_stitch_fragmented_cnvs,
         runtime_override_final_cleanup=runtime_override_final_cleanup,
         runtime_override_split_vcf_to_clean=runtime_override_split_vcf_to_clean,
-        runtime_override_combine_step_1_sex_chr_revisions=runtime_override_combine_step_1_sex_chr_revisions,
         runtime_override_split_include_list=runtime_override_split_include_list,
         runtime_override_combine_clean_vcf_2=runtime_override_combine_clean_vcf_2,
-        runtime_override_combine_revised_4=runtime_override_combine_revised_4,
-        runtime_override_combine_multi_ids_4=runtime_override_combine_multi_ids_4,
         runtime_override_preconcat_step1=runtime_override_preconcat_step1,
         runtime_override_hail_merge_step1=runtime_override_hail_merge_step1,
         runtime_override_fix_header_step1=runtime_override_fix_header_step1,
@@ -168,6 +147,8 @@ workflow CleanVcf {
         runtime_override_hail_merge_drc=runtime_override_hail_merge_drc,
         runtime_override_fix_header_drc=runtime_override_fix_header_drc,
         runtime_override_drop_redundant_cnvs=runtime_override_drop_redundant_cnvs,
+        runtime_override_combine_step_1_vcfs=runtime_override_combine_step_1_vcfs
+        runtime_override_sort_drop_redundant_cnvs=runtime_override_sort_drop_redundant_cnvs
         runtime_attr_format=runtime_attr_format,
         runtime_override_rescue_me_dels=runtime_override_rescue_me_dels
     }

From 0956b922b3e779ce8d9a9912180e22533bf78f8b Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Fri, 1 Nov 2024 13:42:36 -0400
Subject: [PATCH 08/40] Forgot comma

---
 wdl/CleanVcf.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wdl/CleanVcf.wdl b/wdl/CleanVcf.wdl
index 07fb28b53..032884704 100644
--- a/wdl/CleanVcf.wdl
+++ b/wdl/CleanVcf.wdl
@@ -147,7 +147,7 @@ workflow CleanVcf {
         runtime_override_hail_merge_drc=runtime_override_hail_merge_drc,
         runtime_override_fix_header_drc=runtime_override_fix_header_drc,
         runtime_override_drop_redundant_cnvs=runtime_override_drop_redundant_cnvs,
-        runtime_override_combine_step_1_vcfs=runtime_override_combine_step_1_vcfs
+        runtime_override_combine_step_1_vcfs=runtime_override_combine_step_1_vcfs,
         runtime_override_sort_drop_redundant_cnvs=runtime_override_sort_drop_redundant_cnvs
         runtime_attr_format=runtime_attr_format,
         runtime_override_rescue_me_dels=runtime_override_rescue_me_dels

From 48d81a8a68a40962046c8dc00d24251430ad1652 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Fri, 1 Nov 2024 13:42:54 -0400
Subject: [PATCH 09/40] Forgot comma

---
 wdl/CleanVcf.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wdl/CleanVcf.wdl b/wdl/CleanVcf.wdl
index 032884704..1f39d6ee0 100644
--- a/wdl/CleanVcf.wdl
+++ b/wdl/CleanVcf.wdl
@@ -148,7 +148,7 @@ workflow CleanVcf {
         runtime_override_fix_header_drc=runtime_override_fix_header_drc,
         runtime_override_drop_redundant_cnvs=runtime_override_drop_redundant_cnvs,
         runtime_override_combine_step_1_vcfs=runtime_override_combine_step_1_vcfs,
-        runtime_override_sort_drop_redundant_cnvs=runtime_override_sort_drop_redundant_cnvs
+        runtime_override_sort_drop_redundant_cnvs=runtime_override_sort_drop_redundant_cnvs,
         runtime_attr_format=runtime_attr_format,
         runtime_override_rescue_me_dels=runtime_override_rescue_me_dels
     }

From f4112a226601c732f7714f5af786b22919e6a044 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Fri, 1 Nov 2024 16:05:59 -0400
Subject: [PATCH 10/40] Removed unnecessary params

---
 wdl/CleanVcf.wdl | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/wdl/CleanVcf.wdl b/wdl/CleanVcf.wdl
index 1f39d6ee0..30ebd9c42 100644
--- a/wdl/CleanVcf.wdl
+++ b/wdl/CleanVcf.wdl
@@ -21,8 +21,6 @@ workflow CleanVcf {
     Int min_records_per_shard_step1
     Int samples_per_step2_shard
     Int? max_samples_per_shard_step3
-    Int clean_vcf1b_records_per_shard
-    Int clean_vcf5_records_per_shard
 
     File HERVK_reference
     File LINE1_reference
@@ -119,8 +117,6 @@ workflow CleanVcf {
         outlier_samples_list=outlier_samples_list,
         use_hail=use_hail,
         gcs_project=gcs_project,
-        clean_vcf1b_records_per_shard=clean_vcf1b_records_per_shard,
-        clean_vcf5_records_per_shard=clean_vcf5_records_per_shard,
         ploidy_table=CreatePloidyTableFromPed.out,
         HERVK_reference=HERVK_reference,
         LINE1_reference=LINE1_reference,

From ad93a923bca41b39c8c60ebb793a2dbcfeeed355 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Fri, 1 Nov 2024 16:06:51 -0400
Subject: [PATCH 11/40] Added runtime_override_clean_vcf_5

---
 wdl/CleanVcf.wdl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/wdl/CleanVcf.wdl b/wdl/CleanVcf.wdl
index 30ebd9c42..148f94717 100644
--- a/wdl/CleanVcf.wdl
+++ b/wdl/CleanVcf.wdl
@@ -127,6 +127,7 @@ workflow CleanVcf {
         sv_base_mini_docker=sv_base_mini_docker,
         sv_pipeline_docker=sv_pipeline_docker,
         runtime_override_clean_vcf_1a=runtime_override_clean_vcf_1a,
+        runtime_override_clean_vcf_1b=runtime_override_clean_vcf_1b,
         runtime_override_clean_vcf_2=runtime_override_clean_vcf_2,
         runtime_override_clean_vcf_3=runtime_override_clean_vcf_3,
         runtime_override_clean_vcf_4=runtime_override_clean_vcf_4,

From 559a2ce213219dd60e0c61ed7661efeba7ace9e5 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Mon, 4 Nov 2024 10:04:50 -0500
Subject: [PATCH 12/40] Minor changes

---
 wdl/CleanVcf.wdl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/wdl/CleanVcf.wdl b/wdl/CleanVcf.wdl
index 148f94717..42974547b 100644
--- a/wdl/CleanVcf.wdl
+++ b/wdl/CleanVcf.wdl
@@ -45,6 +45,7 @@ workflow CleanVcf {
     File? resolve_complex_merged_vcf
     File? genotype_complex_merged_vcf
 
+    String gatk_docker
     String linux_docker
     String sv_base_mini_docker
     String sv_pipeline_docker
@@ -122,7 +123,7 @@ workflow CleanVcf {
         LINE1_reference=LINE1_reference,
         chr_x=chr_x,
         chr_y=chr_y,
-        gatk_docker="docker.io/broadinstitute/gatk:3eb5c3d38d6c8c65e71f29abe9346c98bfbb1cbe",
+        gatk_docker=gatk_docker,
         linux_docker=linux_docker,
         sv_base_mini_docker=sv_base_mini_docker,
         sv_pipeline_docker=sv_pipeline_docker,

From 43db008b66f566f31b23db87ed6797c2f4fd5636 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Sun, 24 Nov 2024 15:12:10 -0500
Subject: [PATCH 13/40] WIP

---
 .../04_variant_resolution/scripts/clean_vcf_postprocess.py  | 6 ++++++
 .../04_variant_resolution/scripts/clean_vcf_preprocess.py   | 6 ++++++
 2 files changed, 12 insertions(+)
 create mode 100644 src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_postprocess.py
 create mode 100644 src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py

diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_postprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_postprocess.py
new file mode 100644
index 000000000..8e28c90dc
--- /dev/null
+++ b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_postprocess.py
@@ -0,0 +1,6 @@
+#!/bin/python
+
+import argparse
+from collections import defaultdict
+from os import mkdir, path
+
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py
new file mode 100644
index 000000000..8e28c90dc
--- /dev/null
+++ b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py
@@ -0,0 +1,6 @@
+#!/bin/python
+
+import argparse
+from collections import defaultdict
+from os import mkdir, path
+

From e6e519d4b8f7415ca6f1fc9ff471745a30bcd0d0 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Mon, 25 Nov 2024 09:05:38 -0500
Subject: [PATCH 14/40] Initial preprocess script

---
 .../scripts/clean_vcf_preprocess.py           | 120 +++++++++++++++++-
 1 file changed, 118 insertions(+), 2 deletions(-)

diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py
index 8e28c90dc..6ac2ad81c 100644
--- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py
+++ b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py
@@ -1,6 +1,122 @@
 #!/bin/python
 
 import argparse
-from collections import defaultdict
-from os import mkdir, path
+import pysam
 
+# Constants
+EV = 'EV'
+VAR_GQ = 'VAR_GQ'
+MULTIALLELIC = 'MULTIALLELIC'
+UNRESOLVED = 'UNRESOLVED'
+HIGH_SR_BACKGROUND = 'HIGH_SR_BACKGROUND'
+BOTHSIDES_SUPPORT = 'BOTHSIDES_SUPPORT'
+REVISED_EVENT = 'REVISED_EVENT'
+
+# List of possible EV values
+EV_VALUES = ['SR', 'PE', 'SR,PE', 'RD', 'BAF', 'RD,BAF']    
+
+def read_last_column(file_path):
+    result_set = set()
+    with open(file_path, 'r') as f:
+        for line in f:
+            if line.strip():
+                columns = line.strip().split()
+                result_set.add(columns[-1])
+    return result_set
+
+def add_header_lines(header):
+    header.add_line('##FILTER=<ID=UNRESOLVED,Description="Variant is unresolved">')
+    header.add_line('##INFO=<ID=HIGH_SR_BACKGROUND,Number=0,Type=Flag,Description="High number of SR splits in background samples indicating messy region">')
+    header.add_line('##INFO=<ID=BOTHSIDES_SUPPORT,Number=0,Type=Flag,Description="Variant has read-level support for both sides of breakpoint">')
+    header.add_line('##INFO=<ID=REVISED_EVENT,Number=0,Type=Flag,Description="Variant has been revised due to a copy number mismatch">')
+
+def process_record(record, fail_set, pass_set):
+    record = process_EV(record)
+    record = process_VarGQ(record)
+    record = process_Multiallelic(record)
+    record = process_Unresolved(record)
+    record = process_NoisyEvents(record, fail_set)
+    record = process_BothsidesSupportEvents(record, pass_set)
+    return record
+
+def process_EV(record):
+    for sample in record.samples:
+        genotype = record.samples[sample]
+        if EV in genotype and genotype[EV] is not None:
+            ev_attribute = genotype[EV]
+            try:
+                ev_index = int(ev_attribute)
+                if 0 <= ev_index < len(EV_VALUES):
+                    genotype[EV] = EV_VALUES[ev_index]
+            except ValueError:
+                pass  # If it's not an integer, do nothing
+    return record
+
+def process_VarGQ(record):
+    if VAR_GQ in record.info:
+        var_gq = record.info[VAR_GQ]
+        if isinstance(var_gq, list):
+            var_gq = var_gq[0]
+        del record.info[VAR_GQ]
+        record.qual = var_gq
+    return record
+
+def process_Multiallelic(record):
+    if MULTIALLELIC in record.info:
+        del record.info[MULTIALLELIC]
+    return record
+
+def process_Unresolved(record):
+    if UNRESOLVED in record.info:
+        del record.info[UNRESOLVED]
+        record.filter.add(UNRESOLVED)
+    return record
+
+def process_NoisyEvents(record, fail_set):
+    if record.id in fail_set:
+        record.info[HIGH_SR_BACKGROUND] = True
+    return record
+
+def process_BothsidesSupportEvents(record, pass_set):
+    if record.id in pass_set:
+        record.info[BOTHSIDES_SUPPORT] = True
+    return record
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Process VCF variants.')
+    parser.add_argument('--chr-X', dest='chrX', default='chrX', help='chrX column name')
+    parser.add_argument('--chr-Y', dest='chrY', default='chrY', help='chrY column name')
+    parser.add_argument('--fail-list', required=True, help='File with variants failing the background test')
+    parser.add_argument('--pass-list', required=True, help='File with variants passing both sides')
+    parser.add_argument('--output-samples-list', required=True, help='Output file with samples')
+    parser.add_argument('-O', '--output', dest='output_vcf', required=True, help='Output VCF name')
+    parser.add_argument('input_vcf', help='Input VCF file')
+    args = parser.parse_args()
+
+    # Read failList and passList into sets
+    fail_set = read_last_column(args.fail_list)
+    pass_set = read_last_column(args.pass_list)
+
+    # Open input VCF
+    vcf_in = pysam.VariantFile(args.input_vcf)
+
+    # Modify header
+    header = vcf_in.header.copy()
+    add_header_lines(header)
+
+    # Open output VCF
+    vcf_out = pysam.VariantFile(args.output_vcf, 'w', header=header)
+
+    # Write samples list
+    with open(args.output_samples_list, 'w') as samples_writer:
+        for sample in header.samples:
+            samples_writer.write(sample + '\n')
+
+    # Process variants
+    for record in vcf_in:
+        record = process_record(record, fail_set, pass_set)
+        vcf_out.write(record)
+
+    # Close files
+    vcf_in.close()
+    vcf_out.close()

From ccd5dde6f74abc1c9019b20d0a427ab41ee60137 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Mon, 25 Nov 2024 11:27:48 -0500
Subject: [PATCH 15/40] Created postprocessing script

---
 .../scripts/clean_vcf_postprocess.py          | 109 +++++++++++++++++-
 .../scripts/clean_vcf_preprocess.py           |  34 +++---
 wdl/ResolveComplexVariants.wdl                |   5 -
 wdl/TasksMakeCohortVcf.wdl                    |   5 -
 4 files changed, 120 insertions(+), 33 deletions(-)

diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_postprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_postprocess.py
index 8e28c90dc..5a8af3763 100644
--- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_postprocess.py
+++ b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_postprocess.py
@@ -1,6 +1,111 @@
 #!/bin/python
 
 import argparse
-from collections import defaultdict
-from os import mkdir, path
+import pysam
 
+# Constants
+EV = 'EV'
+SVTYPE = 'SVTYPE'
+ME = 'ME'
+UNR = 'UNR'
+FILTER_VCF_INFO_LINES = {'BND_DEPTH', 'BND_MATEID', 'SPLIT_READS', 'PAIRED_END_READS', 'CLUSTER_MEMBER_IDS'}
+FILTER_VCF_LINES = {'ID=UNR', 'ID=BND_DEPTH', 'ID=BND_MATEID', 'ID=CLUSTER_MEMBER_IDS', 'ID=PAIRED_END_READS', 'ID=SPLIT_READS'}
+
+def modify_header(header):
+    new_header = pysam.VariantHeader()
+
+    # Copy over header lines, excluding some
+    for line in header.records:
+        include_line = True
+        if line.type == 'INFO' and line.get('ID') in FILTER_VCF_INFO_LINES:
+            include_line = False
+        elif line.type == 'FORMAT' and line.get('ID') == EV:
+            include_line = False
+        elif line.type == 'ALT' and line.get('ID') == UNR:
+            include_line = False
+        elif any(fv_line in str(line) for fv_line in FILTER_VCF_LINES):
+            include_line = False
+        if include_line:
+            new_header.add_line(str(line))
+
+    # Add new header line for EV
+    new_header.add_line('##FORMAT=<ID=EV,Number=1,Type=String,Description="Classes of evidence supporting final genotype">')
+
+    # Add samples to header
+    for sample in header.samples:
+        new_header.add_sample(sample)
+
+    return new_header
+
+def process_record(record):
+    record = cleanse_info_fields(record)
+    record = process_svtype(record)
+    return record
+
+def cleanse_info_fields(record):
+    for field in FILTER_VCF_INFO_LINES:
+        if field in record.info:
+            del record.info[field]
+    return record
+
+def process_svtype(record):
+    svtype = record.info.get(SVTYPE, None)
+
+    # Check for mobile element in alleles
+    has_mobile_element = False
+    if record.alts:
+        for allele in record.alts:
+            if allele.startswith('<') and allele.endswith('>'):
+                symbol = allele[1:-1]
+                if symbol == ME:
+                    has_mobile_element = True
+                    break
+
+    # If SVTYPE is missing or variant has mobile element, skip processing
+    if svtype is None or has_mobile_element:
+        return record
+
+    # Update alleles
+    ref_allele = record.ref
+    alt_allele = f'<{svtype}>'
+    record.alleles = (ref_allele, alt_allele)
+
+    # Update genotypes
+    for sample in record.samples:
+        genotype = record.samples[sample]
+        gt = genotype.get('GT', (None, None))
+
+        # Count number of alt alleles
+        alt_count = sum(1 for allele_index in gt if allele_index is not None and allele_index > 0)
+
+        # Update GT accordingly
+        if alt_count == 1:
+            genotype['GT'] = (0, 1)
+        elif alt_count == 2:
+            genotype['GT'] = (1, 1)
+
+    return record
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Clean VCF post-processing.')
+    parser.add_argument('-O', '--output', dest='output_vcf', required=True, help='Output VCF name')
+    parser.add_argument('input_vcf', help='Input VCF file')
+    args = parser.parse_args()
+
+    # Open input VCF
+    vcf_in = pysam.VariantFile(args.input_vcf)
+
+    # Modify header
+    new_header = modify_header(vcf_in.header)
+
+    # Open output VCF
+    vcf_out = pysam.VariantFile(args.output_vcf, 'w', header=new_header)
+
+    # Process and write variants
+    for record in vcf_in:
+        record = process_record(record)
+        vcf_out.write(record)
+
+    # Close files
+    vcf_in.close()
+    vcf_out.close()
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py
index 6ac2ad81c..a91996914 100644
--- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py
+++ b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py
@@ -11,8 +11,6 @@
 HIGH_SR_BACKGROUND = 'HIGH_SR_BACKGROUND'
 BOTHSIDES_SUPPORT = 'BOTHSIDES_SUPPORT'
 REVISED_EVENT = 'REVISED_EVENT'
-
-# List of possible EV values
 EV_VALUES = ['SR', 'PE', 'SR,PE', 'RD', 'BAF', 'RD,BAF']    
 
 def read_last_column(file_path):
@@ -32,11 +30,11 @@ def add_header_lines(header):
 
 def process_record(record, fail_set, pass_set):
     record = process_EV(record)
-    record = process_VarGQ(record)
-    record = process_Multiallelic(record)
-    record = process_Unresolved(record)
-    record = process_NoisyEvents(record, fail_set)
-    record = process_BothsidesSupportEvents(record, pass_set)
+    record = process_varGQ(record)
+    record = process_multiallelic(record)
+    record = process_unresolved(record)
+    record = process_noisy(record, fail_set)
+    record = process_bothsides_support(record, pass_set)
     return record
 
 def process_EV(record):
@@ -49,10 +47,10 @@ def process_EV(record):
                 if 0 <= ev_index < len(EV_VALUES):
                     genotype[EV] = EV_VALUES[ev_index]
             except ValueError:
-                pass  # If it's not an integer, do nothing
+                pass
     return record
 
-def process_VarGQ(record):
+def process_varGQ(record):
     if VAR_GQ in record.info:
         var_gq = record.info[VAR_GQ]
         if isinstance(var_gq, list):
@@ -61,23 +59,23 @@ def process_VarGQ(record):
         record.qual = var_gq
     return record
 
-def process_Multiallelic(record):
+def process_multiallelic(record):
     if MULTIALLELIC in record.info:
         del record.info[MULTIALLELIC]
     return record
 
-def process_Unresolved(record):
+def process_unresolved(record):
     if UNRESOLVED in record.info:
         del record.info[UNRESOLVED]
         record.filter.add(UNRESOLVED)
     return record
 
-def process_NoisyEvents(record, fail_set):
+def process_noisy(record, fail_set):
     if record.id in fail_set:
         record.info[HIGH_SR_BACKGROUND] = True
     return record
 
-def process_BothsidesSupportEvents(record, pass_set):
+def process_bothsides_support(record, pass_set):
     if record.id in pass_set:
         record.info[BOTHSIDES_SUPPORT] = True
     return record
@@ -88,12 +86,11 @@ def process_BothsidesSupportEvents(record, pass_set):
     parser.add_argument('--chr-Y', dest='chrY', default='chrY', help='chrY column name')
     parser.add_argument('--fail-list', required=True, help='File with variants failing the background test')
     parser.add_argument('--pass-list', required=True, help='File with variants passing both sides')
-    parser.add_argument('--output-samples-list', required=True, help='Output file with samples')
     parser.add_argument('-O', '--output', dest='output_vcf', required=True, help='Output VCF name')
     parser.add_argument('input_vcf', help='Input VCF file')
     args = parser.parse_args()
 
-    # Read failList and passList into sets
+    # Read noisy and bothsides support events into sets
     fail_set = read_last_column(args.fail_list)
     pass_set = read_last_column(args.pass_list)
 
@@ -107,12 +104,7 @@ def process_BothsidesSupportEvents(record, pass_set):
     # Open output VCF
     vcf_out = pysam.VariantFile(args.output_vcf, 'w', header=header)
 
-    # Write samples list
-    with open(args.output_samples_list, 'w') as samples_writer:
-        for sample in header.samples:
-            samples_writer.write(sample + '\n')
-
-    # Process variants
+    # Process and write variants
     for record in vcf_in:
         record = process_record(record, fail_set, pass_set)
         vcf_out.write(record)
diff --git a/wdl/ResolveComplexVariants.wdl b/wdl/ResolveComplexVariants.wdl
index e2d32128b..f712537b6 100644
--- a/wdl/ResolveComplexVariants.wdl
+++ b/wdl/ResolveComplexVariants.wdl
@@ -17,9 +17,6 @@ workflow ResolveComplexVariants {
     Array[File] disc_files
     Array[File] rf_cutoff_files
 
-    Array[String]? background_fail_columns
-    Array[String]? bothsides_pass_columns
-
     File contig_list
     Int max_shard_size
     File cytobands
@@ -197,7 +194,6 @@ workflow ResolveComplexVariants {
         vcf=RenameVariants.renamed_vcf,
         original_list=cluster_bothside_pass_lists[i],
         outfile="~{cohort_name}.~{contig}.sr_bothside_pass.updated3.txt",
-        header_columns=select_first([bothsides_pass_columns, ["1", "2", "3", "4"]]),
         sv_pipeline_docker=sv_pipeline_docker,
         runtime_attr_override=runtime_override_update_sr_list_pass
     }
@@ -208,7 +204,6 @@ workflow ResolveComplexVariants {
         vcf=RenameVariants.renamed_vcf,
         original_list=cluster_background_fail_lists[i],
         outfile="~{cohort_name}.~{contig}.sr_background_fail.updated3.txt",
-        header_columns=select_first([background_fail_columns, ["1", "2", "3", "4"]]),
         sv_pipeline_docker=sv_pipeline_docker,
         runtime_attr_override=runtime_override_update_sr_list_fail
     }
diff --git a/wdl/TasksMakeCohortVcf.wdl b/wdl/TasksMakeCohortVcf.wdl
index fef15e068..0b81a83fe 100644
--- a/wdl/TasksMakeCohortVcf.wdl
+++ b/wdl/TasksMakeCohortVcf.wdl
@@ -684,7 +684,6 @@ task UpdateSrList {
     File vcf
     File original_list
     String outfile
-    Array[String]? header_columns
     String sv_pipeline_docker
     RuntimeAttr? runtime_attr_override
   }
@@ -712,10 +711,6 @@ task UpdateSrList {
   command <<<
     set -euxo pipefail
 
-    if [[ ! -z "~{sep=' ' header_columns}" ]]; then
-      echo -e "~{sep='\t' header_columns}" > ~{outfile}
-    fi
-
     # append new ids to original list
     svtk vcf2bed ~{vcf} int.bed -i MEMBERS --no-samples --no-header
 

From d60b01bc57605edd60b136bf4a5202ed31db4a07 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Mon, 25 Nov 2024 11:28:45 -0500
Subject: [PATCH 16/40] Minor removal of > character

---
 wdl/TasksMakeCohortVcf.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wdl/TasksMakeCohortVcf.wdl b/wdl/TasksMakeCohortVcf.wdl
index 0b81a83fe..d489831e8 100644
--- a/wdl/TasksMakeCohortVcf.wdl
+++ b/wdl/TasksMakeCohortVcf.wdl
@@ -724,7 +724,7 @@ task UpdateSrList {
         else print $0,$NF; \
       }' int.bed ~{original_list} \
       | sort -k1,1n \
-      >> ~{outfile}
+      > ~{outfile}
   >>>
 
   output {

From 38eefb2bfeca898b091db92948364040937d1acd Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Mon, 25 Nov 2024 11:57:57 -0500
Subject: [PATCH 17/40] Python linting fixes

---
 .../scripts/clean_vcf_postprocess.py                 |  5 +++++
 .../scripts/clean_vcf_preprocess.py                  | 12 +++++++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_postprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_postprocess.py
index 5a8af3763..452345a50 100644
--- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_postprocess.py
+++ b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_postprocess.py
@@ -11,6 +11,7 @@
 FILTER_VCF_INFO_LINES = {'BND_DEPTH', 'BND_MATEID', 'SPLIT_READS', 'PAIRED_END_READS', 'CLUSTER_MEMBER_IDS'}
 FILTER_VCF_LINES = {'ID=UNR', 'ID=BND_DEPTH', 'ID=BND_MATEID', 'ID=CLUSTER_MEMBER_IDS', 'ID=PAIRED_END_READS', 'ID=SPLIT_READS'}
 
+
 def modify_header(header):
     new_header = pysam.VariantHeader()
 
@@ -37,17 +38,20 @@ def modify_header(header):
 
     return new_header
 
+
 def process_record(record):
     record = cleanse_info_fields(record)
     record = process_svtype(record)
     return record
 
+
 def cleanse_info_fields(record):
     for field in FILTER_VCF_INFO_LINES:
         if field in record.info:
             del record.info[field]
     return record
 
+
 def process_svtype(record):
     svtype = record.info.get(SVTYPE, None)
 
@@ -86,6 +90,7 @@ def process_svtype(record):
 
     return record
 
+
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='Clean VCF post-processing.')
     parser.add_argument('-O', '--output', dest='output_vcf', required=True, help='Output VCF name')
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py
index a91996914..7dbd685c2 100644
--- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py
+++ b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py
@@ -11,7 +11,8 @@
 HIGH_SR_BACKGROUND = 'HIGH_SR_BACKGROUND'
 BOTHSIDES_SUPPORT = 'BOTHSIDES_SUPPORT'
 REVISED_EVENT = 'REVISED_EVENT'
-EV_VALUES = ['SR', 'PE', 'SR,PE', 'RD', 'BAF', 'RD,BAF']    
+EV_VALUES = ['SR', 'PE', 'SR,PE', 'RD', 'BAF', 'RD,BAF']
+
 
 def read_last_column(file_path):
     result_set = set()
@@ -22,12 +23,14 @@ def read_last_column(file_path):
                 result_set.add(columns[-1])
     return result_set
 
+
 def add_header_lines(header):
     header.add_line('##FILTER=<ID=UNRESOLVED,Description="Variant is unresolved">')
     header.add_line('##INFO=<ID=HIGH_SR_BACKGROUND,Number=0,Type=Flag,Description="High number of SR splits in background samples indicating messy region">')
     header.add_line('##INFO=<ID=BOTHSIDES_SUPPORT,Number=0,Type=Flag,Description="Variant has read-level support for both sides of breakpoint">')
     header.add_line('##INFO=<ID=REVISED_EVENT,Number=0,Type=Flag,Description="Variant has been revised due to a copy number mismatch">')
 
+
 def process_record(record, fail_set, pass_set):
     record = process_EV(record)
     record = process_varGQ(record)
@@ -37,6 +40,7 @@ def process_record(record, fail_set, pass_set):
     record = process_bothsides_support(record, pass_set)
     return record
 
+
 def process_EV(record):
     for sample in record.samples:
         genotype = record.samples[sample]
@@ -50,6 +54,7 @@ def process_EV(record):
                 pass
     return record
 
+
 def process_varGQ(record):
     if VAR_GQ in record.info:
         var_gq = record.info[VAR_GQ]
@@ -59,27 +64,32 @@ def process_varGQ(record):
         record.qual = var_gq
     return record
 
+
 def process_multiallelic(record):
     if MULTIALLELIC in record.info:
         del record.info[MULTIALLELIC]
     return record
 
+
 def process_unresolved(record):
     if UNRESOLVED in record.info:
         del record.info[UNRESOLVED]
         record.filter.add(UNRESOLVED)
     return record
 
+
 def process_noisy(record, fail_set):
     if record.id in fail_set:
         record.info[HIGH_SR_BACKGROUND] = True
     return record
 
+
 def process_bothsides_support(record, pass_set):
     if record.id in pass_set:
         record.info[BOTHSIDES_SUPPORT] = True
     return record
 
+
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='Process VCF variants.')
     parser.add_argument('--chr-X', dest='chrX', default='chrX', help='chrX column name')

From be7f3bf6e6bd9a7d000294f68884fa0b131eebc9 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Wed, 4 Dec 2024 19:42:12 -0500
Subject: [PATCH 18/40] WIP - refactored WDLs to use new set of tools

---
 ...postprocess.py => cleanvcf_postprocess.py} |   6 +-
 ...f_preprocess.py => cleanvcf_preprocess.py} |   8 +-
 wdl/CleanVcfChromosome.wdl                    | 325 ++++++------------
 3 files changed, 113 insertions(+), 226 deletions(-)
 rename src/sv-pipeline/04_variant_resolution/scripts/{clean_vcf_postprocess.py => cleanvcf_postprocess.py} (93%)
 rename src/sv-pipeline/04_variant_resolution/scripts/{clean_vcf_preprocess.py => cleanvcf_preprocess.py} (91%)

diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_postprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py
similarity index 93%
rename from src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_postprocess.py
rename to src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py
index 452345a50..ca4fabc35 100644
--- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_postprocess.py
+++ b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py
@@ -92,9 +92,9 @@ def process_svtype(record):
 
 
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Clean VCF post-processing.')
-    parser.add_argument('-O', '--output', dest='output_vcf', required=True, help='Output VCF name')
-    parser.add_argument('input_vcf', help='Input VCF file')
+    parser = argparse.ArgumentParser(description='CleanVcf postprocessing.')
+    parser.add_argument('-O', '--output', dest='output_vcf', required=True, help='Output VCF file')
+    parser.add_argument('-V', '--input', dest='input_vcf', required=True, help='Input VCF file')
     args = parser.parse_args()
 
     # Open input VCF
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py
similarity index 91%
rename from src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py
rename to src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py
index 7dbd685c2..787bcfc5b 100644
--- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py
+++ b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py
@@ -91,13 +91,11 @@ def process_bothsides_support(record, pass_set):
 
 
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Process VCF variants.')
-    parser.add_argument('--chr-X', dest='chrX', default='chrX', help='chrX column name')
-    parser.add_argument('--chr-Y', dest='chrY', default='chrY', help='chrY column name')
+    parser = argparse.ArgumentParser(description='CleanVcf preprocessing.')
+    parser.add_argument('-O', '--output', dest='output_vcf', required=True, help='Output VCF file')
+    parser.add_argument('-V', '--input', dest='input_vcf', required=True, help='Input VCF file')
     parser.add_argument('--fail-list', required=True, help='File with variants failing the background test')
     parser.add_argument('--pass-list', required=True, help='File with variants passing both sides')
-    parser.add_argument('-O', '--output', dest='output_vcf', required=True, help='Output VCF name')
-    parser.add_argument('input_vcf', help='Input VCF file')
     args = parser.parse_args()
 
     # Read noisy and bothsides support events into sets
diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl
index 57acd9c64..c89d574eb 100644
--- a/wdl/CleanVcfChromosome.wdl
+++ b/wdl/CleanVcfChromosome.wdl
@@ -10,16 +10,12 @@ workflow CleanVcfChromosome {
 		File vcf
 		String contig
 		File background_list
+		File bothsides_pass_list
+		File? outlier_samples_list
 		File ped_file
 		File allosome_fai
 		String prefix
-		Int max_shards_per_chrom_step1
-		File bothsides_pass_list
-		Int min_records_per_shard_step1
-		Int samples_per_step2_shard
-		File? outlier_samples_list
-		Int? max_samples_per_shard_step3
-
+		
 		File HERVK_reference
 		File LINE1_reference
 
@@ -27,8 +23,6 @@ workflow CleanVcfChromosome {
 		String chr_x
 		String chr_y
 
-		File? svtk_to_gatk_script  # For debugging
-
 		Boolean use_hail
 		String? gcs_project
 
@@ -38,6 +32,10 @@ workflow CleanVcfChromosome {
 		String sv_pipeline_docker
 
 		# overrides for local tasks
+		RuntimeAttr? runtime_attr_revise_overlapping_cnvs
+		RuntimeAttr? runtime_attr_revise_large_cnvs
+		RuntimeAttr? runtime_attr_revise_abnormal_allosomes
+		RuntimeAttr? runtime_attr_revise_multiallelics
 		RuntimeAttr? runtime_override_clean_vcf_1a
 		RuntimeAttr? runtime_override_clean_vcf_1b
 		RuntimeAttr? runtime_override_clean_vcf_2
@@ -76,152 +74,59 @@ workflow CleanVcfChromosome {
 			runtime_attr_override=runtime_attr_format
 	}
 
-	call MiniTasks.SplitVcf as SplitVcfToClean {
+	call CleanVcfPreprocess {
 		input:
 			vcf=FormatVcfToClean.out,
-			contig=contig,
-			prefix="~{prefix}.shard_",
-			n_shards=max_shards_per_chrom_step1,
-			min_vars_per_shard=min_records_per_shard_step1,
-			sv_base_mini_docker=sv_base_mini_docker,
-			runtime_attr_override=runtime_override_split_vcf_to_clean
-	}
-
-	scatter ( i in range(length(SplitVcfToClean.vcf_shards)) ) {
-		call CleanVcf1a {
-			input:
-				vcf=SplitVcfToClean.vcf_shards[i],
-				prefix="~{prefix}.clean_vcf_1a.shard_~{i}",
-				background_fail_list=background_list,
-				bothsides_pass_list=bothsides_pass_list,
-				ped_file=ped_file,
-				allosome_fai=allosome_fai,
-				chr_x=chr_x,
-				chr_y=chr_y,
-				gatk_docker=gatk_docker,
-				runtime_attr_override=runtime_override_clean_vcf_1a
-		}
-	}
-
-	if (use_hail) {
-		call HailMerge.HailMerge as CombineStep1VcfsHail {
-			input:
-				vcfs=CleanVcf1a.intermediate_vcf,
-				prefix="~{prefix}.combine_step_1_vcfs",
-				gcs_project=gcs_project,
-				sv_base_mini_docker=sv_base_mini_docker,
-				sv_pipeline_docker=sv_pipeline_docker,
-				runtime_override_preconcat=runtime_override_preconcat_step1,
-				runtime_override_hail_merge=runtime_override_hail_merge_step1,
-				runtime_override_fix_header=runtime_override_fix_header_step1
-		}
-	}
-	if (!use_hail) {
-		call MiniTasks.ConcatVcfs as CombineStep1Vcfs {
-			input:
-				vcfs=CleanVcf1a.intermediate_vcf,
-				vcfs_idx=CleanVcf1a.intermediate_vcf_idx,
-				naive=true,
-				generate_index=false,
-				outfile_prefix="~{prefix}.combine_step_1_vcfs",
-				sv_base_mini_docker=sv_base_mini_docker,
-				runtime_attr_override=runtime_override_combine_step_1_vcfs
-		}
+			background_list=background_list,
+			bothsides_pass_list=bothsides_pass_list,
+			prefix="~{prefix}.preprocess",
+			gatk_docker=gatk_docker,
+			runtime_attr_override=runtime_attr_preprocess
 	}
 
-	call CleanVcf1b {
+	call CleanVcfReviseOverlappingCnvs {
 		input:
-			vcf=select_first([CombineStep1Vcfs.concat_vcf, CombineStep1VcfsHail.merged_vcf]),
-			prefix="~{prefix}.clean_vcf_1b",
+			vcf=CleanVcfPreprocess.out,
+			prefix="~{prefix}.revise_overlapping_cnvs",
 			gatk_docker=gatk_docker,
-			runtime_attr_override=runtime_override_clean_vcf_1b
+			runtime_attr_override=runtime_attr_revise_overlapping_cnvs
 	}
 
-	call MiniTasks.SplitUncompressed as SplitIncludeList {
+	call CleanVcfReviseLargeCnvs {
 		input:
-			whole_file=CleanVcf1a.include_list[0],
-			lines_per_shard=samples_per_step2_shard,
-			shard_prefix="~{prefix}.split_include_list.",
-			sv_pipeline_docker=sv_pipeline_docker,
-			runtime_attr_override=runtime_override_split_include_list
-	}
-
-	scatter ( i in range(length(SplitIncludeList.shards)) ){
-		call CleanVcf2 {
-			input:
-				vcf=CleanVcf1b.out,
-				prefix="~{prefix}.clean_vcf_2.shard_~{i}",
-				include_list=SplitIncludeList.shards[i],
-				gatk_docker=gatk_docker,
-				runtime_attr_override=runtime_override_clean_vcf_2
-			}
+			vcf=CleanVcfReviseOverlappingCnvs.out,
+			prefix="~{prefix}.revise_large_cnvs",
+			gatk_docker=gatk_docker,
+			runtime_attr_override=runtime_attr_revise_large_cnvs
 	}
 
-	call MiniTasks.CatUncompressedFiles as CombineCleanVcf2 {
+	call CleanVcfReviseAbnormalAllosomes {
 		input:
-			shards=CleanVcf2.out,
-			outfile_name="~{prefix}.combine_clean_vcf_2.txt",
-			sv_base_mini_docker=sv_base_mini_docker,
-			runtime_attr_override=runtime_override_combine_clean_vcf_2
+			vcf=CleanVcfReviseLargeCnvs.out,
+			prefix="~{prefix}.revise_abnormal_allosomes",
+			gatk_docker=gatk_docker,
+			runtime_attr_override=runtime_attr_revise_abnormal_allosomes
 	}
 
-	call CleanVcf3 {
+	call CleanVcfReviseMultiallelicCnvs {
 		input:
-			rd_cn_revise=CombineCleanVcf2.outfile,
-			max_samples_shard = max_samples_per_shard_step3,
-			sv_pipeline_docker=sv_pipeline_docker,
-			runtime_attr_override=runtime_override_clean_vcf_3
-	}
-
-	scatter ( i in range(length(CleanVcf3.shards)) ){
-		call CleanVcf4 {
-			input:
-				vcf=CleanVcf1b.out,
-				prefix="~{prefix}.clean_vcf_4.shard_~{i}",
-				outlier_samples_list=outlier_samples_list,
-				rd_cn_revise=CleanVcf3.shards[i],
-				gatk_docker=gatk_docker,
-				runtime_attr_override=runtime_override_clean_vcf_4
-		}
-	}
-
-	if (use_hail) {
-		call HailMerge.HailMerge as CombineStep4VcfsHail {
-			input:
-				vcfs=CleanVcf4.out,
-				prefix="~{prefix}.combine_revised_4",
-				gcs_project=gcs_project,
-				sv_base_mini_docker=sv_base_mini_docker,
-				sv_pipeline_docker=sv_pipeline_docker,
-				runtime_override_preconcat=runtime_override_preconcat_step1,
-				runtime_override_hail_merge=runtime_override_hail_merge_step1,
-				runtime_override_fix_header=runtime_override_fix_header_step1
-		}
-	}
-	if (!use_hail) {
-		call MiniTasks.ConcatVcfs as CombineStep4Vcfs {
-			input:
-				vcfs=CleanVcf4.out,
-				vcfs_idx=CleanVcf4.out_idx,
-				naive=true,
-				generate_index=true,
-				outfile_prefix="~{prefix}.combine_revised_4",
-				sv_base_mini_docker=sv_base_mini_docker,
-				runtime_attr_override=runtime_override_combine_step_1_vcfs
-		}
+			vcf=CleanVcfReviseAbnormalAllosomes.out,
+			prefix="~{prefix}.revise_multiallelic_cnvs",
+			gatk_docker=gatk_docker,
+			runtime_attr_override=runtime_attr_revise_multiallelics
 	}
 
-	call CleanVcf5 {
+	call CleanVcfPostprocess {
 		input:
-			vcf=select_first([CombineStep4Vcfs.concat_vcf, CombineStep4VcfsHail.merged_vcf]),
-			prefix="~{prefix}.clean_vcf_5",
+			vcf=CleanVcfReviseMultiallelicCnvs.out,
+			prefix="~{prefix}.postprocess",
 			gatk_docker=gatk_docker,
-			runtime_attr_override=runtime_override_clean_vcf_5
+			runtime_attr_override=runtime_attr_revise_multiallelics
 	}
 
 	call DropRedundantCnvs {
 		input:
-			vcf=CleanVcf5.out,
+			vcf=CleanVcfPostprocess.out,
 			prefix="~{prefix}.drop_redundant_cnvs",
 			contig=contig,
 			sv_pipeline_docker=sv_pipeline_docker,
@@ -293,7 +198,6 @@ workflow CleanVcfChromosome {
       ploidy_table=ploidy_table,
       args="--scale-down-gq",
       output_prefix="~{prefix}.final_format",
-      script=svtk_to_gatk_script,
       sv_pipeline_docker=sv_pipeline_docker,
       runtime_attr_override=runtime_attr_format
   }
@@ -304,25 +208,19 @@ workflow CleanVcfChromosome {
 	}
 }
 
-
-task CleanVcf1a {
+task CleanVcfPreprocess {
 	input {
 		File vcf
-		String prefix
-		File background_fail_list
+		File background_list
 		File bothsides_pass_list
-		File ped_file
-		File allosome_fai
-		String chr_x
-		String chr_y
+		String prefix
 		String gatk_docker
 		RuntimeAttr? runtime_attr_override
 	}
 
-	Float input_size = size([vcf, background_fail_list, bothsides_pass_list], "GB")
 	RuntimeAttr runtime_default = object {
 																	mem_gb: 3.75,
-																	disk_gb: ceil(10.0 + input_size * 2),
+																	disk_gb: ceil(10.0 + size(vcf, "GB") * 2),
 																	cpu_cores: 1,
 																	preemptible_tries: 3,
 																	max_retries: 1,
@@ -341,7 +239,6 @@ task CleanVcf1a {
 
 	Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7)
 	String output_vcf = "~{prefix}.vcf.gz"
-	String output_samples_list = "~{prefix}.includelist.txt"
 
 	command <<<
 		set -euo pipefail
@@ -350,24 +247,22 @@ task CleanVcf1a {
 			tabix -p vcf ~{vcf}
 		fi
 		
-		gatk --java-options "-Xmx~{java_mem_mb}m" SVCleanPt1a \
+		python /opt/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py \
 			-V ~{vcf} \
 			-O ~{output_vcf} \
-			--fail-list ~{background_fail_list} \
-			--pass-list ~{bothsides_pass_list} \
-			--chr-X ~{chr_x} \
-			--chr-Y ~{chr_y} \
-			--output-samples-list ~{output_samples_list}
+			--fail-list ~{background_list} \
+			--pass-list ~{bothsides_pass_list}
+
+		tabix -p vcf ~{output_vcf}
 	>>>
 
 	output {
-		File include_list="~{output_samples_list}"
-		File intermediate_vcf="~{output_vcf}"
-		File intermediate_vcf_idx="~{output_vcf}.tbi"
+		File out="~{output_vcf}"
+		File out_idx="~{output_vcf}.tbi"
 	}
 }
 
-task CleanVcf1b {
+task CleanVcfReviseOverlappingCnvs {
 	input {
 		File vcf
 		String prefix
@@ -375,15 +270,14 @@ task CleanVcf1b {
 		RuntimeAttr? runtime_attr_override
 	}
 
-	Float input_size = size([vcf], "GB")
 	RuntimeAttr runtime_default = object {
 																	mem_gb: 3.75,
-																	disk_gb: ceil(10.0 + input_size * 2),
+																	disk_gb: ceil(10.0 + size(vcf, "GB") * 2),
 																	cpu_cores: 1,
 																	preemptible_tries: 3,
 																	max_retries: 1,
 																	boot_disk_gb: 10
-																}
+																}						
 	RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
 	runtime {
 		memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
@@ -395,8 +289,8 @@ task CleanVcf1b {
 		bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
 	}
 
-	String output_vcf = "~{prefix}.vcf.gz"
 	Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7)
+	String output_vcf = "~{prefix}.vcf.gz"
 
 	command <<<
 		set -euo pipefail
@@ -405,7 +299,7 @@ task CleanVcf1b {
 			tabix -p vcf ~{vcf}
 		fi
 		
-		gatk --java-options "-Xmx~{java_mem_mb}m" SVCleanPt1b \
+		gatk --java-options "-Xmx~{java_mem_mb}m" SVReviseOverlappingCnvs \
 			-V ~{vcf} \
 			-O ~{output_vcf}
 	>>>
@@ -416,26 +310,22 @@ task CleanVcf1b {
 	}
 }
 
-task CleanVcf2 {
+task CleanVcfReviseLargeCnvs {
 	input {
 		File vcf
 		String prefix
-		File include_list
 		String gatk_docker
 		RuntimeAttr? runtime_attr_override
 	}
 
-	Float input_size = size([vcf, include_list], "GB")
-	Float base_disk_gb = 10.0
-	Float input_disk_scale = 3.0
 	RuntimeAttr runtime_default = object {
-		mem_gb: 2.0,
-		disk_gb: ceil(base_disk_gb + input_size * input_disk_scale),
-		cpu_cores: 1,
-		preemptible_tries: 3,
-		max_retries: 1,
-		boot_disk_gb: 10
-	}
+																	mem_gb: 3.75,
+																	disk_gb: ceil(10.0 + size(vcf, "GB") * 2),
+																	cpu_cores: 1,
+																	preemptible_tries: 3,
+																	max_retries: 1,
+																	boot_disk_gb: 10
+																}						
 	RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
 	runtime {
 		memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
@@ -447,8 +337,8 @@ task CleanVcf2 {
 		bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
 	}
 
-	String output_revised_list = "~{prefix}.txt"
 	Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7)
+	String output_vcf = "~{prefix}.vcf.gz"
 
 	command <<<
 		set -euo pipefail
@@ -457,36 +347,33 @@ task CleanVcf2 {
 			tabix -p vcf ~{vcf}
 		fi
 		
-		gatk --java-options "-Xmx~{java_mem_mb}m" SVCleanPt2 \
+		gatk --java-options "-Xmx~{java_mem_mb}m" SVReviseLargeCnvs \
 			-V ~{vcf} \
-			--sample-list ~{include_list} \
-			--output-revised-list ~{output_revised_list}
+			-O ~{output_vcf}
 	>>>
 
 	output {
-		File out="~{output_revised_list}"
+		File out="~{output_vcf}"
+		File out_idx="~{output_vcf}.tbi"
 	}
 }
 
-
-task CleanVcf3 {
+task CleanVcfReviseAbnormalAllosomes {
 	input {
-		File rd_cn_revise
-		Int? max_samples_shard
-		String sv_pipeline_docker
+		File vcf
+		String prefix
+		String gatk_docker
 		RuntimeAttr? runtime_attr_override
 	}
-	
-	Int max_samples_shard_ = select_first([max_samples_shard, 7000])
-	Float input_size = size(rd_cn_revise, "GB")
+
 	RuntimeAttr runtime_default = object {
-		mem_gb: 3.75,
-		disk_gb: ceil(10.0 + input_size * 2.0),
-		cpu_cores: 1,
-		preemptible_tries: 3,
-		max_retries: 1,
-		boot_disk_gb: 10
-	}
+																	mem_gb: 3.75,
+																	disk_gb: ceil(10.0 + size(vcf, "GB") * 2),
+																	cpu_cores: 1,
+																	preemptible_tries: 3,
+																	max_retries: 1,
+																	boot_disk_gb: 10
+																}						
 	RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
 	runtime {
 		memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
@@ -494,42 +381,47 @@ task CleanVcf3 {
 		cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
 		preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
 		maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-		docker: sv_pipeline_docker
+		docker: gatk_docker
 		bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
 	}
 
+	Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7)
+	String output_vcf = "~{prefix}.vcf.gz"
+
 	command <<<
 		set -euo pipefail
-		python /opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part3.py ~{rd_cn_revise} -s ~{max_samples_shard_}
-		# Ensure there is at least one shard
-		touch shards/out.0_0.txt
+
+		if [ ! -f "~{vcf}.tbi" ]; then
+			tabix -p vcf ~{vcf}
+		fi
+		
+		gatk --java-options "-Xmx~{java_mem_mb}m" SVReviseAbnormalAllosomes \
+			-V ~{vcf} \
+			-O ~{output_vcf}
 	>>>
 
 	output {
-		 Array[File] shards = glob("shards/*")
+		File out="~{output_vcf}"
+		File out_idx="~{output_vcf}.tbi"
 	}
 }
 
-
-task CleanVcf4 {
+task CleanVcfReviseMultiallelicCnvs {
 	input {
 		File vcf
 		String prefix
-		File rd_cn_revise
-		File? outlier_samples_list
 		String gatk_docker
 		RuntimeAttr? runtime_attr_override
 	}
 
-	Float input_size = size([vcf, rd_cn_revise], "GB")
 	RuntimeAttr runtime_default = object {
-																	mem_gb: 2.0,
-																	disk_gb: 50,
+																	mem_gb: 3.75,
+																	disk_gb: ceil(10.0 + size(vcf, "GB") * 2),
 																	cpu_cores: 1,
 																	preemptible_tries: 3,
 																	max_retries: 1,
 																	boot_disk_gb: 10
-																}
+																}						
 	RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
 	runtime {
 		memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
@@ -541,8 +433,8 @@ task CleanVcf4 {
 		bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
 	}
 
-	String output_vcf = "~{prefix}.vcf.gz"
 	Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7)
+	String output_vcf = "~{prefix}.vcf.gz"
 
 	command <<<
 		set -euo pipefail
@@ -551,11 +443,9 @@ task CleanVcf4 {
 			tabix -p vcf ~{vcf}
 		fi
 		
-		gatk --java-options "-Xmx~{java_mem_mb}m" SVCleanPt4 \
+		gatk --java-options "-Xmx~{java_mem_mb}m" SVReviseMultiallelicCnvs \
 			-V ~{vcf} \
-			-O ~{output_vcf} \
-			--revised-cn-list ~{rd_cn_revise} \
-			~{if defined(outlier_samples_list) then "--outliers-list ~{outlier_samples_list}" else "" }
+			-O ~{output_vcf}
 	>>>
 
 	output {
@@ -564,8 +454,7 @@ task CleanVcf4 {
 	}
 }
 
-
-task CleanVcf5 {
+task CleanVcfPostprocess {
 	input {
 		File vcf
 		String prefix
@@ -573,15 +462,14 @@ task CleanVcf5 {
 		RuntimeAttr? runtime_attr_override
 	}
 
-	Float input_size = size([vcf], "GB")
 	RuntimeAttr runtime_default = object {
-																	mem_gb: 2.0,
-																	disk_gb: 50,
+																	mem_gb: 3.75,
+																	disk_gb: ceil(10.0 + size(vcf, "GB") * 2),
 																	cpu_cores: 1,
 																	preemptible_tries: 3,
 																	max_retries: 1,
 																	boot_disk_gb: 10
-																}
+																}						
 	RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
 	runtime {
 		memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
@@ -593,8 +481,8 @@ task CleanVcf5 {
 		bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
 	}
 
-	String output_vcf = "~{prefix}.vcf.gz"
 	Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7)
+	String output_vcf = "~{prefix}.vcf.gz"
 
 	command <<<
 		set -euo pipefail
@@ -603,9 +491,11 @@ task CleanVcf5 {
 			tabix -p vcf ~{vcf}
 		fi
 		
-		gatk --java-options "-Xmx~{java_mem_mb}m" SVCleanPt5 \
+		python /opt/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py \
 			-V ~{vcf} \
 			-O ~{output_vcf}
+
+		tabix -p vcf ~{output_vcf}
 	>>>
 
 	output {
@@ -614,7 +504,6 @@ task CleanVcf5 {
 	}
 }
 
-
 task RescueMobileElementDeletions {
 	input {
 		File vcf

From fec8d596b144912c5a1c930609acf78c2b2e34cf Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Fri, 6 Dec 2024 09:33:47 -0500
Subject: [PATCH 19/40] WIP

---
 wdl/CleanVcfChromosome.wdl | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl
index c89d574eb..fe2564be9 100644
--- a/wdl/CleanVcfChromosome.wdl
+++ b/wdl/CleanVcfChromosome.wdl
@@ -32,10 +32,13 @@ workflow CleanVcfChromosome {
 		String sv_pipeline_docker
 
 		# overrides for local tasks
+		RuntimeAttr? runtime_attr_preprocess
 		RuntimeAttr? runtime_attr_revise_overlapping_cnvs
 		RuntimeAttr? runtime_attr_revise_large_cnvs
 		RuntimeAttr? runtime_attr_revise_abnormal_allosomes
 		RuntimeAttr? runtime_attr_revise_multiallelics
+		RuntimeAttr? runtime_attr_postprocess
+
 		RuntimeAttr? runtime_override_clean_vcf_1a
 		RuntimeAttr? runtime_override_clean_vcf_1b
 		RuntimeAttr? runtime_override_clean_vcf_2
@@ -103,6 +106,7 @@ workflow CleanVcfChromosome {
 	call CleanVcfReviseAbnormalAllosomes {
 		input:
 			vcf=CleanVcfReviseLargeCnvs.out,
+			outlier_samples_list=outlier_samples_list,
 			prefix="~{prefix}.revise_abnormal_allosomes",
 			gatk_docker=gatk_docker,
 			runtime_attr_override=runtime_attr_revise_abnormal_allosomes
@@ -121,7 +125,7 @@ workflow CleanVcfChromosome {
 			vcf=CleanVcfReviseMultiallelicCnvs.out,
 			prefix="~{prefix}.postprocess",
 			gatk_docker=gatk_docker,
-			runtime_attr_override=runtime_attr_revise_multiallelics
+			runtime_attr_override=runtime_attr_postprocess
 	}
 
 	call DropRedundantCnvs {
@@ -313,6 +317,7 @@ task CleanVcfReviseOverlappingCnvs {
 task CleanVcfReviseLargeCnvs {
 	input {
 		File vcf
+		File? outlier_samples_list
 		String prefix
 		String gatk_docker
 		RuntimeAttr? runtime_attr_override
@@ -349,7 +354,8 @@ task CleanVcfReviseLargeCnvs {
 		
 		gatk --java-options "-Xmx~{java_mem_mb}m" SVReviseLargeCnvs \
 			-V ~{vcf} \
-			-O ~{output_vcf}
+			-O ~{output_vcf} \
+			~{if defined(outlier_samples_list) then "--outlier-samples ~{outlier_samples_list}" else "" }
 	>>>
 
 	output {

From e27026bff22524302f76cd7c34370b7cfaf09a11 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Wed, 11 Dec 2024 11:48:43 -0500
Subject: [PATCH 20/40] Updated pre/postprocess to use pipeline docker

---
 wdl/CleanVcfChromosome.wdl | 85 +++++++++++++++++++++++++++++++-------
 1 file changed, 71 insertions(+), 14 deletions(-)

diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl
index fe2564be9..26da02ed5 100644
--- a/wdl/CleanVcfChromosome.wdl
+++ b/wdl/CleanVcfChromosome.wdl
@@ -33,7 +33,8 @@ workflow CleanVcfChromosome {
 
 		# overrides for local tasks
 		RuntimeAttr? runtime_attr_preprocess
-		RuntimeAttr? runtime_attr_revise_overlapping_cnvs
+		RuntimeAttr? runtime_attr_revise_overlapping_cnv_gts
+		RuntimeAttr? runtime_attr_revise_overlapping_cnv_cns
 		RuntimeAttr? runtime_attr_revise_large_cnvs
 		RuntimeAttr? runtime_attr_revise_abnormal_allosomes
 		RuntimeAttr? runtime_attr_revise_multiallelics
@@ -83,21 +84,30 @@ workflow CleanVcfChromosome {
 			background_list=background_list,
 			bothsides_pass_list=bothsides_pass_list,
 			prefix="~{prefix}.preprocess",
-			gatk_docker=gatk_docker,
+			sv_pipeline_docker=sv_pipeline_docker,
 			runtime_attr_override=runtime_attr_preprocess
 	}
 
-	call CleanVcfReviseOverlappingCnvs {
+	call CleanVcfReviseOverlappingCnvGts {
 		input:
 			vcf=CleanVcfPreprocess.out,
-			prefix="~{prefix}.revise_overlapping_cnvs",
+			prefix="~{prefix}.revise_overlapping_cnv_gts",
+			gatk_docker=gatk_docker,
+			runtime_attr_override=runtime_attr_revise_overlapping_cnv_gts
+	}
+
+	call CleanVcfReviseOverlappingCnvCns {
+		input:
+			vcf=CleanVcfReviseOverlappingCnvGts.out,
+			prefix="~{prefix}.revise_overlapping_cnv_cns",
 			gatk_docker=gatk_docker,
-			runtime_attr_override=runtime_attr_revise_overlapping_cnvs
+			runtime_attr_override=runtime_attr_revise_overlapping_cnv_cns
 	}
 
 	call CleanVcfReviseLargeCnvs {
 		input:
-			vcf=CleanVcfReviseOverlappingCnvs.out,
+			vcf=CleanVcfReviseOverlappingCnvGts.out,
+			outlier_samples_list=outlier_samples_list,
 			prefix="~{prefix}.revise_large_cnvs",
 			gatk_docker=gatk_docker,
 			runtime_attr_override=runtime_attr_revise_large_cnvs
@@ -106,7 +116,6 @@ workflow CleanVcfChromosome {
 	call CleanVcfReviseAbnormalAllosomes {
 		input:
 			vcf=CleanVcfReviseLargeCnvs.out,
-			outlier_samples_list=outlier_samples_list,
 			prefix="~{prefix}.revise_abnormal_allosomes",
 			gatk_docker=gatk_docker,
 			runtime_attr_override=runtime_attr_revise_abnormal_allosomes
@@ -124,7 +133,7 @@ workflow CleanVcfChromosome {
 		input:
 			vcf=CleanVcfReviseMultiallelicCnvs.out,
 			prefix="~{prefix}.postprocess",
-			gatk_docker=gatk_docker,
+			sv_pipeline_docker=sv_pipeline_docker,
 			runtime_attr_override=runtime_attr_postprocess
 	}
 
@@ -218,7 +227,7 @@ task CleanVcfPreprocess {
 		File background_list
 		File bothsides_pass_list
 		String prefix
-		String gatk_docker
+		String sv_pipeline_docker
 		RuntimeAttr? runtime_attr_override
 	}
 
@@ -237,7 +246,7 @@ task CleanVcfPreprocess {
 		cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
 		preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
 		maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-		docker: gatk_docker
+		docker: sv_pipeline_docker
 		bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
 	}
 
@@ -266,7 +275,55 @@ task CleanVcfPreprocess {
 	}
 }
 
-task CleanVcfReviseOverlappingCnvs {
+task CleanVcfReviseOverlappingCnvGts {
+	input {
+		File vcf
+		String prefix
+		String gatk_docker
+		RuntimeAttr? runtime_attr_override
+	}
+
+	RuntimeAttr runtime_default = object {
+																	mem_gb: 3.75,
+																	disk_gb: ceil(10.0 + size(vcf, "GB") * 2),
+																	cpu_cores: 1,
+																	preemptible_tries: 3,
+																	max_retries: 1,
+																	boot_disk_gb: 10
+																}						
+	RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+	runtime {
+		memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
+		disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+		cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+		preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+		maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+		docker: gatk_docker
+		bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+	}
+
+	Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7)
+	String output_vcf = "~{prefix}.vcf.gz"
+
+	command <<<
+		set -euo pipefail
+
+		if [ ! -f "~{vcf}.tbi" ]; then
+			tabix -p vcf ~{vcf}
+		fi
+		
+		gatk --java-options "-Xmx~{java_mem_mb}m" SVReviseOverlappingCnvGts \
+			-V ~{vcf} \
+			-O ~{output_vcf}
+	>>>
+
+	output {
+		File out="~{output_vcf}"
+		File out_idx="~{output_vcf}.tbi"
+	}
+}
+
+task CleanVcfReviseOverlappingCnvCns {
 	input {
 		File vcf
 		String prefix
@@ -303,7 +360,7 @@ task CleanVcfReviseOverlappingCnvs {
 			tabix -p vcf ~{vcf}
 		fi
 		
-		gatk --java-options "-Xmx~{java_mem_mb}m" SVReviseOverlappingCnvs \
+		gatk --java-options "-Xmx~{java_mem_mb}m" SVReviseOverlappingCnvCns \
 			-V ~{vcf} \
 			-O ~{output_vcf}
 	>>>
@@ -464,7 +521,7 @@ task CleanVcfPostprocess {
 	input {
 		File vcf
 		String prefix
-		String gatk_docker
+		String sv_pipeline_docker
 		RuntimeAttr? runtime_attr_override
 	}
 
@@ -483,7 +540,7 @@ task CleanVcfPostprocess {
 		cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
 		preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
 		maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-		docker: gatk_docker
+		docker: sv_pipeline_docker
 		bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
 	}
 

From a92904d1e3c90e36d183b810e97b3f7dd8bcbfc8 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Thu, 12 Dec 2024 16:26:24 -0500
Subject: [PATCH 21/40] Finished testing cleanvcfpreprocess

---
 header.txt                                    |  0
 inputs/values/dockers.json                    |  6 +-
 .../scripts/cleanvcf_preprocess.py            | 63 +++++--------------
 .../replace_ev_numeric_code_with_string.py    |  5 +-
 wdl/CleanVcfChromosome.wdl                    | 21 ++++++-
 5 files changed, 44 insertions(+), 51 deletions(-)
 create mode 100644 header.txt

diff --git a/header.txt b/header.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json
index b68084b91..7fea73d36 100644
--- a/inputs/values/dockers.json
+++ b/inputs/values/dockers.json
@@ -12,8 +12,8 @@
   "samtools_cloud_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/samtools-cloud:2024-10-25-v0.29-beta-5ea22a52",
   "sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2024-10-25-v0.29-beta-5ea22a52",
   "sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base-mini:2024-10-25-v0.29-beta-5ea22a52",
-  "sv_pipeline_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2024-11-15-v1.0-488d7cb0",
-  "sv_pipeline_qc_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2024-11-15-v1.0-488d7cb0",
+  "sv_pipeline_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-e27026bff22524302f76cd7c34370b7cfaf09a11",
+  "sv_pipeline_qc_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-e27026bff22524302f76cd7c34370b7cfaf09a11",
   "wham_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/wham:2024-10-25-v0.29-beta-5ea22a52",
   "igv_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/igv:mw-xz-fixes-2-b1be6a9",
   "duphold_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/duphold:mw-xz-fixes-2-b1be6a9",
@@ -28,5 +28,5 @@
   "sv_utils_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-utils:2024-10-25-v0.29-beta-5ea22a52",
   "gq_recalibrator_docker": "us.gcr.io/broad-dsde-methods/markw/gatk:mw-tb-form-sv-filter-training-data-899360a",
   "str": "us.gcr.io/broad-dsde-methods/gatk-sv/str:2023-05-23-v0.27.3-beta-e537bdd6",
-  "denovo": "us.gcr.io/broad-dsde-methods/gatk-sv/denovo:2024-11-15-v1.0-488d7cb0"
+  "denovo": "us-central1-docker.pkg.dev/talkowski-training/kj-development/denovo:kj-clean-vcf-e27026bff22524302f76cd7c34370b7cfaf09a11"
 }
\ No newline at end of file
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py
index 787bcfc5b..0326d4fc1 100644
--- a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py
+++ b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py
@@ -2,9 +2,8 @@
 
 import argparse
 import pysam
+import gzip
 
-# Constants
-EV = 'EV'
 VAR_GQ = 'VAR_GQ'
 MULTIALLELIC = 'MULTIALLELIC'
 UNRESOLVED = 'UNRESOLVED'
@@ -23,16 +22,7 @@ def read_last_column(file_path):
                 result_set.add(columns[-1])
     return result_set
 
-
-def add_header_lines(header):
-    header.add_line('##FILTER=<ID=UNRESOLVED,Description="Variant is unresolved">')
-    header.add_line('##INFO=<ID=HIGH_SR_BACKGROUND,Number=0,Type=Flag,Description="High number of SR splits in background samples indicating messy region">')
-    header.add_line('##INFO=<ID=BOTHSIDES_SUPPORT,Number=0,Type=Flag,Description="Variant has read-level support for both sides of breakpoint">')
-    header.add_line('##INFO=<ID=REVISED_EVENT,Number=0,Type=Flag,Description="Variant has been revised due to a copy number mismatch">')
-
-
 def process_record(record, fail_set, pass_set):
-    record = process_EV(record)
     record = process_varGQ(record)
     record = process_multiallelic(record)
     record = process_unresolved(record)
@@ -40,21 +30,6 @@ def process_record(record, fail_set, pass_set):
     record = process_bothsides_support(record, pass_set)
     return record
 
-
-def process_EV(record):
-    for sample in record.samples:
-        genotype = record.samples[sample]
-        if EV in genotype and genotype[EV] is not None:
-            ev_attribute = genotype[EV]
-            try:
-                ev_index = int(ev_attribute)
-                if 0 <= ev_index < len(EV_VALUES):
-                    genotype[EV] = EV_VALUES[ev_index]
-            except ValueError:
-                pass
-    return record
-
-
 def process_varGQ(record):
     if VAR_GQ in record.info:
         var_gq = record.info[VAR_GQ]
@@ -64,59 +39,55 @@ def process_varGQ(record):
         record.qual = var_gq
     return record
 
-
 def process_multiallelic(record):
     if MULTIALLELIC in record.info:
         del record.info[MULTIALLELIC]
     return record
 
-
 def process_unresolved(record):
     if UNRESOLVED in record.info:
         del record.info[UNRESOLVED]
         record.filter.add(UNRESOLVED)
     return record
 
-
 def process_noisy(record, fail_set):
     if record.id in fail_set:
         record.info[HIGH_SR_BACKGROUND] = True
     return record
 
-
 def process_bothsides_support(record, pass_set):
     if record.id in pass_set:
         record.info[BOTHSIDES_SUPPORT] = True
     return record
 
-
 if __name__ == '__main__':
+    # Parse arguments
     parser = argparse.ArgumentParser(description='CleanVcf preprocessing.')
-    parser.add_argument('-O', '--output', dest='output_vcf', required=True, help='Output VCF file')
     parser.add_argument('-V', '--input', dest='input_vcf', required=True, help='Input VCF file')
+    parser.add_argument('-O', '--output', dest='output_vcf', required=True, help='Output VCF file')
     parser.add_argument('--fail-list', required=True, help='File with variants failing the background test')
     parser.add_argument('--pass-list', required=True, help='File with variants passing both sides')
     args = parser.parse_args()
 
-    # Read noisy and bothsides support events into sets
+    # Read input files
     fail_set = read_last_column(args.fail_list)
     pass_set = read_last_column(args.pass_list)
-
-    # Open input VCF
-    vcf_in = pysam.VariantFile(args.input_vcf)
-
-    # Modify header
-    header = vcf_in.header.copy()
-    add_header_lines(header)
-
-    # Open output VCF
-    vcf_out = pysam.VariantFile(args.output_vcf, 'w', header=header)
-
-    # Process and write variants
+    if args.input_vcf.endswith('.gz'):
+        vcf_in = pysam.VariantFile(gzip.open(args.input_vcf, 'rt'))
+    else:
+        vcf_in = pysam.VariantFile(args.input_vcf)
+    
+    # Open output file
+    if args.output_vcf.endswith('.gz'):
+        vcf_out = pysam.VariantFile(args.output_vcf, 'wz', header=vcf_in.header)
+    else:
+        vcf_out = pysam.VariantFile(args.output_vcf, 'w', header=vcf_in.header.copy())
+
+    # Process records
     for record in vcf_in:
         record = process_record(record, fail_set, pass_set)
         vcf_out.write(record)
-
+    
     # Close files
     vcf_in.close()
     vcf_out.close()
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/replace_ev_numeric_code_with_string.py b/src/sv-pipeline/04_variant_resolution/scripts/replace_ev_numeric_code_with_string.py
index b7d611d41..69c7d16b6 100755
--- a/src/sv-pipeline/04_variant_resolution/scripts/replace_ev_numeric_code_with_string.py
+++ b/src/sv-pipeline/04_variant_resolution/scripts/replace_ev_numeric_code_with_string.py
@@ -61,7 +61,10 @@ def main():
     if args.fout in '- stdout'.split():
         fout = sys.stdout
     else:
-        fout = open(args.fout, 'w')
+        if args.fout.endswith(".gz"):
+            fout = gzip.open(args.fout, 'wt')
+        else:
+            fout = open(args.fout, 'w')
 
     while True:
         line = vcf.readline().rstrip()
diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl
index 26da02ed5..fb173b146 100644
--- a/wdl/CleanVcfChromosome.wdl
+++ b/wdl/CleanVcfChromosome.wdl
@@ -259,9 +259,28 @@ task CleanVcfPreprocess {
 		if [ ! -f "~{vcf}.tbi" ]; then
 			tabix -p vcf ~{vcf}
 		fi
+
+		python /opt/sv-pipeline/04_variant_resolution/scripts/replace_ev_numeric_code_with_string.py \
+			~{vcf} \
+			processed.vcf.gz
+
+		zgrep '^##' processed.vcf.gz > header.txt
+
+		cat <<EOF >> header.txt
+		##FILTER=<ID=UNRESOLVED,Description="Variant is unresolved">
+		##INFO=<ID=HIGH_SR_BACKGROUND,Number=0,Type=Flag,Description="High number of SR splits in background samples indicating messy region">
+		##INFO=<ID=BOTHSIDES_SUPPORT,Number=0,Type=Flag,Description="Variant has read-level support for both sides of breakpoint">
+		##INFO=<ID=REVISED_EVENT,Number=0,Type=Flag,Description="Variant has been revised due to a copy number mismatch">
+		EOF
+
+		zgrep '^#CHROM' processed.vcf.gz >> header.txt
+
+		bcftools view processed.vcf.gz | bcftools reheader -h header.txt | bgzip -c > processed.reheader.vcf.gz
+
+		rm header.txt
 		
 		python /opt/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py \
-			-V ~{vcf} \
+			-V processed.reheader.vcf.gz \
 			-O ~{output_vcf} \
 			--fail-list ~{background_list} \
 			--pass-list ~{bothsides_pass_list}

From 766acf63d47b59191cfd2362c5acb8311202a904 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Thu, 12 Dec 2024 16:26:54 -0500
Subject: [PATCH 22/40] Undo header.txt addition

---
 header.txt | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 header.txt

diff --git a/header.txt b/header.txt
deleted file mode 100644
index e69de29bb..000000000

From 05a17c2e28a41353ab119342a3462099dc5a19c5 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Thu, 12 Dec 2024 17:44:47 -0500
Subject: [PATCH 23/40] Modified postprocess function, works now

---
 header.txt                                    | 92 +++++++++++++++++++
 inputs/values/dockers.json                    |  6 +-
 .../scripts/cleanvcf_postprocess.py           | 58 ++++++------
 .../scripts/cleanvcf_preprocess.py            |  8 ++
 4 files changed, 132 insertions(+), 32 deletions(-)
 create mode 100644 header.txt

diff --git a/header.txt b/header.txt
new file mode 100644
index 000000000..4923d8e8d
--- /dev/null
+++ b/header.txt
@@ -0,0 +1,92 @@
+##fileformat=VCFv4.2
+##FILTER=<ID=PASS,Description="All filters passed">
+##ALT=<ID=CPX,Description="Complex SV">
+##ALT=<ID=CTX,Description="Reciprocal chromosomal translocation">
+##ALT=<ID=DEL,Description="Deletion">
+##ALT=<ID=DUP,Description="Duplication">
+##ALT=<ID=INS,Description="Insertion">
+##ALT=<ID=INS:ME,Description="Mobile element insertion of unspecified ME class">
+##ALT=<ID=INS:ME:ALU,Description="Alu element insertion">
+##ALT=<ID=INS:ME:LINE1,Description="LINE1 element insertion">
+##ALT=<ID=INS:ME:SVA,Description="SVA element insertion">
+##ALT=<ID=INS:UNK,Description="Sequence insertion of unspecified origin">
+##ALT=<ID=UNR,Description="Unresolved breakend or complex SV">
+##CPX_TYPE_INS_iDEL="Insertion with deletion at insertion site."
+##CPX_TYPE_INVdel="Complex inversion with 3' flanking deletion."
+##CPX_TYPE_INVdup="Complex inversion with 3' flanking duplication."
+##CPX_TYPE_dDUP="Dispersed duplication."
+##CPX_TYPE_dDUP_iDEL="Dispersed duplication with deletion at insertion site."
+##CPX_TYPE_delINV="Complex inversion with 5' flanking deletion."
+##CPX_TYPE_delINVdel="Complex inversion with 5' and 3' flanking deletions."
+##CPX_TYPE_delINVdup="Complex inversion with 5' flanking deletion and 3' flanking duplication."
+##CPX_TYPE_dupINV="Complex inversion with 5' flanking duplication."
+##CPX_TYPE_dupINVdel="Complex inversion with 5' flanking duplication and 3' flanking deletion."
+##CPX_TYPE_dupINVdup="Complex inversion with 5' and 3' flanking duplications."
+##CPX_TYPE_piDUP_FR="Palindromic inverted tandem duplication, forward-reverse orientation."
+##CPX_TYPE_piDUP_RF="Palindromic inverted tandem duplication, reverse-forward orientation."
+##FILTER=<ID=MULTIALLELIC,Description="Multiallelic site">
+##FILTER=<ID=UNRESOLVED,Description="Variant is unresolved">
+##FORMAT=<ID=CN,Number=1,Type=Integer,Description="Predicted copy state">
+##FORMAT=<ID=CNQ,Number=1,Type=Integer,Description="Read-depth genotype quality">
+##FORMAT=<ID=ECN,Number=1,Type=Integer,Description="Expected copy number for ref genotype">
+##FORMAT=<ID=EV,Number=1,Type=String,Description="Classes of evidence supporting final genotype">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=PE_GQ,Number=1,Type=Integer,Description="Paired-end genotype quality">
+##FORMAT=<ID=PE_GT,Number=1,Type=Integer,Description="Paired-end genotype">
+##FORMAT=<ID=RD_CN,Number=1,Type=Integer,Description="Predicted copy state">
+##FORMAT=<ID=RD_GQ,Number=1,Type=Integer,Description="Read-depth genotype quality">
+##FORMAT=<ID=SR_GQ,Number=1,Type=Integer,Description="Split read genotype quality">
+##FORMAT=<ID=SR_GT,Number=1,Type=Integer,Description="Split-read genotype">
+##INFO=<ID=ALGORITHMS,Number=.,Type=String,Description="Source algorithms">
+##INFO=<ID=BOTHSIDES_SUPPORT,Number=0,Type=Flag,Description="Variant has read-level support for both sides of breakpoint">
+##INFO=<ID=CHR2,Number=1,Type=String,Description="Second contig">
+##INFO=<ID=CPX_INTERVALS,Number=.,Type=String,Description="Genomic intervals constituting complex variant.">
+##INFO=<ID=CPX_TYPE,Number=1,Type=String,Description="Class of complex variant.">
+##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the structural variant">
+##INFO=<ID=END2,Number=1,Type=Integer,Description="Second position">
+##INFO=<ID=EVENT,Number=1,Type=String,Description="ID of event associated to breakend">
+##INFO=<ID=EVIDENCE,Number=.,Type=String,Description="Classes of random forest support.">
+##INFO=<ID=HIGH_SR_BACKGROUND,Number=0,Type=Flag,Description="High number of SR splits in background samples indicating messy region">
+##INFO=<ID=MEMBERS,Number=.,Type=String,Description="Cluster variant ids">
+##INFO=<ID=MULTIALLELIC,Number=0,Type=Flag,Description="Multiallelic site">
+##INFO=<ID=PESR_GT_OVERDISPERSION,Number=0,Type=Flag,Description="High PESR dispersion count">
+##INFO=<ID=SOURCE,Number=1,Type=String,Description="Source of inserted sequence.">
+##INFO=<ID=STRANDS,Number=1,Type=String,Description="First and second strands">
+##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Length of affected segment on the reference">
+##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
+##INFO=<ID=UNRESOLVED_TYPE,Number=1,Type=String,Description="Class of unresolved variant.">
+##INFO=<ID=varGQ,Number=1,Type=Integer,Description="Variant genotype quality">
+##bcftools_viewCommand=view --header-only /cromwell_root/fc-122007c3-964c-4cc1-b244-e9d6b2a0ea43/submissions/231aacd9-418b-40b6-a3b4-b32e697697bf/CombineBatches/bb6039d1-7dfd-4710-95df-c121caa22646/call-ClusterDepth/shard-0/VcfClusterSingleChrom/7038f60f-5b99-4e6c-b6de-12c9d5bb0bd6/call-ClusterSingleChrom/ClusterSingleChrom/54e9cb83-bb56-4c6c-aeee-a6291e3d4a09/call-ShardedCluster/shard-0/ShardedCluster/7b5e9498-d3b7-4fe7-ac47-5727b462dfc6/call-ConcatVcfs/brainvar_all_samples.chr1.depth.DEL.clustered.vcf.gz; Date=Wed Sep 18 21:08:50 2024
+##bcftools_viewCommand=view -S ^/cromwell_root/fc-122007c3-964c-4cc1-b244-e9d6b2a0ea43/submissions/ee2e1fee-dc5d-4183-9897-0c7e64b3be56/FilterBatchSamples/0766fc10-de05-4b84-9870-caa80fdc9bdd/call-CatOutliers/brainvar_all_samples.outliers.samples.list --force-samples --no-update /cromwell_root/fc-122007c3-964c-4cc1-b244-e9d6b2a0ea43/submissions/d5daf9ab-92fe-401b-97b3-48519a3312f9/FilterBatchSites/e26397de-b866-4f86-bced-1bbb4ed1852b/call-FilterAnnotateVcf/shard-4/brainvar_all_samples.depth.with_evidence.vcf.gz; Date=Tue Sep 17 15:58:20 2024
+##bcftools_viewCommand=view -e 'SVTYPE!="CNV" && COUNT(GT="alt")==0' -O z -o brainvar_all_samples.depth.outliers_removed.vcf.gz; Date=Tue Sep 17 15:58:20 2024
+##bcftools_viewCommand=view -i %ID!=@excluded_vids.list -Oz -o brainvar_all_samples.cluster_batch.depth.chr1.exclude_intervals.vcf.gz /cromwell_root/fc-122007c3-964c-4cc1-b244-e9d6b2a0ea43/submissions/b59992f9-22ce-4f42-a75e-ff60defa10a5/ClusterBatch/5988ced3-7983-4868-bb99-99d55a2446b8/call-ClusterDepth/ClusterDepth/27ed208e-9b5a-47b5-849a-5f6e854c312b/call-SVCluster/shard-0/brainvar_all_samples.cluster_batch.depth.chr1.clustered.vcf.gz; Date=Mon Sep 16 17:47:23 2024
+##bcftools_viewVersion=1.15.1+htslib-1.15.1
+##contig=<ID=chr1,assembly=38,length=248956422>
+##contig=<ID=chr2,assembly=38,length=242193529>
+##contig=<ID=chr3,assembly=38,length=198295559>
+##contig=<ID=chr4,assembly=38,length=190214555>
+##contig=<ID=chr5,assembly=38,length=181538259>
+##contig=<ID=chr6,assembly=38,length=170805979>
+##contig=<ID=chr7,assembly=38,length=159345973>
+##contig=<ID=chr8,assembly=38,length=145138636>
+##contig=<ID=chr9,assembly=38,length=138394717>
+##contig=<ID=chr10,assembly=38,length=133797422>
+##contig=<ID=chr11,assembly=38,length=135086622>
+##contig=<ID=chr12,assembly=38,length=133275309>
+##contig=<ID=chr13,assembly=38,length=114364328>
+##contig=<ID=chr14,assembly=38,length=107043718>
+##contig=<ID=chr15,assembly=38,length=101991189>
+##contig=<ID=chr16,assembly=38,length=90338345>
+##contig=<ID=chr17,assembly=38,length=83257441>
+##contig=<ID=chr18,assembly=38,length=80373285>
+##contig=<ID=chr19,assembly=38,length=58617616>
+##contig=<ID=chr20,assembly=38,length=64444167>
+##contig=<ID=chr21,assembly=38,length=46709983>
+##contig=<ID=chr22,assembly=38,length=50818468>
+##contig=<ID=chrX,assembly=38,length=156040895>
+##contig=<ID=chrY,assembly=38,length=57227415>
+##source=depth
+##bcftools_viewVersion=1.21+htslib-1.21
+##bcftools_viewCommand=view -h /Users/kjaising/Desktop/Work/CleanVcf/Postprocess/brainvar_all_samples_gatk.vcf.gz; Date=Thu Dec 12 17:05:30 2024
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	HSB092	HSB100	HSB102	HSB103	HSB105	HSB107	HSB112	HSB113	HSB114	HSB115	HSB116	HSB118	HSB119	HSB121	HSB122	HSB127	HSB130	HSB131	HSB132	HSB136	HSB139	HSB142	HSB143	HSB148	HSB149	HSB150	HSB152	HSB153	HSB154	HSB155	HSB159	HSB171	HSB172	HSB173	HSB174	HSB175	HSB178	HSB194	HSB195	HSB221	HSB222	HSB223	HSB238	HSB239	HSB248	HSB260	HSB261	HSB265	HSB267	HSB268	HSB270	HSB271	HSB272	HSB274	HSB275	HSB278	HSB279	HSB282	HSB286	HSB289	HSB292	HSB313	HSB316	HSB321	HSB322	HSB332	HSB337	HSB338	HSB339	HSB340	HSB341	HSB342	HSB343	HSB344	HSB345	HSB388	HSB389	HSB394	HSB395	HSB396	HSB398	HSB411	HSB412	HSB413	HSB414	HSB415	HSB416	HSB417	HSB418	HSB420	HSB421	HSB422	HSB425	HSB427	HSB428	HSB429	HSB430	HSB431	HSB432	HSB433	HSB439	HSB440	HSB442	HSB443	HSB444	HSB445	HSB452	HSB453	HSB454	HSB455	HSB456	HSB457	HSB459	HSB460	HSB461	HSB462	HSB463	HSB464	HSB465	HSB466	HSB467	HSB468	HSB469	HSB470	HSB471	HSB472	HSB473	HSB474	HSB475	HSB476	HSB478	HSB479	HSB480	HSB481	HSB482	HSB483	HSB484	HSB485	HSB486	HSB487	HSB488	HSB489	HSB490	HSB492	HSB493	HSB494	HSB495	HSB496	HSB497	HSB498	HSB499	HSB500	HSB501	HSB502	HSB503	HSB504	HSB505	HSB506	HSB507	HSB508	HSB509	HSB510	HSB511	HSB513	HSB514	HSB515	HSB516	HSB536	HSB543	HSB544	HSB545	HSB546	HSB547	HSB561	HSB562	HSB563	HSB564	HSB565	HSB566	HSB568	HSB569	HSB571	HSB572	HSB573	HSB577	HSB578	HSB579	HSB583	HSB587	HSB589	HSB590	HSB591	HSB593	HSB594	HSB595	HSB596	HSB597	HSB598	HSB608	HSB615	HSB616	HSB618	HSB619	HSB622	HSB623	HSB624	HSB625	HSB626	HSB627	HSB629	HSB630	HSB631	HSB633	HSB634	HSB637	HSB638	HSB643	HSB644	HSB645	HSB646	HSB649	HSB650	HSB651	HSB652	HSB653	HSB654	HSB657	HSB666	HSB669	HSB670	HSB671	HSB672	HSB674	HSB676	HSB679
diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json
index 7fea73d36..8f7b64fd5 100644
--- a/inputs/values/dockers.json
+++ b/inputs/values/dockers.json
@@ -12,8 +12,8 @@
   "samtools_cloud_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/samtools-cloud:2024-10-25-v0.29-beta-5ea22a52",
   "sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2024-10-25-v0.29-beta-5ea22a52",
   "sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base-mini:2024-10-25-v0.29-beta-5ea22a52",
-  "sv_pipeline_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-e27026bff22524302f76cd7c34370b7cfaf09a11",
-  "sv_pipeline_qc_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-e27026bff22524302f76cd7c34370b7cfaf09a11",
+  "sv_pipeline_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-766acf63d47b59191cfd2362c5acb8311202a904",
+  "sv_pipeline_qc_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-766acf63d47b59191cfd2362c5acb8311202a904",
   "wham_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/wham:2024-10-25-v0.29-beta-5ea22a52",
   "igv_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/igv:mw-xz-fixes-2-b1be6a9",
   "duphold_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/duphold:mw-xz-fixes-2-b1be6a9",
@@ -28,5 +28,5 @@
   "sv_utils_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-utils:2024-10-25-v0.29-beta-5ea22a52",
   "gq_recalibrator_docker": "us.gcr.io/broad-dsde-methods/markw/gatk:mw-tb-form-sv-filter-training-data-899360a",
   "str": "us.gcr.io/broad-dsde-methods/gatk-sv/str:2023-05-23-v0.27.3-beta-e537bdd6",
-  "denovo": "us-central1-docker.pkg.dev/talkowski-training/kj-development/denovo:kj-clean-vcf-e27026bff22524302f76cd7c34370b7cfaf09a11"
+  "denovo": "us-central1-docker.pkg.dev/talkowski-training/kj-development/denovo:kj-clean-vcf-766acf63d47b59191cfd2362c5acb8311202a904"
 }
\ No newline at end of file
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py
index ca4fabc35..16cfeaf04 100644
--- a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py
+++ b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py
@@ -2,37 +2,37 @@
 
 import argparse
 import pysam
+import gzip
+
 
-# Constants
 EV = 'EV'
 SVTYPE = 'SVTYPE'
 ME = 'ME'
 UNR = 'UNR'
-FILTER_VCF_INFO_LINES = {'BND_DEPTH', 'BND_MATEID', 'SPLIT_READS', 'PAIRED_END_READS', 'CLUSTER_MEMBER_IDS'}
-FILTER_VCF_LINES = {'ID=UNR', 'ID=BND_DEPTH', 'ID=BND_MATEID', 'ID=CLUSTER_MEMBER_IDS', 'ID=PAIRED_END_READS', 'ID=SPLIT_READS'}
-
-
-def modify_header(header):
+FILTER_VCF_INFO_LINES = {
+    'BND_DEPTH', 'BND_MATEID', 'SPLIT_READS', 'PAIRED_END_READS', 
+    'CLUSTER_MEMBER_IDS', 'MULTIALLELIC', 'UNRESOLVED'
+}
+FILTER_VCF_TEXT_LINES = {
+    'CIPOS', 'CIEND', 'RMSSTD', 'source', 'bcftools', 'GATKCommandLine', 'fileformat'
+}
+
+def cleanse_header(header):
     new_header = pysam.VariantHeader()
 
-    # Copy over header lines, excluding some
     for line in header.records:
         include_line = True
         if line.type == 'INFO' and line.get('ID') in FILTER_VCF_INFO_LINES:
             include_line = False
+        elif any(fv_line in str(line) for fv_line in FILTER_VCF_TEXT_LINES):
+            include_line = False
         elif line.type == 'FORMAT' and line.get('ID') == EV:
             include_line = False
         elif line.type == 'ALT' and line.get('ID') == UNR:
             include_line = False
-        elif any(fv_line in str(line) for fv_line in FILTER_VCF_LINES):
-            include_line = False
         if include_line:
             new_header.add_line(str(line))
-
-    # Add new header line for EV
-    new_header.add_line('##FORMAT=<ID=EV,Number=1,Type=String,Description="Classes of evidence supporting final genotype">')
-
-    # Add samples to header
+    
     for sample in header.samples:
         new_header.add_sample(sample)
 
@@ -55,7 +55,7 @@ def cleanse_info_fields(record):
 def process_svtype(record):
     svtype = record.info.get(SVTYPE, None)
 
-    # Check for mobile element in alleles
+    # Skip if variant has mobile element
     has_mobile_element = False
     if record.alts:
         for allele in record.alts:
@@ -64,8 +64,6 @@ def process_svtype(record):
                 if symbol == ME:
                     has_mobile_element = True
                     break
-
-    # If SVTYPE is missing or variant has mobile element, skip processing
     if svtype is None or has_mobile_element:
         return record
 
@@ -79,10 +77,7 @@ def process_svtype(record):
         genotype = record.samples[sample]
         gt = genotype.get('GT', (None, None))
 
-        # Count number of alt alleles
         alt_count = sum(1 for allele_index in gt if allele_index is not None and allele_index > 0)
-
-        # Update GT accordingly
         if alt_count == 1:
             genotype['GT'] = (0, 1)
         elif alt_count == 2:
@@ -92,21 +87,26 @@ def process_svtype(record):
 
 
 if __name__ == '__main__':
+    # Parse arguments
     parser = argparse.ArgumentParser(description='CleanVcf postprocessing.')
     parser.add_argument('-O', '--output', dest='output_vcf', required=True, help='Output VCF file')
     parser.add_argument('-V', '--input', dest='input_vcf', required=True, help='Input VCF file')
     args = parser.parse_args()
 
     # Open input VCF
-    vcf_in = pysam.VariantFile(args.input_vcf)
-
-    # Modify header
-    new_header = modify_header(vcf_in.header)
-
-    # Open output VCF
-    vcf_out = pysam.VariantFile(args.output_vcf, 'w', header=new_header)
-
-    # Process and write variants
+    if args.input_vcf.endswith('.gz'):
+        vcf_in = pysam.VariantFile(gzip.open(args.input_vcf, 'rt'))
+    else:
+        vcf_in = pysam.VariantFile(args.input_vcf)
+    new_header = cleanse_header(vcf_in.header)
+    
+    # Open output file
+    if args.output_vcf.endswith('.gz'):
+        vcf_out = pysam.VariantFile(args.output_vcf, 'wz', header=new_header)
+    else:
+        vcf_out = pysam.VariantFile(args.output_vcf, 'w', header=new_header)
+    
+    # Process records
     for record in vcf_in:
         record = process_record(record)
         vcf_out.write(record)
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py
index 0326d4fc1..32477a24d 100644
--- a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py
+++ b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py
@@ -4,6 +4,7 @@
 import pysam
 import gzip
 
+
 VAR_GQ = 'VAR_GQ'
 MULTIALLELIC = 'MULTIALLELIC'
 UNRESOLVED = 'UNRESOLVED'
@@ -22,6 +23,7 @@ def read_last_column(file_path):
                 result_set.add(columns[-1])
     return result_set
 
+
 def process_record(record, fail_set, pass_set):
     record = process_varGQ(record)
     record = process_multiallelic(record)
@@ -30,6 +32,7 @@ def process_record(record, fail_set, pass_set):
     record = process_bothsides_support(record, pass_set)
     return record
 
+
 def process_varGQ(record):
     if VAR_GQ in record.info:
         var_gq = record.info[VAR_GQ]
@@ -39,27 +42,32 @@ def process_varGQ(record):
         record.qual = var_gq
     return record
 
+
 def process_multiallelic(record):
     if MULTIALLELIC in record.info:
         del record.info[MULTIALLELIC]
     return record
 
+
 def process_unresolved(record):
     if UNRESOLVED in record.info:
         del record.info[UNRESOLVED]
         record.filter.add(UNRESOLVED)
     return record
 
+
 def process_noisy(record, fail_set):
     if record.id in fail_set:
         record.info[HIGH_SR_BACKGROUND] = True
     return record
 
+
 def process_bothsides_support(record, pass_set):
     if record.id in pass_set:
         record.info[BOTHSIDES_SUPPORT] = True
     return record
 
+
 if __name__ == '__main__':
     # Parse arguments
     parser = argparse.ArgumentParser(description='CleanVcf preprocessing.')

From d4033870e659b59c96980ba834241a3e68a5865e Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Thu, 12 Dec 2024 17:45:04 -0500
Subject: [PATCH 24/40] Removed header.txt again

---
 header.txt | 92 ------------------------------------------------------
 1 file changed, 92 deletions(-)
 delete mode 100644 header.txt

diff --git a/header.txt b/header.txt
deleted file mode 100644
index 4923d8e8d..000000000
--- a/header.txt
+++ /dev/null
@@ -1,92 +0,0 @@
-##fileformat=VCFv4.2
-##FILTER=<ID=PASS,Description="All filters passed">
-##ALT=<ID=CPX,Description="Complex SV">
-##ALT=<ID=CTX,Description="Reciprocal chromosomal translocation">
-##ALT=<ID=DEL,Description="Deletion">
-##ALT=<ID=DUP,Description="Duplication">
-##ALT=<ID=INS,Description="Insertion">
-##ALT=<ID=INS:ME,Description="Mobile element insertion of unspecified ME class">
-##ALT=<ID=INS:ME:ALU,Description="Alu element insertion">
-##ALT=<ID=INS:ME:LINE1,Description="LINE1 element insertion">
-##ALT=<ID=INS:ME:SVA,Description="SVA element insertion">
-##ALT=<ID=INS:UNK,Description="Sequence insertion of unspecified origin">
-##ALT=<ID=UNR,Description="Unresolved breakend or complex SV">
-##CPX_TYPE_INS_iDEL="Insertion with deletion at insertion site."
-##CPX_TYPE_INVdel="Complex inversion with 3' flanking deletion."
-##CPX_TYPE_INVdup="Complex inversion with 3' flanking duplication."
-##CPX_TYPE_dDUP="Dispersed duplication."
-##CPX_TYPE_dDUP_iDEL="Dispersed duplication with deletion at insertion site."
-##CPX_TYPE_delINV="Complex inversion with 5' flanking deletion."
-##CPX_TYPE_delINVdel="Complex inversion with 5' and 3' flanking deletions."
-##CPX_TYPE_delINVdup="Complex inversion with 5' flanking deletion and 3' flanking duplication."
-##CPX_TYPE_dupINV="Complex inversion with 5' flanking duplication."
-##CPX_TYPE_dupINVdel="Complex inversion with 5' flanking duplication and 3' flanking deletion."
-##CPX_TYPE_dupINVdup="Complex inversion with 5' and 3' flanking duplications."
-##CPX_TYPE_piDUP_FR="Palindromic inverted tandem duplication, forward-reverse orientation."
-##CPX_TYPE_piDUP_RF="Palindromic inverted tandem duplication, reverse-forward orientation."
-##FILTER=<ID=MULTIALLELIC,Description="Multiallelic site">
-##FILTER=<ID=UNRESOLVED,Description="Variant is unresolved">
-##FORMAT=<ID=CN,Number=1,Type=Integer,Description="Predicted copy state">
-##FORMAT=<ID=CNQ,Number=1,Type=Integer,Description="Read-depth genotype quality">
-##FORMAT=<ID=ECN,Number=1,Type=Integer,Description="Expected copy number for ref genotype">
-##FORMAT=<ID=EV,Number=1,Type=String,Description="Classes of evidence supporting final genotype">
-##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
-##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
-##FORMAT=<ID=PE_GQ,Number=1,Type=Integer,Description="Paired-end genotype quality">
-##FORMAT=<ID=PE_GT,Number=1,Type=Integer,Description="Paired-end genotype">
-##FORMAT=<ID=RD_CN,Number=1,Type=Integer,Description="Predicted copy state">
-##FORMAT=<ID=RD_GQ,Number=1,Type=Integer,Description="Read-depth genotype quality">
-##FORMAT=<ID=SR_GQ,Number=1,Type=Integer,Description="Split read genotype quality">
-##FORMAT=<ID=SR_GT,Number=1,Type=Integer,Description="Split-read genotype">
-##INFO=<ID=ALGORITHMS,Number=.,Type=String,Description="Source algorithms">
-##INFO=<ID=BOTHSIDES_SUPPORT,Number=0,Type=Flag,Description="Variant has read-level support for both sides of breakpoint">
-##INFO=<ID=CHR2,Number=1,Type=String,Description="Second contig">
-##INFO=<ID=CPX_INTERVALS,Number=.,Type=String,Description="Genomic intervals constituting complex variant.">
-##INFO=<ID=CPX_TYPE,Number=1,Type=String,Description="Class of complex variant.">
-##INFO=<ID=END,Number=1,Type=Integer,Description="End position of the structural variant">
-##INFO=<ID=END2,Number=1,Type=Integer,Description="Second position">
-##INFO=<ID=EVENT,Number=1,Type=String,Description="ID of event associated to breakend">
-##INFO=<ID=EVIDENCE,Number=.,Type=String,Description="Classes of random forest support.">
-##INFO=<ID=HIGH_SR_BACKGROUND,Number=0,Type=Flag,Description="High number of SR splits in background samples indicating messy region">
-##INFO=<ID=MEMBERS,Number=.,Type=String,Description="Cluster variant ids">
-##INFO=<ID=MULTIALLELIC,Number=0,Type=Flag,Description="Multiallelic site">
-##INFO=<ID=PESR_GT_OVERDISPERSION,Number=0,Type=Flag,Description="High PESR dispersion count">
-##INFO=<ID=SOURCE,Number=1,Type=String,Description="Source of inserted sequence.">
-##INFO=<ID=STRANDS,Number=1,Type=String,Description="First and second strands">
-##INFO=<ID=SVLEN,Number=1,Type=Integer,Description="Length of affected segment on the reference">
-##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">
-##INFO=<ID=UNRESOLVED_TYPE,Number=1,Type=String,Description="Class of unresolved variant.">
-##INFO=<ID=varGQ,Number=1,Type=Integer,Description="Variant genotype quality">
-##bcftools_viewCommand=view --header-only /cromwell_root/fc-122007c3-964c-4cc1-b244-e9d6b2a0ea43/submissions/231aacd9-418b-40b6-a3b4-b32e697697bf/CombineBatches/bb6039d1-7dfd-4710-95df-c121caa22646/call-ClusterDepth/shard-0/VcfClusterSingleChrom/7038f60f-5b99-4e6c-b6de-12c9d5bb0bd6/call-ClusterSingleChrom/ClusterSingleChrom/54e9cb83-bb56-4c6c-aeee-a6291e3d4a09/call-ShardedCluster/shard-0/ShardedCluster/7b5e9498-d3b7-4fe7-ac47-5727b462dfc6/call-ConcatVcfs/brainvar_all_samples.chr1.depth.DEL.clustered.vcf.gz; Date=Wed Sep 18 21:08:50 2024
-##bcftools_viewCommand=view -S ^/cromwell_root/fc-122007c3-964c-4cc1-b244-e9d6b2a0ea43/submissions/ee2e1fee-dc5d-4183-9897-0c7e64b3be56/FilterBatchSamples/0766fc10-de05-4b84-9870-caa80fdc9bdd/call-CatOutliers/brainvar_all_samples.outliers.samples.list --force-samples --no-update /cromwell_root/fc-122007c3-964c-4cc1-b244-e9d6b2a0ea43/submissions/d5daf9ab-92fe-401b-97b3-48519a3312f9/FilterBatchSites/e26397de-b866-4f86-bced-1bbb4ed1852b/call-FilterAnnotateVcf/shard-4/brainvar_all_samples.depth.with_evidence.vcf.gz; Date=Tue Sep 17 15:58:20 2024
-##bcftools_viewCommand=view -e 'SVTYPE!="CNV" && COUNT(GT="alt")==0' -O z -o brainvar_all_samples.depth.outliers_removed.vcf.gz; Date=Tue Sep 17 15:58:20 2024
-##bcftools_viewCommand=view -i %ID!=@excluded_vids.list -Oz -o brainvar_all_samples.cluster_batch.depth.chr1.exclude_intervals.vcf.gz /cromwell_root/fc-122007c3-964c-4cc1-b244-e9d6b2a0ea43/submissions/b59992f9-22ce-4f42-a75e-ff60defa10a5/ClusterBatch/5988ced3-7983-4868-bb99-99d55a2446b8/call-ClusterDepth/ClusterDepth/27ed208e-9b5a-47b5-849a-5f6e854c312b/call-SVCluster/shard-0/brainvar_all_samples.cluster_batch.depth.chr1.clustered.vcf.gz; Date=Mon Sep 16 17:47:23 2024
-##bcftools_viewVersion=1.15.1+htslib-1.15.1
-##contig=<ID=chr1,assembly=38,length=248956422>
-##contig=<ID=chr2,assembly=38,length=242193529>
-##contig=<ID=chr3,assembly=38,length=198295559>
-##contig=<ID=chr4,assembly=38,length=190214555>
-##contig=<ID=chr5,assembly=38,length=181538259>
-##contig=<ID=chr6,assembly=38,length=170805979>
-##contig=<ID=chr7,assembly=38,length=159345973>
-##contig=<ID=chr8,assembly=38,length=145138636>
-##contig=<ID=chr9,assembly=38,length=138394717>
-##contig=<ID=chr10,assembly=38,length=133797422>
-##contig=<ID=chr11,assembly=38,length=135086622>
-##contig=<ID=chr12,assembly=38,length=133275309>
-##contig=<ID=chr13,assembly=38,length=114364328>
-##contig=<ID=chr14,assembly=38,length=107043718>
-##contig=<ID=chr15,assembly=38,length=101991189>
-##contig=<ID=chr16,assembly=38,length=90338345>
-##contig=<ID=chr17,assembly=38,length=83257441>
-##contig=<ID=chr18,assembly=38,length=80373285>
-##contig=<ID=chr19,assembly=38,length=58617616>
-##contig=<ID=chr20,assembly=38,length=64444167>
-##contig=<ID=chr21,assembly=38,length=46709983>
-##contig=<ID=chr22,assembly=38,length=50818468>
-##contig=<ID=chrX,assembly=38,length=156040895>
-##contig=<ID=chrY,assembly=38,length=57227415>
-##source=depth
-##bcftools_viewVersion=1.21+htslib-1.21
-##bcftools_viewCommand=view -h /Users/kjaising/Desktop/Work/CleanVcf/Postprocess/brainvar_all_samples_gatk.vcf.gz; Date=Thu Dec 12 17:05:30 2024
-#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	HSB092	HSB100	HSB102	HSB103	HSB105	HSB107	HSB112	HSB113	HSB114	HSB115	HSB116	HSB118	HSB119	HSB121	HSB122	HSB127	HSB130	HSB131	HSB132	HSB136	HSB139	HSB142	HSB143	HSB148	HSB149	HSB150	HSB152	HSB153	HSB154	HSB155	HSB159	HSB171	HSB172	HSB173	HSB174	HSB175	HSB178	HSB194	HSB195	HSB221	HSB222	HSB223	HSB238	HSB239	HSB248	HSB260	HSB261	HSB265	HSB267	HSB268	HSB270	HSB271	HSB272	HSB274	HSB275	HSB278	HSB279	HSB282	HSB286	HSB289	HSB292	HSB313	HSB316	HSB321	HSB322	HSB332	HSB337	HSB338	HSB339	HSB340	HSB341	HSB342	HSB343	HSB344	HSB345	HSB388	HSB389	HSB394	HSB395	HSB396	HSB398	HSB411	HSB412	HSB413	HSB414	HSB415	HSB416	HSB417	HSB418	HSB420	HSB421	HSB422	HSB425	HSB427	HSB428	HSB429	HSB430	HSB431	HSB432	HSB433	HSB439	HSB440	HSB442	HSB443	HSB444	HSB445	HSB452	HSB453	HSB454	HSB455	HSB456	HSB457	HSB459	HSB460	HSB461	HSB462	HSB463	HSB464	HSB465	HSB466	HSB467	HSB468	HSB469	HSB470	HSB471	HSB472	HSB473	HSB474	HSB475	HSB476	HSB478	HSB479	HSB480	HSB481	HSB482	HSB483	HSB484	HSB485	HSB486	HSB487	HSB488	HSB489	HSB490	HSB492	HSB493	HSB494	HSB495	HSB496	HSB497	HSB498	HSB499	HSB500	HSB501	HSB502	HSB503	HSB504	HSB505	HSB506	HSB507	HSB508	HSB509	HSB510	HSB511	HSB513	HSB514	HSB515	HSB516	HSB536	HSB543	HSB544	HSB545	HSB546	HSB547	HSB561	HSB562	HSB563	HSB564	HSB565	HSB566	HSB568	HSB569	HSB571	HSB572	HSB573	HSB577	HSB578	HSB579	HSB583	HSB587	HSB589	HSB590	HSB591	HSB593	HSB594	HSB595	HSB596	HSB597	HSB598	HSB608	HSB615	HSB616	HSB618	HSB619	HSB622	HSB623	HSB624	HSB625	HSB626	HSB627	HSB629	HSB630	HSB631	HSB633	HSB634	HSB637	HSB638	HSB643	HSB644	HSB645	HSB646	HSB649	HSB650	HSB651	HSB652	HSB653	HSB654	HSB657	HSB666	HSB669	HSB670	HSB671	HSB672	HSB674	HSB676	HSB679

From 170ec7c38b004f02639b5f82270113b74162fe79 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Fri, 13 Dec 2024 10:44:54 -0500
Subject: [PATCH 25/40] Updated header writing

---
 inputs/values/dockers.json                                  | 6 +++---
 .../04_variant_resolution/scripts/cleanvcf_postprocess.py   | 5 ++++-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json
index 8f7b64fd5..724abee14 100644
--- a/inputs/values/dockers.json
+++ b/inputs/values/dockers.json
@@ -12,8 +12,8 @@
   "samtools_cloud_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/samtools-cloud:2024-10-25-v0.29-beta-5ea22a52",
   "sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2024-10-25-v0.29-beta-5ea22a52",
   "sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base-mini:2024-10-25-v0.29-beta-5ea22a52",
-  "sv_pipeline_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-766acf63d47b59191cfd2362c5acb8311202a904",
-  "sv_pipeline_qc_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-766acf63d47b59191cfd2362c5acb8311202a904",
+  "sv_pipeline_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-d4033870e659b59c96980ba834241a3e68a5865e",
+  "sv_pipeline_qc_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-d4033870e659b59c96980ba834241a3e68a5865e",
   "wham_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/wham:2024-10-25-v0.29-beta-5ea22a52",
   "igv_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/igv:mw-xz-fixes-2-b1be6a9",
   "duphold_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/duphold:mw-xz-fixes-2-b1be6a9",
@@ -28,5 +28,5 @@
   "sv_utils_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-utils:2024-10-25-v0.29-beta-5ea22a52",
   "gq_recalibrator_docker": "us.gcr.io/broad-dsde-methods/markw/gatk:mw-tb-form-sv-filter-training-data-899360a",
   "str": "us.gcr.io/broad-dsde-methods/gatk-sv/str:2023-05-23-v0.27.3-beta-e537bdd6",
-  "denovo": "us-central1-docker.pkg.dev/talkowski-training/kj-development/denovo:kj-clean-vcf-766acf63d47b59191cfd2362c5acb8311202a904"
+  "denovo": "us-central1-docker.pkg.dev/talkowski-training/kj-development/denovo:kj-clean-vcf-d4033870e659b59c96980ba834241a3e68a5865e"
 }
\ No newline at end of file
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py
index 16cfeaf04..7efe4d330 100644
--- a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py
+++ b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py
@@ -11,12 +11,15 @@
 UNR = 'UNR'
 FILTER_VCF_INFO_LINES = {
     'BND_DEPTH', 'BND_MATEID', 'SPLIT_READS', 'PAIRED_END_READS', 
-    'CLUSTER_MEMBER_IDS', 'MULTIALLELIC', 'UNRESOLVED'
+    'CLUSTER_MEMBER_IDS', 'MULTIALLELIC', 'UNRESOLVED', 'VARGQ', 
+    'EVENT', 'REVISED_EVENT', 'MULTI_CNV'
 }
 FILTER_VCF_TEXT_LINES = {
     'CIPOS', 'CIEND', 'RMSSTD', 'source', 'bcftools', 'GATKCommandLine', 'fileformat'
 }
 
+# TODO: Remove INFO fields in advance of script: 'MULTI_CNV', 'VARGQ', 'REVISED_EVENT'
+
 def cleanse_header(header):
     new_header = pysam.VariantHeader()
 

From eed4c575fd8bb26753bb94cf4c398513fb51f099 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Fri, 13 Dec 2024 13:08:52 -0500
Subject: [PATCH 26/40] Decommissioned post-process script

---
 inputs/values/dockers.json                    |   6 +-
 .../scripts/cleanvcf_postprocess.py           | 119 ------------------
 wdl/CleanVcfChromosome.wdl                    |  12 +-
 3 files changed, 10 insertions(+), 127 deletions(-)
 delete mode 100644 src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py

diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json
index 724abee14..0b92b966f 100644
--- a/inputs/values/dockers.json
+++ b/inputs/values/dockers.json
@@ -12,8 +12,8 @@
   "samtools_cloud_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/samtools-cloud:2024-10-25-v0.29-beta-5ea22a52",
   "sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2024-10-25-v0.29-beta-5ea22a52",
   "sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base-mini:2024-10-25-v0.29-beta-5ea22a52",
-  "sv_pipeline_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-d4033870e659b59c96980ba834241a3e68a5865e",
-  "sv_pipeline_qc_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-d4033870e659b59c96980ba834241a3e68a5865e",
+  "sv_pipeline_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-170ec7c38b004f02639b5f82270113b74162fe79",
+  "sv_pipeline_qc_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-170ec7c38b004f02639b5f82270113b74162fe79",
   "wham_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/wham:2024-10-25-v0.29-beta-5ea22a52",
   "igv_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/igv:mw-xz-fixes-2-b1be6a9",
   "duphold_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/duphold:mw-xz-fixes-2-b1be6a9",
@@ -28,5 +28,5 @@
   "sv_utils_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-utils:2024-10-25-v0.29-beta-5ea22a52",
   "gq_recalibrator_docker": "us.gcr.io/broad-dsde-methods/markw/gatk:mw-tb-form-sv-filter-training-data-899360a",
   "str": "us.gcr.io/broad-dsde-methods/gatk-sv/str:2023-05-23-v0.27.3-beta-e537bdd6",
-  "denovo": "us-central1-docker.pkg.dev/talkowski-training/kj-development/denovo:kj-clean-vcf-d4033870e659b59c96980ba834241a3e68a5865e"
+  "denovo": "us-central1-docker.pkg.dev/talkowski-training/kj-development/denovo:kj-clean-vcf-170ec7c38b004f02639b5f82270113b74162fe79"
 }
\ No newline at end of file
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py
deleted file mode 100644
index 7efe4d330..000000000
--- a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py
+++ /dev/null
@@ -1,119 +0,0 @@
-#!/bin/python
-
-import argparse
-import pysam
-import gzip
-
-
-EV = 'EV'
-SVTYPE = 'SVTYPE'
-ME = 'ME'
-UNR = 'UNR'
-FILTER_VCF_INFO_LINES = {
-    'BND_DEPTH', 'BND_MATEID', 'SPLIT_READS', 'PAIRED_END_READS', 
-    'CLUSTER_MEMBER_IDS', 'MULTIALLELIC', 'UNRESOLVED', 'VARGQ', 
-    'EVENT', 'REVISED_EVENT', 'MULTI_CNV'
-}
-FILTER_VCF_TEXT_LINES = {
-    'CIPOS', 'CIEND', 'RMSSTD', 'source', 'bcftools', 'GATKCommandLine', 'fileformat'
-}
-
-# TODO: Remove INFO fields in advance of script: 'MULTI_CNV', 'VARGQ', 'REVISED_EVENT'
-
-def cleanse_header(header):
-    new_header = pysam.VariantHeader()
-
-    for line in header.records:
-        include_line = True
-        if line.type == 'INFO' and line.get('ID') in FILTER_VCF_INFO_LINES:
-            include_line = False
-        elif any(fv_line in str(line) for fv_line in FILTER_VCF_TEXT_LINES):
-            include_line = False
-        elif line.type == 'FORMAT' and line.get('ID') == EV:
-            include_line = False
-        elif line.type == 'ALT' and line.get('ID') == UNR:
-            include_line = False
-        if include_line:
-            new_header.add_line(str(line))
-    
-    for sample in header.samples:
-        new_header.add_sample(sample)
-
-    return new_header
-
-
-def process_record(record):
-    record = cleanse_info_fields(record)
-    record = process_svtype(record)
-    return record
-
-
-def cleanse_info_fields(record):
-    for field in FILTER_VCF_INFO_LINES:
-        if field in record.info:
-            del record.info[field]
-    return record
-
-
-def process_svtype(record):
-    svtype = record.info.get(SVTYPE, None)
-
-    # Skip if variant has mobile element
-    has_mobile_element = False
-    if record.alts:
-        for allele in record.alts:
-            if allele.startswith('<') and allele.endswith('>'):
-                symbol = allele[1:-1]
-                if symbol == ME:
-                    has_mobile_element = True
-                    break
-    if svtype is None or has_mobile_element:
-        return record
-
-    # Update alleles
-    ref_allele = record.ref
-    alt_allele = f'<{svtype}>'
-    record.alleles = (ref_allele, alt_allele)
-
-    # Update genotypes
-    for sample in record.samples:
-        genotype = record.samples[sample]
-        gt = genotype.get('GT', (None, None))
-
-        alt_count = sum(1 for allele_index in gt if allele_index is not None and allele_index > 0)
-        if alt_count == 1:
-            genotype['GT'] = (0, 1)
-        elif alt_count == 2:
-            genotype['GT'] = (1, 1)
-
-    return record
-
-
-if __name__ == '__main__':
-    # Parse arguments
-    parser = argparse.ArgumentParser(description='CleanVcf postprocessing.')
-    parser.add_argument('-O', '--output', dest='output_vcf', required=True, help='Output VCF file')
-    parser.add_argument('-V', '--input', dest='input_vcf', required=True, help='Input VCF file')
-    args = parser.parse_args()
-
-    # Open input VCF
-    if args.input_vcf.endswith('.gz'):
-        vcf_in = pysam.VariantFile(gzip.open(args.input_vcf, 'rt'))
-    else:
-        vcf_in = pysam.VariantFile(args.input_vcf)
-    new_header = cleanse_header(vcf_in.header)
-    
-    # Open output file
-    if args.output_vcf.endswith('.gz'):
-        vcf_out = pysam.VariantFile(args.output_vcf, 'wz', header=new_header)
-    else:
-        vcf_out = pysam.VariantFile(args.output_vcf, 'w', header=new_header)
-    
-    # Process records
-    for record in vcf_in:
-        record = process_record(record)
-        vcf_out.write(record)
-
-    # Close files
-    vcf_in.close()
-    vcf_out.close()
diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl
index fb173b146..69a1e068e 100644
--- a/wdl/CleanVcfChromosome.wdl
+++ b/wdl/CleanVcfChromosome.wdl
@@ -268,7 +268,7 @@ task CleanVcfPreprocess {
 
 		cat <<EOF >> header.txt
 		##FILTER=<ID=UNRESOLVED,Description="Variant is unresolved">
-		##INFO=<ID=HIGH_SR_BACKGROUND,Number=0,Type=Flag,Description="High number of SR splits in background samples indicating messy region">
+		##INFO=<ID=HIGH_SR_BACKGROUND,Number=0,Type=Flag,Description="Variant has high number of SR splits in background samples">
 		##INFO=<ID=BOTHSIDES_SUPPORT,Number=0,Type=Flag,Description="Variant has read-level support for both sides of breakpoint">
 		##INFO=<ID=REVISED_EVENT,Number=0,Type=Flag,Description="Variant has been revised due to a copy number mismatch">
 		EOF
@@ -572,10 +572,12 @@ task CleanVcfPostprocess {
 		if [ ! -f "~{vcf}.tbi" ]; then
 			tabix -p vcf ~{vcf}
 		fi
-		
-		python /opt/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py \
-			-V ~{vcf} \
-			-O ~{output_vcf}
+
+		bcftools annotate -x INFO/MULTIALLELIC,INFO/UNRESOLVED,INFO/EVENT,INFO/REVISED_EVENT,INFO/MULTI_CNV,INFO/varGQ ~{vcf} -o processed.vcf.gz -O z
+
+		bcftools view -h processed.vcf.gz | grep -v -E "CIPOS|CIEND|RMSSTD|source|bcftools|GATKCommandLine|##FORMAT=<ID=EV>|##ALT=<ID=UNR>|##INFO=<ID=(MULTIALLELIC|UNRESOLVED|EVENT|REVISED_EVENT|MULTI_CNV|varGQ)" > header.txt
+
+		bcftools reheader -h header.txt processed.vcf.gz -o ~{output_vcf}
 
 		tabix -p vcf ~{output_vcf}
 	>>>

From 078155c3a3f9b502e8b245dec98b1a01809745d6 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Fri, 13 Dec 2024 14:22:33 -0500
Subject: [PATCH 27/40] Updated vargq values

---
 .../04_variant_resolution/scripts/cleanvcf_preprocess.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py
index 32477a24d..d04e0a781 100644
--- a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py
+++ b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py
@@ -5,7 +5,7 @@
 import gzip
 
 
-VAR_GQ = 'VAR_GQ'
+VAR_GQ = 'varGQ'
 MULTIALLELIC = 'MULTIALLELIC'
 UNRESOLVED = 'UNRESOLVED'
 HIGH_SR_BACKGROUND = 'HIGH_SR_BACKGROUND'

From ffa4439e95ca4809a01862ae5fc3f59d070a5133 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Mon, 6 Jan 2025 14:21:19 -0500
Subject: [PATCH 28/40] Minor update to pass correct VCF downstream

---
 wdl/CleanVcfChromosome.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl
index 69a1e068e..441049c32 100644
--- a/wdl/CleanVcfChromosome.wdl
+++ b/wdl/CleanVcfChromosome.wdl
@@ -106,7 +106,7 @@ workflow CleanVcfChromosome {
 
 	call CleanVcfReviseLargeCnvs {
 		input:
-			vcf=CleanVcfReviseOverlappingCnvGts.out,
+			vcf=CleanVcfReviseOverlappingCnvCns.out,
 			outlier_samples_list=outlier_samples_list,
 			prefix="~{prefix}.revise_large_cnvs",
 			gatk_docker=gatk_docker,

From c75c86a03011a1a29cd1b1445c6ec8df91aeee11 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Mon, 6 Jan 2025 15:15:05 -0500
Subject: [PATCH 29/40] Created postprocessing file

---
 .../scripts/cleanvcf_postprocess.py           | 46 +++++++++++++++++++
 wdl/CleanVcfChromosome.wdl                    |  6 ++-
 2 files changed, 51 insertions(+), 1 deletion(-)
 create mode 100644 src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py

diff --git a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py
new file mode 100644
index 000000000..c6bbcbc9d
--- /dev/null
+++ b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py
@@ -0,0 +1,46 @@
+#!/bin/python
+
+import argparse
+import pysam
+import gzip
+
+
+def process_record(record):
+    record = process_svtype(record)
+    return record
+
+
+def process_svtype(record):
+    if record.info.get('SVTYPE') == 'DUP':
+        if not any(':ME' in alt for alt in record.alts):
+            record.alts = ('<DUP>',)
+    return record
+
+
+if __name__ == '__main__':
+    # Parse arguments
+    parser = argparse.ArgumentParser(description='CleanVcf postprocessing.')
+    parser.add_argument('-V', '--input', dest='input_vcf', required=True, help='Input VCF file')
+    parser.add_argument('-O', '--output', dest='output_vcf', required=True, help='Output VCF file')
+    args = parser.parse_args()
+
+    # Read input files
+    if args.input_vcf.endswith('.gz'):
+        vcf_in = pysam.VariantFile(gzip.open(args.input_vcf, 'rt'))
+    else:
+        vcf_in = pysam.VariantFile(args.input_vcf)
+    
+    # Open output file
+    if args.output_vcf.endswith('.gz'):
+        vcf_out = pysam.VariantFile(args.output_vcf, 'wz', header=vcf_in.header)
+    else:
+        vcf_out = pysam.VariantFile(args.output_vcf, 'w', header=vcf_in.header.copy())
+
+    # Process records
+    for record in vcf_in:
+        record = process_record(record)
+        vcf_out.write(record)
+    
+    # Close files
+    vcf_in.close()
+    vcf_out.close()
diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl
index 441049c32..ee2824f65 100644
--- a/wdl/CleanVcfChromosome.wdl
+++ b/wdl/CleanVcfChromosome.wdl
@@ -577,8 +577,12 @@ task CleanVcfPostprocess {
 
 		bcftools view -h processed.vcf.gz | grep -v -E "CIPOS|CIEND|RMSSTD|source|bcftools|GATKCommandLine|##FORMAT=<ID=EV>|##ALT=<ID=UNR>|##INFO=<ID=(MULTIALLELIC|UNRESOLVED|EVENT|REVISED_EVENT|MULTI_CNV|varGQ)" > header.txt
 
-		bcftools reheader -h header.txt processed.vcf.gz -o ~{output_vcf}
+		bcftools reheader -h header.txt processed.vcf.gz -o processed.reheader.vcf.gz
 
+		python /opt/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py \
+			-V processed.reheader.vcf.gz \
+			-O ~{output_vcf}
+		
 		tabix -p vcf ~{output_vcf}
 	>>>
 

From 0edef35fe4b3020cab020dd59fdd23d2620f4be9 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Mon, 6 Jan 2025 15:38:19 -0500
Subject: [PATCH 30/40] Updated postprocessing order

---
 inputs/values/dockers.json |  6 +++---
 wdl/CleanVcfChromosome.wdl | 12 ++++++------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json
index 0b92b966f..947cbc84f 100644
--- a/inputs/values/dockers.json
+++ b/inputs/values/dockers.json
@@ -12,8 +12,8 @@
   "samtools_cloud_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/samtools-cloud:2024-10-25-v0.29-beta-5ea22a52",
   "sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2024-10-25-v0.29-beta-5ea22a52",
   "sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base-mini:2024-10-25-v0.29-beta-5ea22a52",
-  "sv_pipeline_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-170ec7c38b004f02639b5f82270113b74162fe79",
-  "sv_pipeline_qc_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-170ec7c38b004f02639b5f82270113b74162fe79",
+  "sv_pipeline_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-c75c86a03011a1a29cd1b1445c6ec8df91aeee11",
+  "sv_pipeline_qc_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-c75c86a03011a1a29cd1b1445c6ec8df91aeee11",
   "wham_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/wham:2024-10-25-v0.29-beta-5ea22a52",
   "igv_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/igv:mw-xz-fixes-2-b1be6a9",
   "duphold_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/duphold:mw-xz-fixes-2-b1be6a9",
@@ -28,5 +28,5 @@
   "sv_utils_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-utils:2024-10-25-v0.29-beta-5ea22a52",
   "gq_recalibrator_docker": "us.gcr.io/broad-dsde-methods/markw/gatk:mw-tb-form-sv-filter-training-data-899360a",
   "str": "us.gcr.io/broad-dsde-methods/gatk-sv/str:2023-05-23-v0.27.3-beta-e537bdd6",
-  "denovo": "us-central1-docker.pkg.dev/talkowski-training/kj-development/denovo:kj-clean-vcf-170ec7c38b004f02639b5f82270113b74162fe79"
+  "denovo": "us-central1-docker.pkg.dev/talkowski-training/kj-development/denovo:kj-clean-vcf-c75c86a03011a1a29cd1b1445c6ec8df91aeee11"
 }
\ No newline at end of file
diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl
index ee2824f65..8e174714b 100644
--- a/wdl/CleanVcfChromosome.wdl
+++ b/wdl/CleanVcfChromosome.wdl
@@ -573,15 +573,15 @@ task CleanVcfPostprocess {
 			tabix -p vcf ~{vcf}
 		fi
 
-		bcftools annotate -x INFO/MULTIALLELIC,INFO/UNRESOLVED,INFO/EVENT,INFO/REVISED_EVENT,INFO/MULTI_CNV,INFO/varGQ ~{vcf} -o processed.vcf.gz -O z
+		python /opt/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py \
+			-V ~{vcf} \
+			-O processed.vcf.gz
 
-		bcftools view -h processed.vcf.gz | grep -v -E "CIPOS|CIEND|RMSSTD|source|bcftools|GATKCommandLine|##FORMAT=<ID=EV>|##ALT=<ID=UNR>|##INFO=<ID=(MULTIALLELIC|UNRESOLVED|EVENT|REVISED_EVENT|MULTI_CNV|varGQ)" > header.txt
+		bcftools annotate -x INFO/MULTIALLELIC,INFO/UNRESOLVED,INFO/EVENT,INFO/REVISED_EVENT,INFO/MULTI_CNV,INFO/varGQ processed.vcf.gz -o processed.annotated.vcf.gz -O z
 
-		bcftools reheader -h header.txt processed.vcf.gz -o processed.reheader.vcf.gz
+		bcftools view -h processed.annotated.vcf.gz | grep -v -E "CIPOS|CIEND|RMSSTD|source|bcftools|GATKCommandLine|##FORMAT=<ID=EV>|##ALT=<ID=UNR>|##INFO=<ID=(MULTIALLELIC|UNRESOLVED|EVENT|REVISED_EVENT|MULTI_CNV|varGQ)" > header.txt
 
-		python /opt/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py \
-			-V processed.reheader.vcf.gz \
-			-O ~{output_vcf}
+		bcftools reheader -h header.txt processed.annotated.vcf.gz -o ~{output_vcf}
 		
 		tabix -p vcf ~{output_vcf}
 	>>>

From a2e7381c9f4822c8b171f99d679b394cc4bd60ac Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Mon, 6 Jan 2025 16:20:59 -0500
Subject: [PATCH 31/40] Updated header of output VCF to include all required
 fields

---
 inputs/values/dockers.json                               | 6 +++---
 .../scripts/cleanvcf_postprocess.py                      | 5 ++---
 wdl/CleanVcfChromosome.wdl                               | 9 ++++++++-
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json
index 947cbc84f..bc8f1a448 100644
--- a/inputs/values/dockers.json
+++ b/inputs/values/dockers.json
@@ -12,8 +12,8 @@
   "samtools_cloud_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/samtools-cloud:2024-10-25-v0.29-beta-5ea22a52",
   "sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2024-10-25-v0.29-beta-5ea22a52",
   "sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base-mini:2024-10-25-v0.29-beta-5ea22a52",
-  "sv_pipeline_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-c75c86a03011a1a29cd1b1445c6ec8df91aeee11",
-  "sv_pipeline_qc_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-c75c86a03011a1a29cd1b1445c6ec8df91aeee11",
+  "sv_pipeline_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-0edef35fe4b3020cab020dd59fdd23d2620f4be9",
+  "sv_pipeline_qc_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-0edef35fe4b3020cab020dd59fdd23d2620f4be9",
   "wham_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/wham:2024-10-25-v0.29-beta-5ea22a52",
   "igv_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/igv:mw-xz-fixes-2-b1be6a9",
   "duphold_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/duphold:mw-xz-fixes-2-b1be6a9",
@@ -28,5 +28,5 @@
   "sv_utils_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-utils:2024-10-25-v0.29-beta-5ea22a52",
   "gq_recalibrator_docker": "us.gcr.io/broad-dsde-methods/markw/gatk:mw-tb-form-sv-filter-training-data-899360a",
   "str": "us.gcr.io/broad-dsde-methods/gatk-sv/str:2023-05-23-v0.27.3-beta-e537bdd6",
-  "denovo": "us-central1-docker.pkg.dev/talkowski-training/kj-development/denovo:kj-clean-vcf-c75c86a03011a1a29cd1b1445c6ec8df91aeee11"
+  "denovo": "us-central1-docker.pkg.dev/talkowski-training/kj-development/denovo:kj-clean-vcf-0edef35fe4b3020cab020dd59fdd23d2620f4be9"
 }
\ No newline at end of file
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py
index c6bbcbc9d..46a31aa00 100644
--- a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py
+++ b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py
@@ -11,9 +11,8 @@ def process_record(record):
 
 
 def process_svtype(record):
-    if record.info.get('SVTYPE') == 'DUP':
-        if not any(':ME' in alt for alt in record.alts):
-            record.alts = ('<DUP>',)
+    if not any(':ME' in alt for alt in record.alts):
+        record.alts = ('<' + record.info.get('SVTYPE') + '>',)
     return record
 
 
diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl
index 8e174714b..01d0cea83 100644
--- a/wdl/CleanVcfChromosome.wdl
+++ b/wdl/CleanVcfChromosome.wdl
@@ -579,8 +579,15 @@ task CleanVcfPostprocess {
 
 		bcftools annotate -x INFO/MULTIALLELIC,INFO/UNRESOLVED,INFO/EVENT,INFO/REVISED_EVENT,INFO/MULTI_CNV,INFO/varGQ processed.vcf.gz -o processed.annotated.vcf.gz -O z
 
-		bcftools view -h processed.annotated.vcf.gz | grep -v -E "CIPOS|CIEND|RMSSTD|source|bcftools|GATKCommandLine|##FORMAT=<ID=EV>|##ALT=<ID=UNR>|##INFO=<ID=(MULTIALLELIC|UNRESOLVED|EVENT|REVISED_EVENT|MULTI_CNV|varGQ)" > header.txt
+		bcftools view -h processed.annotated.vcf.gz | grep "^##" | \
+			grep -v -E "CIPOS|CIEND|RMSSTD|source|bcftools|GATKCommandLine|##FORMAT=<ID=EV>|##ALT=<ID=UNR>|##INFO=<ID=(MULTIALLELIC|UNRESOLVED|EVENT|REVISED_EVENT|MULTI_CNV|varGQ)" > temp_header.txt
+		echo '##INFO=<ID=UNRESOLVED_TYPE,Number=1,Type=String,Description="Class of unresolved variant.">' >> temp_header.txt
+		echo '##ALT=<ID=CNV,Description="Copy Number Polymorphism">' >> temp_header.txt
 
+		bcftools view -h processed.annotated.vcf.gz | grep "^#CHROM" > chrom_header.txt
+
+		cat temp_header.txt chrom_header.txt > header.txt
+		
 		bcftools reheader -h header.txt processed.annotated.vcf.gz -o ~{output_vcf}
 		
 		tabix -p vcf ~{output_vcf}

From 3aec8d05147587b67924787e3252b32c078fa18c Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Tue, 7 Jan 2025 09:54:29 -0500
Subject: [PATCH 32/40] Updated changes pre-testing on gnomad

---
 inputs/values/dockers.json |  6 ++---
 wdl/CleanVcf.wdl           | 49 ++++++++++++++++++++------------------
 wdl/CleanVcfChromosome.wdl | 17 ++++---------
 3 files changed, 34 insertions(+), 38 deletions(-)

diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json
index bc8f1a448..32ac70f24 100644
--- a/inputs/values/dockers.json
+++ b/inputs/values/dockers.json
@@ -12,8 +12,8 @@
   "samtools_cloud_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/samtools-cloud:2024-10-25-v0.29-beta-5ea22a52",
   "sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2024-10-25-v0.29-beta-5ea22a52",
   "sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base-mini:2024-10-25-v0.29-beta-5ea22a52",
-  "sv_pipeline_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-0edef35fe4b3020cab020dd59fdd23d2620f4be9",
-  "sv_pipeline_qc_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-0edef35fe4b3020cab020dd59fdd23d2620f4be9",
+  "sv_pipeline_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-a2e7381c9f4822c8b171f99d679b394cc4bd60ac",
+  "sv_pipeline_qc_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-a2e7381c9f4822c8b171f99d679b394cc4bd60ac",
   "wham_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/wham:2024-10-25-v0.29-beta-5ea22a52",
   "igv_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/igv:mw-xz-fixes-2-b1be6a9",
   "duphold_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/duphold:mw-xz-fixes-2-b1be6a9",
@@ -28,5 +28,5 @@
   "sv_utils_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-utils:2024-10-25-v0.29-beta-5ea22a52",
   "gq_recalibrator_docker": "us.gcr.io/broad-dsde-methods/markw/gatk:mw-tb-form-sv-filter-training-data-899360a",
   "str": "us.gcr.io/broad-dsde-methods/gatk-sv/str:2023-05-23-v0.27.3-beta-e537bdd6",
-  "denovo": "us-central1-docker.pkg.dev/talkowski-training/kj-development/denovo:kj-clean-vcf-0edef35fe4b3020cab020dd59fdd23d2620f4be9"
+  "denovo": "us-central1-docker.pkg.dev/talkowski-training/kj-development/denovo:kj-clean-vcf-a2e7381c9f4822c8b171f99d679b394cc4bd60ac"
 }
\ No newline at end of file
diff --git a/wdl/CleanVcf.wdl b/wdl/CleanVcf.wdl
index 42974547b..bb64b930e 100644
--- a/wdl/CleanVcf.wdl
+++ b/wdl/CleanVcf.wdl
@@ -58,12 +58,13 @@ workflow CleanVcf {
     RuntimeAttr? runtime_attr_create_ploidy
 
     # overrides for CleanVcfContig
-    RuntimeAttr? runtime_override_clean_vcf_1a
-    RuntimeAttr? runtime_override_clean_vcf_1b
-    RuntimeAttr? runtime_override_clean_vcf_2
-    RuntimeAttr? runtime_override_clean_vcf_3
-    RuntimeAttr? runtime_override_clean_vcf_4
-    RuntimeAttr? runtime_override_clean_vcf_5
+    RuntimeAttr? runtime_attr_preprocess
+		RuntimeAttr? runtime_attr_revise_overlapping_cnv_gts
+		RuntimeAttr? runtime_attr_revise_overlapping_cnv_cns
+		RuntimeAttr? runtime_attr_revise_large_cnvs
+		RuntimeAttr? runtime_attr_revise_abnormal_allosomes
+		RuntimeAttr? runtime_attr_revise_multiallelics
+		RuntimeAttr? runtime_attr_postprocess
     RuntimeAttr? runtime_override_stitch_fragmented_cnvs
     RuntimeAttr? runtime_override_final_cleanup
     RuntimeAttr? runtime_attr_format
@@ -106,33 +107,35 @@ workflow CleanVcf {
       input:
         vcf=complex_genotype_vcfs[i],
         contig=contig,
+        chr_x=chr_x,
+        chr_y=chr_y,
+        prefix="~{cohort_name}.~{contig}",
+
         background_list=complex_resolve_background_fail_list,
-        ped_file=ped_file,
         bothsides_pass_list=complex_resolve_bothside_pass_list,
-        allosome_fai=allosome_fai,
-        prefix="~{cohort_name}.~{contig}",
-        max_shards_per_chrom_step1=max_shards_per_chrom_step1,
-        min_records_per_shard_step1=min_records_per_shard_step1,
-        samples_per_step2_shard=samples_per_step2_shard,
-        max_samples_per_shard_step3=max_samples_per_shard_step3,
         outlier_samples_list=outlier_samples_list,
+        ped_file=ped_file,
+        allosome_fai=allosome_fai,
+
+        HERVK_reference=HERVK_reference,
+        LINE1_reference=LINE1_reference,
+
         use_hail=use_hail,
         gcs_project=gcs_project,
         ploidy_table=CreatePloidyTableFromPed.out,
-        HERVK_reference=HERVK_reference,
-        LINE1_reference=LINE1_reference,
-        chr_x=chr_x,
-        chr_y=chr_y,
+
         gatk_docker=gatk_docker,
         linux_docker=linux_docker,
         sv_base_mini_docker=sv_base_mini_docker,
         sv_pipeline_docker=sv_pipeline_docker,
-        runtime_override_clean_vcf_1a=runtime_override_clean_vcf_1a,
-        runtime_override_clean_vcf_1b=runtime_override_clean_vcf_1b,
-        runtime_override_clean_vcf_2=runtime_override_clean_vcf_2,
-        runtime_override_clean_vcf_3=runtime_override_clean_vcf_3,
-        runtime_override_clean_vcf_4=runtime_override_clean_vcf_4,
-        runtime_override_clean_vcf_5=runtime_override_clean_vcf_5,
+        
+        runtime_attr_preprocess=runtime_attr_preprocess,
+        runtime_attr_revise_overlapping_cnv_gts=runtime_attr_revise_overlapping_cnv_gts,
+        runtime_attr_revise_overlapping_cnv_cns=runtime_attr_revise_overlapping_cnv_cns,
+        runtime_attr_revise_large_cnvs=runtime_attr_revise_large_cnvs,
+        runtime_attr_revise_abnormal_allosomes=runtime_attr_revise_abnormal_allosomes,
+        runtime_attr_revise_multiallelics=runtime_attr_revise_multiallelics,
+        runtime_attr_postprocess=runtime_attr_postprocess,
         runtime_override_stitch_fragmented_cnvs=runtime_override_stitch_fragmented_cnvs,
         runtime_override_final_cleanup=runtime_override_final_cleanup,
         runtime_override_split_vcf_to_clean=runtime_override_split_vcf_to_clean,
diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl
index 01d0cea83..3d8c6dfac 100644
--- a/wdl/CleanVcfChromosome.wdl
+++ b/wdl/CleanVcfChromosome.wdl
@@ -9,20 +9,20 @@ workflow CleanVcfChromosome {
 	input {
 		File vcf
 		String contig
+		String chr_x
+		String chr_y
+		String prefix
+
 		File background_list
 		File bothsides_pass_list
 		File? outlier_samples_list
 		File ped_file
+		File ploidy_table
 		File allosome_fai
-		String prefix
 		
 		File HERVK_reference
 		File LINE1_reference
 
-		File ploidy_table
-		String chr_x
-		String chr_y
-
 		Boolean use_hail
 		String? gcs_project
 
@@ -39,13 +39,6 @@ workflow CleanVcfChromosome {
 		RuntimeAttr? runtime_attr_revise_abnormal_allosomes
 		RuntimeAttr? runtime_attr_revise_multiallelics
 		RuntimeAttr? runtime_attr_postprocess
-
-		RuntimeAttr? runtime_override_clean_vcf_1a
-		RuntimeAttr? runtime_override_clean_vcf_1b
-		RuntimeAttr? runtime_override_clean_vcf_2
-		RuntimeAttr? runtime_override_clean_vcf_3
-		RuntimeAttr? runtime_override_clean_vcf_4
-		RuntimeAttr? runtime_override_clean_vcf_5
 		RuntimeAttr? runtime_override_stitch_fragmented_cnvs
 		RuntimeAttr? runtime_override_final_cleanup
 		RuntimeAttr? runtime_override_rescue_me_dels

From 581c5c14b9d61edcdf2ab0c9797a783c7c8dfc7b Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Thu, 9 Jan 2025 15:41:53 -0500
Subject: [PATCH 33/40] Removed unnecessary runtime attributes

---
 wdl/CleanVcfChromosome.wdl | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl
index 3d8c6dfac..8ace1eea6 100644
--- a/wdl/CleanVcfChromosome.wdl
+++ b/wdl/CleanVcfChromosome.wdl
@@ -44,20 +44,11 @@ workflow CleanVcfChromosome {
 		RuntimeAttr? runtime_override_rescue_me_dels
 		RuntimeAttr? runtime_attr_add_high_fp_rate_filters
 
-		RuntimeAttr? runtime_override_preconcat_step1
-		RuntimeAttr? runtime_override_hail_merge_step1
-		RuntimeAttr? runtime_override_fix_header_step1
-
 		RuntimeAttr? runtime_override_preconcat_drc
 		RuntimeAttr? runtime_override_hail_merge_drc
 		RuntimeAttr? runtime_override_fix_header_drc
 
-		# overrides for MiniTasks
-		RuntimeAttr? runtime_override_split_vcf_to_clean
-		RuntimeAttr? runtime_override_split_include_list
-		RuntimeAttr? runtime_override_combine_clean_vcf_2
 		RuntimeAttr? runtime_override_drop_redundant_cnvs
-		RuntimeAttr? runtime_override_combine_step_1_vcfs
 		RuntimeAttr? runtime_override_sort_drop_redundant_cnvs
 		RuntimeAttr? runtime_attr_format
 	}

From be3a8014b614e3beb9b3ab8cd4dc43bfbd0a56de Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Mon, 13 Jan 2025 10:54:16 -0500
Subject: [PATCH 34/40] Added sex revisions for male GT

---
 .../scripts/cleanvcf_preprocess.py            | 94 ++++++++++++++++++-
 wdl/CleanVcfChromosome.wdl                    |  6 ++
 2 files changed, 98 insertions(+), 2 deletions(-)

diff --git a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py
index d04e0a781..d0a144711 100644
--- a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py
+++ b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py
@@ -12,6 +12,7 @@
 BOTHSIDES_SUPPORT = 'BOTHSIDES_SUPPORT'
 REVISED_EVENT = 'REVISED_EVENT'
 EV_VALUES = ['SR', 'PE', 'SR,PE', 'RD', 'BAF', 'RD,BAF']
+MIN_ALLOSOME_EVENT_SIZE = 50
 
 
 def read_last_column(file_path):
@@ -24,12 +25,13 @@ def read_last_column(file_path):
     return result_set
 
 
-def process_record(record, fail_set, pass_set):
+def process_record(record, chrX, chrY, fail_set, pass_set):
     record = process_varGQ(record)
     record = process_multiallelic(record)
     record = process_unresolved(record)
     record = process_noisy(record, fail_set)
     record = process_bothsides_support(record, pass_set)
+    record = process_allosomes(record, chrX, chrY)
     return record
 
 
@@ -68,16 +70,104 @@ def process_bothsides_support(record, pass_set):
     return record
 
 
+def process_allosomes(record, chrX, chrY):
+    chromosome = record.chrom
+    if chromosome not in (chrX, chrY):
+        return record
+
+    updated_samples = []
+    sv_type = record.info.get('SVTYPE', '')
+    sv_len = record.info.get('SVLEN', 0)
+
+    if sv_type in ('DEL', 'DUP') and sv_len >= MIN_ALLOSOME_EVENT_SIZE:
+        is_y = (chromosome == chrY)
+
+        for sample in record.samples:
+            genotype = record.samples[sample]
+            sex = genotype.get('EXPECTED_COPY_NUMBER_FORMAT', None)
+
+            if sex == 1:  # Male
+                if is_revisable_event(record, is_y, sex):
+                    record.info[REVISED_EVENT] = True
+                    adjust_male_genotype(genotype, sv_type)
+            elif sex == 2 and is_y:  # Female
+                genotype['GT'] = (None, None)  # NO_CALL for females on chrY
+            elif sex == 0:  # Unknown
+                genotype['GT'] = (None, None)  # NO_CALL for unknown sex
+            
+            updated_samples.append(sample)
+
+    return record
+
+
+def is_revisable_event(record, is_y, sex):
+    genotypes = record.samples.values()
+    male_counts = [0, 0, 0, 0]
+    female_counts = [0, 0, 0, 0]
+
+    for genotype in genotypes:
+        rd_cn = genotype.get('RD_CN', -1)
+        rd_cn_val = min(rd_cn, 3) if rd_cn != -1 else -1
+        if rd_cn_val == -1:
+            continue
+
+        if sex == 1:  # Male
+            male_counts[rd_cn_val] += 1
+        elif sex == 2:  # Female
+            female_counts[rd_cn_val] += 1
+
+    male_median = calc_median_distribution(male_counts)
+    female_median = calc_median_distribution(female_counts)
+
+    return male_median == 2 and (is_y and female_median == 0 or not is_y and female_median == 4)
+
+
+def adjust_male_genotype(genotype, sv_type):
+    rd_cn = genotype.get('RD_CN', 0)
+    genotype['RD_CN'] = rd_cn + 1
+    ref_allele, alt_allele = genotype['alleles']
+
+    if sv_type == 'DEL':
+        if rd_cn >= 1:
+            genotype['GT'] = (ref_allele, ref_allele)
+        elif rd_cn == 0:
+            genotype['GT'] = (ref_allele, alt_allele)
+    elif sv_type == 'DUP':
+        if rd_cn <= 1:
+            genotype['GT'] = (ref_allele, ref_allele)
+        elif rd_cn == 2:
+            genotype['GT'] = (ref_allele, alt_allele)
+        else:
+            genotype['GT'] = (alt_allele, alt_allele)
+
+
+def calc_median_distribution(counts):
+    total = sum(counts)
+    if total == 0:
+        return -1
+
+    target = total // 2
+    running_total = 0
+    for i, count in enumerate(counts):
+        running_total += count
+        if running_total >= target:
+            return i * 2 if running_total > target else i * 2 + 1
+
+
 if __name__ == '__main__':
     # Parse arguments
     parser = argparse.ArgumentParser(description='CleanVcf preprocessing.')
     parser.add_argument('-V', '--input', dest='input_vcf', required=True, help='Input VCF file')
     parser.add_argument('-O', '--output', dest='output_vcf', required=True, help='Output VCF file')
+    parser.add_argument('--chrX', required=True, help='Chromosome X representation in VCF')
+    parser.add_argument('--chrY', required=True, help='Chromosome Y representation in VCF')
     parser.add_argument('--fail-list', required=True, help='File with variants failing the background test')
     parser.add_argument('--pass-list', required=True, help='File with variants passing both sides')
     args = parser.parse_args()
 
     # Read input files
+    chrX = args.chrX
+    chrY = args.chrY
     fail_set = read_last_column(args.fail_list)
     pass_set = read_last_column(args.pass_list)
     if args.input_vcf.endswith('.gz'):
@@ -93,7 +183,7 @@ def process_bothsides_support(record, pass_set):
 
     # Process records
     for record in vcf_in:
-        record = process_record(record, fail_set, pass_set)
+        record = process_record(record, chrX, chrY, fail_set, pass_set)
         vcf_out.write(record)
     
     # Close files
diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl
index 8ace1eea6..0b83a7487 100644
--- a/wdl/CleanVcfChromosome.wdl
+++ b/wdl/CleanVcfChromosome.wdl
@@ -65,6 +65,8 @@ workflow CleanVcfChromosome {
 	call CleanVcfPreprocess {
 		input:
 			vcf=FormatVcfToClean.out,
+			chr_x=chr_x,
+			chr_y=chr_y,
 			background_list=background_list,
 			bothsides_pass_list=bothsides_pass_list,
 			prefix="~{prefix}.preprocess",
@@ -208,6 +210,8 @@ workflow CleanVcfChromosome {
 task CleanVcfPreprocess {
 	input {
 		File vcf
+		String chr_x
+		String chr_y
 		File background_list
 		File bothsides_pass_list
 		String prefix
@@ -266,6 +270,8 @@ task CleanVcfPreprocess {
 		python /opt/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py \
 			-V processed.reheader.vcf.gz \
 			-O ~{output_vcf} \
+			--chrX ~{chr_x} \
+			--chrY ~{chr_y} \
 			--fail-list ~{background_list} \
 			--pass-list ~{bothsides_pass_list}
 

From 9a6da9d9c8bdb6f5e17cc97806ea803ce3e909d2 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Fri, 17 Jan 2025 17:01:22 -0500
Subject: [PATCH 35/40] Coalesced overlapping cnv tools into one

---
 .../scripts/cleanvcf_preprocess.py            |   2 +-
 wdl/CleanVcfChromosome.wdl                    | 265 ++++++++++--------
 2 files changed, 155 insertions(+), 112 deletions(-)

diff --git a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py
index d0a144711..f8f1a90cc 100644
--- a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py
+++ b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py
@@ -119,7 +119,7 @@ def is_revisable_event(record, is_y, sex):
     male_median = calc_median_distribution(male_counts)
     female_median = calc_median_distribution(female_counts)
 
-    return male_median == 2 and (is_y and female_median == 0 or not is_y and female_median == 4)
+    return male_median == 1 and (female_median == 0 if is_y else female_median == 2)
 
 
 def adjust_male_genotype(genotype, sv_type):
diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl
index 0b83a7487..dddc27f72 100644
--- a/wdl/CleanVcfChromosome.wdl
+++ b/wdl/CleanVcfChromosome.wdl
@@ -33,6 +33,7 @@ workflow CleanVcfChromosome {
 
 		# overrides for local tasks
 		RuntimeAttr? runtime_attr_preprocess
+		RuntimeAttr? runtime_attr_revise_overlapping_cnvs
 		RuntimeAttr? runtime_attr_revise_overlapping_cnv_gts
 		RuntimeAttr? runtime_attr_revise_overlapping_cnv_cns
 		RuntimeAttr? runtime_attr_revise_large_cnvs
@@ -74,25 +75,17 @@ workflow CleanVcfChromosome {
 			runtime_attr_override=runtime_attr_preprocess
 	}
 
-	call CleanVcfReviseOverlappingCnvGts {
+	call CleanVcfReviseOverlappingCnvs {
 		input:
 			vcf=CleanVcfPreprocess.out,
-			prefix="~{prefix}.revise_overlapping_cnv_gts",
+			prefix="~{prefix}.revise_overlapping_cnvs",
 			gatk_docker=gatk_docker,
-			runtime_attr_override=runtime_attr_revise_overlapping_cnv_gts
-	}
-
-	call CleanVcfReviseOverlappingCnvCns {
-		input:
-			vcf=CleanVcfReviseOverlappingCnvGts.out,
-			prefix="~{prefix}.revise_overlapping_cnv_cns",
-			gatk_docker=gatk_docker,
-			runtime_attr_override=runtime_attr_revise_overlapping_cnv_cns
+			runtime_attr_override=runtime_attr_revise_overlapping_cnvs
 	}
 
 	call CleanVcfReviseLargeCnvs {
 		input:
-			vcf=CleanVcfReviseOverlappingCnvCns.out,
+			vcf=CleanVcfReviseOverlappingCnvs.out,
 			outlier_samples_list=outlier_samples_list,
 			prefix="~{prefix}.revise_large_cnvs",
 			gatk_docker=gatk_docker,
@@ -220,13 +213,13 @@ task CleanVcfPreprocess {
 	}
 
 	RuntimeAttr runtime_default = object {
-																	mem_gb: 3.75,
-																	disk_gb: ceil(10.0 + size(vcf, "GB") * 2),
-																	cpu_cores: 1,
-																	preemptible_tries: 3,
-																	max_retries: 1,
-																	boot_disk_gb: 10
-																}						
+		mem_gb: 3.75,
+		disk_gb: ceil(10.0 + size(vcf, "GB") * 2),
+		cpu_cores: 1,
+		preemptible_tries: 3,
+		max_retries: 1,
+		boot_disk_gb: 10
+	}						
 	RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
 	runtime {
 		memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
@@ -284,6 +277,54 @@ task CleanVcfPreprocess {
 	}
 }
 
+task CleanVcfReviseOverlappingCnvs {
+	input {
+		File vcf
+		String prefix
+		String gatk_docker
+		RuntimeAttr? runtime_attr_override
+	}
+
+	RuntimeAttr runtime_default = object {
+		mem_gb: 3.75,
+		disk_gb: ceil(10.0 + size(vcf, "GB") * 2),
+		cpu_cores: 1,
+		preemptible_tries: 3,
+		max_retries: 1,
+		boot_disk_gb: 10
+	}						
+	RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+	runtime {
+		memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
+		disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+		cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+		preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+		maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+		docker: gatk_docker
+		bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+	}
+
+	Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7)
+	String output_vcf = "~{prefix}.vcf.gz"
+
+	command <<<
+		set -euo pipefail
+
+		if [ ! -f "~{vcf}.tbi" ]; then
+			tabix -p vcf ~{vcf}
+		fi
+		
+		gatk --java-options "-Xmx~{java_mem_mb}m" SVReviseOverlappingCnvs \
+			-V ~{vcf} \
+			-O ~{output_vcf}
+	>>>
+
+	output {
+		File out="~{output_vcf}"
+		File out_idx="~{output_vcf}.tbi"
+	}
+}
+
 task CleanVcfReviseOverlappingCnvGts {
 	input {
 		File vcf
@@ -293,13 +334,13 @@ task CleanVcfReviseOverlappingCnvGts {
 	}
 
 	RuntimeAttr runtime_default = object {
-																	mem_gb: 3.75,
-																	disk_gb: ceil(10.0 + size(vcf, "GB") * 2),
-																	cpu_cores: 1,
-																	preemptible_tries: 3,
-																	max_retries: 1,
-																	boot_disk_gb: 10
-																}						
+		mem_gb: 3.75,
+		disk_gb: ceil(10.0 + size(vcf, "GB") * 2),
+		cpu_cores: 1,
+		preemptible_tries: 3,
+		max_retries: 1,
+		boot_disk_gb: 10
+	}						
 	RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
 	runtime {
 		memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
@@ -341,13 +382,13 @@ task CleanVcfReviseOverlappingCnvCns {
 	}
 
 	RuntimeAttr runtime_default = object {
-																	mem_gb: 3.75,
-																	disk_gb: ceil(10.0 + size(vcf, "GB") * 2),
-																	cpu_cores: 1,
-																	preemptible_tries: 3,
-																	max_retries: 1,
-																	boot_disk_gb: 10
-																}						
+		mem_gb: 3.75,
+		disk_gb: ceil(10.0 + size(vcf, "GB") * 2),
+		cpu_cores: 1,
+		preemptible_tries: 3,
+		max_retries: 1,
+		boot_disk_gb: 10
+	}					
 	RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
 	runtime {
 		memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
@@ -390,13 +431,15 @@ task CleanVcfReviseLargeCnvs {
 	}
 
 	RuntimeAttr runtime_default = object {
-																	mem_gb: 3.75,
-																	disk_gb: ceil(10.0 + size(vcf, "GB") * 2),
-																	cpu_cores: 1,
-																	preemptible_tries: 3,
-																	max_retries: 1,
-																	boot_disk_gb: 10
-																}						
+		mem_gb: 3.75,
+		disk_gb: ceil(10.0 + size(vcf, "GB") * 2),
+		cpu_cores: 1,
+		preemptible_tries: 3,
+		max_retries: 1,
+		boot_disk_gb: 10
+	}
+																
+																					
 	RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
 	runtime {
 		memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
@@ -439,13 +482,13 @@ task CleanVcfReviseAbnormalAllosomes {
 	}
 
 	RuntimeAttr runtime_default = object {
-																	mem_gb: 3.75,
-																	disk_gb: ceil(10.0 + size(vcf, "GB") * 2),
-																	cpu_cores: 1,
-																	preemptible_tries: 3,
-																	max_retries: 1,
-																	boot_disk_gb: 10
-																}						
+		mem_gb: 3.75,
+		disk_gb: ceil(10.0 + size(vcf, "GB") * 2),
+		cpu_cores: 1,
+		preemptible_tries: 3,
+		max_retries: 1,
+		boot_disk_gb: 10
+	}						
 	RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
 	runtime {
 		memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
@@ -487,13 +530,13 @@ task CleanVcfReviseMultiallelicCnvs {
 	}
 
 	RuntimeAttr runtime_default = object {
-																	mem_gb: 3.75,
-																	disk_gb: ceil(10.0 + size(vcf, "GB") * 2),
-																	cpu_cores: 1,
-																	preemptible_tries: 3,
-																	max_retries: 1,
-																	boot_disk_gb: 10
-																}						
+		mem_gb: 3.75,
+		disk_gb: ceil(10.0 + size(vcf, "GB") * 2),
+		cpu_cores: 1,
+		preemptible_tries: 3,
+		max_retries: 1,
+		boot_disk_gb: 10
+	}			
 	RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
 	runtime {
 		memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
@@ -535,13 +578,13 @@ task CleanVcfPostprocess {
 	}
 
 	RuntimeAttr runtime_default = object {
-																	mem_gb: 3.75,
-																	disk_gb: ceil(10.0 + size(vcf, "GB") * 2),
-																	cpu_cores: 1,
-																	preemptible_tries: 3,
-																	max_retries: 1,
-																	boot_disk_gb: 10
-																}						
+		mem_gb: 3.75,
+		disk_gb: ceil(10.0 + size(vcf, "GB") * 2),
+		cpu_cores: 1,
+		preemptible_tries: 3,
+		max_retries: 1,
+		boot_disk_gb: 10
+	}				
 	RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
 	runtime {
 		memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
@@ -740,13 +783,13 @@ task StitchFragmentedCnvs {
 
 	Float input_size = size(vcf, "GB")
 	RuntimeAttr runtime_default = object {
-																	mem_gb: 7.5,
-																	disk_gb: ceil(10.0 + input_size * 2),
-																	cpu_cores: 1,
-																	preemptible_tries: 3,
-																	max_retries: 1,
-																	boot_disk_gb: 10
-																}
+		mem_gb: 7.5,
+		disk_gb: ceil(10.0 + input_size * 2),
+		cpu_cores: 1,
+		preemptible_tries: 3,
+		max_retries: 1,
+		boot_disk_gb: 10
+	}
 	RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
 	Float mem_gb = select_first([runtime_override.mem_gb, runtime_default.mem_gb])
 	Int java_mem_mb = ceil(mem_gb * 1000 * 0.8)
@@ -781,53 +824,53 @@ task StitchFragmentedCnvs {
 
 # Add FILTER status for pockets of variants with high FP rate: wham-only DELs and Scramble-only SVAs with HIGH_SR_BACKGROUND
 task AddHighFDRFilters {
-  input {
-    File vcf
-    String prefix
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
+	input {
+		File vcf
+		String prefix
+		String sv_pipeline_docker
+		RuntimeAttr? runtime_attr_override
+	}
 
-  Float input_size = size(vcf, "GiB")
-  RuntimeAttr runtime_default = object {
-    mem_gb: 3.75,
-    disk_gb: ceil(10.0 + input_size * 3.0),
-    cpu_cores: 1,
-    preemptible_tries: 3,
-    max_retries: 1,
-    boot_disk_gb: 10
-  }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
+	Float input_size = size(vcf, "GiB")
+	RuntimeAttr runtime_default = object {
+		mem_gb: 3.75,
+		disk_gb: ceil(10.0 + input_size * 3.0),
+		cpu_cores: 1,
+		preemptible_tries: 3,
+		max_retries: 1,
+		boot_disk_gb: 10
+	}
+	RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+	runtime {
+		memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
+		disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+		cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+		preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+		maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+		docker: sv_pipeline_docker
+		bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+	}
 
-  command <<<
-    set -euo pipefail
+	command <<<
+		set -euo pipefail
 
-    python <<CODE
-import pysam
-with pysam.VariantFile("~{vcf}", 'r') as fin:
-  header = fin.header
-  header.add_line("##FILTER=<ID=HIGH_ALGORITHM_FDR,Description=\"Categories of variants with low precision including Wham-only deletions and certain Scramble SVAs\">")
-  with pysam.VariantFile("~{prefix}.vcf.gz", 'w', header=header) as fo:
-    for record in fin:
-        if (record.info['ALGORITHMS'] == ('wham',) and record.info['SVTYPE'] == 'DEL') or \
-          (record.info['ALGORITHMS'] == ('scramble',) and record.info['HIGH_SR_BACKGROUND'] and record.alts == ('<INS:ME:SVA>',)):
-            record.filter.add('HIGH_ALGORITHM_FDR')
-        fo.write(record)
-CODE
-  >>>
+		python <<CODE
+	import pysam
+	with pysam.VariantFile("~{vcf}", 'r') as fin:
+	header = fin.header
+	header.add_line("##FILTER=<ID=HIGH_ALGORITHM_FDR,Description=\"Categories of variants with low precision including Wham-only deletions and certain Scramble SVAs\">")
+	with pysam.VariantFile("~{prefix}.vcf.gz", 'w', header=header) as fo:
+		for record in fin:
+			if (record.info['ALGORITHMS'] == ('wham',) and record.info['SVTYPE'] == 'DEL') or \
+			(record.info['ALGORITHMS'] == ('scramble',) and record.info['HIGH_SR_BACKGROUND'] and record.alts == ('<INS:ME:SVA>',)):
+				record.filter.add('HIGH_ALGORITHM_FDR')
+			fo.write(record)
+	CODE
+	>>>
 
-  output {
-    File out = "~{prefix}.vcf.gz"
-  }
+	output {
+		File out = "~{prefix}.vcf.gz"
+	}
 }
 
 

From db4dfca3cfa739952c8cb590c301c8e12e967e4b Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Tue, 21 Jan 2025 15:11:50 -0500
Subject: [PATCH 36/40] Updated workflow inputs to reflect one-tool approach to
 overlapping CNVs

---
 wdl/CleanVcf.wdl           |  21 +-
 wdl/CleanVcfChromosome.wdl | 570 +++++++++++++++----------------------
 2 files changed, 238 insertions(+), 353 deletions(-)

diff --git a/wdl/CleanVcf.wdl b/wdl/CleanVcf.wdl
index bb64b930e..ad166b18a 100644
--- a/wdl/CleanVcf.wdl
+++ b/wdl/CleanVcf.wdl
@@ -59,8 +59,7 @@ workflow CleanVcf {
 
     # overrides for CleanVcfContig
     RuntimeAttr? runtime_attr_preprocess
-		RuntimeAttr? runtime_attr_revise_overlapping_cnv_gts
-		RuntimeAttr? runtime_attr_revise_overlapping_cnv_cns
+		RuntimeAttr? runtime_attr_revise_overlapping_cnvs
 		RuntimeAttr? runtime_attr_revise_large_cnvs
 		RuntimeAttr? runtime_attr_revise_abnormal_allosomes
 		RuntimeAttr? runtime_attr_revise_multiallelics
@@ -70,19 +69,11 @@ workflow CleanVcf {
     RuntimeAttr? runtime_attr_format
     RuntimeAttr? runtime_override_rescue_me_dels
 
-    RuntimeAttr? runtime_override_preconcat_step1
-    RuntimeAttr? runtime_override_hail_merge_step1
-    RuntimeAttr? runtime_override_fix_header_step1
-
     RuntimeAttr? runtime_override_preconcat_drc
     RuntimeAttr? runtime_override_hail_merge_drc
     RuntimeAttr? runtime_override_fix_header_drc
 
-    RuntimeAttr? runtime_override_split_vcf_to_clean
-    RuntimeAttr? runtime_override_split_include_list
-    RuntimeAttr? runtime_override_combine_clean_vcf_2
     RuntimeAttr? runtime_override_drop_redundant_cnvs
-    RuntimeAttr? runtime_override_combine_step_1_vcfs
     RuntimeAttr? runtime_override_sort_drop_redundant_cnvs
   }
 
@@ -130,25 +121,17 @@ workflow CleanVcf {
         sv_pipeline_docker=sv_pipeline_docker,
         
         runtime_attr_preprocess=runtime_attr_preprocess,
-        runtime_attr_revise_overlapping_cnv_gts=runtime_attr_revise_overlapping_cnv_gts,
-        runtime_attr_revise_overlapping_cnv_cns=runtime_attr_revise_overlapping_cnv_cns,
+        runtime_attr_revise_overlapping_cnvs=runtime_attr_revise_overlapping_cnvs,
         runtime_attr_revise_large_cnvs=runtime_attr_revise_large_cnvs,
         runtime_attr_revise_abnormal_allosomes=runtime_attr_revise_abnormal_allosomes,
         runtime_attr_revise_multiallelics=runtime_attr_revise_multiallelics,
         runtime_attr_postprocess=runtime_attr_postprocess,
         runtime_override_stitch_fragmented_cnvs=runtime_override_stitch_fragmented_cnvs,
         runtime_override_final_cleanup=runtime_override_final_cleanup,
-        runtime_override_split_vcf_to_clean=runtime_override_split_vcf_to_clean,
-        runtime_override_split_include_list=runtime_override_split_include_list,
-        runtime_override_combine_clean_vcf_2=runtime_override_combine_clean_vcf_2,
-        runtime_override_preconcat_step1=runtime_override_preconcat_step1,
-        runtime_override_hail_merge_step1=runtime_override_hail_merge_step1,
-        runtime_override_fix_header_step1=runtime_override_fix_header_step1,
         runtime_override_preconcat_drc=runtime_override_preconcat_drc,
         runtime_override_hail_merge_drc=runtime_override_hail_merge_drc,
         runtime_override_fix_header_drc=runtime_override_fix_header_drc,
         runtime_override_drop_redundant_cnvs=runtime_override_drop_redundant_cnvs,
-        runtime_override_combine_step_1_vcfs=runtime_override_combine_step_1_vcfs,
         runtime_override_sort_drop_redundant_cnvs=runtime_override_sort_drop_redundant_cnvs,
         runtime_attr_format=runtime_attr_format,
         runtime_override_rescue_me_dels=runtime_override_rescue_me_dels
diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl
index dddc27f72..72b499154 100644
--- a/wdl/CleanVcfChromosome.wdl
+++ b/wdl/CleanVcfChromosome.wdl
@@ -34,8 +34,6 @@ workflow CleanVcfChromosome {
 		# overrides for local tasks
 		RuntimeAttr? runtime_attr_preprocess
 		RuntimeAttr? runtime_attr_revise_overlapping_cnvs
-		RuntimeAttr? runtime_attr_revise_overlapping_cnv_gts
-		RuntimeAttr? runtime_attr_revise_overlapping_cnv_cns
 		RuntimeAttr? runtime_attr_revise_large_cnvs
 		RuntimeAttr? runtime_attr_revise_abnormal_allosomes
 		RuntimeAttr? runtime_attr_revise_multiallelics
@@ -325,102 +323,6 @@ task CleanVcfReviseOverlappingCnvs {
 	}
 }
 
-task CleanVcfReviseOverlappingCnvGts {
-	input {
-		File vcf
-		String prefix
-		String gatk_docker
-		RuntimeAttr? runtime_attr_override
-	}
-
-	RuntimeAttr runtime_default = object {
-		mem_gb: 3.75,
-		disk_gb: ceil(10.0 + size(vcf, "GB") * 2),
-		cpu_cores: 1,
-		preemptible_tries: 3,
-		max_retries: 1,
-		boot_disk_gb: 10
-	}						
-	RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-	runtime {
-		memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-		disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-		cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-		preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-		maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-		docker: gatk_docker
-		bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-	}
-
-	Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7)
-	String output_vcf = "~{prefix}.vcf.gz"
-
-	command <<<
-		set -euo pipefail
-
-		if [ ! -f "~{vcf}.tbi" ]; then
-			tabix -p vcf ~{vcf}
-		fi
-		
-		gatk --java-options "-Xmx~{java_mem_mb}m" SVReviseOverlappingCnvGts \
-			-V ~{vcf} \
-			-O ~{output_vcf}
-	>>>
-
-	output {
-		File out="~{output_vcf}"
-		File out_idx="~{output_vcf}.tbi"
-	}
-}
-
-task CleanVcfReviseOverlappingCnvCns {
-	input {
-		File vcf
-		String prefix
-		String gatk_docker
-		RuntimeAttr? runtime_attr_override
-	}
-
-	RuntimeAttr runtime_default = object {
-		mem_gb: 3.75,
-		disk_gb: ceil(10.0 + size(vcf, "GB") * 2),
-		cpu_cores: 1,
-		preemptible_tries: 3,
-		max_retries: 1,
-		boot_disk_gb: 10
-	}					
-	RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-	runtime {
-		memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-		disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-		cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-		preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-		maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-		docker: gatk_docker
-		bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-	}
-
-	Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7)
-	String output_vcf = "~{prefix}.vcf.gz"
-
-	command <<<
-		set -euo pipefail
-
-		if [ ! -f "~{vcf}.tbi" ]; then
-			tabix -p vcf ~{vcf}
-		fi
-		
-		gatk --java-options "-Xmx~{java_mem_mb}m" SVReviseOverlappingCnvCns \
-			-V ~{vcf} \
-			-O ~{output_vcf}
-	>>>
-
-	output {
-		File out="~{output_vcf}"
-		File out_idx="~{output_vcf}.tbi"
-	}
-}
-
 task CleanVcfReviseLargeCnvs {
 	input {
 		File vcf
@@ -633,297 +535,297 @@ task CleanVcfPostprocess {
 }
 
 task RescueMobileElementDeletions {
-	input {
-		File vcf
-		String prefix
-		File LINE1
-		File HERVK
-		String sv_pipeline_docker
-		RuntimeAttr? runtime_attr_override
-	}
+  input {
+    File vcf
+    String prefix
+    File LINE1
+    File HERVK
+    String sv_pipeline_docker
+    RuntimeAttr? runtime_attr_override
+  }
 
-	Float input_size = size(vcf, "GiB")
-	RuntimeAttr runtime_default = object {
-		mem_gb: 3.75 + input_size * 1.5,
-		disk_gb: ceil(100.0 + input_size * 3.0),
-		cpu_cores: 1,
-		preemptible_tries: 3,
-		max_retries: 1,
-		boot_disk_gb: 10
-	}
-	RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-	runtime {
-		memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-		disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-		cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-		preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-		maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-		docker: sv_pipeline_docker
-		bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-	}
+  Float input_size = size(vcf, "GiB")
+  RuntimeAttr runtime_default = object {
+    mem_gb: 3.75 + input_size * 1.5,
+    disk_gb: ceil(100.0 + input_size * 3.0),
+    cpu_cores: 1,
+    preemptible_tries: 3,
+    max_retries: 1,
+    boot_disk_gb: 10
+  }
+  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+  runtime {
+    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
+    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+    docker: sv_pipeline_docker
+    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+  }
 
-	command <<<
-		set -euo pipefail
+  command <<<
+    set -euo pipefail
 
-		python <<CODE
+    python <<CODE
 import os
 import pysam
 fin=pysam.VariantFile("~{vcf}")
 fo=pysam.VariantFile("~{prefix}.bnd_del.vcf.gz", 'w', header = fin.header)
 for record in fin:
-		if record.info['SVTYPE'] in ['BND'] and record.info['STRANDS']=="+-" and record.chrom == record.info['CHR2'] and record.info['END2'] - record.start < 10000:
-				record.info['SVLEN'] = record.info['END2'] - record.start
-				fo.write(record)
+    if record.info['SVTYPE'] in ['BND'] and record.info['STRANDS']=="+-" and record.chrom == record.info['CHR2'] and record.info['END2'] - record.start < 10000:
+        record.info['SVLEN'] = record.info['END2'] - record.start
+        fo.write(record)
 fin.close()
 fo.close()
 CODE
 
-		tabix -p vcf ~{prefix}.bnd_del.vcf.gz
+    tabix -p vcf ~{prefix}.bnd_del.vcf.gz
 
-		svtk vcf2bed ~{prefix}.bnd_del.vcf.gz -i ALL --include-filters ~{prefix}.bnd_del.bed
-		bgzip ~{prefix}.bnd_del.bed
+    svtk vcf2bed ~{prefix}.bnd_del.vcf.gz -i ALL --include-filters ~{prefix}.bnd_del.bed
+    bgzip ~{prefix}.bnd_del.bed
 
-		bedtools coverage -wo -a ~{prefix}.bnd_del.bed.gz -b ~{LINE1} | awk '{if ($NF>.5) print}' | cut -f4 | sed -e 's/$/\tDEL\tPASS\toverlap_LINE1/' > manual_revise.MEI_DEL_from_BND.SVID_SVTYPE_FILTER_INFO.tsv
-		bedtools coverage -wo -a ~{prefix}.bnd_del.bed.gz -b ~{HERVK} | awk '{if ($NF>.5) print}' | cut -f4 | sed -e 's/$/\tDEL\tPASS\toverlap_HERVK/' >> manual_revise.MEI_DEL_from_BND.SVID_SVTYPE_FILTER_INFO.tsv
+    bedtools coverage -wo -a ~{prefix}.bnd_del.bed.gz -b ~{LINE1} | awk '{if ($NF>.5) print}' | cut -f4 | sed -e 's/$/\tDEL\tPASS\toverlap_LINE1/' > manual_revise.MEI_DEL_from_BND.SVID_SVTYPE_FILTER_INFO.tsv
+    bedtools coverage -wo -a ~{prefix}.bnd_del.bed.gz -b ~{HERVK} | awk '{if ($NF>.5) print}' | cut -f4 | sed -e 's/$/\tDEL\tPASS\toverlap_HERVK/' >> manual_revise.MEI_DEL_from_BND.SVID_SVTYPE_FILTER_INFO.tsv
 
-		python <<CODE
+    python <<CODE
 import pysam
 def SVID_MEI_DEL_readin(MEI_DEL_reset):
-		out={}
-		fin=open(MEI_DEL_reset)
-		for line in fin:
-				pin=line.strip().split()
-				if not pin[0] in out.keys():
-						out[pin[0]] = pin[3]
-		fin.close()
-		return out
+    out={}
+    fin=open(MEI_DEL_reset)
+    for line in fin:
+        pin=line.strip().split()
+        if not pin[0] in out.keys():
+            out[pin[0]] = pin[3]
+    fin.close()
+    return out
 
 hash_MEI_DEL_reset = SVID_MEI_DEL_readin("manual_revise.MEI_DEL_from_BND.SVID_SVTYPE_FILTER_INFO.tsv")
 fin=pysam.VariantFile("~{vcf}")
 fo=pysam.VariantFile("~{prefix}.vcf.gz", 'w', header = fin.header)
 for record in fin:
-		if record.id in hash_MEI_DEL_reset.keys():
-				del record.filter['UNRESOLVED']
-				record.info['SVTYPE'] = 'DEL'
-				record.info['SVLEN'] = record.info['END2'] - record.start
-				record.stop = record.info['END2']
-				record.info.pop("CHR2")
-				record.info.pop("END2")
-				record.info.pop("UNRESOLVED_TYPE")
-				if hash_MEI_DEL_reset[record.id] == 'overlap_LINE1':
-						record.alts = ('<DEL:ME:LINE1>',)
-				if hash_MEI_DEL_reset[record.id] == 'overlap_HERVK':
-						record.alts = ('<DEL:ME:HERVK>',)
-		fo.write(record)
+    if record.id in hash_MEI_DEL_reset.keys():
+        del record.filter['UNRESOLVED']
+        record.info['SVTYPE'] = 'DEL'
+        record.info['SVLEN'] = record.info['END2'] - record.start
+        record.stop = record.info['END2']
+        record.info.pop("CHR2")
+        record.info.pop("END2")
+        record.info.pop("UNRESOLVED_TYPE")
+        if hash_MEI_DEL_reset[record.id] == 'overlap_LINE1':
+            record.alts = ('<DEL:ME:LINE1>',)
+        if hash_MEI_DEL_reset[record.id] == 'overlap_HERVK':
+            record.alts = ('<DEL:ME:HERVK>',)
+    fo.write(record)
 fin.close()
 fo.close()
 CODE
-	>>>
+  >>>
 
-	output {
-		File out = "~{prefix}.vcf.gz"
-	}
+  output {
+    File out = "~{prefix}.vcf.gz"
+  }
 }
 
 
 # Remove CNVs that are redundant with CPX events or other CNVs
 task DropRedundantCnvs {
-	input {
-		File vcf
-		String prefix
-		String contig
-		String sv_pipeline_docker
-		RuntimeAttr? runtime_attr_override
-	}
+  input {
+    File vcf
+    String prefix
+    String contig
+    String sv_pipeline_docker
+    RuntimeAttr? runtime_attr_override
+  }
 
-	Float input_size = size(vcf, "GiB")
-	# disk is cheap, read/write speed is proportional to disk size, and disk IO is a significant time factor:
-	# in tests on large VCFs, memory usage is ~1.0 * input VCF size
-	# the biggest disk usage is at the end of the task, with input + output VCF on disk
-	Int cpu_cores = 2 # speed up compression / decompression of VCFs
-	RuntimeAttr runtime_default = object {
-		mem_gb: 3.75 + input_size * 1.5,
-		disk_gb: ceil(100.0 + input_size * 2.0),
-		cpu_cores: cpu_cores,
-		preemptible_tries: 3,
-		max_retries: 1,
-		boot_disk_gb: 10
-	}
-	RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-	runtime {
-		memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-		disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-		cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-		preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-		maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-		docker: sv_pipeline_docker
-		bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-	}
+  Float input_size = size(vcf, "GiB")
+  # disk is cheap, read/write speed is proportional to disk size, and disk IO is a significant time factor:
+  # in tests on large VCFs, memory usage is ~1.0 * input VCF size
+  # the biggest disk usage is at the end of the task, with input + output VCF on disk
+  Int cpu_cores = 2 # speed up compression / decompression of VCFs
+  RuntimeAttr runtime_default = object {
+    mem_gb: 3.75 + input_size * 1.5,
+    disk_gb: ceil(100.0 + input_size * 2.0),
+    cpu_cores: cpu_cores,
+    preemptible_tries: 3,
+    max_retries: 1,
+    boot_disk_gb: 10
+  }
+  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+  runtime {
+    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
+    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+    docker: sv_pipeline_docker
+    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+  }
 
-	command <<<
-		set -euo pipefail
-		/opt/sv-pipeline/04_variant_resolution/scripts/resolve_cpx_cnv_redundancies.py \
-			~{vcf} ~{prefix}.vcf.gz --temp-dir ./tmp
-	>>>
+  command <<<
+    set -euo pipefail
+    /opt/sv-pipeline/04_variant_resolution/scripts/resolve_cpx_cnv_redundancies.py \
+      ~{vcf} ~{prefix}.vcf.gz --temp-dir ./tmp
+  >>>
 
-	output {
-		File out = "~{prefix}.vcf.gz"
-	}
+  output {
+    File out = "~{prefix}.vcf.gz"
+  }
 }
 
 
 # Stitch fragmented RD-only calls found in 100% of the same samples
 task StitchFragmentedCnvs {
-	input {
-		File vcf
-		String prefix
-		String sv_pipeline_docker
-		RuntimeAttr? runtime_attr_override
-	}
-
-	Float input_size = size(vcf, "GB")
-	RuntimeAttr runtime_default = object {
-		mem_gb: 7.5,
-		disk_gb: ceil(10.0 + input_size * 2),
-		cpu_cores: 1,
-		preemptible_tries: 3,
-		max_retries: 1,
-		boot_disk_gb: 10
-	}
-	RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-	Float mem_gb = select_first([runtime_override.mem_gb, runtime_default.mem_gb])
-	Int java_mem_mb = ceil(mem_gb * 1000 * 0.8)
-
-	runtime {
-		memory: "~{mem_gb} GB"
-		disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-		cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-		preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-		maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-		docker: sv_pipeline_docker
-		bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-	}
+  input {
+    File vcf
+    String prefix
+    String sv_pipeline_docker
+    RuntimeAttr? runtime_attr_override
+  }
 
-	command <<<
-		set -euo pipefail
-		echo "First pass..."
-		java -Xmx~{java_mem_mb}M -jar ${STITCH_JAR} 0.2 200000 0.2 ~{vcf} \
-			| bgzip \
-			> tmp.vcf.gz
-		rm ~{vcf}
-		echo "Second pass..."
-		java -Xmx~{java_mem_mb}M -jar ${STITCH_JAR} 0.2 200000 0.2 tmp.vcf.gz \
-			| bgzip \
-			> ~{prefix}.vcf.gz
-	>>>
+  Float input_size = size(vcf, "GB")
+  RuntimeAttr runtime_default = object {
+                                  mem_gb: 7.5,
+                                  disk_gb: ceil(10.0 + input_size * 2),
+                                  cpu_cores: 1,
+                                  preemptible_tries: 3,
+                                  max_retries: 1,
+                                  boot_disk_gb: 10
+                                }
+  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+  Float mem_gb = select_first([runtime_override.mem_gb, runtime_default.mem_gb])
+  Int java_mem_mb = ceil(mem_gb * 1000 * 0.8)
+
+  runtime {
+    memory: "~{mem_gb} GB"
+    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+    docker: sv_pipeline_docker
+    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+  }
 
-	output {
-		File stitched_vcf_shard = "~{prefix}.vcf.gz"
-	}
+  command <<<
+    set -euo pipefail
+    echo "First pass..."
+    java -Xmx~{java_mem_mb}M -jar ${STITCH_JAR} 0.2 200000 0.2 ~{vcf} \
+      | bgzip \
+      > tmp.vcf.gz
+    rm ~{vcf}
+    echo "Second pass..."
+    java -Xmx~{java_mem_mb}M -jar ${STITCH_JAR} 0.2 200000 0.2 tmp.vcf.gz \
+      | bgzip \
+      > ~{prefix}.vcf.gz
+  >>>
+
+  output {
+    File stitched_vcf_shard = "~{prefix}.vcf.gz"
+  }
 }
 
 # Add FILTER status for pockets of variants with high FP rate: wham-only DELs and Scramble-only SVAs with HIGH_SR_BACKGROUND
 task AddHighFDRFilters {
-	input {
-		File vcf
-		String prefix
-		String sv_pipeline_docker
-		RuntimeAttr? runtime_attr_override
-	}
+  input {
+    File vcf
+    String prefix
+    String sv_pipeline_docker
+    RuntimeAttr? runtime_attr_override
+  }
 
-	Float input_size = size(vcf, "GiB")
-	RuntimeAttr runtime_default = object {
-		mem_gb: 3.75,
-		disk_gb: ceil(10.0 + input_size * 3.0),
-		cpu_cores: 1,
-		preemptible_tries: 3,
-		max_retries: 1,
-		boot_disk_gb: 10
-	}
-	RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-	runtime {
-		memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-		disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-		cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-		preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-		maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-		docker: sv_pipeline_docker
-		bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-	}
+  Float input_size = size(vcf, "GiB")
+  RuntimeAttr runtime_default = object {
+    mem_gb: 3.75,
+    disk_gb: ceil(10.0 + input_size * 3.0),
+    cpu_cores: 1,
+    preemptible_tries: 3,
+    max_retries: 1,
+    boot_disk_gb: 10
+  }
+  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+  runtime {
+    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
+    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+    docker: sv_pipeline_docker
+    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+  }
 
-	command <<<
-		set -euo pipefail
+  command <<<
+    set -euo pipefail
 
-		python <<CODE
-	import pysam
-	with pysam.VariantFile("~{vcf}", 'r') as fin:
-	header = fin.header
-	header.add_line("##FILTER=<ID=HIGH_ALGORITHM_FDR,Description=\"Categories of variants with low precision including Wham-only deletions and certain Scramble SVAs\">")
-	with pysam.VariantFile("~{prefix}.vcf.gz", 'w', header=header) as fo:
-		for record in fin:
-			if (record.info['ALGORITHMS'] == ('wham',) and record.info['SVTYPE'] == 'DEL') or \
-			(record.info['ALGORITHMS'] == ('scramble',) and record.info['HIGH_SR_BACKGROUND'] and record.alts == ('<INS:ME:SVA>',)):
-				record.filter.add('HIGH_ALGORITHM_FDR')
-			fo.write(record)
-	CODE
-	>>>
+    python <<CODE
+import pysam
+with pysam.VariantFile("~{vcf}", 'r') as fin:
+  header = fin.header
+  header.add_line("##FILTER=<ID=HIGH_ALGORITHM_FDR,Description=\"Categories of variants with low precision including Wham-only deletions and certain Scramble SVAs\">")
+  with pysam.VariantFile("~{prefix}.vcf.gz", 'w', header=header) as fo:
+    for record in fin:
+        if (record.info['ALGORITHMS'] == ('wham',) and record.info['SVTYPE'] == 'DEL') or \
+          (record.info['ALGORITHMS'] == ('scramble',) and record.info['HIGH_SR_BACKGROUND'] and record.alts == ('<INS:ME:SVA>',)):
+            record.filter.add('HIGH_ALGORITHM_FDR')
+        fo.write(record)
+CODE
+  >>>
 
-	output {
-		File out = "~{prefix}.vcf.gz"
-	}
+  output {
+    File out = "~{prefix}.vcf.gz"
+  }
 }
 
 
 
 # Final VCF cleanup
 task FinalCleanup {
-	input {
-		File vcf
-		String contig
-		String prefix
-		String sv_pipeline_docker
-		RuntimeAttr? runtime_attr_override
-	}
-
-	# generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed
-	# generally assume working memory is ~3 * inputs
-	Float input_size = size(vcf, "GB")
-	Float base_disk_gb = 10.0
-	Float base_mem_gb = 2.0
-	Float input_mem_scale = 3.0
-	Float input_disk_scale = 5.0
-	RuntimeAttr runtime_default = object {
-		mem_gb: base_mem_gb + input_size * input_mem_scale,
-		disk_gb: ceil(base_disk_gb + input_size * input_disk_scale),
-		cpu_cores: 1,
-		preemptible_tries: 3,
-		max_retries: 1,
-		boot_disk_gb: 10
-	}
-	RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-	runtime {
-		memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-		disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-		cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-		preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-		maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-		docker: sv_pipeline_docker
-		bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-	}
+  input {
+    File vcf
+    String contig
+    String prefix
+    String sv_pipeline_docker
+    RuntimeAttr? runtime_attr_override
+  }
 
-	command <<<
-		set -eu -o pipefail
-		
-		/opt/sv-pipeline/04_variant_resolution/scripts/rename_after_vcfcluster.py \
-			--chrom ~{contig} \
-			--prefix ~{prefix} \
-			~{vcf} stdout \
-			| bcftools annotate --no-version -e 'SVTYPE=="CNV" && SVLEN<5000' -x INFO/MEMBERS -Oz -o ~{prefix}.vcf.gz
-		tabix ~{prefix}.vcf.gz
-	>>>
+  # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed
+  # generally assume working memory is ~3 * inputs
+  Float input_size = size(vcf, "GB")
+  Float base_disk_gb = 10.0
+  Float base_mem_gb = 2.0
+  Float input_mem_scale = 3.0
+  Float input_disk_scale = 5.0
+  RuntimeAttr runtime_default = object {
+    mem_gb: base_mem_gb + input_size * input_mem_scale,
+    disk_gb: ceil(base_disk_gb + input_size * input_disk_scale),
+    cpu_cores: 1,
+    preemptible_tries: 3,
+    max_retries: 1,
+    boot_disk_gb: 10
+  }
+  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+  runtime {
+    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
+    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+    docker: sv_pipeline_docker
+    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+  }
 
-	output {
-		File final_cleaned_shard = "~{prefix}.vcf.gz"
-		File final_cleaned_shard_idx = "~{prefix}.vcf.gz.tbi"
-	}
+  command <<<
+    set -eu -o pipefail
+    
+    /opt/sv-pipeline/04_variant_resolution/scripts/rename_after_vcfcluster.py \
+      --chrom ~{contig} \
+      --prefix ~{prefix} \
+      ~{vcf} stdout \
+      | bcftools annotate --no-version -e 'SVTYPE=="CNV" && SVLEN<5000' -x INFO/MEMBERS -Oz -o ~{prefix}.vcf.gz
+    tabix ~{prefix}.vcf.gz
+  >>>
+
+  output {
+    File final_cleaned_shard = "~{prefix}.vcf.gz"
+    File final_cleaned_shard_idx = "~{prefix}.vcf.gz.tbi"
+  }
 }
\ No newline at end of file

From 24fc04e4cc75fe6edf56147952008abe7025ce03 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Wed, 22 Jan 2025 09:58:58 -0500
Subject: [PATCH 37/40] Readded calcAF.wdl

---
 wdl/CalcAF.wdl | 179 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 179 insertions(+)
 create mode 100644 wdl/CalcAF.wdl

diff --git a/wdl/CalcAF.wdl b/wdl/CalcAF.wdl
new file mode 100644
index 000000000..064c3b28a
--- /dev/null
+++ b/wdl/CalcAF.wdl
@@ -0,0 +1,179 @@
+version 1.0
+
+import "Structs.wdl"
+import "TasksMakeCohortVcf.wdl" as tmc
+
+workflow CalcAF {
+  input {
+    File vcf
+    File vcf_idx
+    Int sv_per_shard
+    String prefix
+    String sv_pipeline_docker
+    File? sample_pop_assignments  #Two-column file with sample ID & pop assignment. "." for pop will ignore sample
+    File? famfile                 #Used for M/F AF calculations
+    File? par_bed                 #Used for marking hemizygous males on X & Y
+    File? allosomes_list          #allosomes .fai used to override default sex chromosome assignments
+    String? drop_empty_records
+
+    RuntimeAttr? runtime_attr_compute_shard_af
+    RuntimeAttr? runtime_attr_scatter_vcf
+    RuntimeAttr? runtime_attr_combine_sharded_vcfs
+  }
+
+
+  # Tabix to chromosome of interest, and shard input VCF for stats collection
+  call tmc.ScatterVcf {
+    input:
+      vcf=vcf,
+      prefix=prefix,
+      sv_pipeline_docker=sv_pipeline_docker,
+      records_per_shard=sv_per_shard,
+      runtime_attr_override = runtime_attr_scatter_vcf
+  }
+
+  # Scatter over VCF shards
+  scatter ( shard in ScatterVcf.shards ) {
+    # Collect AF summary stats
+    call ComputeShardAFs {
+      input:
+        vcf=shard,
+        sv_pipeline_docker=sv_pipeline_docker,
+        prefix=prefix,
+        sample_pop_assignments=sample_pop_assignments,
+        famfile=famfile,
+        par_bed=par_bed,
+        allosomes_list=allosomes_list,
+        runtime_attr_override = runtime_attr_compute_shard_af
+      }
+  	}
+
+  # Merge shards into single VCF
+  call CombineShardedVcfs {
+    input:
+      vcfs=ComputeShardAFs.shard_wAFs,
+      sv_pipeline_docker=sv_pipeline_docker,
+      prefix=prefix,
+      drop_empty_records=drop_empty_records,
+      runtime_attr_override = runtime_attr_combine_sharded_vcfs
+  }
+
+  # Final output
+  output {
+    File vcf_wAFs = CombineShardedVcfs.vcf_out
+    File vcf_wAFs_idx = CombineShardedVcfs.vcf_out_idx
+  }
+}
+
+# Subset a vcf to a single chromosome, and add global AF information (no subpop)
+task ComputeShardAFs {
+  input {
+    File vcf
+    String prefix
+    String sv_pipeline_docker
+    File? sample_pop_assignments
+    File? famfile
+    File? par_bed
+    File? allosomes_list
+    RuntimeAttr? runtime_attr_override
+  }
+  RuntimeAttr default_attr = object {
+    cpu_cores: 1, 
+    mem_gb: 1.5,
+    disk_gb: ceil(20 + size(vcf, "GB") * 2),
+    boot_disk_gb: 10,
+    preemptible_tries: 3,
+    max_retries: 1
+  }
+  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+  command <<<
+    set -euo pipefail
+    optionals=" "
+    if [ ~{default="SKIP" sample_pop_assignments} != "SKIP" ]; then
+      optionals="$( echo "$optionals" ) -p ~{sample_pop_assignments}"
+    fi
+    if [ ~{default="SKIP" famfile} != "SKIP" ]; then
+      optionals="$( echo "$optionals" ) -f ~{famfile}"
+    fi
+    if [ ~{default="SKIP" par_bed} != "SKIP" ]; then
+      optionals="$( echo "$optionals" ) --par ~{par_bed}"
+    fi
+    if [ ~{default="SKIP" allosomes_list} != "SKIP" ]; then
+      optionals="$( echo "$optionals" ) --allosomes-list ~{allosomes_list}"
+    fi
+    echo -e "OPTIONALS INTERPRETED AS: $optionals"
+    echo -e "NOW RUNNING: /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py $( echo "$optionals" ) ~{vcf} stdout"
+    #Tabix chromosome of interest & compute AN, AC, and AF
+    /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py $optionals "~{vcf}" stdout \
+    | bgzip -c \
+    > "~{prefix}.wAFs.vcf.gz"
+  >>>
+
+  output {
+    File shard_wAFs = "~{prefix}.wAFs.vcf.gz"
+  }
+  
+  runtime {
+    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+    docker: sv_pipeline_docker
+    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+  }
+}
+
+
+# Merge VCF shards & drop records with zero remaining non-ref alleles
+task CombineShardedVcfs {
+  input {
+    Array[File] vcfs
+    String prefix
+    String sv_pipeline_docker
+    String? drop_empty_records
+    RuntimeAttr? runtime_attr_override
+  }
+  RuntimeAttr default_attr = object {
+    cpu_cores: 1, 
+    mem_gb: 4,
+    disk_gb: 50,
+    boot_disk_gb: 10,
+    preemptible_tries: 3,
+    max_retries: 1
+  }
+  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+
+  command {
+    set -euo pipefail
+    vcf-concat -f ~{write_lines(vcfs)} \
+    | vcf-sort \
+    > merged.vcf
+    if [ ~{default="TRUE" drop_empty_records} == "TRUE" ]; then
+      /opt/sv-pipeline/05_annotation/scripts/prune_allref_records.py \
+        merged.vcf stdout \
+      | bgzip -c \
+      > "~{prefix}.wAFs.vcf.gz"
+    else
+      cat merged.vcf | bgzip -c > "~{prefix}.wAFs.vcf.gz"
+    fi
+    tabix -p vcf "~{prefix}.wAFs.vcf.gz"
+  }
+
+
+  output {
+    File vcf_out = "~{prefix}.wAFs.vcf.gz"
+    File vcf_out_idx = "~{prefix}.wAFs.vcf.gz.tbi"
+  }
+  runtime {
+    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
+    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
+    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
+    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
+    docker: sv_pipeline_docker
+    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
+  }
+}
+

From c588ab1f2d7ede1f3aef339c9639797ecb58e203 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Wed, 22 Jan 2025 10:01:38 -0500
Subject: [PATCH 38/40] Removed calcaf.wdl

---
 wdl/CalcAF.wdl | 179 -------------------------------------------------
 1 file changed, 179 deletions(-)
 delete mode 100644 wdl/CalcAF.wdl

diff --git a/wdl/CalcAF.wdl b/wdl/CalcAF.wdl
deleted file mode 100644
index 064c3b28a..000000000
--- a/wdl/CalcAF.wdl
+++ /dev/null
@@ -1,179 +0,0 @@
-version 1.0
-
-import "Structs.wdl"
-import "TasksMakeCohortVcf.wdl" as tmc
-
-workflow CalcAF {
-  input {
-    File vcf
-    File vcf_idx
-    Int sv_per_shard
-    String prefix
-    String sv_pipeline_docker
-    File? sample_pop_assignments  #Two-column file with sample ID & pop assignment. "." for pop will ignore sample
-    File? famfile                 #Used for M/F AF calculations
-    File? par_bed                 #Used for marking hemizygous males on X & Y
-    File? allosomes_list          #allosomes .fai used to override default sex chromosome assignments
-    String? drop_empty_records
-
-    RuntimeAttr? runtime_attr_compute_shard_af
-    RuntimeAttr? runtime_attr_scatter_vcf
-    RuntimeAttr? runtime_attr_combine_sharded_vcfs
-  }
-
-
-  # Tabix to chromosome of interest, and shard input VCF for stats collection
-  call tmc.ScatterVcf {
-    input:
-      vcf=vcf,
-      prefix=prefix,
-      sv_pipeline_docker=sv_pipeline_docker,
-      records_per_shard=sv_per_shard,
-      runtime_attr_override = runtime_attr_scatter_vcf
-  }
-
-  # Scatter over VCF shards
-  scatter ( shard in ScatterVcf.shards ) {
-    # Collect AF summary stats
-    call ComputeShardAFs {
-      input:
-        vcf=shard,
-        sv_pipeline_docker=sv_pipeline_docker,
-        prefix=prefix,
-        sample_pop_assignments=sample_pop_assignments,
-        famfile=famfile,
-        par_bed=par_bed,
-        allosomes_list=allosomes_list,
-        runtime_attr_override = runtime_attr_compute_shard_af
-      }
-  	}
-
-  # Merge shards into single VCF
-  call CombineShardedVcfs {
-    input:
-      vcfs=ComputeShardAFs.shard_wAFs,
-      sv_pipeline_docker=sv_pipeline_docker,
-      prefix=prefix,
-      drop_empty_records=drop_empty_records,
-      runtime_attr_override = runtime_attr_combine_sharded_vcfs
-  }
-
-  # Final output
-  output {
-    File vcf_wAFs = CombineShardedVcfs.vcf_out
-    File vcf_wAFs_idx = CombineShardedVcfs.vcf_out_idx
-  }
-}
-
-# Subset a vcf to a single chromosome, and add global AF information (no subpop)
-task ComputeShardAFs {
-  input {
-    File vcf
-    String prefix
-    String sv_pipeline_docker
-    File? sample_pop_assignments
-    File? famfile
-    File? par_bed
-    File? allosomes_list
-    RuntimeAttr? runtime_attr_override
-  }
-  RuntimeAttr default_attr = object {
-    cpu_cores: 1, 
-    mem_gb: 1.5,
-    disk_gb: ceil(20 + size(vcf, "GB") * 2),
-    boot_disk_gb: 10,
-    preemptible_tries: 3,
-    max_retries: 1
-  }
-  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
-
-  command <<<
-    set -euo pipefail
-    optionals=" "
-    if [ ~{default="SKIP" sample_pop_assignments} != "SKIP" ]; then
-      optionals="$( echo "$optionals" ) -p ~{sample_pop_assignments}"
-    fi
-    if [ ~{default="SKIP" famfile} != "SKIP" ]; then
-      optionals="$( echo "$optionals" ) -f ~{famfile}"
-    fi
-    if [ ~{default="SKIP" par_bed} != "SKIP" ]; then
-      optionals="$( echo "$optionals" ) --par ~{par_bed}"
-    fi
-    if [ ~{default="SKIP" allosomes_list} != "SKIP" ]; then
-      optionals="$( echo "$optionals" ) --allosomes-list ~{allosomes_list}"
-    fi
-    echo -e "OPTIONALS INTERPRETED AS: $optionals"
-    echo -e "NOW RUNNING: /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py $( echo "$optionals" ) ~{vcf} stdout"
-    #Tabix chromosome of interest & compute AN, AC, and AF
-    /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py $optionals "~{vcf}" stdout \
-    | bgzip -c \
-    > "~{prefix}.wAFs.vcf.gz"
-  >>>
-
-  output {
-    File shard_wAFs = "~{prefix}.wAFs.vcf.gz"
-  }
-  
-  runtime {
-    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
-    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
-    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
-    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
-    docker: sv_pipeline_docker
-    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
-    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
-  }
-}
-
-
-# Merge VCF shards & drop records with zero remaining non-ref alleles
-task CombineShardedVcfs {
-  input {
-    Array[File] vcfs
-    String prefix
-    String sv_pipeline_docker
-    String? drop_empty_records
-    RuntimeAttr? runtime_attr_override
-  }
-  RuntimeAttr default_attr = object {
-    cpu_cores: 1, 
-    mem_gb: 4,
-    disk_gb: 50,
-    boot_disk_gb: 10,
-    preemptible_tries: 3,
-    max_retries: 1
-  }
-  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
-
-  command {
-    set -euo pipefail
-    vcf-concat -f ~{write_lines(vcfs)} \
-    | vcf-sort \
-    > merged.vcf
-    if [ ~{default="TRUE" drop_empty_records} == "TRUE" ]; then
-      /opt/sv-pipeline/05_annotation/scripts/prune_allref_records.py \
-        merged.vcf stdout \
-      | bgzip -c \
-      > "~{prefix}.wAFs.vcf.gz"
-    else
-      cat merged.vcf | bgzip -c > "~{prefix}.wAFs.vcf.gz"
-    fi
-    tabix -p vcf "~{prefix}.wAFs.vcf.gz"
-  }
-
-
-  output {
-    File vcf_out = "~{prefix}.wAFs.vcf.gz"
-    File vcf_out_idx = "~{prefix}.wAFs.vcf.gz.tbi"
-  }
-  runtime {
-    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
-    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
-    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
-    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
-    docker: sv_pipeline_docker
-    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
-    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
-  }
-}
-

From a6588c664bf64b8c9b10a55b6b78f840c6a81eaf Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Wed, 29 Jan 2025 16:26:33 -0500
Subject: [PATCH 39/40] Minor changes to make wdl more readable

---
 wdl/CleanVcfChromosome.wdl | 185 ++-----------------------------------
 1 file changed, 7 insertions(+), 178 deletions(-)

diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl
index 6b13dbceb..cdbae0c2b 100644
--- a/wdl/CleanVcfChromosome.wdl
+++ b/wdl/CleanVcfChromosome.wdl
@@ -26,15 +26,12 @@ workflow CleanVcfChromosome {
 		Boolean use_hail
 		String? gcs_project
 
-<<<<<<< HEAD
 		String gatk_docker
 		String linux_docker
 		String sv_base_mini_docker
 		String sv_pipeline_docker
-=======
     File? svtk_to_gatk_script  # For debugging
     File? make_clean_gq_script
->>>>>>> main
 
 		# overrides for local tasks
 		RuntimeAttr? runtime_attr_preprocess
@@ -160,144 +157,14 @@ workflow CleanVcfChromosome {
 			runtime_attr_override=runtime_override_stitch_fragmented_cnvs
 	}
 
-<<<<<<< HEAD
 	call RescueMobileElementDeletions {
-=======
-  call MiniTasks.SplitUncompressed as SplitIncludeList {
-    input:
-      whole_file=CleanVcf1a.include_list[0],
-      lines_per_shard=samples_per_step2_shard,
-      shard_prefix="~{prefix}.split_include_list.",
-      sv_pipeline_docker=sv_pipeline_docker,
-      runtime_attr_override=runtime_override_split_include_list
-  }
-
-  scatter ( i in range(length(SplitIncludeList.shards)) ){
-    call CleanVcf2 {
-      input:
-        normal_revise_vcf=CleanVcf1b.normal,
-        prefix="~{prefix}.clean_vcf_2.shard_~{i}",
-        include_list=SplitIncludeList.shards[i],
-        multi_cnvs=CleanVcf1b.multi,
-        sv_pipeline_docker=sv_pipeline_docker,
-        runtime_attr_override=runtime_override_clean_vcf_2
-      }
-  }
-
-  call MiniTasks.CatUncompressedFiles as CombineCleanVcf2 {
-    input:
-      shards=CleanVcf2.out,
-      outfile_name="~{prefix}.combine_clean_vcf_2.txt",
-      sv_base_mini_docker=sv_base_mini_docker,
-      runtime_attr_override=runtime_override_combine_clean_vcf_2
-  }
-
-  call CleanVcf3 {
-    input:
-      rd_cn_revise=CombineCleanVcf2.outfile,
-      max_samples_shard = max_samples_per_shard_step3,
-      sv_pipeline_docker=sv_pipeline_docker,
-      runtime_attr_override=runtime_override_clean_vcf_3
-  }
-
-  scatter ( i in range(length(CleanVcf3.shards)) ){
-    call CleanVcf4 {
-      input:
-        rd_cn_revise=CleanVcf3.shards[i],
-        normal_revise_vcf=CleanVcf1b.normal,
-        prefix="~{prefix}.clean_vcf_4.shard_~{i}",
-        sv_pipeline_docker=sv_pipeline_docker,
-        runtime_attr_override=runtime_override_clean_vcf_4
-    }
-  }
-
-  call MiniTasks.CatUncompressedFiles as CombineRevised4 {
-    input:
-      shards=CleanVcf4.out,
-      outfile_name="~{prefix}.combine_revised_4.txt.gz",
-      sv_base_mini_docker=sv_base_mini_docker,
-      runtime_attr_override=runtime_override_combine_revised_4
-  }
-
-  call MiniTasks.CatUncompressedFiles as CombineMultiIds4 {
-    input:
-      shards=CleanVcf4.multi_ids,
-      outfile_name="~{prefix}.combine_multi_ids_4.txt.gz",
-      sv_base_mini_docker=sv_base_mini_docker,
-      runtime_attr_override=runtime_override_combine_multi_ids_4
-  }
-
-  call c5.CleanVcf5 {
-    input:
-      revise_vcf_lines=CombineRevised4.outfile,
-      normal_revise_vcf=CleanVcf1b.normal,
-      ped_file=ped_file,
-      sex_chr_revise=CombineStep1SexChrRevisions.outfile,
-      multi_ids=CombineMultiIds4.outfile,
-      outlier_samples_list=outlier_samples_list,
-      contig=contig,
-      prefix="~{prefix}.clean_vcf_5",
-      records_per_shard=clean_vcf5_records_per_shard,
-      threads_per_task=clean_vcf5_threads_per_task,
-      make_clean_gq_script=make_clean_gq_script,
-      sv_pipeline_docker=sv_pipeline_docker,
-      sv_base_mini_docker=sv_base_mini_docker,
-      runtime_attr_override_scatter=runtime_override_clean_vcf_5_scatter,
-      runtime_attr_override_make_cleangq=runtime_override_clean_vcf_5_make_cleangq,
-      runtime_attr_override_find_redundant_multiallelics=runtime_override_clean_vcf_5_find_redundant_multiallelics,
-      runtime_attr_override_polish=runtime_override_clean_vcf_5_polish
-  }
-
-  call DropRedundantCnvs {
-    input:
-      vcf=CleanVcf5.polished,
-      prefix="~{prefix}.drop_redundant_cnvs",
-      contig=contig,
-      sv_pipeline_docker=sv_pipeline_docker,
-      runtime_attr_override=runtime_override_drop_redundant_cnvs
-  }
-
-  if (use_hail) {
-    call HailMerge.HailMerge as SortDropRedundantCnvsHail {
-      input:
-        vcfs=[DropRedundantCnvs.out],
-        prefix="~{prefix}.drop_redundant_cnvs.sorted",
-        gcs_project=gcs_project,
-        reset_cnv_gts=true,
-        sv_base_mini_docker=sv_base_mini_docker,
-        sv_pipeline_docker=sv_pipeline_docker,
-        runtime_override_preconcat=runtime_override_preconcat_drc,
-        runtime_override_hail_merge=runtime_override_hail_merge_drc,
-        runtime_override_fix_header=runtime_override_fix_header_drc
-    }
-  }
-  if (!use_hail) {
-    call MiniTasks.SortVcf as SortDropRedundantCnvs {
-      input:
-        vcf=DropRedundantCnvs.out,
-        outfile_prefix="~{prefix}.drop_redundant_cnvs.sorted",
-        sv_base_mini_docker=sv_base_mini_docker,
-        runtime_attr_override=runtime_override_sort_drop_redundant_cnvs
-    }
-  }
-
-  call StitchFragmentedCnvs {
-    input:
-      vcf=select_first([SortDropRedundantCnvs.out, SortDropRedundantCnvsHail.merged_vcf]),
-      prefix="~{prefix}.stitch_fragmented_cnvs",
-      sv_pipeline_docker=sv_pipeline_docker,
-      runtime_attr_override=runtime_override_stitch_fragmented_cnvs
-  }
-
-  call RescueMobileElementDeletions {
->>>>>>> main
-    input:
-      vcf = StitchFragmentedCnvs.stitched_vcf_shard,
-      prefix = "~{prefix}.rescue_me_dels",
-      LINE1 = LINE1_reference,
-      HERVK = HERVK_reference,
-      sv_pipeline_docker = sv_pipeline_docker,
-      runtime_attr_override = runtime_override_rescue_me_dels
+		input:
+		vcf = StitchFragmentedCnvs.stitched_vcf_shard,
+		prefix = "~{prefix}.rescue_me_dels",
+		LINE1 = LINE1_reference,
+		HERVK = HERVK_reference,
+		sv_pipeline_docker = sv_pipeline_docker,
+		runtime_attr_override = runtime_override_rescue_me_dels
   }
 
   call AddHighFDRFilters {
@@ -585,46 +452,8 @@ task CleanVcfReviseMultiallelicCnvs {
 		bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
 	}
 
-<<<<<<< HEAD
 	Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7)
 	String output_vcf = "~{prefix}.vcf.gz"
-=======
-    vcf = pysam.VariantFile(VCF_FILE)
-    # Max sample count with PE or SR GT over 3
-    max_vf = max(len(vcf.header.samples) * 0.01, 2)
-    record_start = (batch_num - 1) * segments
-    record_end = batch_num * segments
-    record_idx = 0
-    print("{} {} {}".format(max_vf, record_start, record_end))
-    multi_geno_ids = set([])
-    for record in vcf:
-      record_idx += 1
-      if record_idx < record_start:
-        continue
-      elif record_idx > record_end:
-        break
-      num_gt_over_2 = 0
-      for sid in record.samples:
-        s = record.samples[sid]
-        # Pick best GT
-        if s.get('PE_GT') is None:
-          continue
-        elif s.get('SR_GT') is None:
-          gt = s.get('PE_GT')
-        elif s.get('PE_GT') > 0 and s.get('SR_GT') == 0:
-          gt = s.get('PE_GT')
-        elif s.get('PE_GT') == 0:
-          gt = s.get('SR_GT')
-        elif s.get('PE_GQ') >= s.get('SR_GQ'):
-          gt = s.get('PE_GT')
-        else:
-          gt = s.get('SR_GT')
-        if gt > 2:
-          num_gt_over_2 += 1
-      if num_gt_over_2 > max_vf:
-        multi_geno_ids.add(record.id)
-    vcf.close()
->>>>>>> main
 
 	command <<<
 		set -euo pipefail

From 835f5b023da8ee84de58abb771c85c8b58b063f7 Mon Sep 17 00:00:00 2001
From: Karan Jaisingh <kjaising@broadinstitute.org>
Date: Wed, 29 Jan 2025 16:40:30 -0500
Subject: [PATCH 40/40] Removed head from merge conflict

---
 inputs/values/dockers.json | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json
index 1719a4602..b4e41f574 100644
--- a/inputs/values/dockers.json
+++ b/inputs/values/dockers.json
@@ -25,9 +25,5 @@
   "sv_utils_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-utils:2025-01-06-v1.0.1-e902bf4e",
   "gq_recalibrator_docker": "us.gcr.io/broad-dsde-methods/markw/gatk:mw-tb-form-sv-filter-training-data-899360a",
   "str": "us.gcr.io/broad-dsde-methods/gatk-sv/str:2023-05-23-v0.27.3-beta-e537bdd6",
-<<<<<<< HEAD
-  "denovo": "us-central1-docker.pkg.dev/talkowski-training/kj-development/denovo:kj-clean-vcf-a2e7381c9f4822c8b171f99d679b394cc4bd60ac"
-=======
   "denovo": "us.gcr.io/broad-dsde-methods/gatk-sv/denovo:2025-01-14-v1.0.1-88dbd052"
->>>>>>> main
 }
\ No newline at end of file