From be512149ca2a4739f4ad6706d04d1fda54ac0067 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Thu, 17 Oct 2024 12:32:21 -0400 Subject: [PATCH 01/40] Initial commit --- wdl/ResolveComplexVariants.wdl | 5 +++++ wdl/TasksMakeCohortVcf.wdl | 7 ++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/wdl/ResolveComplexVariants.wdl b/wdl/ResolveComplexVariants.wdl index f712537b6..e2d32128b 100644 --- a/wdl/ResolveComplexVariants.wdl +++ b/wdl/ResolveComplexVariants.wdl @@ -17,6 +17,9 @@ workflow ResolveComplexVariants { Array[File] disc_files Array[File] rf_cutoff_files + Array[String]? background_fail_columns + Array[String]? bothsides_pass_columns + File contig_list Int max_shard_size File cytobands @@ -194,6 +197,7 @@ workflow ResolveComplexVariants { vcf=RenameVariants.renamed_vcf, original_list=cluster_bothside_pass_lists[i], outfile="~{cohort_name}.~{contig}.sr_bothside_pass.updated3.txt", + header_columns=select_first([bothsides_pass_columns, ["1", "2", "3", "4"]]), sv_pipeline_docker=sv_pipeline_docker, runtime_attr_override=runtime_override_update_sr_list_pass } @@ -204,6 +208,7 @@ workflow ResolveComplexVariants { vcf=RenameVariants.renamed_vcf, original_list=cluster_background_fail_lists[i], outfile="~{cohort_name}.~{contig}.sr_background_fail.updated3.txt", + header_columns=select_first([background_fail_columns, ["1", "2", "3", "4"]]), sv_pipeline_docker=sv_pipeline_docker, runtime_attr_override=runtime_override_update_sr_list_fail } diff --git a/wdl/TasksMakeCohortVcf.wdl b/wdl/TasksMakeCohortVcf.wdl index d489831e8..fef15e068 100644 --- a/wdl/TasksMakeCohortVcf.wdl +++ b/wdl/TasksMakeCohortVcf.wdl @@ -684,6 +684,7 @@ task UpdateSrList { File vcf File original_list String outfile + Array[String]? header_columns String sv_pipeline_docker RuntimeAttr? runtime_attr_override } @@ -711,6 +712,10 @@ task UpdateSrList { command <<< set -euxo pipefail + if [[ ! -z "~{sep=' ' header_columns}" ]]; then + echo -e "~{sep='\t' header_columns}" > ~{outfile} + fi + # append new ids to original list svtk vcf2bed ~{vcf} int.bed -i MEMBERS --no-samples --no-header @@ -724,7 +729,7 @@ task UpdateSrList { else print $0,$NF; \ }' int.bed ~{original_list} \ | sort -k1,1n \ - > ~{outfile} + >> ~{outfile} >>> output { From d20d5b87e46ac7e259abd4f95c456f80f7545fbd Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Thu, 31 Oct 2024 15:34:37 -0400 Subject: [PATCH 02/40] E2E working in local integration tests --- .github/.dockstore.yml | 1 + dockerfiles/sv-pipeline/Dockerfile | 12 - .../scripts/clean_vcf_part2.sh | 171 -- .../svpipeline/CleanVCFPart1.java | 316 ---- .../svpipeline/CleanVCFPart1UnitTest.java | 40 - wdl/CleanVcf.wdl | 9 +- wdl/CleanVcf5.wdl | 6 +- wdl/CleanVcfChromosome.wdl | 1577 ++++++++--------- 8 files changed, 791 insertions(+), 1341 deletions(-) delete mode 100755 src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part2.sh delete mode 100644 src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1.java delete mode 100644 src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1UnitTest.java diff --git a/.github/.dockstore.yml b/.github/.dockstore.yml index 974fdeee6..938508af3 100644 --- a/.github/.dockstore.yml +++ b/.github/.dockstore.yml @@ -141,6 +141,7 @@ workflows: filters: branches: - main + - kj_sv_cleanvcf tags: - /.*/ diff --git a/dockerfiles/sv-pipeline/Dockerfile b/dockerfiles/sv-pipeline/Dockerfile index d4f9aa687..5d9c759c7 100644 --- a/dockerfiles/sv-pipeline/Dockerfile +++ b/dockerfiles/sv-pipeline/Dockerfile @@ -70,13 +70,9 @@ RUN plink2 || true # -Compile StitchFragmentedCNVs Java program # -Compile StitchFragmentedCNVs unit tests # -Compile VCFParser unit tests -# -Compile and test CleanVCFPart1 Java program -# -Compile and test CleanVCFPart1 unit tests ENV STITCH_JAR="/opt/sv-pipeline/java/build/StitchFragmentedCNVs.jar" ARG STITCH_UNIT_TEST_JAR="/opt/sv-pipeline/java/build/StitchFragmentedCNVsUnitTest.jar" ARG VCF_PARSER_UNIT_TEST_JAR="/opt/sv-pipeline/java/build/VCFParserUnitTest.jar" -ENV CLEAN_VCF_PART_1_JAR="/opt/sv-pipeline/java/build/CleanVCFPart1.jar" -ARG CLEAN_VCF_PART_1_UNIT_TEST_JAR="/opt/sv-pipeline/java/build/CleanVCFPart1UnitTest.jar" ARG BUILD_DEPS="openjdk-8-jdk" ARG DEBIAN_FRONTEND=noninteractive RUN export APT_TRANSIENT_PACKAGES=$(diff_of_lists.sh "$BUILD_DEPS" $APT_REQUIRED_PACKAGES) && \ @@ -97,14 +93,6 @@ RUN export APT_TRANSIENT_PACKAGES=$(diff_of_lists.sh "$BUILD_DEPS" $APT_REQUIRED echo "Running VCFParserUnitTest..." && \ java -enableassertions -jar $VCF_PARSER_UNIT_TEST_JAR && \ rm -rf build/classes/* $VCF_PARSER_UNIT_TEST_JAR && \ - javac -d build/classes org/broadinstitute/svpipeline/CleanVCFPart1.java org/broadinstitute/svpipeline/VCFParser.java && \ - jar cfe build/CleanVCFPart1.jar "org.broadinstitute.svpipeline.CleanVCFPart1" -C build/classes . && \ - rm -rf build/classes/* && \ - javac -d build/classes org/broadinstitute/svpipeline/CleanVCFPart1UnitTest.java org/broadinstitute/svpipeline/CleanVCFPart1.java org/broadinstitute/svpipeline/VCFParser.java && \ - jar cfe build/CleanVCFPart1UnitTest.jar "org.broadinstitute.svpipeline.CleanVCFPart1UnitTest" -C build/classes . && \ - echo "Running CleanVCFPart1UnitTest..." && \ - java -enableassertions -jar $CLEAN_VCF_PART_1_UNIT_TEST_JAR && \ - rm -rf build/classes/* $CLEAN_VCF_PART_1_UNIT_TEST_JAR && \ apt-get -qqy remove --purge $APT_TRANSIENT_PACKAGES && \ apt-get -qqy autoremove --purge && \ apt-get -qqy clean && \ diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part2.sh b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part2.sh deleted file mode 100755 index 5d4827fa5..000000000 --- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part2.sh +++ /dev/null @@ -1,171 +0,0 @@ -#!/bin/bash -# -# clean_vcf_part2.sh -# - -set -euo pipefail - -##bgzipped combined vcf from clean vcf part1b.sh## -normal_revise_vcf_gz=$1 -##whitelist of split ids for parallelization## -whitelist=$2 -##list of multiallelic CNVs from 1b## -multi_cnv=$3 -##output filename## -outputfile=$4 - -export LC_ALL=C - -##subset vcf to whitelist samples## -bcftools view $normal_revise_vcf_gz -S $whitelist --no-update \ - | gzip \ - > subset.vcf.gz - - -##create new bed with updated genotypes### -zcat subset.vcf.gz \ - | awk 'BEGIN{FS=OFS="\t"}{if (substr($1,1,1)=="#" || $5=="" || $5=="") print}' \ - | svtk vcf2bed stdin stdout \ - | sed 1d \ - | sort -k4,4 \ - | gzip \ - > int.afternormalfix.bed.gz - -##Find overlapping depth based variants and reassign depth based; note this is necessary because depth call >5kb genotypes are 100% driven by depth ## -##generate a sample list based on depth for depth overlap check below. Necessary because genotype is capped at 1/1 and by direction (i.e no dels in dups)## -##grab all samples per variant with a non normal copy state## -zcat subset.vcf.gz \ - | awk 'BEGIN{FS=OFS="\t"}{if (substr($1,1,1)=="#") print; else if ($5=="" || $5=="") {$1=$3; print}}' \ - | vcftools --vcf - --stdout --extract-FORMAT-info RD_CN \ - | awk 'BEGIN{FS=OFS="\t"} NR==1{for (i=3;i<=NF;i++) header[i]=$i} NR>1{for(j=3;j<=NF;j++) print $1,header[j],$j }' \ - | sort -k1,1 \ - | awk 'BEGIN{FS=OFS="\t"} \ - {if (($1~"DEL" && $3<2 && $3!=".") || ($1~"DUP" && $3>2 && $3!=".")) a[$1]=a[$1]?a[$1]","$2:$2} \ - END{for (i in a) print i,a[i]}' \ - | sort -k1,1 \ - > afternormal.combined.RD_CN.list.txt - -##get a list of samples for actual variants not just those with aberrant copy states## -zcat int.afternormalfix.bed.gz \ - | awk 'BEGIN{FS=OFS="\t"} $6!="" {split($6,samples,","); for (i in samples) print $4"@"samples[i]}' \ - | sort -k1,1 \ - > fullvar.afternormal.list.txt - -##create bed with anything that has abnormal copy state## -##do not compress all.bed because of bedtools bug: https://github.com/arq5x/bedtools2/issues/643## -zcat int.afternormalfix.bed.gz \ - | cut -f1-5 \ - | awk 'BEGIN{FS=OFS="\t"}{if($3-$2>=5000)print $0}' \ - | join -1 4 -2 1 -t ' ' - afternormal.combined.RD_CN.list.txt \ - | awk 'BEGIN {FS=OFS="\t"} \ - $6!=""{split($6,samples,","); \ - for (i in samples) {s = samples[i]; print $2"_"s,$3,$4,$1,$5,s,$1"@"s}}' \ - | sort -k7,7 \ - | join -t ' ' -a 1 -1 7 -2 1 -e "NA" -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.1 - fullvar.afternormal.list.txt \ - > all.bed - -##intersect variants and always set larger to left## -bedtools intersect -wa -wb -a all.bed -b all.bed \ - | awk 'BEGIN{FS=OFS="\t"} \ - {if ($8=="NA") { if ($16!="NA") print $9,$10,$11,$12,$13,$14,$15,$1,$2,$3,$4,$5,$6,$7; } \ - else if ($4!=$12) { - if ($3-$2>=$11-$10) print $1,$2,$3,$4,$5,$6,$7,$9,$10,$11,$12,$13,$14,$15; \ - else if ($16!="NA") print $9,$10,$11,$12,$13,$14,$15,$1,$2,$3,$4,$5,$6,$7; \ - else print $1,$2,$3,$4,$5,$6,$7,$9,$10,$11,$12,$13,$14,$15;}}' \ - | sort -k7,7 \ - | uniq \ - | gzip \ - > bed.overlap.txt.gz - -zcat subset.vcf.gz \ - | grep '^#' \ - > vcf_header -zcat bed.overlap.txt.gz \ - | awk '{print $4; print $11}' \ - | sort -u \ - > overlap.events -echo '~~~' >> overlap.events -zcat subset.vcf.gz \ - | grep -v '^#' \ - | sort -k3,3 \ - | join -t ' ' -1 1 -2 3 overlap.events - \ - | awk 'BEGIN{FS=OFS="\t"}{tmp=$2;$2=$3;$3=tmp;print}' \ - | sort -k2n,2 \ - > overlap.events.vcf - -##get info for each variant## -zcat bed.overlap.txt.gz \ - | awk 'BEGIN{FS=OFS="\t"}{print $7; print $14;}' \ - | sort -u \ - > combined.bed -for var in EV RD_CN GT -do - echo '~~~' >> combined.bed - cat vcf_header overlap.events.vcf \ - | vcftools --vcf - --stdout --extract-FORMAT-info ${var} \ - | awk 'BEGIN{FS=OFS="\t"} \ - NR==1{for (i=3;i<=NF;i++) header[i]=$i} \ - NR>1 {for (j=3;j<=NF;j++) print $1"@"header[j],$j}' \ - | sort -k1,1 \ - | join -t ' ' -1 1 -2 1 combined.bed - \ - > combined.bed.tmp - mv combined.bed.tmp combined.bed -done - -zcat bed.overlap.txt.gz \ - | join -t ' ' -1 7 -2 1 - combined.bed \ - | cut -f2- \ - | sort -k13,13 \ - | join -t ' ' -1 13 -2 1 - combined.bed \ - | cut -f2- \ - | awk 'BEGIN{FS=OFS="\t"}{print $3-$2,$9-$8,$0}' \ - | sort -k1nr,1 -k2nr,2 \ - | cut -f3- \ - | gzip \ - > all.combined.bed.gz - -zcat all.combined.bed.gz \ - | awk -v multiCNVFile=$multi_cnv ' \ - function makeRevision( id, val ) { reviseCN[id] = val; if ( val == 2 ) wasRevisedToNormal[id] = 1; } \ - BEGIN \ - {FS=OFS="\t"; \ - while ( getline < multiCNVFile ) multiCNV[$0] = 1; \ - close(multiCNVFile)} \ - {chr_sample1 = $1; start1 = $2; stop1 = $3; ev1 = $4; svtype1 = $5; sample1 = $6; \ - chr_sample2 = $7; start2 = $8; stop2 = $9; ev2 = $10; svtype2 = $11; sample2 = $12; \ - support1 = $13; RD_CN1 = $14; GT1 = $15; support2 = $16; RD_CN2 = $17; GT2 = $18; \ - id1 = ev1"@"sample1; id2 = ev2"@"sample2; \ - length1 = stop1 - start1; length2 = stop2 - start2; \ - if ( id1 in wasRevisedToNormal ) next; \ - if ( id1 in reviseCN ) RD_CN1 = reviseCN[id1]; \ - if ( id2 in reviseCN ) RD_CN2 = reviseCN[id2]; \ - overlap = (stop1 < stop2 ? stop1 : stop2) - (start1 > start2 ? start1 : start2); \ - smallOverlap50 = overlap/length2 > .5; \ - largeOverlap50 = overlap/length1 > .5; \ - ##Call where smaller depth call is being driven by larger## \ - if ( support1 ~ /RD/ && support1 != "RD" && support2 == "RD" && smallOverlap50 && !(ev1 in multiCNV) ) { \ - if ( RD_CN1 == 0 ) makeRevision(id2, RD_CN2 + 2); \ - else if ( RD_CN1 == 1 ) makeRevision(id2, RD_CN2 + RD_CN1); \ - else if ( RD_CN1 > 1 ) { newCN = RD_CN2 - RD_CN1 + 2; if ( newCN < 0 ) newCN = 0; makeRevision(id2, newCN); } } \ - ##Smaller CNV driving larger CNV genotype## \ - else if ( support1 == "RD" && support2 ~ /RD/ && support2 != "RD" && smallOverlap50 && !(ev2 in multiCNV) && GT2 != "0/0" && largeOverlap50 ) { \ - if ( RD_CN2 == 0 ) makeRevision(id1, RD_CN1 + 2); \ - else if ( RD_CN2 == 1 ) makeRevision(id1, RD_CN1 + RD_CN2); \ - else if ( RD_CN2 > 1 ) { newCN = RD_CN1 - RD_CN2 + 2; if ( newCN < 0 ) newCN = 0; makeRevision(id1, newCN); } } \ - ##Depth only calls where smaller call is being driven by larger## \ - else if ( support1 == "RD" && support2 == "RD" && smallOverlap50 && svtype1 == svtype2 && !(ev1 in multiCNV) ) { \ - if ( RD_CN1 == 0 && RD_CN1 != RD_CN2 ) makeRevision(id2, RD_CN2 + 2); \ - else if ( RD_CN1 == 1 && RD_CN1 > RD_CN2 ) makeRevision(id2, 1); \ - else if ( RD_CN1 > 1 && RD_CN1 < RD_CN2 ) { newCN = RD_CN2 - RD_CN1 + 2; if ( newCN < 0 ) newCN = 0; makeRevision(id2, newCN); } \ - else makeRevision(id2, 2); } \ - ##Any other time a larger call is driving a smaller call## \ - else if ( support1 ~ /RD/ && smallOverlap50 && length2 > 5000 && !(ev1 in multiCNV) ) { \ - if ( RD_CN1 == 0 ) makeRevision(id2, RD_CN2 + 2); \ - else if ( RD_CN1 == 1 ) makeRevision(id2, RD_CN2 + RD_CN1); \ - else if ( RD_CN1 > 1 ) { newCN = RD_CN2 - RD_CN1 + 2; if ( newCN < 0 ) newCN = 0; makeRevision(id2, newCN); } } } \ - END \ - {for ( id in reviseCN ) print id,reviseCN[id]; }' \ - | sed 's/@/ /' \ - | sort \ - | awk 'BEGIN{FS=OFS="\t"}{if ($3<0) $3=0; print $0}' \ - > $outputfile diff --git a/src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1.java b/src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1.java deleted file mode 100644 index 5637154bb..000000000 --- a/src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1.java +++ /dev/null @@ -1,316 +0,0 @@ -package org.broadinstitute.svpipeline; - -import java.io.*; -import java.nio.charset.StandardCharsets; -import java.util.*; -import java.util.regex.Pattern; -import org.broadinstitute.svpipeline.VCFParser.*; - -public class CleanVCFPart1 { - private static final ByteSequence[] EV_VALS = { - null, - new ByteSequence("RD"), - new ByteSequence("PE"), - new ByteSequence("RD,PE"), - new ByteSequence("SR"), - new ByteSequence("RD,SR"), - new ByteSequence("PE,SR"), - new ByteSequence("RD,PE,SR") - }; - private static final ByteSequence FORMAT_LINE = new ByteSequence("FORMAT"); - private static final ByteSequence ID_KEY = new ByteSequence("ID"); - private static final ByteSequence EV_VALUE = new ByteSequence("EV"); - private static final ByteSequence TYPE_KEY = new ByteSequence("Type"); - private static final ByteSequence STRING_VALUE = new ByteSequence("String"); - private static final ByteSequence NUMBER_KEY = new ByteSequence("Number"); - private static final ByteSequence SVTYPE_KEY = new ByteSequence("SVTYPE"); - private static final ByteSequence ME_VALUE = new ByteSequence(":ME"); - private static final ByteSequence LT_VALUE = new ByteSequence("<"); - private static final ByteSequence GT_VALUE = new ByteSequence(">"); - private static final ByteSequence N_VALUE = new ByteSequence("N"); - private static final ByteSequence END_KEY = new ByteSequence("END"); - private static final ByteSequence VARGQ_KEY = new ByteSequence("varGQ"); - private static final ByteSequence MULTIALLELIC_KEY = new ByteSequence("MULTIALLELIC"); - private static final ByteSequence UNRESOLVED_KEY = new ByteSequence("UNRESOLVED"); - private static final ByteSequence HIGH_SR_BACKGROUND_KEY = new ByteSequence("HIGH_SR_BACKGROUND"); - private static final ByteSequence BOTHSIDES_SUPPORT_KEY = new ByteSequence("BOTHSIDES_SUPPORT"); - private static final ByteSequence DEL_VALUE = new ByteSequence("DEL"); - private static final ByteSequence DUP_VALUE = new ByteSequence("DUP"); - private static final ByteSequence RDCN_VALUE = new ByteSequence("RD_CN"); - private static final ByteSequence MISSING_VALUE = new ByteSequence("."); - private static final ByteSequence MISSING_GENOTYPE = new ByteSequence("./."); - private static final ByteSequence GT_REF_REF = new ByteSequence("0/0"); - private static final ByteSequence GT_REF_ALT = new ByteSequence("0/1"); - private static final ByteSequence GT_ALT_ALT = new ByteSequence("1/1"); - - private static final int MIN_ALLOSOME_EVENT_SIZE = 5000; - - public static void main( final String[] args ) { - if ( args.length != 8 ) { - System.err.println("Usage: java org.broadinstitute.svpipeline.CleanVCFPart1 " + - "INPUTVCFFILE PEDIGREES XCHR YCHR NOISYEVENTS BOTHSIDES SAMPLESOUT REVISEDEVENTSOUT"); - System.exit(1); - } - final VCFParser parser = new VCFParser(args[0]); - final ByteSequence xChrName = new ByteSequence(args[2]); - final ByteSequence yChrName = new ByteSequence(args[3]); - final Set noisyEvents = readLastColumn(args[4]); - final Set bothsidesSupportEvents = readLastColumn(args[5]); - try ( final OutputStream os - = new BufferedOutputStream(new FileOutputStream(FileDescriptor.out)); - final OutputStream osSamples = new BufferedOutputStream(new FileOutputStream(args[6])); - final OutputStream osRevEvents = new BufferedOutputStream(new FileOutputStream(args[7])) ) { - int[] sexForSample = null; - while ( parser.hasMetadata() ) { - final Metadata metadata = parser.nextMetaData(); - if ( metadata instanceof ColumnHeaderMetadata ) { - final ColumnHeaderMetadata cols = ((ColumnHeaderMetadata)metadata); - final List colNames = cols.getValue(); - final int nCols = colNames.size(); - for ( int idx = 9; idx < nCols; ++idx ) { - colNames.get(idx).write(osSamples); - osSamples.write('\n'); - } - sexForSample = readPedFile(args[1], cols.getValue()); - os.write(("##INFO=\n") - .getBytes(StandardCharsets.UTF_8)); - os.write("##FILTER=\n" - .getBytes(StandardCharsets.UTF_8)); - os.write(("##INFO=\n") - .getBytes(StandardCharsets.UTF_8)); - } else if ( metadata instanceof KeyAttributesMetadata ) { - final KeyAttributesMetadata keyAttrs = (KeyAttributesMetadata)metadata; - if ( keyAttrs.getKey().equals(FORMAT_LINE) ) { - final List kvs = keyAttrs.getValue(); - final int nKVs = kvs.size(); - if ( nKVs > 2 ) { - final KeyValue kv0 = kvs.get(0); - final KeyValue kv1 = kvs.get(1); - final KeyValue kv2 = kvs.get(2); - if ( kv0.getKey().equals(ID_KEY) && kv0.getValue().equals(EV_VALUE) ) { - if ( kv1.getKey().equals(NUMBER_KEY) ) { - kvs.set(1, new KeyValue(NUMBER_KEY, MISSING_VALUE)); - } - if ( kv2.getKey().equals(TYPE_KEY) ) { - kvs.set(2, new KeyValue(TYPE_KEY, STRING_VALUE)); - } - } - } - } - } - metadata.write(os); - } - if ( sexForSample == null ) { - throw new RuntimeException("header line with sample names is missing."); - } - while ( parser.hasRecord() ) { - final Record record = parser.nextRecord(); - - // replace the numeric EV value with a text value - final int evIdx = record.getFormat().indexOf(EV_VALUE); - if ( evIdx >= 0 ) { - for ( final CompoundField genotypeVals : record.getGenotypes() ) { - genotypeVals.set(evIdx, EV_VALS[genotypeVals.get(evIdx).asInt()]); - } - } - - // move the SVTYPE to the ALT field (except for MEs) - final InfoField info = record.getInfo(); - final ByteSequence svType = info.get(SVTYPE_KEY); - if ( !record.getAlt().contains(ME_VALUE) ) { - if ( svType != null ) { - record.setAlt(new ByteSequence(LT_VALUE, svType, GT_VALUE)); - } - } - record.setRef(N_VALUE); - - // move varGQ info field to quality column - final ByteSequence varGQ = info.get(VARGQ_KEY); - if ( varGQ != null ) { - record.setQuality(varGQ); - info.remove(VARGQ_KEY); - } - - // remove MULTIALLELIC flag, if present - info.remove(MULTIALLELIC_KEY); - - // remove UNRESOLVED flag and add it as a filter - if ( info.containsKey(UNRESOLVED_KEY) ) { - record.getFilter().add(UNRESOLVED_KEY); - info.remove(UNRESOLVED_KEY); - } - - // mark noisy events - if ( noisyEvents.contains(record.getID()) ) { - record.getInfo().put(HIGH_SR_BACKGROUND_KEY, null); - } - - // mark bothsides support - if ( bothsidesSupportEvents.contains(record.getID()) ) { - record.getInfo().put(BOTHSIDES_SUPPORT_KEY, null); - } - - // fix genotypes on allosomes - final boolean isY; - if ( (isY = yChrName.equals(record.getChromosome())) || - xChrName.equals(record.getChromosome())) { - final List genotypes = record.getGenotypes(); - final int rdCNIndex = record.getFormat().indexOf(RDCN_VALUE); - final ByteSequence end = info.get(END_KEY); - boolean adjustMale = false; - final boolean isDel; - if ( ((isDel = DEL_VALUE.equals(svType)) || DUP_VALUE.equals(svType)) && rdCNIndex >= 0 && end != null && - end.asInt() + 1 - record.getPosition() > MIN_ALLOSOME_EVENT_SIZE ) { - adjustMale = isRevisableEvent(genotypes, rdCNIndex, sexForSample, isY); - if ( adjustMale ) { - record.getID().write(osRevEvents); - osRevEvents.write('\n'); - } - } - CompoundField emptyGenotype = null; - final int nSamples = genotypes.size(); - for ( int sampleIdx = 0; sampleIdx < nSamples; ++sampleIdx ) { - final int sampleSex = sexForSample[sampleIdx]; - final CompoundField genotype = genotypes.get(sampleIdx); - if ( sampleSex == 1 ) { - if ( adjustMale ) { - final ByteSequence rdCN = genotype.get(rdCNIndex); - if ( rdCN.equals(MISSING_VALUE) ) { - continue; - } - final int rdCNVal = rdCN.asInt(); - genotype.set(rdCNIndex, new ByteSequence(Integer.toString(rdCNVal + 1))); - if ( isDel ) { - if ( rdCNVal >= 1 ) genotype.set(0, GT_REF_REF); - else if ( rdCNVal == 0 ) genotype.set(0, GT_REF_ALT); - } else { - if ( rdCNVal <= 1 ) genotype.set(0, GT_REF_REF); - else if ( rdCNVal == 2 ) genotype.set(0, GT_REF_ALT); - else genotype.set(0, GT_ALT_ALT); - } - } - } else if ( sampleSex == 2 ) { - if ( isY ) { - if ( emptyGenotype == null ) { - emptyGenotype = new CompoundField(MISSING_GENOTYPE, ':'); - int nFields = genotype.size(); - while ( --nFields > 0 ) { - emptyGenotype.add(MISSING_VALUE); - } - emptyGenotype.getValue(); // performance hack to put the pieces together - } - genotypes.set(sampleIdx, emptyGenotype); - } - } else { - genotype.set(0, MISSING_GENOTYPE); - } - } - } - - record.write(os); - } - } catch ( final IOException ioe ) { - throw new RuntimeException("Can't write to stdout", ioe); - } - } - - private static boolean isRevisableEvent( final List genotypes, - final int rdCNIndex, - final int[] sexForColumn, - final boolean isY ) { - // We're going to calculate the median rdCN values for males and females. - // We only care if the median is 0, 1, 2, or something larger, so we'll use 4 bins to - // sum up the counts: all values >2 go into the last bucket. - final int[] maleCounts = new int[4]; - final int[] femaleCounts = new int[4]; - final int nSamples = genotypes.size(); - for ( int sampleIdx = 0; sampleIdx < nSamples; ++sampleIdx ) { - final ByteSequence rdCN = genotypes.get(sampleIdx).get(rdCNIndex); - if ( MISSING_VALUE.equals(rdCN) ) { - continue; - } - int rdCNVal = rdCN.asInt(); - if ( rdCNVal > 2 ) { - rdCNVal = 3; - } - final int sampleSex = sexForColumn[sampleIdx]; - if ( sampleSex == 1 ) { - maleCounts[rdCNVal] += 1; - } else if ( sampleSex == 2 ) { - femaleCounts[rdCNVal] += 1; - } - } - final double maleMedian = calcMedian(maleCounts); - double femaleMedian = calcMedian(femaleCounts); - return maleMedian == 1. && (isY ? femaleMedian == 0. : femaleMedian == 2.); - } - - // visible for testing - static double calcMedian( final int[] counts ) { - final double target = (counts[0] + counts[1] + counts[2] + counts[3]) / 2.; - if ( target == 0. ) { - return Double.NaN; - } - int total = 0; - for ( int iii = 0; iii < 4; ++iii ) { - total += counts[iii]; - if ( total == target ) { - return iii + .5; - } else if ( total > target ) { - return (double)iii; - } - } - throw new IllegalStateException("we should never reach this statement"); - } - - private static Set readLastColumn( final String filename ) { - final Set values = new HashSet<>(); - try { - final BufferedReader neRdr = - new BufferedReader(new InputStreamReader(new FileInputStream(filename))); - String line; - while ( (line = neRdr.readLine()) != null ) { - final String lastCol = line.substring(line.lastIndexOf('\t') + 1); - values.add(new ByteSequence(lastCol)); - } - } catch ( final IOException ioe ) { - throw new RuntimeException("can't read table file " + filename); - } - return values; - } - - private static int[] readPedFile( final String pedFilename, List sampleNames ) { - final int nCols = sampleNames.size() - 9; - final Map sexForSampleMap = new HashMap<>(2*nCols); - final int[] sexForSample = new int[nCols]; - try { - final BufferedReader pedRdr = - new BufferedReader(new InputStreamReader(new FileInputStream(pedFilename))); - final Pattern tabPattern = Pattern.compile("\\t"); - String line; - while ( (line = pedRdr.readLine()) != null ) { - if ( line.startsWith("#") ) continue; - final Scanner scanner = new Scanner(line).useDelimiter(tabPattern); - scanner.next(); // family ignored - final String sampleName = scanner.next(); - scanner.next(); // mom ignored - scanner.next(); // pop ignored - final int sex = scanner.nextInt(); - sexForSampleMap.put(new ByteSequence(sampleName), sex); - } - } catch ( final IOException ioe ) { - throw new RuntimeException("can't read " + pedFilename, ioe); - } - for ( int col = 0; col < nCols; ++col ) { - final ByteSequence sampleName = sampleNames.get(col + 9); - final Integer sex = sexForSampleMap.get(sampleName); - if ( sex == null ) { - throw new RuntimeException("can't determine sex for sample " + sampleName); - } - sexForSample[col] = sex; - } - return sexForSample; - } -} diff --git a/src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1UnitTest.java b/src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1UnitTest.java deleted file mode 100644 index 77a6b5658..000000000 --- a/src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1UnitTest.java +++ /dev/null @@ -1,40 +0,0 @@ -package org.broadinstitute.svpipeline; - -public class CleanVCFPart1UnitTest { - public static void main( final String[] args ) { - testAsserts(); - testMedianCalculation(); - System.out.println("OK"); - } - - public static void testAsserts() { - boolean caughtIt = false; - try { - assert(false); - } catch ( final AssertionError ae ) { - caughtIt = true; - } - if ( !caughtIt ) { - throw new AssertionError("assertions aren't turned on, so you're not testing anything."); - } - } - - public static void testMedianCalculation() { - final int[] counts = new int[4]; - assert(Double.isNaN(CleanVCFPart1.calcMedian(counts))); - counts[0] = 1; - assert(CleanVCFPart1.calcMedian(counts) == 0.0); - counts[1] = 1; - assert(CleanVCFPart1.calcMedian(counts) == 0.5); - counts[2] = 1; - assert(CleanVCFPart1.calcMedian(counts) == 1.0); - counts[3] = 1; - assert(CleanVCFPart1.calcMedian(counts) == 1.5); - counts[2] = 2; - assert(CleanVCFPart1.calcMedian(counts) == 2.0); - counts[3] = 4; - assert(CleanVCFPart1.calcMedian(counts) == 2.5); - counts[3] = 5; - assert(CleanVCFPart1.calcMedian(counts) == 3.0); - } -} diff --git a/wdl/CleanVcf.wdl b/wdl/CleanVcf.wdl index ab228ccf7..f58c7f4f1 100644 --- a/wdl/CleanVcf.wdl +++ b/wdl/CleanVcf.wdl @@ -141,6 +141,7 @@ workflow CleanVcf { LINE1_reference=LINE1_reference, chr_x=chr_x, chr_y=chr_y, + gatk_docker="docker.io/broadinstitute/gatk:3eb5c3d38d6c8c65e71f29abe9346c98bfbb1cbe", linux_docker=linux_docker, sv_base_mini_docker=sv_base_mini_docker, sv_pipeline_docker=sv_pipeline_docker, @@ -167,14 +168,6 @@ workflow CleanVcf { runtime_override_hail_merge_drc=runtime_override_hail_merge_drc, runtime_override_fix_header_drc=runtime_override_fix_header_drc, runtime_override_drop_redundant_cnvs=runtime_override_drop_redundant_cnvs, - runtime_attr_override_subset_large_cnvs_1b=runtime_attr_override_subset_large_cnvs_1b, - runtime_attr_override_sort_bed_1b=runtime_attr_override_sort_bed_1b, - runtime_attr_override_intersect_bed_1b=runtime_attr_override_intersect_bed_1b, - runtime_attr_override_build_dict_1b=runtime_attr_override_build_dict_1b, - runtime_attr_override_scatter_1b=runtime_attr_override_scatter_1b, - runtime_attr_override_filter_vcf_1b=runtime_attr_override_filter_vcf_1b, - runtime_override_concat_vcfs_1b=runtime_override_concat_vcfs_1b, - runtime_override_cat_multi_cnvs_1b=runtime_override_cat_multi_cnvs_1b, runtime_attr_format=runtime_attr_format, runtime_override_rescue_me_dels=runtime_override_rescue_me_dels } diff --git a/wdl/CleanVcf5.wdl b/wdl/CleanVcf5.wdl index 085aaa5e5..f4396df52 100644 --- a/wdl/CleanVcf5.wdl +++ b/wdl/CleanVcf5.wdl @@ -8,7 +8,6 @@ workflow CleanVcf5 { File normal_revise_vcf File revise_vcf_lines File ped_file - File sex_chr_revise File multi_ids File? outlier_samples_list @@ -44,7 +43,6 @@ workflow CleanVcf5 { revise_vcf_lines=revise_vcf_lines, normal_revise_vcf=ScatterVcf.shards[i], ped_file=ped_file, - sex_chr_revise=sex_chr_revise, multi_ids=multi_ids, outlier_samples_list=outlier_samples_list, make_clean_gq_script=make_clean_gq_script, @@ -83,7 +81,6 @@ task MakeCleanGQ { File revise_vcf_lines File normal_revise_vcf File ped_file - File sex_chr_revise File multi_ids File? outlier_samples_list File? make_clean_gq_script @@ -96,7 +93,7 @@ task MakeCleanGQ { # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed # generally assume working memory is ~3 * inputs Float input_size = size( - select_all([revise_vcf_lines, normal_revise_vcf, ped_file, sex_chr_revise, multi_ids, outlier_samples_list]), + select_all([revise_vcf_lines, normal_revise_vcf, ped_file, multi_ids, outlier_samples_list]), "GB") Float base_disk_gb = 10.0 @@ -133,7 +130,6 @@ task MakeCleanGQ { revise.vcf.lines.vcf.gz \ ~{normal_revise_vcf} \ ~{ped_file} \ - ~{sex_chr_revise} \ ~{multi_ids} \ outliers.txt \ ~{prefix} diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl index a14ffa8c4..3e5332ad2 100644 --- a/wdl/CleanVcfChromosome.wdl +++ b/wdl/CleanVcfChromosome.wdl @@ -3,852 +3,851 @@ version 1.0 import "Structs.wdl" import "TasksMakeCohortVcf.wdl" as MiniTasks import "FormatVcfForGatk.wdl" as fvcf -import "CleanVcf1b.wdl" as c1b -import "CleanVcf5.wdl" as c5 import "HailMerge.wdl" as HailMerge workflow CleanVcfChromosome { - input { - File vcf - String contig - File background_list - File ped_file - File allosome_fai - String prefix - Int max_shards_per_chrom_step1 - File bothsides_pass_list - Int min_records_per_shard_step1 - Int samples_per_step2_shard - Int clean_vcf1b_records_per_shard - Int clean_vcf5_records_per_shard - Int? clean_vcf5_threads_per_task - File? outlier_samples_list - Int? max_samples_per_shard_step3 - - File HERVK_reference - File LINE1_reference - - File ploidy_table - String chr_x - String chr_y - - File? svtk_to_gatk_script # For debugging - - Boolean use_hail - String? gcs_project - - String linux_docker - String sv_base_mini_docker - String sv_pipeline_docker - - # overrides for local tasks - RuntimeAttr? runtime_override_clean_vcf_1a - RuntimeAttr? runtime_override_clean_vcf_2 - RuntimeAttr? runtime_override_clean_vcf_3 - RuntimeAttr? runtime_override_clean_vcf_4 - RuntimeAttr? runtime_override_clean_vcf_5_scatter - RuntimeAttr? runtime_override_clean_vcf_5_make_cleangq - RuntimeAttr? runtime_override_clean_vcf_5_find_redundant_multiallelics - RuntimeAttr? runtime_override_clean_vcf_5_polish - RuntimeAttr? runtime_override_stitch_fragmented_cnvs - RuntimeAttr? runtime_override_final_cleanup - RuntimeAttr? runtime_override_rescue_me_dels - - # Clean vcf 1b - RuntimeAttr? runtime_attr_override_subset_large_cnvs_1b - RuntimeAttr? runtime_attr_override_sort_bed_1b - RuntimeAttr? runtime_attr_override_intersect_bed_1b - RuntimeAttr? runtime_attr_override_build_dict_1b - RuntimeAttr? runtime_attr_override_scatter_1b - RuntimeAttr? runtime_attr_override_filter_vcf_1b - RuntimeAttr? runtime_override_concat_vcfs_1b - RuntimeAttr? runtime_override_cat_multi_cnvs_1b - - RuntimeAttr? runtime_override_preconcat_step1 - RuntimeAttr? runtime_override_hail_merge_step1 - RuntimeAttr? runtime_override_fix_header_step1 - - RuntimeAttr? runtime_override_preconcat_drc - RuntimeAttr? runtime_override_hail_merge_drc - RuntimeAttr? runtime_override_fix_header_drc - - # overrides for MiniTasks - RuntimeAttr? runtime_override_split_vcf_to_clean - RuntimeAttr? runtime_override_combine_step_1_sex_chr_revisions - RuntimeAttr? runtime_override_split_include_list - RuntimeAttr? runtime_override_combine_clean_vcf_2 - RuntimeAttr? runtime_override_combine_revised_4 - RuntimeAttr? runtime_override_combine_multi_ids_4 - RuntimeAttr? runtime_override_drop_redundant_cnvs - RuntimeAttr? runtime_override_combine_step_1_vcfs - RuntimeAttr? runtime_override_sort_drop_redundant_cnvs - RuntimeAttr? runtime_attr_format - - } - - call MiniTasks.SplitVcf as SplitVcfToClean { - input: - vcf=vcf, - contig=contig, - prefix="~{prefix}.shard_", - n_shards=max_shards_per_chrom_step1, - min_vars_per_shard=min_records_per_shard_step1, - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_split_vcf_to_clean - } - - scatter ( i in range(length(SplitVcfToClean.vcf_shards)) ) { - call CleanVcf1a { - input: - vcf=SplitVcfToClean.vcf_shards[i], - prefix="~{prefix}.clean_vcf_1.shard_~{i}", - background_fail_list=background_list, - bothsides_pass_list=bothsides_pass_list, - ped_file=ped_file, - allosome_fai=allosome_fai, - chr_x=chr_x, - chr_y=chr_y, - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_clean_vcf_1a - } - } - - if (use_hail) { - call HailMerge.HailMerge as CombineStep1VcfsHail { - input: - vcfs=CleanVcf1a.intermediate_vcf, - prefix="~{prefix}.combine_step_1_vcfs", - gcs_project=gcs_project, - sv_base_mini_docker=sv_base_mini_docker, - sv_pipeline_docker=sv_pipeline_docker, - runtime_override_preconcat=runtime_override_preconcat_step1, - runtime_override_hail_merge=runtime_override_hail_merge_step1, - runtime_override_fix_header=runtime_override_fix_header_step1 - } - } - if (!use_hail) { - call MiniTasks.ConcatVcfs as CombineStep1Vcfs { - input: - vcfs=CleanVcf1a.intermediate_vcf, - vcfs_idx=CleanVcf1a.intermediate_vcf_idx, - naive=true, - generate_index=false, - outfile_prefix="~{prefix}.combine_step_1_vcfs", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_combine_step_1_vcfs - } - } - - call MiniTasks.CatUncompressedFiles as CombineStep1SexChrRevisions { - input: - shards=CleanVcf1a.sex, - outfile_name="~{prefix}.combine_step_1_sex_chr_revisions.txt", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_combine_step_1_sex_chr_revisions - } - - call c1b.CleanVcf1b { - input: - intermediate_vcf=select_first([CombineStep1Vcfs.concat_vcf, CombineStep1VcfsHail.merged_vcf]), - prefix="~{prefix}.clean_vcf_1b", - records_per_shard=clean_vcf1b_records_per_shard, - sv_pipeline_docker=sv_pipeline_docker, - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override_subset_large_cnvs=runtime_attr_override_subset_large_cnvs_1b, - runtime_attr_override_sort_bed=runtime_attr_override_sort_bed_1b, - runtime_attr_override_intersect_bed=runtime_attr_override_intersect_bed_1b, - runtime_attr_override_build_dict=runtime_attr_override_build_dict_1b, - runtime_attr_override_scatter=runtime_attr_override_scatter_1b, - runtime_attr_override_filter_vcf=runtime_attr_override_filter_vcf_1b, - runtime_override_concat_vcfs=runtime_override_concat_vcfs_1b, - runtime_override_cat_multi_cnvs=runtime_override_cat_multi_cnvs_1b - } - - call MiniTasks.SplitUncompressed as SplitIncludeList { - input: - whole_file=CleanVcf1a.include_list[0], - lines_per_shard=samples_per_step2_shard, - shard_prefix="~{prefix}.split_include_list.", - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_split_include_list - } - - scatter ( i in range(length(SplitIncludeList.shards)) ){ - call CleanVcf2 { - input: - normal_revise_vcf=CleanVcf1b.normal, - prefix="~{prefix}.clean_vcf_2.shard_~{i}", - include_list=SplitIncludeList.shards[i], - multi_cnvs=CleanVcf1b.multi, - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_clean_vcf_2 - } - } - - call MiniTasks.CatUncompressedFiles as CombineCleanVcf2 { - input: - shards=CleanVcf2.out, - outfile_name="~{prefix}.combine_clean_vcf_2.txt", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_combine_clean_vcf_2 - } - - call CleanVcf3 { - input: - rd_cn_revise=CombineCleanVcf2.outfile, - max_samples_shard = max_samples_per_shard_step3, - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_clean_vcf_3 - } - - scatter ( i in range(length(CleanVcf3.shards)) ){ - call CleanVcf4 { - input: - rd_cn_revise=CleanVcf3.shards[i], - normal_revise_vcf=CleanVcf1b.normal, - prefix="~{prefix}.clean_vcf_4.shard_~{i}", - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_clean_vcf_4 - } - } - - call MiniTasks.CatUncompressedFiles as CombineRevised4 { - input: - shards=CleanVcf4.out, - outfile_name="~{prefix}.combine_revised_4.txt.gz", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_combine_revised_4 - } - - call MiniTasks.CatUncompressedFiles as CombineMultiIds4 { - input: - shards=CleanVcf4.multi_ids, - outfile_name="~{prefix}.combine_multi_ids_4.txt.gz", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_combine_multi_ids_4 - } - - call c5.CleanVcf5 { - input: - revise_vcf_lines=CombineRevised4.outfile, - normal_revise_vcf=CleanVcf1b.normal, - ped_file=ped_file, - sex_chr_revise=CombineStep1SexChrRevisions.outfile, - multi_ids=CombineMultiIds4.outfile, - outlier_samples_list=outlier_samples_list, - contig=contig, - prefix="~{prefix}.clean_vcf_5", - records_per_shard=clean_vcf5_records_per_shard, - threads_per_task=clean_vcf5_threads_per_task, - sv_pipeline_docker=sv_pipeline_docker, - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override_scatter=runtime_override_clean_vcf_5_scatter, - runtime_attr_override_make_cleangq=runtime_override_clean_vcf_5_make_cleangq, - runtime_attr_override_find_redundant_multiallelics=runtime_override_clean_vcf_5_find_redundant_multiallelics, - runtime_attr_override_polish=runtime_override_clean_vcf_5_polish - } - - call DropRedundantCnvs { - input: - vcf=CleanVcf5.polished, - prefix="~{prefix}.drop_redundant_cnvs", - contig=contig, - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_drop_redundant_cnvs - } - - if (use_hail) { - call HailMerge.HailMerge as SortDropRedundantCnvsHail { - input: - vcfs=[DropRedundantCnvs.out], - prefix="~{prefix}.drop_redundant_cnvs.sorted", - gcs_project=gcs_project, - reset_cnv_gts=true, - sv_base_mini_docker=sv_base_mini_docker, - sv_pipeline_docker=sv_pipeline_docker, - runtime_override_preconcat=runtime_override_preconcat_drc, - runtime_override_hail_merge=runtime_override_hail_merge_drc, - runtime_override_fix_header=runtime_override_fix_header_drc - } - } - if (!use_hail) { - call MiniTasks.SortVcf as SortDropRedundantCnvs { - input: - vcf=DropRedundantCnvs.out, - outfile_prefix="~{prefix}.drop_redundant_cnvs.sorted", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_sort_drop_redundant_cnvs - } - } - - call StitchFragmentedCnvs { - input: - vcf=select_first([SortDropRedundantCnvs.out, SortDropRedundantCnvsHail.merged_vcf]), - prefix="~{prefix}.stitch_fragmented_cnvs", - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_stitch_fragmented_cnvs - } - - call RescueMobileElementDeletions { - input: - vcf = StitchFragmentedCnvs.stitched_vcf_shard, - prefix = "~{prefix}.rescue_me_dels", - LINE1 = LINE1_reference, - HERVK = HERVK_reference, - sv_pipeline_docker = sv_pipeline_docker, - runtime_attr_override = runtime_override_rescue_me_dels - } - - call FinalCleanup { - input: - vcf=RescueMobileElementDeletions.out, - contig=contig, - prefix="~{prefix}.final_cleanup", - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_final_cleanup - } - - call fvcf.FormatVcf { - input: - vcf=FinalCleanup.final_cleaned_shard, - ploidy_table=ploidy_table, - args="--scale-down-gq", - output_prefix="~{prefix}.final_format", - script=svtk_to_gatk_script, - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_attr_format - } - - output { - File out = FormatVcf.out - File out_idx = FormatVcf.out_index - } + input { + File vcf + String contig + File background_list + File ped_file + File allosome_fai + String prefix + Int max_shards_per_chrom_step1 + File bothsides_pass_list + Int min_records_per_shard_step1 + Int samples_per_step2_shard + Int clean_vcf5_records_per_shard + Int? clean_vcf5_threads_per_task + File? outlier_samples_list + Int? max_samples_per_shard_step3 + + File HERVK_reference + File LINE1_reference + + File ploidy_table + String chr_x + String chr_y + + File? svtk_to_gatk_script # For debugging + + Boolean use_hail + String? gcs_project + + String gatk_docker + String linux_docker + String sv_base_mini_docker + String sv_pipeline_docker + + # overrides for local tasks + RuntimeAttr? runtime_override_clean_vcf_1a + RuntimeAttr? runtime_override_clean_vcf_1b + RuntimeAttr? runtime_override_clean_vcf_2 + RuntimeAttr? runtime_override_clean_vcf_3 + RuntimeAttr? runtime_override_clean_vcf_4 + RuntimeAttr? runtime_override_clean_vcf_5 + RuntimeAttr? runtime_override_stitch_fragmented_cnvs + RuntimeAttr? runtime_override_final_cleanup + RuntimeAttr? runtime_override_rescue_me_dels + + RuntimeAttr? runtime_override_preconcat_step1 + RuntimeAttr? runtime_override_hail_merge_step1 + RuntimeAttr? runtime_override_fix_header_step1 + + RuntimeAttr? runtime_override_preconcat_drc + RuntimeAttr? runtime_override_hail_merge_drc + RuntimeAttr? runtime_override_fix_header_drc + + # overrides for MiniTasks + RuntimeAttr? runtime_override_split_vcf_to_clean + RuntimeAttr? runtime_override_split_include_list + RuntimeAttr? runtime_override_combine_clean_vcf_2 + RuntimeAttr? runtime_override_drop_redundant_cnvs + RuntimeAttr? runtime_override_combine_step_1_vcfs + RuntimeAttr? runtime_override_sort_drop_redundant_cnvs + RuntimeAttr? runtime_attr_format + } + + call fvcf.FormatVcf as FormatVcfToClean { + input: + vcf=vcf, + ploidy_table=ploidy_table, + output_prefix="~{prefix}.formatted", + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_override=runtime_attr_format + } + + call MiniTasks.SplitVcf as SplitVcfToClean { + input: + vcf=FormatVcfToClean.out, + contig=contig, + prefix="~{prefix}.shard_", + n_shards=max_shards_per_chrom_step1, + min_vars_per_shard=min_records_per_shard_step1, + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_override_split_vcf_to_clean + } + + scatter ( i in range(length(SplitVcfToClean.vcf_shards)) ) { + call CleanVcf1a { + input: + vcf=SplitVcfToClean.vcf_shards[i], + prefix="~{prefix}.clean_vcf_1a.shard_~{i}", + background_fail_list=background_list, + bothsides_pass_list=bothsides_pass_list, + ped_file=ped_file, + allosome_fai=allosome_fai, + chr_x=chr_x, + chr_y=chr_y, + gatk_docker=gatk_docker, + runtime_attr_override=runtime_override_clean_vcf_1a + } + } + + if (use_hail) { + call HailMerge.HailMerge as CombineStep1VcfsHail { + input: + vcfs=CleanVcf1a.intermediate_vcf, + prefix="~{prefix}.combine_step_1_vcfs", + gcs_project=gcs_project, + sv_base_mini_docker=sv_base_mini_docker, + sv_pipeline_docker=sv_pipeline_docker, + runtime_override_preconcat=runtime_override_preconcat_step1, + runtime_override_hail_merge=runtime_override_hail_merge_step1, + runtime_override_fix_header=runtime_override_fix_header_step1 + } + } + if (!use_hail) { + call MiniTasks.ConcatVcfs as CombineStep1Vcfs { + input: + vcfs=CleanVcf1a.intermediate_vcf, + vcfs_idx=CleanVcf1a.intermediate_vcf_idx, + naive=true, + generate_index=false, + outfile_prefix="~{prefix}.combine_step_1_vcfs", + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_override_combine_step_1_vcfs + } + } + + call CleanVcf1b { + input: + vcf=select_first([CombineStep1Vcfs.concat_vcf, CombineStep1VcfsHail.merged_vcf]), + prefix="~{prefix}.clean_vcf_1b", + gatk_docker=gatk_docker, + runtime_attr_override=runtime_override_clean_vcf_1b + } + + call MiniTasks.SplitUncompressed as SplitIncludeList { + input: + whole_file=CleanVcf1a.include_list[0], + lines_per_shard=samples_per_step2_shard, + shard_prefix="~{prefix}.split_include_list.", + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_override=runtime_override_split_include_list + } + + scatter ( i in range(length(SplitIncludeList.shards)) ){ + call CleanVcf2 { + input: + vcf=CleanVcf1b.out, + prefix="~{prefix}.clean_vcf_2.shard_~{i}", + include_list=SplitIncludeList.shards[i], + gatk_docker=gatk_docker, + runtime_attr_override=runtime_override_clean_vcf_2 + } + } + + call MiniTasks.CatUncompressedFiles as CombineCleanVcf2 { + input: + shards=CleanVcf2.out, + outfile_name="~{prefix}.combine_clean_vcf_2.txt", + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_override_combine_clean_vcf_2 + } + + call CleanVcf3 { + input: + rd_cn_revise=CombineCleanVcf2.outfile, + max_samples_shard = max_samples_per_shard_step3, + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_override=runtime_override_clean_vcf_3 + } + + scatter ( i in range(length(CleanVcf3.shards)) ){ + call CleanVcf4 { + input: + vcf=CleanVcf1b.out, + prefix="~{prefix}.clean_vcf_4.shard_~{i}", + outlier_samples_list=outlier_samples_list, + rd_cn_revise=CleanVcf3.shards[i], + gatk_docker=gatk_docker, + runtime_attr_override=runtime_override_clean_vcf_4 + } + } + + if (use_hail) { + call HailMerge.HailMerge as CombineStep4VcfsHail { + input: + vcfs=CleanVcf4.out, + prefix="~{prefix}.combine_revised_4", + gcs_project=gcs_project, + sv_base_mini_docker=sv_base_mini_docker, + sv_pipeline_docker=sv_pipeline_docker, + runtime_override_preconcat=runtime_override_preconcat_step1, + runtime_override_hail_merge=runtime_override_hail_merge_step1, + runtime_override_fix_header=runtime_override_fix_header_step1 + } + } + if (!use_hail) { + call MiniTasks.ConcatVcfs as CombineStep4Vcfs { + input: + vcfs=CleanVcf4.out, + vcfs_idx=CleanVcf4.out_idx, + naive=true, + generate_index=true, + outfile_prefix="~{prefix}.combine_revised_4", + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_override_combine_step_1_vcfs + } + } + + call CleanVcf5 { + input: + vcf=select_first([CombineStep4Vcfs.concat_vcf, CombineStep4VcfsHail.merged_vcf]), + prefix="~{prefix}.clean_vcf_5", + gatk_docker=gatk_docker, + runtime_attr_override=runtime_override_clean_vcf_5 + } + + call DropRedundantCnvs { + input: + vcf=CleanVcf5.out, + prefix="~{prefix}.drop_redundant_cnvs", + contig=contig, + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_override=runtime_override_drop_redundant_cnvs + } + + if (use_hail) { + call HailMerge.HailMerge as SortDropRedundantCnvsHail { + input: + vcfs=[DropRedundantCnvs.out], + prefix="~{prefix}.drop_redundant_cnvs.sorted", + gcs_project=gcs_project, + reset_cnv_gts=true, + sv_base_mini_docker=sv_base_mini_docker, + sv_pipeline_docker=sv_pipeline_docker, + runtime_override_preconcat=runtime_override_preconcat_drc, + runtime_override_hail_merge=runtime_override_hail_merge_drc, + runtime_override_fix_header=runtime_override_fix_header_drc + } + } + if (!use_hail) { + call MiniTasks.SortVcf as SortDropRedundantCnvs { + input: + vcf=DropRedundantCnvs.out, + outfile_prefix="~{prefix}.drop_redundant_cnvs.sorted", + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_override_sort_drop_redundant_cnvs + } + } + + call StitchFragmentedCnvs { + input: + vcf=select_first([SortDropRedundantCnvs.out, SortDropRedundantCnvsHail.merged_vcf]), + prefix="~{prefix}.stitch_fragmented_cnvs", + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_override=runtime_override_stitch_fragmented_cnvs + } + + call RescueMobileElementDeletions { + input: + vcf = StitchFragmentedCnvs.stitched_vcf_shard, + prefix = "~{prefix}.rescue_me_dels", + LINE1 = LINE1_reference, + HERVK = HERVK_reference, + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_override_rescue_me_dels + } + + call FinalCleanup { + input: + vcf=RescueMobileElementDeletions.out, + contig=contig, + prefix="~{prefix}.final_cleanup", + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_override=runtime_override_final_cleanup + } + + call fvcf.FormatVcf as FormatVcfToOutput { + input: + vcf=FinalCleanup.final_cleaned_shard, + ploidy_table=ploidy_table, + args="--scale-down-gq", + output_prefix="~{prefix}.final_format", + script=svtk_to_gatk_script, + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_override=runtime_attr_format + } + + output { + File out = FormatVcfToOutput.out + File out_idx = FormatVcfToOutput.out_index + } } task CleanVcf1a { - input { - File vcf - String prefix - File background_fail_list - File bothsides_pass_list - File ped_file - File allosome_fai - String chr_x - String chr_y - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([vcf, background_fail_list, bothsides_pass_list], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 2), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - - touch ~{prefix}.includelist.txt - touch ~{prefix}.sexchr.revise.txt - - # outputs - # includelist.txt: the names of all the samples in the input vcf - # sexchr.revise.txt: the names of the events where genotypes got tweaked on allosomes - # stdout: a revised vcf - java -jar $CLEAN_VCF_PART_1_JAR \ - ~{vcf} \ - ~{ped_file} \ - ~{chr_x} \ - ~{chr_y} \ - ~{background_fail_list} \ - ~{bothsides_pass_list} \ - ~{prefix}.includelist.txt \ - ~{prefix}.sexchr.revise.txt \ - | bgzip \ - > ~{prefix}.vcf.gz - tabix ~{prefix}.vcf.gz - >>> - - output { - File include_list="~{prefix}.includelist.txt" - File sex="~{prefix}.sexchr.revise.txt" - File intermediate_vcf="~{prefix}.vcf.gz" - File intermediate_vcf_idx="~{prefix}.vcf.gz.tbi" - } + input { + File vcf + String prefix + File background_fail_list + File bothsides_pass_list + File ped_file + File allosome_fai + String chr_x + String chr_y + String gatk_docker + RuntimeAttr? runtime_attr_override + } + + Float input_size = size([vcf, background_fail_list, bothsides_pass_list], "GB") + RuntimeAttr runtime_default = object { + mem_gb: 3.75, + disk_gb: ceil(10.0 + input_size * 2), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: gatk_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + Int java_mem_mb = ceil(runtime_override.mem_gb * 1000 * 0.7) + String output_vcf = "~{prefix}.vcf.gz" + String output_samples_list = "~{prefix}.includelist.txt" + + command <<< + set -euo pipefail + + if [ ! -f "~{vcf}.tbi" ]; then + tabix -p vcf ~{vcf} + fi + + gatk --java-options "-Xmx~{java_mem_mb}m" SVCleanPt1a \ + -V ~{vcf} \ + -O ~{output_vcf} \ + --fail-list ~{background_fail_list} \ + --pass-list ~{bothsides_pass_list} \ + --chr-X ~{chr_x} \ + --chr-Y ~{chr_y} \ + --output-samples-list ~{output_samples_list} + >>> + + output { + File include_list="~{output_samples_list}" + File intermediate_vcf="~{output_vcf}" + File intermediate_vcf_idx="~{output_vcf}.tbi" + } +} + +task CleanVcf1b { + input { + File vcf + String prefix + String gatk_docker + RuntimeAttr? runtime_attr_override + } + + Float input_size = size([vcf], "GB") + RuntimeAttr runtime_default = object { + mem_gb: 3.75, + disk_gb: ceil(10.0 + input_size * 2), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: gatk_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + String output_vcf = "~{prefix}.vcf.gz" + Int java_mem_mb = ceil(runtime_override.mem_gb * 1000 * 0.7) + + command <<< + set -euo pipefail + + if [ ! -f "~{vcf}.tbi" ]; then + tabix -p vcf ~{vcf} + fi + + gatk --java-options "-Xmx~{java_mem_mb}m" SVCleanPt1b \ + -V ~{vcf} \ + -O ~{output_vcf} + >>> + + output { + File out="~{output_vcf}" + File out_idx="~{output_vcf}.tbi" + } } task CleanVcf2 { - input { - File normal_revise_vcf - String prefix - File include_list - File multi_cnvs - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed - # generally assume working memory is ~3 * inputs - Float input_size = size([normal_revise_vcf, include_list, multi_cnvs], "GB") - Float base_disk_gb = 10.0 - Float input_disk_scale = 3.0 - RuntimeAttr runtime_default = object { - mem_gb: 2.0, - disk_gb: ceil(base_disk_gb + input_size * input_disk_scale), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -eu -o pipefail - - bcftools index ~{normal_revise_vcf} - /opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part2.sh \ - ~{normal_revise_vcf} \ - ~{include_list} \ - ~{multi_cnvs} \ - "~{prefix}.txt" - >>> - - output { - File out="~{prefix}.txt" - } + input { + File vcf + String prefix + File include_list + String gatk_docker + RuntimeAttr? runtime_attr_override + } + + Float input_size = size([vcf, include_list], "GB") + Float base_disk_gb = 10.0 + Float input_disk_scale = 3.0 + RuntimeAttr runtime_default = object { + mem_gb: 2.0, + disk_gb: ceil(base_disk_gb + input_size * input_disk_scale), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: gatk_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + String output_revised_list = "~{prefix}.txt" + Int java_mem_mb = ceil(runtime_override.mem_gb * 1000 * 0.7) + + command <<< + set -euo pipefail + + if [ ! -f "~{vcf}.tbi" ]; then + tabix -p vcf ~{vcf} + fi + + gatk --java-options "-Xmx~{java_mem_mb}m" SVCleanPt2 \ + -V ~{vcf} \ + --sample-list ~{include_list} \ + --output-revised-list ~{output_revised_list} + >>> + + output { + File out="~{output_revised_list}" + } } task CleanVcf3 { - input { - File rd_cn_revise - Int? max_samples_shard - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - Int max_samples_shard_ = select_first([max_samples_shard, 7000]) - # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed - # generally assume working memory is ~3 * inputs - Float input_size = size(rd_cn_revise, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 2.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - python /opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part3.py ~{rd_cn_revise} -s ~{max_samples_shard_} - # Ensure there is at least one shard - touch shards/out.0_0.txt - >>> - - output { - Array[File] shards = glob("shards/*") - } + input { + File rd_cn_revise + Int? max_samples_shard + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + Int max_samples_shard_ = select_first([max_samples_shard, 7000]) + Float input_size = size(rd_cn_revise, "GB") + RuntimeAttr runtime_default = object { + mem_gb: 3.75, + disk_gb: ceil(10.0 + input_size * 2.0), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_pipeline_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + command <<< + set -euo pipefail + python /opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part3.py ~{rd_cn_revise} -s ~{max_samples_shard_} + # Ensure there is at least one shard + touch shards/out.0_0.txt + >>> + + output { + Array[File] shards = glob("shards/*") + } } task CleanVcf4 { - input { - File rd_cn_revise - File normal_revise_vcf - String prefix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([rd_cn_revise, normal_revise_vcf], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 2.0, - disk_gb: 50, - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - python3 < record_end: - break - num_gt_over_2 = 0 - for sid in record.samples: - s = record.samples[sid] - # Pick best GT - if s['PE_GT'] is None: - continue - elif s['SR_GT'] is None: - gt = s['PE_GT'] - elif s['PE_GT'] > 0 and s['SR_GT'] == 0: - gt = s['PE_GT'] - elif s['PE_GT'] == 0: - gt = s['SR_GT'] - elif s['PE_GQ'] >= s['SR_GQ']: - gt = s['PE_GT'] - else: - gt = s['SR_GT'] - if gt > 2: - num_gt_over_2 += 1 - if num_gt_over_2 > max_vf: - multi_geno_ids.add(record.id) - vcf.close() - - multi_geno_ids = sorted(list(multi_geno_ids)) - with open("~{prefix}.multi_geno_ids.txt", "w") as f: - for vid in multi_geno_ids: - f.write(vid + "\n") - CODE - - bgzip ~{prefix}.revise_vcf_lines.txt - gzip ~{prefix}.multi_geno_ids.txt - >>> - - output { - File out="~{prefix}.revise_vcf_lines.txt.gz" - File multi_ids="~{prefix}.multi_geno_ids.txt.gz" - } + input { + File vcf + String prefix + File rd_cn_revise + File? outlier_samples_list + String gatk_docker + RuntimeAttr? runtime_attr_override + } + + Float input_size = size([vcf, rd_cn_revise], "GB") + RuntimeAttr runtime_default = object { + mem_gb: 2.0, + disk_gb: 50, + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: gatk_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + String output_vcf = "~{prefix}.vcf.gz" + Int java_mem_mb = ceil(runtime_override.mem_gb * 1000 * 0.7) + + command <<< + set -euo pipefail + + if [ ! -f "~{vcf}.tbi" ]; then + tabix -p vcf ~{vcf} + fi + + gatk --java-options "-Xmx~{java_mem_mb}m" SVCleanPt4 \ + -V ~{vcf} \ + -O ~{output_vcf} \ + --revised-cn-list ~{rd_cn_revise} \ + ~{if defined(outlier_samples_list) then "--outliers-list ~{outlier_samples_list}" else "" } + >>> + + output { + File out="~{output_vcf}" + File out_idx="~{output_vcf}.tbi" + } +} + + +task CleanVcf5 { + input { + File vcf + String prefix + String gatk_docker + RuntimeAttr? runtime_attr_override + } + + Float input_size = size([vcf], "GB") + RuntimeAttr runtime_default = object { + mem_gb: 2.0, + disk_gb: 50, + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: gatk_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + String output_vcf = "~{prefix}.vcf.gz" + Int java_mem_mb = ceil(runtime_override.mem_gb * 1000 * 0.7) + + command <<< + set -euo pipefail + + if [ ! -f "~{vcf}.tbi" ]; then + tabix -p vcf ~{vcf} + fi + + gatk --java-options "-Xmx~{java_mem_mb}m" SVCleanPt5 \ + -V ~{vcf} \ + -O ~{output_vcf} + >>> + + output { + File out="~{output_vcf}" + File out_idx="~{output_vcf}.tbi" + } } + task RescueMobileElementDeletions { - input { - File vcf - String prefix - File LINE1 - File HERVK - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(vcf, "GiB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75 + input_size * 1.5, - disk_gb: ceil(100.0 + input_size * 3.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - - python <.5) print}' | cut -f4 | sed -e 's/$/\tDEL\tPASS\toverlap_LINE1/' > manual_revise.MEI_DEL_from_BND.SVID_SVTYPE_FILTER_INFO.tsv - bedtools coverage -wo -a ~{prefix}.bnd_del.bed.gz -b ~{HERVK} | awk '{if ($NF>.5) print}' | cut -f4 | sed -e 's/$/\tDEL\tPASS\toverlap_HERVK/' >> manual_revise.MEI_DEL_from_BND.SVID_SVTYPE_FILTER_INFO.tsv + bedtools coverage -wo -a ~{prefix}.bnd_del.bed.gz -b ~{LINE1} | awk '{if ($NF>.5) print}' | cut -f4 | sed -e 's/$/\tDEL\tPASS\toverlap_LINE1/' > manual_revise.MEI_DEL_from_BND.SVID_SVTYPE_FILTER_INFO.tsv + bedtools coverage -wo -a ~{prefix}.bnd_del.bed.gz -b ~{HERVK} | awk '{if ($NF>.5) print}' | cut -f4 | sed -e 's/$/\tDEL\tPASS\toverlap_HERVK/' >> manual_revise.MEI_DEL_from_BND.SVID_SVTYPE_FILTER_INFO.tsv - python <',) - if hash_MEI_DEL_reset[record.id] == 'overlap_HERVK': - record.alts = ('',) - fo.write(record) + if record.id in hash_MEI_DEL_reset.keys(): + del record.filter['UNRESOLVED'] + record.info['SVTYPE'] = 'DEL' + record.info['SVLEN'] = record.info['END2'] - record.start + record.stop = record.info['END2'] + record.info.pop("CHR2") + record.info.pop("END2") + record.info.pop("UNRESOLVED_TYPE") + if hash_MEI_DEL_reset[record.id] == 'overlap_LINE1': + record.alts = ('',) + if hash_MEI_DEL_reset[record.id] == 'overlap_HERVK': + record.alts = ('',) + fo.write(record) fin.close() fo.close() CODE - >>> + >>> - output { - File out = "~{prefix}.vcf.gz" - } + output { + File out = "~{prefix}.vcf.gz" + } } # Remove CNVs that are redundant with CPX events or other CNVs task DropRedundantCnvs { - input { - File vcf - String prefix - String contig - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(vcf, "GiB") - # disk is cheap, read/write speed is proportional to disk size, and disk IO is a significant time factor: - # in tests on large VCFs, memory usage is ~1.0 * input VCF size - # the biggest disk usage is at the end of the task, with input + output VCF on disk - Int cpu_cores = 2 # speed up compression / decompression of VCFs - RuntimeAttr runtime_default = object { - mem_gb: 3.75 + input_size * 1.5, - disk_gb: ceil(100.0 + input_size * 2.0), - cpu_cores: cpu_cores, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - /opt/sv-pipeline/04_variant_resolution/scripts/resolve_cpx_cnv_redundancies.py \ - ~{vcf} ~{prefix}.vcf.gz --temp-dir ./tmp - >>> - - output { - File out = "~{prefix}.vcf.gz" - } + input { + File vcf + String prefix + String contig + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + Float input_size = size(vcf, "GiB") + # disk is cheap, read/write speed is proportional to disk size, and disk IO is a significant time factor: + # in tests on large VCFs, memory usage is ~1.0 * input VCF size + # the biggest disk usage is at the end of the task, with input + output VCF on disk + Int cpu_cores = 2 # speed up compression / decompression of VCFs + RuntimeAttr runtime_default = object { + mem_gb: 3.75 + input_size * 1.5, + disk_gb: ceil(100.0 + input_size * 2.0), + cpu_cores: cpu_cores, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_pipeline_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + command <<< + set -euo pipefail + /opt/sv-pipeline/04_variant_resolution/scripts/resolve_cpx_cnv_redundancies.py \ + ~{vcf} ~{prefix}.vcf.gz --temp-dir ./tmp + >>> + + output { + File out = "~{prefix}.vcf.gz" + } } # Stitch fragmented RD-only calls found in 100% of the same samples task StitchFragmentedCnvs { - input { - File vcf - String prefix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(vcf, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 2), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - Float mem_gb = select_first([runtime_override.mem_gb, runtime_default.mem_gb]) - Int java_mem_mb = ceil(mem_gb * 1000 * 0.8) - - runtime { - memory: "~{mem_gb} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - echo "First pass..." - java -Xmx~{java_mem_mb}M -jar ${STITCH_JAR} 0.2 200000 0.2 ~{vcf} \ - | bgzip \ - > tmp.vcf.gz - rm ~{vcf} - echo "Second pass..." - java -Xmx~{java_mem_mb}M -jar ${STITCH_JAR} 0.2 200000 0.2 tmp.vcf.gz \ - | bgzip \ - > ~{prefix}.vcf.gz - >>> - - output { - File stitched_vcf_shard = "~{prefix}.vcf.gz" - } + input { + File vcf + String prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + Float input_size = size(vcf, "GB") + RuntimeAttr runtime_default = object { + mem_gb: 7.5, + disk_gb: ceil(10.0 + input_size * 2), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + Float mem_gb = select_first([runtime_override.mem_gb, runtime_default.mem_gb]) + Int java_mem_mb = ceil(mem_gb * 1000 * 0.8) + + runtime { + memory: "~{mem_gb} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_pipeline_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + command <<< + set -euo pipefail + echo "First pass..." + java -Xmx~{java_mem_mb}M -jar ${STITCH_JAR} 0.2 200000 0.2 ~{vcf} \ + | bgzip \ + > tmp.vcf.gz + rm ~{vcf} + echo "Second pass..." + java -Xmx~{java_mem_mb}M -jar ${STITCH_JAR} 0.2 200000 0.2 tmp.vcf.gz \ + | bgzip \ + > ~{prefix}.vcf.gz + >>> + + output { + File stitched_vcf_shard = "~{prefix}.vcf.gz" + } } # Final VCF cleanup task FinalCleanup { - input { - File vcf - String contig - String prefix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed - # generally assume working memory is ~3 * inputs - Float input_size = size(vcf, "GB") - Float base_disk_gb = 10.0 - Float base_mem_gb = 2.0 - Float input_mem_scale = 3.0 - Float input_disk_scale = 5.0 - RuntimeAttr runtime_default = object { - mem_gb: base_mem_gb + input_size * input_mem_scale, - disk_gb: ceil(base_disk_gb + input_size * input_disk_scale), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -eu -o pipefail - - /opt/sv-pipeline/04_variant_resolution/scripts/rename_after_vcfcluster.py \ - --chrom ~{contig} \ - --prefix ~{prefix} \ - ~{vcf} stdout \ - | bcftools annotate --no-version -e 'SVTYPE=="CNV" && SVLEN<5000' -x INFO/MEMBERS -Oz -o ~{prefix}.vcf.gz - tabix ~{prefix}.vcf.gz - >>> - - output { - File final_cleaned_shard = "~{prefix}.vcf.gz" - File final_cleaned_shard_idx = "~{prefix}.vcf.gz.tbi" - } + input { + File vcf + String contig + String prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed + # generally assume working memory is ~3 * inputs + Float input_size = size(vcf, "GB") + Float base_disk_gb = 10.0 + Float base_mem_gb = 2.0 + Float input_mem_scale = 3.0 + Float input_disk_scale = 5.0 + RuntimeAttr runtime_default = object { + mem_gb: base_mem_gb + input_size * input_mem_scale, + disk_gb: ceil(base_disk_gb + input_size * input_disk_scale), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_pipeline_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + command <<< + set -eu -o pipefail + + /opt/sv-pipeline/04_variant_resolution/scripts/rename_after_vcfcluster.py \ + --chrom ~{contig} \ + --prefix ~{prefix} \ + ~{vcf} stdout \ + | bcftools annotate --no-version -e 'SVTYPE=="CNV" && SVLEN<5000' -x INFO/MEMBERS -Oz -o ~{prefix}.vcf.gz + tabix ~{prefix}.vcf.gz + >>> + + output { + File final_cleaned_shard = "~{prefix}.vcf.gz" + File final_cleaned_shard_idx = "~{prefix}.vcf.gz.tbi" + } } \ No newline at end of file From 7eaa215ba3f6bb1ce1bea123be76e4d3ef239979 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Thu, 31 Oct 2024 21:21:09 -0400 Subject: [PATCH 03/40] Furhter changes - reverted changes to 1b, removed 5 --- wdl/CleanVcf1b.wdl | 353 --------------------------------------------- wdl/CleanVcf5.wdl | 6 +- 2 files changed, 5 insertions(+), 354 deletions(-) delete mode 100644 wdl/CleanVcf1b.wdl diff --git a/wdl/CleanVcf1b.wdl b/wdl/CleanVcf1b.wdl deleted file mode 100644 index 691d0591c..000000000 --- a/wdl/CleanVcf1b.wdl +++ /dev/null @@ -1,353 +0,0 @@ -version 1.0 - -import "Structs.wdl" -import "CleanVcf5.wdl" as CleanVcf5 -import "TasksMakeCohortVcf.wdl" as MiniTasks - -workflow CleanVcf1b { - input { - File intermediate_vcf - String prefix - Int records_per_shard - - String sv_pipeline_docker - String sv_base_mini_docker - - RuntimeAttr? runtime_attr_override_subset_large_cnvs - RuntimeAttr? runtime_attr_override_sort_bed - RuntimeAttr? runtime_attr_override_intersect_bed - RuntimeAttr? runtime_attr_override_build_dict - RuntimeAttr? runtime_attr_override_scatter - RuntimeAttr? runtime_attr_override_filter_vcf - RuntimeAttr? runtime_override_concat_vcfs - RuntimeAttr? runtime_override_cat_multi_cnvs - } - - call SubsetLargeCNVs { - input: - vcf=intermediate_vcf, - prefix="~{prefix}.subset_large_cnvs", - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_attr_override_subset_large_cnvs - } - - call Vcf2Bed { - input: - vcf=SubsetLargeCNVs.out, - prefix="~{prefix}.subset_large_cnvs", - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_attr_override_subset_large_cnvs - } - - call SortBed { - input: - bed=Vcf2Bed.out, - prefix="~{prefix}.subset_large_cnvs.sorted", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_attr_override_sort_bed - } - - call BedtoolsIntersect { - input: - bed=SortBed.out, - prefix="~{prefix}.bedtools_intersect", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_attr_override_intersect_bed - } - - call BuildGenoNormalReviseDictionary { - input: - filtered_vcf=SubsetLargeCNVs.out, - intersect_bed=BedtoolsIntersect.out, - prefix="~{prefix}.geno_normal_revise", - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_attr_override_build_dict - } - - call MiniTasks.ScatterVcf { - input: - vcf=intermediate_vcf, - records_per_shard=records_per_shard, - prefix="~{prefix}.scatter_vcf", - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_attr_override_scatter - } - - scatter ( i in range(length(ScatterVcf.shards)) ) { - call FilterVcf { - input: - intermediate_vcf=ScatterVcf.shards[i], - dictionary_json_gz=BuildGenoNormalReviseDictionary.out, - prefix="~{prefix}.filter_vcf.shard_~{i}", - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_attr_override_filter_vcf - } - } - - call MiniTasks.ConcatVcfs as ConcatCleanVcf1bShards { - input: - vcfs=FilterVcf.out, - naive=true, - sort_vcf_list=true, - outfile_prefix="~{prefix}.concat_vcfs", - sv_base_mini_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_concat_vcfs - } - - call MiniTasks.CatUncompressedFiles as ConcatMultiCnvs { - input: - shards=FilterVcf.multi_cnvs, - outfile_name="~{prefix}.multi.cnvs.txt", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_cat_multi_cnvs - } - - output { - File normal = ConcatCleanVcf1bShards.concat_vcf - File multi = ConcatMultiCnvs.outfile - } -} - -task SubsetLargeCNVs { - input { - File vcf - String prefix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(vcf, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 2.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - bcftools view --no-version \ - -i '(INFO/SVTYPE=="DEL" || INFO/SVTYPE=="DUP") && INFO/SVLEN>=5000' \ - ~{vcf} \ - | bgzip \ - > ~{prefix}.vcf.gz - >>> - output { - File out = "~{prefix}.vcf.gz" - } -} - -task Vcf2Bed { - input { - File vcf - String prefix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(vcf, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 2.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - svtk vcf2bed --no-header ~{vcf} stdout \ - | awk -F'\t' -v OFS='\t' '{if ($6=="") $6="blanksample";print $0}' \ - | gzip -1 \ - > ~{prefix}.bed.gz - >>> - output { - File out = "~{prefix}.bed.gz" - } -} - -task SortBed { - input { - File bed - String prefix - String sv_base_mini_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(bed, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 10.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_base_mini_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - mkdir tmp - zcat ~{bed} \ - | sort -T tmp -k1,1 -k2,2n \ - | gzip -1 \ - > ~{prefix}.bed.gz - >>> - output { - File out = "~{prefix}.bed.gz" - } -} - -task BedtoolsIntersect { - input { - File bed - String prefix - String sv_base_mini_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(bed, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 10.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_base_mini_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - bedtools intersect -sorted -wa -wb -a <(zcat ~{bed}) -b <(zcat ~{bed}) \ - | awk -F'\t' -v OFS='\t' '$4!=$10 && $5!=$11' \ - | gzip -1 \ - > ~{prefix}.bed.gz - >>> - output { - File out = "~{prefix}.bed.gz" - } -} - -task BuildGenoNormalReviseDictionary { - input { - File filtered_vcf - File intersect_bed - String prefix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([filtered_vcf, intersect_bed], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 2.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - python /opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_build_dict.py ~{filtered_vcf} ~{intersect_bed} \ - | gzip -1 \ - > ~{prefix}.json.gz - >>> - output { - File out = "~{prefix}.json.gz" - } -} - -task FilterVcf { - input { - File intermediate_vcf - File dictionary_json_gz - String prefix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([intermediate_vcf, dictionary_json_gz], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 2.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - python /opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_filter.py ~{dictionary_json_gz} ~{intermediate_vcf} \ - | bgzip \ - > ~{prefix}.vcf.gz - mv multi.cnvs.txt ~{prefix}.multi.cnvs.txt - >>> - output { - File out = "~{prefix}.vcf.gz" - File multi_cnvs = "~{prefix}.multi.cnvs.txt" - } -} diff --git a/wdl/CleanVcf5.wdl b/wdl/CleanVcf5.wdl index f4396df52..085aaa5e5 100644 --- a/wdl/CleanVcf5.wdl +++ b/wdl/CleanVcf5.wdl @@ -8,6 +8,7 @@ workflow CleanVcf5 { File normal_revise_vcf File revise_vcf_lines File ped_file + File sex_chr_revise File multi_ids File? outlier_samples_list @@ -43,6 +44,7 @@ workflow CleanVcf5 { revise_vcf_lines=revise_vcf_lines, normal_revise_vcf=ScatterVcf.shards[i], ped_file=ped_file, + sex_chr_revise=sex_chr_revise, multi_ids=multi_ids, outlier_samples_list=outlier_samples_list, make_clean_gq_script=make_clean_gq_script, @@ -81,6 +83,7 @@ task MakeCleanGQ { File revise_vcf_lines File normal_revise_vcf File ped_file + File sex_chr_revise File multi_ids File? outlier_samples_list File? make_clean_gq_script @@ -93,7 +96,7 @@ task MakeCleanGQ { # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed # generally assume working memory is ~3 * inputs Float input_size = size( - select_all([revise_vcf_lines, normal_revise_vcf, ped_file, multi_ids, outlier_samples_list]), + select_all([revise_vcf_lines, normal_revise_vcf, ped_file, sex_chr_revise, multi_ids, outlier_samples_list]), "GB") Float base_disk_gb = 10.0 @@ -130,6 +133,7 @@ task MakeCleanGQ { revise.vcf.lines.vcf.gz \ ~{normal_revise_vcf} \ ~{ped_file} \ + ~{sex_chr_revise} \ ~{multi_ids} \ outliers.txt \ ~{prefix} From 1ad3c1c9755d0c334f04e4db977b4289539197e7 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Thu, 31 Oct 2024 21:24:14 -0400 Subject: [PATCH 04/40] More files removed... --- .../scripts/clean_vcf_part1b_build_dict.py | 154 ---------- .../scripts/clean_vcf_part1b_filter.py | 82 ------ ..._vcf_part5_find_redundant_multiallelics.py | 60 ---- .../scripts/clean_vcf_part5_update_records.py | 191 ------------- wdl/CalcAF.wdl | 1 - wdl/CleanVcf5.wdl | 265 ------------------ 6 files changed, 753 deletions(-) delete mode 100644 src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_build_dict.py delete mode 100644 src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_filter.py delete mode 100755 src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_find_redundant_multiallelics.py delete mode 100755 src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_update_records.py delete mode 100644 wdl/CleanVcf5.wdl diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_build_dict.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_build_dict.py deleted file mode 100644 index b7da153cb..000000000 --- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_build_dict.py +++ /dev/null @@ -1,154 +0,0 @@ -""" -Remove CNVs that are improperly genotyped by depth because they are nested -within a real CNV -""" - -import logging -import pybedtools -import pysam -import sys -import json - -from collections import defaultdict - -SVTYPE = "SVTYPE" -BLANK_SAMPLES = "blanksample" - - -class SVType: - DUP = "DUP" - DEL = "DEL" - - -class VariantFormatTypes: - # Predicted copy state - RD_CN = "RD_CN" - # Classes of evidence supporting final genotype - EV = "EV" - - -class VCFReviser: - def __init__(self): - self.rd_cn = {} - self.sample_indices_dict = {} - self.sample_list = [] - - def _update_rd_cn(self, variant, sample_indices): - self.rd_cn[variant.id] = {s: variant.samples[s][VariantFormatTypes.RD_CN] for s in sample_indices} - - @staticmethod - def get_wider(f): - # f[1] : first interval start - # f[2] : first interval end - # f[7] : second interval start - # f[8] : second interval end - if int(f[2]) - int(f[1]) >= int(f[8]) - int(f[7]): - return f[0:6], f[6:12] - else: - return f[6:12], f[0:6] - - @staticmethod - def get_coverage(wider, narrower): - n_start = int(narrower[1]) - n_stop = int(narrower[2]) - w_start = int(wider[1]) - w_stop = int(wider[2]) - - coverage = 0 - if w_start <= n_stop and n_start <= w_stop: - intersection_size = min(n_stop, w_stop) - max(n_start, w_start) - coverage = intersection_size / (n_stop - n_start) - return coverage - - def get_geno_normal_revise(self, vcf_file, bed_file): - overlap_test_text = defaultdict(dict) - with pysam.VariantFile(vcf_file, "r") as f: - header = f.header - i = -1 - for sample in header.samples: - i += 1 - self.sample_indices_dict[sample] = i - self.sample_list.append(sample) - - logging.info("Filtering intersect results") - bed = pybedtools.BedTool(bed_file) - for interval in bed.intervals: - wider, narrower = self.get_wider(interval.fields) - # wider and narrower are lists/tuples with the following fields: - # [0] : contig - # [1] : start position - # [2] : end position - # [3] : variant ID - # [4] : SV type - # [5] : comma-delimited sample lists, or BLANK_SAMPLES if none - if wider[5] == BLANK_SAMPLES: - continue - - coverage = self.get_coverage(wider, narrower) - if coverage >= 0.5: - wider_samples = set(wider[5].split(",")) - narrower_samples = set(narrower[5].split(",")) - non_common_samples = [self.sample_indices_dict[s] for s in wider_samples - narrower_samples] - for x in non_common_samples: - vid = narrower[3] - overlap_test_text[vid][x] = (wider[3], wider[4]) - - # Determine for which vid/sample pairs we need RD_CN - # Substantially reduces memory - logging.info('Getting revised variant IDs') - revise_vids = defaultdict(set) - for var_id, samples_dict in overlap_test_text.items(): - for sample_index, v in samples_dict.items(): - # v[0] : variant ID - # v[1] : SV type - if v[1] == SVType.DUP or v[1] == SVType.DEL: - revise_vids[var_id].add(sample_index) - revise_vids[v[0]].add(sample_index) - - logging.info('Getting RD_CN/EV') - for variant in f: - if variant.id in revise_vids: - sample_indices = revise_vids[variant.id] - self._update_rd_cn(variant, sample_indices) - - logging.info('Generating geno_normal_revise_dict') - geno_normal_revise_dict = {} - for var_id, samples_dict in overlap_test_text.items(): - for sample_index, v in samples_dict.items(): - # v[0] : variant ID - # v[1] : SV type - new_val = None - if sample_index not in revise_vids[v[0]]: - sys.stderr.write("{} {}\n".format(sample_index, v[0])) - if v[1] == SVType.DUP and \ - self.rd_cn[var_id][sample_index] == 2 and \ - self.rd_cn[v[0]][sample_index] == 3: - new_val = 1 - elif v[1] == SVType.DEL and \ - self.rd_cn[var_id][sample_index] == 2 \ - and self.rd_cn[v[0]][sample_index] == 1: - new_val = 3 - - if new_val: - if var_id not in geno_normal_revise_dict: - geno_normal_revise_dict[var_id] = {} - sample_id = self.sample_list[sample_index] - geno_normal_revise_dict[var_id][sample_id] = new_val - - return geno_normal_revise_dict - - -def main(args): - logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO) - logging.info('Starting script') - reviser = VCFReviser() - filtered_vcf = args[1] - intersected_bed = args[2] - geno_normal_revise_dict = reviser.get_geno_normal_revise(filtered_vcf, intersected_bed) - logging.info('Dumping dictionary') - sys.stdout.write(json.dumps(geno_normal_revise_dict)) - logging.info('Done') - - -if __name__ == '__main__': - main(sys.argv) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_filter.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_filter.py deleted file mode 100644 index e63b890cd..000000000 --- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_filter.py +++ /dev/null @@ -1,82 +0,0 @@ -""" -Remove CNVs that are improperly genotyped by depth because they are nested -within a real CNV -""" - -import os -import logging -import pysam -import sys -from pathlib import Path -import json -import gzip - -SVTYPE = "SVTYPE" -BLANK_SAMPLES = "B" - - -class SVType: - DUP = "DUP" - DEL = "DEL" - - -class VariantFormatTypes: - # Predicted copy state - RD_CN = "RD_CN" - # Classes of evidence supporting final genotype - EV = "EV" - - -def modify_variants(dict_file_gz, vcf, multi_cnvs): - logging.info('Loading dictionary') - with gzip.open(dict_file_gz, 'rt') as f: - geno_normal_revise_dict = json.load(f) - - logging.info('Filtering variants') - with pysam.VariantFile(vcf, "r") as f_in: - header = f_in.header - sys.stdout.write(str(header)) - with open(multi_cnvs, "w") as multi_cnvs_f: - variants = f_in.fetch() - for variant in variants: - if variant.id in geno_normal_revise_dict: - for sample_id in geno_normal_revise_dict[variant.id]: - o = variant.samples[sample_id] - o.update({"GT": (0, 1)}) - o.update({"GQ": o["RD_GQ"]}) - - if variant.stop - variant.start >= 1000: - if variant.info[SVTYPE] in [SVType.DEL, SVType.DUP]: - is_del = variant.info[SVTYPE] == SVType.DEL - for k, v in variant.samples.items(): - rd_cn = v[VariantFormatTypes.RD_CN] - if rd_cn is None: - continue - if (is_del and rd_cn > 3) or \ - (not is_del and (rd_cn < 1 or rd_cn > 4)): - multi_cnvs_f.write(variant.id + "\n") - break - - sys.stdout.write(str(variant)) - - -def ensure_file(filename): - filename = os.path.join(".", filename) - filename = Path(filename) - if filename.exists(): - os.remove(filename) - return filename.name - - -def main(args): - logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO) - logging.info('Starting script') - multi_cnvs_filename = ensure_file("multi.cnvs.txt") - dict_file_gz = args[1] - vcf_file = args[2] - modify_variants(dict_file_gz, vcf_file, multi_cnvs_filename) - logging.info('Done') - - -if __name__ == '__main__': - main(sys.argv) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_find_redundant_multiallelics.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_find_redundant_multiallelics.py deleted file mode 100755 index ad2b744a5..000000000 --- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_find_redundant_multiallelics.py +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/env python - -import argparse -import sys -import svtk.utils as svu - - -def process_features_for_size1(features_for_size1, redundant_multiallelics): - for intersection in sorted(features_for_size1, key=lambda x: int(x[9]) - int(x[8]), reverse=True): - b_len = int(intersection.fields[9]) - int(intersection.fields[8]) - overlap = int(intersection.fields[14]) - small_coverage = overlap / b_len - if small_coverage > 0.50: - if intersection.fields[3] not in redundant_multiallelics: - redundant_multiallelics.add(intersection.fields[10]) - - -def main(): - parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.RawDescriptionHelpFormatter) - parser.add_argument('multiallelic_filename') - parser.add_argument('fout') - args = parser.parse_args() - - print("finding redundant overlapping sites", file=sys.stderr) - multiallelic_bed = svu.vcf2bedtool(args.multiallelic_filename, include_filters=True) - - redundant_multiallelics = set() - # feature fields: - # [1] : first interval start - # [2] : first interval end - # [3] : first interval variant ID - # [8] : second interval start - # [9] : second interval end - # [10] : second interval variant ID - self_inter = multiallelic_bed.intersect(multiallelic_bed, wo=True)\ - .filter(lambda feature: feature[3] != feature[10]) \ - .filter(lambda feature: (int(feature[2]) - int(feature[1])) >= (int(feature[9]) - int(feature[8]))) \ - .sort(sizeD=True) - current_size1 = -1 - features_for_size1 = [] - for feature in self_inter: - size1 = int(feature[2]) - int(feature[1]) - if size1 != current_size1: - process_features_for_size1(features_for_size1, redundant_multiallelics) - features_for_size1 = [] - - current_size1 = size1 - features_for_size1.append(feature) - - process_features_for_size1(features_for_size1, redundant_multiallelics) - print("identified {} redundant multiallelic sites".format(len(redundant_multiallelics)), file=sys.stderr) - with open(args.fout, "w") as list_file: - for vid in redundant_multiallelics: - print(vid, file=list_file) - - -if __name__ == '__main__': - main() diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_update_records.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_update_records.py deleted file mode 100755 index 51675b5ab..000000000 --- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_update_records.py +++ /dev/null @@ -1,191 +0,0 @@ -#!/usr/bin/env python - -import argparse -from collections import Counter -import gzip -import pysam -import sys -import svtk.utils as svu - - -def main(): - parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.RawDescriptionHelpFormatter) - parser.add_argument('revise_vcf_lines', type=argparse.FileType('r')) - parser.add_argument('normal_revise_vcf') - parser.add_argument('famfile', type=argparse.FileType('r')) - parser.add_argument('sexchr_revise') - parser.add_argument('multi_geno_ids_txt') - parser.add_argument('outlier_samples_list', type=argparse.FileType('r')) - parser.add_argument('out_prefix') - parser.add_argument('--threads_per_file', required=False, default=2, type=int) - args = parser.parse_args() - - # load the revised lines and index by ID - with pysam.VariantFile(args.revise_vcf_lines, threads=args.threads_per_file) as revise_vcf: - header2 = revise_vcf.header - revised_lines_by_id = {record.id: record for record in revise_vcf} - print("loaded {} revised lines".format(len(revised_lines_by_id)), file=sys.stderr) - - outlier_samples = set([line.rstrip() for line in args.outlier_samples_list if not line.isspace()]) - print("loaded {} outlier samples".format(len(outlier_samples)), file=sys.stderr) - - male_samples = set() - for line in args.famfile: - if line.isspace(): - continue - fields = line.rstrip().split("\t") - if fields[4] == '1': - male_samples.add(fields[1]) - print("identified {} male samples".format(len(male_samples)), file=sys.stderr) - - if args.sexchr_revise.endswith(".gz"): - sexchr_revise = {line.rstrip() for line in gzip.open(args.sexchr_revise, 'rt')} - else: - sexchr_revise = {line.rstrip() for line in open(args.sexchr_revise, 'rt')} - print("{} sites to revise on sex chromosomes".format(len(sexchr_revise)), file=sys.stderr) - - if args.multi_geno_ids_txt.endswith(".gz"): - multi_geno_ids = {line.rstrip() for line in gzip.open(args.multi_geno_ids_txt, 'rt')} - else: - multi_geno_ids = {line.rstrip() for line in open(args.multi_geno_ids_txt, 'rt')} - print("{} multiallelic sites".format(len(multi_geno_ids)), file=sys.stderr) - - NEW_HEADER_LINES = ['##ALT=', - '##FORMAT=', - '##FORMAT=', - '##INFO=', - '##FILTER='] - - with pysam.VariantFile(args.normal_revise_vcf) as normal_vcf: - - # # Add metadata lines for annotations - header1 = normal_vcf.header - - for f in NEW_HEADER_LINES: - header1.add_line(f) - header2.add_line(f) - - non_outlier_samples = {s for s in header1.samples if s not in outlier_samples} - vf_1 = max(len(non_outlier_samples) * 0.01, 2) - - biallelic_gts = {(1, 1), (0, 0), (0, 1), (None, None)} - - print("reformatting records", file=sys.stderr) - cleangq_filename = args.out_prefix + ".cleanGQ.vcf.gz" - multiallelic_filename = args.out_prefix + ".multiallelic.vcf.gz" - no_variant_samples_list_file = args.out_prefix + ".no_called_samples.list" - - with pysam.VariantFile(cleangq_filename, 'w', header=normal_vcf.header, threads=args.threads_per_file) as cleanqg_out, \ - pysam.VariantFile(multiallelic_filename, 'w', header=normal_vcf.header) as multiallelic_out, \ - open(no_variant_samples_list_file, 'w') as no_variant_samples_out: - for idx, record in enumerate(normal_vcf): - multi_del = False - multi_dup = False - gt4_copystate = False - gt5kb_dup = False - gt5kb_del = False - if (idx - 1) % 1000 == 0: - print("processed {} records".format(idx), file=sys.stderr) - if record.id in revised_lines_by_id: - record = revised_lines_by_id[record.id] - if record.info.get('SVTYPE', None) == 'DEL': - if abs(record.stop - record.pos) >= 1000: - sample_cn_map = {s: record.samples[s]['RD_CN'] for s in non_outlier_samples} - if len([s for s in sample_cn_map if (sample_cn_map[s] is not None and sample_cn_map[s] > 3)]) > vf_1: - multi_del = True - gts = [record.samples[s]['GT'] for s in non_outlier_samples] - if any(gt not in biallelic_gts for gt in gts): - gt5kb_del = True - if abs(record.stop - record.pos) >= 5000: - if not multi_del: - gt5kb_del = True - - if record.info.get('SVTYPE', None) == 'DUP': - if abs(record.stop - record.pos) >= 1000: - sample_cn_map = {s: record.samples[s]['RD_CN'] for s in non_outlier_samples} - if sum(1 for s in sample_cn_map if sample_cn_map[s] is not None and sample_cn_map[s] > 4) > vf_1: - multi_dup = True - if sum(1 for x in Counter(sample_cn_map.values()) if x is not None and (x < 1 or x > 4)) > 4: - gt4_copystate = True - if sum(1 for s in sample_cn_map if sample_cn_map[s] is not None and - (sample_cn_map[s] < 1 or sample_cn_map[s] > 4) and gt4_copystate) > vf_1: - multi_dup = True - gts = [record.samples[s]['GT'] for s in non_outlier_samples] - if any(gt not in biallelic_gts for gt in gts): - gt5kb_dup = True - if abs(record.stop - record.pos) >= 5000: - if not multi_dup: - gt5kb_dup = True - - if gt5kb_del: - for sample_obj in record.samples.itervalues(): - # Leave no-calls - if sample_obj['GT'] == (None, None): - continue - if not sample_obj['GQ'] is None and \ - (sample_obj['RD_CN'] is not None and sample_obj['RD_CN'] >= 2): - sample_obj['GT'] = (0, 0) - elif not sample_obj['GQ'] is None and \ - (sample_obj['RD_CN'] is not None and sample_obj['RD_CN'] == 1): - sample_obj['GT'] = (0, 1) - elif not sample_obj['GQ'] is None: - sample_obj['GT'] = (1, 1) # RD_CN 0 DEL - - if gt5kb_dup: - for sample_obj in record.samples.itervalues(): - # Leave no-calls - if sample_obj['GT'] == (None, None): - continue - if not sample_obj['GQ'] is None and \ - (sample_obj['RD_CN'] is not None and sample_obj['RD_CN'] <= 2): - sample_obj['GT'] = (0, 0) - elif not sample_obj['GQ'] is None and \ - (sample_obj['RD_CN'] is not None and sample_obj['RD_CN'] == 3): - sample_obj['GT'] = (0, 1) - elif not sample_obj['GQ'] is None: - sample_obj['GT'] = (1, 1) # RD_CN > 3 DUP - - if record.id in multi_geno_ids: - record.info['PESR_GT_OVERDISPERSION'] = True - - if multi_del or multi_dup: - record.filter.add('MULTIALLELIC') - for j, sample in enumerate(record.samples): - record.samples[sample]['GT'] = None - record.samples[sample]['GQ'] = None - record.samples[sample]['CN'] = record.samples[sample]['RD_CN'] - record.samples[sample]['CNQ'] = record.samples[sample]['RD_GQ'] - - if len(record.filter) > 1 and 'PASS' in record.filter: - del record.filter['PASS'] - - if 'MULTIALLELIC' in record.filter and ('' in record.alts or '' in record.alts): - record.alts = ('',) - record.info['SVTYPE'] = 'CNV' - - if record.id in sexchr_revise: - for sample in record.samples: - if sample in male_samples: - cn = record.samples[sample]['RD_CN'] - if cn is not None and int(cn) > 0: - cn = int(cn) - record.samples[sample]['RD_CN'] = cn - 1 - if 'CN' in record.samples[sample]: - record.samples[sample]['CN'] = cn - 1 # the old script didn't do this but I think it should - - cleanqg_out.write(record) - - if 'MULTIALLELIC' in record.filter: - multiallelic_out.write(record) - - if len(svu.get_called_samples(record)) == 0: - print(record.id, file=no_variant_samples_out) - - print("done", file=sys.stderr) - - -if __name__ == '__main__': - main() diff --git a/wdl/CalcAF.wdl b/wdl/CalcAF.wdl index cbc124e2a..064c3b28a 100644 --- a/wdl/CalcAF.wdl +++ b/wdl/CalcAF.wdl @@ -1,7 +1,6 @@ version 1.0 import "Structs.wdl" -import "CleanVcf5.wdl" as cleanvcf5 import "TasksMakeCohortVcf.wdl" as tmc workflow CalcAF { diff --git a/wdl/CleanVcf5.wdl b/wdl/CleanVcf5.wdl deleted file mode 100644 index 085aaa5e5..000000000 --- a/wdl/CleanVcf5.wdl +++ /dev/null @@ -1,265 +0,0 @@ -version 1.0 - -import "Structs.wdl" -import "TasksMakeCohortVcf.wdl" as tasks - -workflow CleanVcf5 { - input { - File normal_revise_vcf - File revise_vcf_lines - File ped_file - File sex_chr_revise - File multi_ids - File? outlier_samples_list - - String prefix - String contig - Int records_per_shard - - File? make_clean_gq_script - File? find_redundant_sites_script - - String sv_base_mini_docker - String sv_pipeline_docker - - Int? threads_per_task - RuntimeAttr? runtime_attr_override_scatter - RuntimeAttr? runtime_attr_override_make_cleangq - RuntimeAttr? runtime_attr_override_find_redundant_multiallelics - RuntimeAttr? runtime_attr_override_polish - } - - call tasks.ScatterVcf { - input: - vcf=normal_revise_vcf, - records_per_shard = records_per_shard, - prefix = "~{prefix}.scatter_vcf", - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_attr_override_scatter - } - - scatter ( i in range(length(ScatterVcf.shards)) ) { - call MakeCleanGQ { - input: - revise_vcf_lines=revise_vcf_lines, - normal_revise_vcf=ScatterVcf.shards[i], - ped_file=ped_file, - sex_chr_revise=sex_chr_revise, - multi_ids=multi_ids, - outlier_samples_list=outlier_samples_list, - make_clean_gq_script=make_clean_gq_script, - prefix="~{prefix}.make_clean_gq.shard_~{i}", - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_attr_override_make_cleangq - } - } - - call FindRedundantMultiallelics { - input: - multiallelic_vcfs=MakeCleanGQ.multiallelic_vcf, - find_redundant_sites_script=find_redundant_sites_script, - prefix="~{prefix}.find_redundant_multiallelics", - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_attr_override_find_redundant_multiallelics - } - - call Polish { - input: - clean_gq_vcfs=MakeCleanGQ.clean_gq_vcf, - no_sample_lists=MakeCleanGQ.no_sample_list, - redundant_multiallelics_list=FindRedundantMultiallelics.redundant_multiallelics_list, - prefix="~{prefix}.polish", - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_attr_override_polish - } - - output { - File polished=Polish.polished - } -} - -task MakeCleanGQ { - input { - File revise_vcf_lines - File normal_revise_vcf - File ped_file - File sex_chr_revise - File multi_ids - File? outlier_samples_list - File? make_clean_gq_script - String prefix - Int? threads = 2 - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed - # generally assume working memory is ~3 * inputs - Float input_size = size( - select_all([revise_vcf_lines, normal_revise_vcf, ped_file, sex_chr_revise, multi_ids, outlier_samples_list]), - "GB") - Float base_disk_gb = 10.0 - - RuntimeAttr runtime_default = object { - mem_gb: 16, - disk_gb: ceil(base_disk_gb + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -eu -o pipefail - - ~{if defined(outlier_samples_list) then "ln ~{outlier_samples_list} outliers.txt" else "touch outliers.txt"} - - # put the revise lines into a normal VCF format - bcftools view -h ~{normal_revise_vcf} > header.txt - cat header.txt <(zcat ~{revise_vcf_lines} | grep . | tr " " "\t") | bgzip -c > revise.vcf.lines.vcf.gz - - python3 ~{default="/opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_update_records.py" make_clean_gq_script} \ - --threads_per_file ~{threads} \ - revise.vcf.lines.vcf.gz \ - ~{normal_revise_vcf} \ - ~{ped_file} \ - ~{sex_chr_revise} \ - ~{multi_ids} \ - outliers.txt \ - ~{prefix} - - bcftools view -G -O z ~{prefix}.multiallelic.vcf.gz > ~{prefix}.multiallelic.sites.vcf.gz - tabix ~{prefix}.cleanGQ.vcf.gz - >>> - - output { - File clean_gq_vcf=prefix + ".cleanGQ.vcf.gz" - File clean_gq_vcf_idx=prefix + ".cleanGQ.vcf.gz.tbi" - File multiallelic_vcf=prefix + ".multiallelic.sites.vcf.gz" - File no_sample_list = prefix + ".no_called_samples.list" - } -} - -task FindRedundantMultiallelics { - input { - Array[File] multiallelic_vcfs - File? find_redundant_sites_script - String prefix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed - # generally assume working memory is ~3 * inputs - Float input_size = size(multiallelic_vcfs, "GB") - Float base_disk_gb = 10.0 - - RuntimeAttr runtime_default = object { - mem_gb: 16, - disk_gb: ceil(base_disk_gb + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - VCFS="~{write_lines(multiallelic_vcfs)}" - cat $VCFS | awk -F '/' '{print $NF"\t"$0}' | sort -k1,1V | awk '{print $2}' > vcfs_sorted.list - bcftools concat --no-version --output-type z --file-list vcfs_sorted.list --output multiallelic.vcf.gz - - python3 ~{default="/opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_find_redundant_multiallelics.py" find_redundant_sites_script} \ - multiallelic.vcf.gz \ - ~{prefix}.list - - >>> - - output { - File redundant_multiallelics_list="~{prefix}.list" - } -} - - -task Polish { - input { - Array[File] clean_gq_vcfs - Array[File] no_sample_lists - File redundant_multiallelics_list - String prefix - String sv_pipeline_docker - Int threads = 2 - RuntimeAttr? runtime_attr_override - } - - # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed - # generally assume working memory is ~3 * inputs - Float input_size = size(clean_gq_vcfs, "GB") - Float base_disk_gb = 10.0 - - RuntimeAttr runtime_default = object { - mem_gb: 16, - disk_gb: ceil(base_disk_gb + input_size * 5.0), - cpu_cores: 4, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - - VCFS="~{write_lines(clean_gq_vcfs)}" - cat $VCFS | awk -F '/' '{print $NF"\t"$0}' | sort -k1,1V | awk '{print $2}' > vcfs_sorted.list - cat ~{redundant_multiallelics_list} ~{sep=" " no_sample_lists} > ids_to_remove.list - bcftools concat --no-version --output-type u --file-list vcfs_sorted.list | \ - bcftools view --no-version \ - --exclude 'ID=@ids_to_remove.list' \ - --output-type z -o polished.need_reheader.vcf.gz --threads ~{threads} - - # do the last bit of header cleanup - bcftools view -h polished.need_reheader.vcf.gz > original_header.vcf - cat original_header.vcf | fgrep '##fileformat' > new_header.vcf - cat original_header.vcf \ - | egrep -v "CIPOS|CIEND|RMSSTD|EVENT|INFO=> new_header.vcf - # Don't sort contigs lexicographically, which would result in incorrect chr1, chr10, chr11, ... ordering - cat original_header.vcf | fgrep '##contig' >> new_header.vcf - cat original_header.vcf | fgrep '#CHROM' >> new_header.vcf - bcftools reheader polished.need_reheader.vcf.gz -h new_header.vcf -o ~{prefix}.vcf.gz - >>> - - output { - File polished="~{prefix}.vcf.gz" - } -} From db6b9b8c4146ec0bf850455ffaa261f0d8abb637 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Fri, 1 Nov 2024 11:45:23 -0400 Subject: [PATCH 05/40] Minor changes to merge with latest changes --- wdl/CleanVcfChromosome.wdl | 78 +++++++++++++------------------------- 1 file changed, 27 insertions(+), 51 deletions(-) diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl index 0edce82fb..8f34d3680 100644 --- a/wdl/CleanVcfChromosome.wdl +++ b/wdl/CleanVcfChromosome.wdl @@ -17,8 +17,6 @@ workflow CleanVcfChromosome { File bothsides_pass_list Int min_records_per_shard_step1 Int samples_per_step2_shard - Int clean_vcf5_records_per_shard - Int? clean_vcf5_threads_per_task File? outlier_samples_list Int? max_samples_per_shard_step3 @@ -49,29 +47,7 @@ workflow CleanVcfChromosome { RuntimeAttr? runtime_override_stitch_fragmented_cnvs RuntimeAttr? runtime_override_final_cleanup RuntimeAttr? runtime_override_rescue_me_dels - # overrides for local tasks - RuntimeAttr? runtime_override_clean_vcf_1a - RuntimeAttr? runtime_override_clean_vcf_2 - RuntimeAttr? runtime_override_clean_vcf_3 - RuntimeAttr? runtime_override_clean_vcf_4 - RuntimeAttr? runtime_override_clean_vcf_5_scatter - RuntimeAttr? runtime_override_clean_vcf_5_make_cleangq - RuntimeAttr? runtime_override_clean_vcf_5_find_redundant_multiallelics - RuntimeAttr? runtime_override_clean_vcf_5_polish - RuntimeAttr? runtime_override_stitch_fragmented_cnvs - RuntimeAttr? runtime_override_final_cleanup - RuntimeAttr? runtime_override_rescue_me_dels - RuntimeAttr? runtime_attr_add_high_fp_rate_filters - - # Clean vcf 1b - RuntimeAttr? runtime_attr_override_subset_large_cnvs_1b - RuntimeAttr? runtime_attr_override_sort_bed_1b - RuntimeAttr? runtime_attr_override_intersect_bed_1b - RuntimeAttr? runtime_attr_override_build_dict_1b - RuntimeAttr? runtime_attr_override_scatter_1b - RuntimeAttr? runtime_attr_override_filter_vcf_1b - RuntimeAttr? runtime_override_concat_vcfs_1b - RuntimeAttr? runtime_override_cat_multi_cnvs_1b + RuntimeAttr? runtime_attr_add_high_fp_rate_filters RuntimeAttr? runtime_override_preconcat_step1 RuntimeAttr? runtime_override_hail_merge_step1 @@ -285,18 +261,18 @@ workflow CleanVcfChromosome { } call RescueMobileElementDeletions { - input: - vcf = StitchFragmentedCnvs.stitched_vcf_shard, - prefix = "~{prefix}.rescue_me_dels", - LINE1 = LINE1_reference, - HERVK = HERVK_reference, - sv_pipeline_docker = sv_pipeline_docker, - runtime_attr_override = runtime_override_rescue_me_dels - } + input: + vcf = StitchFragmentedCnvs.stitched_vcf_shard, + prefix = "~{prefix}.rescue_me_dels", + LINE1 = LINE1_reference, + HERVK = HERVK_reference, + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_override_rescue_me_dels + } - call AddHighFDRFilters { - input: - vcf=RescueMobileElementDeletions.out, + call AddHighFDRFilters { + input: + vcf=RescueMobileElementDeletions.out, prefix="~{prefix}.high_fdr_filtered", sv_pipeline_docker=sv_pipeline_docker, runtime_attr_override=runtime_attr_add_high_fp_rate_filters @@ -305,22 +281,22 @@ workflow CleanVcfChromosome { call FinalCleanup { input: vcf=AddHighFDRFilters.out, - contig=contig, - prefix="~{prefix}.final_cleanup", - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_final_cleanup - } + contig=contig, + prefix="~{prefix}.final_cleanup", + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_override=runtime_override_final_cleanup + } - call fvcf.FormatVcf as FormatVcfToOutput { - input: - vcf=FinalCleanup.final_cleaned_shard, - ploidy_table=ploidy_table, - args="--scale-down-gq", - output_prefix="~{prefix}.final_format", - script=svtk_to_gatk_script, - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_attr_format - } + call fvcf.FormatVcf as FormatVcfToOutput { + input: + vcf=FinalCleanup.final_cleaned_shard, + ploidy_table=ploidy_table, + args="--scale-down-gq", + output_prefix="~{prefix}.final_format", + script=svtk_to_gatk_script, + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_override=runtime_attr_format + } output { File out = FormatVcfToOutput.out From e70ec78a3a972493c2a956ff5c1a7ac355bf3e54 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Fri, 1 Nov 2024 12:30:48 -0400 Subject: [PATCH 06/40] Modified java_mem_gb to use select_first --- wdl/CleanVcfChromosome.wdl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl index 8f34d3680..57acd9c64 100644 --- a/wdl/CleanVcfChromosome.wdl +++ b/wdl/CleanVcfChromosome.wdl @@ -339,7 +339,7 @@ task CleanVcf1a { bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) } - Int java_mem_mb = ceil(runtime_override.mem_gb * 1000 * 0.7) + Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7) String output_vcf = "~{prefix}.vcf.gz" String output_samples_list = "~{prefix}.includelist.txt" @@ -396,7 +396,7 @@ task CleanVcf1b { } String output_vcf = "~{prefix}.vcf.gz" - Int java_mem_mb = ceil(runtime_override.mem_gb * 1000 * 0.7) + Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7) command <<< set -euo pipefail @@ -448,7 +448,7 @@ task CleanVcf2 { } String output_revised_list = "~{prefix}.txt" - Int java_mem_mb = ceil(runtime_override.mem_gb * 1000 * 0.7) + Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7) command <<< set -euo pipefail @@ -542,7 +542,7 @@ task CleanVcf4 { } String output_vcf = "~{prefix}.vcf.gz" - Int java_mem_mb = ceil(runtime_override.mem_gb * 1000 * 0.7) + Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7) command <<< set -euo pipefail @@ -594,7 +594,7 @@ task CleanVcf5 { } String output_vcf = "~{prefix}.vcf.gz" - Int java_mem_mb = ceil(runtime_override.mem_gb * 1000 * 0.7) + Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7) command <<< set -euo pipefail From 2cb19cafd5282aad79142c49f2b32f73a95dc4b6 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Fri, 1 Nov 2024 12:47:08 -0400 Subject: [PATCH 07/40] Clean up CleanVcf.wdl inputs --- wdl/CleanVcf.wdl | 29 +++++------------------------ 1 file changed, 5 insertions(+), 24 deletions(-) diff --git a/wdl/CleanVcf.wdl b/wdl/CleanVcf.wdl index f58c7f4f1..07fb28b53 100644 --- a/wdl/CleanVcf.wdl +++ b/wdl/CleanVcf.wdl @@ -60,28 +60,16 @@ workflow CleanVcf { # overrides for CleanVcfContig RuntimeAttr? runtime_override_clean_vcf_1a + RuntimeAttr? runtime_override_clean_vcf_1b RuntimeAttr? runtime_override_clean_vcf_2 RuntimeAttr? runtime_override_clean_vcf_3 RuntimeAttr? runtime_override_clean_vcf_4 - RuntimeAttr? runtime_override_clean_vcf_5_scatter - RuntimeAttr? runtime_override_clean_vcf_5_make_cleangq - RuntimeAttr? runtime_override_clean_vcf_5_find_redundant_multiallelics - RuntimeAttr? runtime_override_clean_vcf_5_polish + RuntimeAttr? runtime_override_clean_vcf_5 RuntimeAttr? runtime_override_stitch_fragmented_cnvs RuntimeAttr? runtime_override_final_cleanup RuntimeAttr? runtime_attr_format RuntimeAttr? runtime_override_rescue_me_dels - # Clean vcf 1b - RuntimeAttr? runtime_attr_override_subset_large_cnvs_1b - RuntimeAttr? runtime_attr_override_sort_bed_1b - RuntimeAttr? runtime_attr_override_intersect_bed_1b - RuntimeAttr? runtime_attr_override_build_dict_1b - RuntimeAttr? runtime_attr_override_scatter_1b - RuntimeAttr? runtime_attr_override_filter_vcf_1b - RuntimeAttr? runtime_override_concat_vcfs_1b - RuntimeAttr? runtime_override_cat_multi_cnvs_1b - RuntimeAttr? runtime_override_preconcat_step1 RuntimeAttr? runtime_override_hail_merge_step1 RuntimeAttr? runtime_override_fix_header_step1 @@ -91,11 +79,8 @@ workflow CleanVcf { RuntimeAttr? runtime_override_fix_header_drc RuntimeAttr? runtime_override_split_vcf_to_clean - RuntimeAttr? runtime_override_combine_step_1_sex_chr_revisions RuntimeAttr? runtime_override_split_include_list RuntimeAttr? runtime_override_combine_clean_vcf_2 - RuntimeAttr? runtime_override_combine_revised_4 - RuntimeAttr? runtime_override_combine_multi_ids_4 RuntimeAttr? runtime_override_drop_redundant_cnvs RuntimeAttr? runtime_override_combine_step_1_vcfs RuntimeAttr? runtime_override_sort_drop_redundant_cnvs @@ -149,18 +134,12 @@ workflow CleanVcf { runtime_override_clean_vcf_2=runtime_override_clean_vcf_2, runtime_override_clean_vcf_3=runtime_override_clean_vcf_3, runtime_override_clean_vcf_4=runtime_override_clean_vcf_4, - runtime_override_clean_vcf_5_scatter=runtime_override_clean_vcf_5_scatter, - runtime_override_clean_vcf_5_make_cleangq=runtime_override_clean_vcf_5_make_cleangq, - runtime_override_clean_vcf_5_find_redundant_multiallelics=runtime_override_clean_vcf_5_find_redundant_multiallelics, - runtime_override_clean_vcf_5_polish=runtime_override_clean_vcf_5_polish, + runtime_override_clean_vcf_5=runtime_override_clean_vcf_5, runtime_override_stitch_fragmented_cnvs=runtime_override_stitch_fragmented_cnvs, runtime_override_final_cleanup=runtime_override_final_cleanup, runtime_override_split_vcf_to_clean=runtime_override_split_vcf_to_clean, - runtime_override_combine_step_1_sex_chr_revisions=runtime_override_combine_step_1_sex_chr_revisions, runtime_override_split_include_list=runtime_override_split_include_list, runtime_override_combine_clean_vcf_2=runtime_override_combine_clean_vcf_2, - runtime_override_combine_revised_4=runtime_override_combine_revised_4, - runtime_override_combine_multi_ids_4=runtime_override_combine_multi_ids_4, runtime_override_preconcat_step1=runtime_override_preconcat_step1, runtime_override_hail_merge_step1=runtime_override_hail_merge_step1, runtime_override_fix_header_step1=runtime_override_fix_header_step1, @@ -168,6 +147,8 @@ workflow CleanVcf { runtime_override_hail_merge_drc=runtime_override_hail_merge_drc, runtime_override_fix_header_drc=runtime_override_fix_header_drc, runtime_override_drop_redundant_cnvs=runtime_override_drop_redundant_cnvs, + runtime_override_combine_step_1_vcfs=runtime_override_combine_step_1_vcfs + runtime_override_sort_drop_redundant_cnvs=runtime_override_sort_drop_redundant_cnvs runtime_attr_format=runtime_attr_format, runtime_override_rescue_me_dels=runtime_override_rescue_me_dels } From 0956b922b3e779ce8d9a9912180e22533bf78f8b Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Fri, 1 Nov 2024 13:42:36 -0400 Subject: [PATCH 08/40] Forgot comma --- wdl/CleanVcf.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wdl/CleanVcf.wdl b/wdl/CleanVcf.wdl index 07fb28b53..032884704 100644 --- a/wdl/CleanVcf.wdl +++ b/wdl/CleanVcf.wdl @@ -147,7 +147,7 @@ workflow CleanVcf { runtime_override_hail_merge_drc=runtime_override_hail_merge_drc, runtime_override_fix_header_drc=runtime_override_fix_header_drc, runtime_override_drop_redundant_cnvs=runtime_override_drop_redundant_cnvs, - runtime_override_combine_step_1_vcfs=runtime_override_combine_step_1_vcfs + runtime_override_combine_step_1_vcfs=runtime_override_combine_step_1_vcfs, runtime_override_sort_drop_redundant_cnvs=runtime_override_sort_drop_redundant_cnvs runtime_attr_format=runtime_attr_format, runtime_override_rescue_me_dels=runtime_override_rescue_me_dels From 48d81a8a68a40962046c8dc00d24251430ad1652 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Fri, 1 Nov 2024 13:42:54 -0400 Subject: [PATCH 09/40] Forgot comma --- wdl/CleanVcf.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wdl/CleanVcf.wdl b/wdl/CleanVcf.wdl index 032884704..1f39d6ee0 100644 --- a/wdl/CleanVcf.wdl +++ b/wdl/CleanVcf.wdl @@ -148,7 +148,7 @@ workflow CleanVcf { runtime_override_fix_header_drc=runtime_override_fix_header_drc, runtime_override_drop_redundant_cnvs=runtime_override_drop_redundant_cnvs, runtime_override_combine_step_1_vcfs=runtime_override_combine_step_1_vcfs, - runtime_override_sort_drop_redundant_cnvs=runtime_override_sort_drop_redundant_cnvs + runtime_override_sort_drop_redundant_cnvs=runtime_override_sort_drop_redundant_cnvs, runtime_attr_format=runtime_attr_format, runtime_override_rescue_me_dels=runtime_override_rescue_me_dels } From f4112a226601c732f7714f5af786b22919e6a044 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Fri, 1 Nov 2024 16:05:59 -0400 Subject: [PATCH 10/40] Removed unnecessary params --- wdl/CleanVcf.wdl | 4 ---- 1 file changed, 4 deletions(-) diff --git a/wdl/CleanVcf.wdl b/wdl/CleanVcf.wdl index 1f39d6ee0..30ebd9c42 100644 --- a/wdl/CleanVcf.wdl +++ b/wdl/CleanVcf.wdl @@ -21,8 +21,6 @@ workflow CleanVcf { Int min_records_per_shard_step1 Int samples_per_step2_shard Int? max_samples_per_shard_step3 - Int clean_vcf1b_records_per_shard - Int clean_vcf5_records_per_shard File HERVK_reference File LINE1_reference @@ -119,8 +117,6 @@ workflow CleanVcf { outlier_samples_list=outlier_samples_list, use_hail=use_hail, gcs_project=gcs_project, - clean_vcf1b_records_per_shard=clean_vcf1b_records_per_shard, - clean_vcf5_records_per_shard=clean_vcf5_records_per_shard, ploidy_table=CreatePloidyTableFromPed.out, HERVK_reference=HERVK_reference, LINE1_reference=LINE1_reference, From ad93a923bca41b39c8c60ebb793a2dbcfeeed355 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Fri, 1 Nov 2024 16:06:51 -0400 Subject: [PATCH 11/40] Added runtime_override_clean_vcf_5 --- wdl/CleanVcf.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/wdl/CleanVcf.wdl b/wdl/CleanVcf.wdl index 30ebd9c42..148f94717 100644 --- a/wdl/CleanVcf.wdl +++ b/wdl/CleanVcf.wdl @@ -127,6 +127,7 @@ workflow CleanVcf { sv_base_mini_docker=sv_base_mini_docker, sv_pipeline_docker=sv_pipeline_docker, runtime_override_clean_vcf_1a=runtime_override_clean_vcf_1a, + runtime_override_clean_vcf_1b=runtime_override_clean_vcf_1b, runtime_override_clean_vcf_2=runtime_override_clean_vcf_2, runtime_override_clean_vcf_3=runtime_override_clean_vcf_3, runtime_override_clean_vcf_4=runtime_override_clean_vcf_4, From 559a2ce213219dd60e0c61ed7661efeba7ace9e5 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Mon, 4 Nov 2024 10:04:50 -0500 Subject: [PATCH 12/40] Minor changes --- wdl/CleanVcf.wdl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/wdl/CleanVcf.wdl b/wdl/CleanVcf.wdl index 148f94717..42974547b 100644 --- a/wdl/CleanVcf.wdl +++ b/wdl/CleanVcf.wdl @@ -45,6 +45,7 @@ workflow CleanVcf { File? resolve_complex_merged_vcf File? genotype_complex_merged_vcf + String gatk_docker String linux_docker String sv_base_mini_docker String sv_pipeline_docker @@ -122,7 +123,7 @@ workflow CleanVcf { LINE1_reference=LINE1_reference, chr_x=chr_x, chr_y=chr_y, - gatk_docker="docker.io/broadinstitute/gatk:3eb5c3d38d6c8c65e71f29abe9346c98bfbb1cbe", + gatk_docker=gatk_docker, linux_docker=linux_docker, sv_base_mini_docker=sv_base_mini_docker, sv_pipeline_docker=sv_pipeline_docker, From 43db008b66f566f31b23db87ed6797c2f4fd5636 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Sun, 24 Nov 2024 15:12:10 -0500 Subject: [PATCH 13/40] WIP --- .../04_variant_resolution/scripts/clean_vcf_postprocess.py | 6 ++++++ .../04_variant_resolution/scripts/clean_vcf_preprocess.py | 6 ++++++ 2 files changed, 12 insertions(+) create mode 100644 src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_postprocess.py create mode 100644 src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_postprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_postprocess.py new file mode 100644 index 000000000..8e28c90dc --- /dev/null +++ b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_postprocess.py @@ -0,0 +1,6 @@ +#!/bin/python + +import argparse +from collections import defaultdict +from os import mkdir, path + diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py new file mode 100644 index 000000000..8e28c90dc --- /dev/null +++ b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py @@ -0,0 +1,6 @@ +#!/bin/python + +import argparse +from collections import defaultdict +from os import mkdir, path + From e6e519d4b8f7415ca6f1fc9ff471745a30bcd0d0 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Mon, 25 Nov 2024 09:05:38 -0500 Subject: [PATCH 14/40] Initial preprocess script --- .../scripts/clean_vcf_preprocess.py | 120 +++++++++++++++++- 1 file changed, 118 insertions(+), 2 deletions(-) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py index 8e28c90dc..6ac2ad81c 100644 --- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py +++ b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py @@ -1,6 +1,122 @@ #!/bin/python import argparse -from collections import defaultdict -from os import mkdir, path +import pysam +# Constants +EV = 'EV' +VAR_GQ = 'VAR_GQ' +MULTIALLELIC = 'MULTIALLELIC' +UNRESOLVED = 'UNRESOLVED' +HIGH_SR_BACKGROUND = 'HIGH_SR_BACKGROUND' +BOTHSIDES_SUPPORT = 'BOTHSIDES_SUPPORT' +REVISED_EVENT = 'REVISED_EVENT' + +# List of possible EV values +EV_VALUES = ['SR', 'PE', 'SR,PE', 'RD', 'BAF', 'RD,BAF'] + +def read_last_column(file_path): + result_set = set() + with open(file_path, 'r') as f: + for line in f: + if line.strip(): + columns = line.strip().split() + result_set.add(columns[-1]) + return result_set + +def add_header_lines(header): + header.add_line('##FILTER=') + header.add_line('##INFO=') + header.add_line('##INFO=') + header.add_line('##INFO=') + +def process_record(record, fail_set, pass_set): + record = process_EV(record) + record = process_VarGQ(record) + record = process_Multiallelic(record) + record = process_Unresolved(record) + record = process_NoisyEvents(record, fail_set) + record = process_BothsidesSupportEvents(record, pass_set) + return record + +def process_EV(record): + for sample in record.samples: + genotype = record.samples[sample] + if EV in genotype and genotype[EV] is not None: + ev_attribute = genotype[EV] + try: + ev_index = int(ev_attribute) + if 0 <= ev_index < len(EV_VALUES): + genotype[EV] = EV_VALUES[ev_index] + except ValueError: + pass # If it's not an integer, do nothing + return record + +def process_VarGQ(record): + if VAR_GQ in record.info: + var_gq = record.info[VAR_GQ] + if isinstance(var_gq, list): + var_gq = var_gq[0] + del record.info[VAR_GQ] + record.qual = var_gq + return record + +def process_Multiallelic(record): + if MULTIALLELIC in record.info: + del record.info[MULTIALLELIC] + return record + +def process_Unresolved(record): + if UNRESOLVED in record.info: + del record.info[UNRESOLVED] + record.filter.add(UNRESOLVED) + return record + +def process_NoisyEvents(record, fail_set): + if record.id in fail_set: + record.info[HIGH_SR_BACKGROUND] = True + return record + +def process_BothsidesSupportEvents(record, pass_set): + if record.id in pass_set: + record.info[BOTHSIDES_SUPPORT] = True + return record + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Process VCF variants.') + parser.add_argument('--chr-X', dest='chrX', default='chrX', help='chrX column name') + parser.add_argument('--chr-Y', dest='chrY', default='chrY', help='chrY column name') + parser.add_argument('--fail-list', required=True, help='File with variants failing the background test') + parser.add_argument('--pass-list', required=True, help='File with variants passing both sides') + parser.add_argument('--output-samples-list', required=True, help='Output file with samples') + parser.add_argument('-O', '--output', dest='output_vcf', required=True, help='Output VCF name') + parser.add_argument('input_vcf', help='Input VCF file') + args = parser.parse_args() + + # Read failList and passList into sets + fail_set = read_last_column(args.fail_list) + pass_set = read_last_column(args.pass_list) + + # Open input VCF + vcf_in = pysam.VariantFile(args.input_vcf) + + # Modify header + header = vcf_in.header.copy() + add_header_lines(header) + + # Open output VCF + vcf_out = pysam.VariantFile(args.output_vcf, 'w', header=header) + + # Write samples list + with open(args.output_samples_list, 'w') as samples_writer: + for sample in header.samples: + samples_writer.write(sample + '\n') + + # Process variants + for record in vcf_in: + record = process_record(record, fail_set, pass_set) + vcf_out.write(record) + + # Close files + vcf_in.close() + vcf_out.close() From ccd5dde6f74abc1c9019b20d0a427ab41ee60137 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Mon, 25 Nov 2024 11:27:48 -0500 Subject: [PATCH 15/40] Created postprocessing script --- .../scripts/clean_vcf_postprocess.py | 109 +++++++++++++++++- .../scripts/clean_vcf_preprocess.py | 34 +++--- wdl/ResolveComplexVariants.wdl | 5 - wdl/TasksMakeCohortVcf.wdl | 5 - 4 files changed, 120 insertions(+), 33 deletions(-) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_postprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_postprocess.py index 8e28c90dc..5a8af3763 100644 --- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_postprocess.py +++ b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_postprocess.py @@ -1,6 +1,111 @@ #!/bin/python import argparse -from collections import defaultdict -from os import mkdir, path +import pysam +# Constants +EV = 'EV' +SVTYPE = 'SVTYPE' +ME = 'ME' +UNR = 'UNR' +FILTER_VCF_INFO_LINES = {'BND_DEPTH', 'BND_MATEID', 'SPLIT_READS', 'PAIRED_END_READS', 'CLUSTER_MEMBER_IDS'} +FILTER_VCF_LINES = {'ID=UNR', 'ID=BND_DEPTH', 'ID=BND_MATEID', 'ID=CLUSTER_MEMBER_IDS', 'ID=PAIRED_END_READS', 'ID=SPLIT_READS'} + +def modify_header(header): + new_header = pysam.VariantHeader() + + # Copy over header lines, excluding some + for line in header.records: + include_line = True + if line.type == 'INFO' and line.get('ID') in FILTER_VCF_INFO_LINES: + include_line = False + elif line.type == 'FORMAT' and line.get('ID') == EV: + include_line = False + elif line.type == 'ALT' and line.get('ID') == UNR: + include_line = False + elif any(fv_line in str(line) for fv_line in FILTER_VCF_LINES): + include_line = False + if include_line: + new_header.add_line(str(line)) + + # Add new header line for EV + new_header.add_line('##FORMAT=') + + # Add samples to header + for sample in header.samples: + new_header.add_sample(sample) + + return new_header + +def process_record(record): + record = cleanse_info_fields(record) + record = process_svtype(record) + return record + +def cleanse_info_fields(record): + for field in FILTER_VCF_INFO_LINES: + if field in record.info: + del record.info[field] + return record + +def process_svtype(record): + svtype = record.info.get(SVTYPE, None) + + # Check for mobile element in alleles + has_mobile_element = False + if record.alts: + for allele in record.alts: + if allele.startswith('<') and allele.endswith('>'): + symbol = allele[1:-1] + if symbol == ME: + has_mobile_element = True + break + + # If SVTYPE is missing or variant has mobile element, skip processing + if svtype is None or has_mobile_element: + return record + + # Update alleles + ref_allele = record.ref + alt_allele = f'<{svtype}>' + record.alleles = (ref_allele, alt_allele) + + # Update genotypes + for sample in record.samples: + genotype = record.samples[sample] + gt = genotype.get('GT', (None, None)) + + # Count number of alt alleles + alt_count = sum(1 for allele_index in gt if allele_index is not None and allele_index > 0) + + # Update GT accordingly + if alt_count == 1: + genotype['GT'] = (0, 1) + elif alt_count == 2: + genotype['GT'] = (1, 1) + + return record + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Clean VCF post-processing.') + parser.add_argument('-O', '--output', dest='output_vcf', required=True, help='Output VCF name') + parser.add_argument('input_vcf', help='Input VCF file') + args = parser.parse_args() + + # Open input VCF + vcf_in = pysam.VariantFile(args.input_vcf) + + # Modify header + new_header = modify_header(vcf_in.header) + + # Open output VCF + vcf_out = pysam.VariantFile(args.output_vcf, 'w', header=new_header) + + # Process and write variants + for record in vcf_in: + record = process_record(record) + vcf_out.write(record) + + # Close files + vcf_in.close() + vcf_out.close() diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py index 6ac2ad81c..a91996914 100644 --- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py +++ b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py @@ -11,8 +11,6 @@ HIGH_SR_BACKGROUND = 'HIGH_SR_BACKGROUND' BOTHSIDES_SUPPORT = 'BOTHSIDES_SUPPORT' REVISED_EVENT = 'REVISED_EVENT' - -# List of possible EV values EV_VALUES = ['SR', 'PE', 'SR,PE', 'RD', 'BAF', 'RD,BAF'] def read_last_column(file_path): @@ -32,11 +30,11 @@ def add_header_lines(header): def process_record(record, fail_set, pass_set): record = process_EV(record) - record = process_VarGQ(record) - record = process_Multiallelic(record) - record = process_Unresolved(record) - record = process_NoisyEvents(record, fail_set) - record = process_BothsidesSupportEvents(record, pass_set) + record = process_varGQ(record) + record = process_multiallelic(record) + record = process_unresolved(record) + record = process_noisy(record, fail_set) + record = process_bothsides_support(record, pass_set) return record def process_EV(record): @@ -49,10 +47,10 @@ def process_EV(record): if 0 <= ev_index < len(EV_VALUES): genotype[EV] = EV_VALUES[ev_index] except ValueError: - pass # If it's not an integer, do nothing + pass return record -def process_VarGQ(record): +def process_varGQ(record): if VAR_GQ in record.info: var_gq = record.info[VAR_GQ] if isinstance(var_gq, list): @@ -61,23 +59,23 @@ def process_VarGQ(record): record.qual = var_gq return record -def process_Multiallelic(record): +def process_multiallelic(record): if MULTIALLELIC in record.info: del record.info[MULTIALLELIC] return record -def process_Unresolved(record): +def process_unresolved(record): if UNRESOLVED in record.info: del record.info[UNRESOLVED] record.filter.add(UNRESOLVED) return record -def process_NoisyEvents(record, fail_set): +def process_noisy(record, fail_set): if record.id in fail_set: record.info[HIGH_SR_BACKGROUND] = True return record -def process_BothsidesSupportEvents(record, pass_set): +def process_bothsides_support(record, pass_set): if record.id in pass_set: record.info[BOTHSIDES_SUPPORT] = True return record @@ -88,12 +86,11 @@ def process_BothsidesSupportEvents(record, pass_set): parser.add_argument('--chr-Y', dest='chrY', default='chrY', help='chrY column name') parser.add_argument('--fail-list', required=True, help='File with variants failing the background test') parser.add_argument('--pass-list', required=True, help='File with variants passing both sides') - parser.add_argument('--output-samples-list', required=True, help='Output file with samples') parser.add_argument('-O', '--output', dest='output_vcf', required=True, help='Output VCF name') parser.add_argument('input_vcf', help='Input VCF file') args = parser.parse_args() - # Read failList and passList into sets + # Read noisy and bothsides support events into sets fail_set = read_last_column(args.fail_list) pass_set = read_last_column(args.pass_list) @@ -107,12 +104,7 @@ def process_BothsidesSupportEvents(record, pass_set): # Open output VCF vcf_out = pysam.VariantFile(args.output_vcf, 'w', header=header) - # Write samples list - with open(args.output_samples_list, 'w') as samples_writer: - for sample in header.samples: - samples_writer.write(sample + '\n') - - # Process variants + # Process and write variants for record in vcf_in: record = process_record(record, fail_set, pass_set) vcf_out.write(record) diff --git a/wdl/ResolveComplexVariants.wdl b/wdl/ResolveComplexVariants.wdl index e2d32128b..f712537b6 100644 --- a/wdl/ResolveComplexVariants.wdl +++ b/wdl/ResolveComplexVariants.wdl @@ -17,9 +17,6 @@ workflow ResolveComplexVariants { Array[File] disc_files Array[File] rf_cutoff_files - Array[String]? background_fail_columns - Array[String]? bothsides_pass_columns - File contig_list Int max_shard_size File cytobands @@ -197,7 +194,6 @@ workflow ResolveComplexVariants { vcf=RenameVariants.renamed_vcf, original_list=cluster_bothside_pass_lists[i], outfile="~{cohort_name}.~{contig}.sr_bothside_pass.updated3.txt", - header_columns=select_first([bothsides_pass_columns, ["1", "2", "3", "4"]]), sv_pipeline_docker=sv_pipeline_docker, runtime_attr_override=runtime_override_update_sr_list_pass } @@ -208,7 +204,6 @@ workflow ResolveComplexVariants { vcf=RenameVariants.renamed_vcf, original_list=cluster_background_fail_lists[i], outfile="~{cohort_name}.~{contig}.sr_background_fail.updated3.txt", - header_columns=select_first([background_fail_columns, ["1", "2", "3", "4"]]), sv_pipeline_docker=sv_pipeline_docker, runtime_attr_override=runtime_override_update_sr_list_fail } diff --git a/wdl/TasksMakeCohortVcf.wdl b/wdl/TasksMakeCohortVcf.wdl index fef15e068..0b81a83fe 100644 --- a/wdl/TasksMakeCohortVcf.wdl +++ b/wdl/TasksMakeCohortVcf.wdl @@ -684,7 +684,6 @@ task UpdateSrList { File vcf File original_list String outfile - Array[String]? header_columns String sv_pipeline_docker RuntimeAttr? runtime_attr_override } @@ -712,10 +711,6 @@ task UpdateSrList { command <<< set -euxo pipefail - if [[ ! -z "~{sep=' ' header_columns}" ]]; then - echo -e "~{sep='\t' header_columns}" > ~{outfile} - fi - # append new ids to original list svtk vcf2bed ~{vcf} int.bed -i MEMBERS --no-samples --no-header From d60b01bc57605edd60b136bf4a5202ed31db4a07 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Mon, 25 Nov 2024 11:28:45 -0500 Subject: [PATCH 16/40] Minor removal of > character --- wdl/TasksMakeCohortVcf.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wdl/TasksMakeCohortVcf.wdl b/wdl/TasksMakeCohortVcf.wdl index 0b81a83fe..d489831e8 100644 --- a/wdl/TasksMakeCohortVcf.wdl +++ b/wdl/TasksMakeCohortVcf.wdl @@ -724,7 +724,7 @@ task UpdateSrList { else print $0,$NF; \ }' int.bed ~{original_list} \ | sort -k1,1n \ - >> ~{outfile} + > ~{outfile} >>> output { From 38eefb2bfeca898b091db92948364040937d1acd Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Mon, 25 Nov 2024 11:57:57 -0500 Subject: [PATCH 17/40] Python linting fixes --- .../scripts/clean_vcf_postprocess.py | 5 +++++ .../scripts/clean_vcf_preprocess.py | 12 +++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_postprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_postprocess.py index 5a8af3763..452345a50 100644 --- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_postprocess.py +++ b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_postprocess.py @@ -11,6 +11,7 @@ FILTER_VCF_INFO_LINES = {'BND_DEPTH', 'BND_MATEID', 'SPLIT_READS', 'PAIRED_END_READS', 'CLUSTER_MEMBER_IDS'} FILTER_VCF_LINES = {'ID=UNR', 'ID=BND_DEPTH', 'ID=BND_MATEID', 'ID=CLUSTER_MEMBER_IDS', 'ID=PAIRED_END_READS', 'ID=SPLIT_READS'} + def modify_header(header): new_header = pysam.VariantHeader() @@ -37,17 +38,20 @@ def modify_header(header): return new_header + def process_record(record): record = cleanse_info_fields(record) record = process_svtype(record) return record + def cleanse_info_fields(record): for field in FILTER_VCF_INFO_LINES: if field in record.info: del record.info[field] return record + def process_svtype(record): svtype = record.info.get(SVTYPE, None) @@ -86,6 +90,7 @@ def process_svtype(record): return record + if __name__ == '__main__': parser = argparse.ArgumentParser(description='Clean VCF post-processing.') parser.add_argument('-O', '--output', dest='output_vcf', required=True, help='Output VCF name') diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py index a91996914..7dbd685c2 100644 --- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py +++ b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py @@ -11,7 +11,8 @@ HIGH_SR_BACKGROUND = 'HIGH_SR_BACKGROUND' BOTHSIDES_SUPPORT = 'BOTHSIDES_SUPPORT' REVISED_EVENT = 'REVISED_EVENT' -EV_VALUES = ['SR', 'PE', 'SR,PE', 'RD', 'BAF', 'RD,BAF'] +EV_VALUES = ['SR', 'PE', 'SR,PE', 'RD', 'BAF', 'RD,BAF'] + def read_last_column(file_path): result_set = set() @@ -22,12 +23,14 @@ def read_last_column(file_path): result_set.add(columns[-1]) return result_set + def add_header_lines(header): header.add_line('##FILTER=') header.add_line('##INFO=') header.add_line('##INFO=') header.add_line('##INFO=') + def process_record(record, fail_set, pass_set): record = process_EV(record) record = process_varGQ(record) @@ -37,6 +40,7 @@ def process_record(record, fail_set, pass_set): record = process_bothsides_support(record, pass_set) return record + def process_EV(record): for sample in record.samples: genotype = record.samples[sample] @@ -50,6 +54,7 @@ def process_EV(record): pass return record + def process_varGQ(record): if VAR_GQ in record.info: var_gq = record.info[VAR_GQ] @@ -59,27 +64,32 @@ def process_varGQ(record): record.qual = var_gq return record + def process_multiallelic(record): if MULTIALLELIC in record.info: del record.info[MULTIALLELIC] return record + def process_unresolved(record): if UNRESOLVED in record.info: del record.info[UNRESOLVED] record.filter.add(UNRESOLVED) return record + def process_noisy(record, fail_set): if record.id in fail_set: record.info[HIGH_SR_BACKGROUND] = True return record + def process_bothsides_support(record, pass_set): if record.id in pass_set: record.info[BOTHSIDES_SUPPORT] = True return record + if __name__ == '__main__': parser = argparse.ArgumentParser(description='Process VCF variants.') parser.add_argument('--chr-X', dest='chrX', default='chrX', help='chrX column name') From be7f3bf6e6bd9a7d000294f68884fa0b131eebc9 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Wed, 4 Dec 2024 19:42:12 -0500 Subject: [PATCH 18/40] WIP - refactored WDLs to use new set of tools --- ...postprocess.py => cleanvcf_postprocess.py} | 6 +- ...f_preprocess.py => cleanvcf_preprocess.py} | 8 +- wdl/CleanVcfChromosome.wdl | 325 ++++++------------ 3 files changed, 113 insertions(+), 226 deletions(-) rename src/sv-pipeline/04_variant_resolution/scripts/{clean_vcf_postprocess.py => cleanvcf_postprocess.py} (93%) rename src/sv-pipeline/04_variant_resolution/scripts/{clean_vcf_preprocess.py => cleanvcf_preprocess.py} (91%) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_postprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py similarity index 93% rename from src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_postprocess.py rename to src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py index 452345a50..ca4fabc35 100644 --- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_postprocess.py +++ b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py @@ -92,9 +92,9 @@ def process_svtype(record): if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Clean VCF post-processing.') - parser.add_argument('-O', '--output', dest='output_vcf', required=True, help='Output VCF name') - parser.add_argument('input_vcf', help='Input VCF file') + parser = argparse.ArgumentParser(description='CleanVcf postprocessing.') + parser.add_argument('-O', '--output', dest='output_vcf', required=True, help='Output VCF file') + parser.add_argument('-V', '--input', dest='input_vcf', required=True, help='Input VCF file') args = parser.parse_args() # Open input VCF diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py similarity index 91% rename from src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py rename to src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py index 7dbd685c2..787bcfc5b 100644 --- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_preprocess.py +++ b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py @@ -91,13 +91,11 @@ def process_bothsides_support(record, pass_set): if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Process VCF variants.') - parser.add_argument('--chr-X', dest='chrX', default='chrX', help='chrX column name') - parser.add_argument('--chr-Y', dest='chrY', default='chrY', help='chrY column name') + parser = argparse.ArgumentParser(description='CleanVcf preprocessing.') + parser.add_argument('-O', '--output', dest='output_vcf', required=True, help='Output VCF file') + parser.add_argument('-V', '--input', dest='input_vcf', required=True, help='Input VCF file') parser.add_argument('--fail-list', required=True, help='File with variants failing the background test') parser.add_argument('--pass-list', required=True, help='File with variants passing both sides') - parser.add_argument('-O', '--output', dest='output_vcf', required=True, help='Output VCF name') - parser.add_argument('input_vcf', help='Input VCF file') args = parser.parse_args() # Read noisy and bothsides support events into sets diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl index 57acd9c64..c89d574eb 100644 --- a/wdl/CleanVcfChromosome.wdl +++ b/wdl/CleanVcfChromosome.wdl @@ -10,16 +10,12 @@ workflow CleanVcfChromosome { File vcf String contig File background_list + File bothsides_pass_list + File? outlier_samples_list File ped_file File allosome_fai String prefix - Int max_shards_per_chrom_step1 - File bothsides_pass_list - Int min_records_per_shard_step1 - Int samples_per_step2_shard - File? outlier_samples_list - Int? max_samples_per_shard_step3 - + File HERVK_reference File LINE1_reference @@ -27,8 +23,6 @@ workflow CleanVcfChromosome { String chr_x String chr_y - File? svtk_to_gatk_script # For debugging - Boolean use_hail String? gcs_project @@ -38,6 +32,10 @@ workflow CleanVcfChromosome { String sv_pipeline_docker # overrides for local tasks + RuntimeAttr? runtime_attr_revise_overlapping_cnvs + RuntimeAttr? runtime_attr_revise_large_cnvs + RuntimeAttr? runtime_attr_revise_abnormal_allosomes + RuntimeAttr? runtime_attr_revise_multiallelics RuntimeAttr? runtime_override_clean_vcf_1a RuntimeAttr? runtime_override_clean_vcf_1b RuntimeAttr? runtime_override_clean_vcf_2 @@ -76,152 +74,59 @@ workflow CleanVcfChromosome { runtime_attr_override=runtime_attr_format } - call MiniTasks.SplitVcf as SplitVcfToClean { + call CleanVcfPreprocess { input: vcf=FormatVcfToClean.out, - contig=contig, - prefix="~{prefix}.shard_", - n_shards=max_shards_per_chrom_step1, - min_vars_per_shard=min_records_per_shard_step1, - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_split_vcf_to_clean - } - - scatter ( i in range(length(SplitVcfToClean.vcf_shards)) ) { - call CleanVcf1a { - input: - vcf=SplitVcfToClean.vcf_shards[i], - prefix="~{prefix}.clean_vcf_1a.shard_~{i}", - background_fail_list=background_list, - bothsides_pass_list=bothsides_pass_list, - ped_file=ped_file, - allosome_fai=allosome_fai, - chr_x=chr_x, - chr_y=chr_y, - gatk_docker=gatk_docker, - runtime_attr_override=runtime_override_clean_vcf_1a - } - } - - if (use_hail) { - call HailMerge.HailMerge as CombineStep1VcfsHail { - input: - vcfs=CleanVcf1a.intermediate_vcf, - prefix="~{prefix}.combine_step_1_vcfs", - gcs_project=gcs_project, - sv_base_mini_docker=sv_base_mini_docker, - sv_pipeline_docker=sv_pipeline_docker, - runtime_override_preconcat=runtime_override_preconcat_step1, - runtime_override_hail_merge=runtime_override_hail_merge_step1, - runtime_override_fix_header=runtime_override_fix_header_step1 - } - } - if (!use_hail) { - call MiniTasks.ConcatVcfs as CombineStep1Vcfs { - input: - vcfs=CleanVcf1a.intermediate_vcf, - vcfs_idx=CleanVcf1a.intermediate_vcf_idx, - naive=true, - generate_index=false, - outfile_prefix="~{prefix}.combine_step_1_vcfs", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_combine_step_1_vcfs - } + background_list=background_list, + bothsides_pass_list=bothsides_pass_list, + prefix="~{prefix}.preprocess", + gatk_docker=gatk_docker, + runtime_attr_override=runtime_attr_preprocess } - call CleanVcf1b { + call CleanVcfReviseOverlappingCnvs { input: - vcf=select_first([CombineStep1Vcfs.concat_vcf, CombineStep1VcfsHail.merged_vcf]), - prefix="~{prefix}.clean_vcf_1b", + vcf=CleanVcfPreprocess.out, + prefix="~{prefix}.revise_overlapping_cnvs", gatk_docker=gatk_docker, - runtime_attr_override=runtime_override_clean_vcf_1b + runtime_attr_override=runtime_attr_revise_overlapping_cnvs } - call MiniTasks.SplitUncompressed as SplitIncludeList { + call CleanVcfReviseLargeCnvs { input: - whole_file=CleanVcf1a.include_list[0], - lines_per_shard=samples_per_step2_shard, - shard_prefix="~{prefix}.split_include_list.", - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_split_include_list - } - - scatter ( i in range(length(SplitIncludeList.shards)) ){ - call CleanVcf2 { - input: - vcf=CleanVcf1b.out, - prefix="~{prefix}.clean_vcf_2.shard_~{i}", - include_list=SplitIncludeList.shards[i], - gatk_docker=gatk_docker, - runtime_attr_override=runtime_override_clean_vcf_2 - } + vcf=CleanVcfReviseOverlappingCnvs.out, + prefix="~{prefix}.revise_large_cnvs", + gatk_docker=gatk_docker, + runtime_attr_override=runtime_attr_revise_large_cnvs } - call MiniTasks.CatUncompressedFiles as CombineCleanVcf2 { + call CleanVcfReviseAbnormalAllosomes { input: - shards=CleanVcf2.out, - outfile_name="~{prefix}.combine_clean_vcf_2.txt", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_combine_clean_vcf_2 + vcf=CleanVcfReviseLargeCnvs.out, + prefix="~{prefix}.revise_abnormal_allosomes", + gatk_docker=gatk_docker, + runtime_attr_override=runtime_attr_revise_abnormal_allosomes } - call CleanVcf3 { + call CleanVcfReviseMultiallelicCnvs { input: - rd_cn_revise=CombineCleanVcf2.outfile, - max_samples_shard = max_samples_per_shard_step3, - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_clean_vcf_3 - } - - scatter ( i in range(length(CleanVcf3.shards)) ){ - call CleanVcf4 { - input: - vcf=CleanVcf1b.out, - prefix="~{prefix}.clean_vcf_4.shard_~{i}", - outlier_samples_list=outlier_samples_list, - rd_cn_revise=CleanVcf3.shards[i], - gatk_docker=gatk_docker, - runtime_attr_override=runtime_override_clean_vcf_4 - } - } - - if (use_hail) { - call HailMerge.HailMerge as CombineStep4VcfsHail { - input: - vcfs=CleanVcf4.out, - prefix="~{prefix}.combine_revised_4", - gcs_project=gcs_project, - sv_base_mini_docker=sv_base_mini_docker, - sv_pipeline_docker=sv_pipeline_docker, - runtime_override_preconcat=runtime_override_preconcat_step1, - runtime_override_hail_merge=runtime_override_hail_merge_step1, - runtime_override_fix_header=runtime_override_fix_header_step1 - } - } - if (!use_hail) { - call MiniTasks.ConcatVcfs as CombineStep4Vcfs { - input: - vcfs=CleanVcf4.out, - vcfs_idx=CleanVcf4.out_idx, - naive=true, - generate_index=true, - outfile_prefix="~{prefix}.combine_revised_4", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_combine_step_1_vcfs - } + vcf=CleanVcfReviseAbnormalAllosomes.out, + prefix="~{prefix}.revise_multiallelic_cnvs", + gatk_docker=gatk_docker, + runtime_attr_override=runtime_attr_revise_multiallelics } - call CleanVcf5 { + call CleanVcfPostprocess { input: - vcf=select_first([CombineStep4Vcfs.concat_vcf, CombineStep4VcfsHail.merged_vcf]), - prefix="~{prefix}.clean_vcf_5", + vcf=CleanVcfReviseMultiallelicCnvs.out, + prefix="~{prefix}.postprocess", gatk_docker=gatk_docker, - runtime_attr_override=runtime_override_clean_vcf_5 + runtime_attr_override=runtime_attr_revise_multiallelics } call DropRedundantCnvs { input: - vcf=CleanVcf5.out, + vcf=CleanVcfPostprocess.out, prefix="~{prefix}.drop_redundant_cnvs", contig=contig, sv_pipeline_docker=sv_pipeline_docker, @@ -293,7 +198,6 @@ workflow CleanVcfChromosome { ploidy_table=ploidy_table, args="--scale-down-gq", output_prefix="~{prefix}.final_format", - script=svtk_to_gatk_script, sv_pipeline_docker=sv_pipeline_docker, runtime_attr_override=runtime_attr_format } @@ -304,25 +208,19 @@ workflow CleanVcfChromosome { } } - -task CleanVcf1a { +task CleanVcfPreprocess { input { File vcf - String prefix - File background_fail_list + File background_list File bothsides_pass_list - File ped_file - File allosome_fai - String chr_x - String chr_y + String prefix String gatk_docker RuntimeAttr? runtime_attr_override } - Float input_size = size([vcf, background_fail_list, bothsides_pass_list], "GB") RuntimeAttr runtime_default = object { mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 2), + disk_gb: ceil(10.0 + size(vcf, "GB") * 2), cpu_cores: 1, preemptible_tries: 3, max_retries: 1, @@ -341,7 +239,6 @@ task CleanVcf1a { Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7) String output_vcf = "~{prefix}.vcf.gz" - String output_samples_list = "~{prefix}.includelist.txt" command <<< set -euo pipefail @@ -350,24 +247,22 @@ task CleanVcf1a { tabix -p vcf ~{vcf} fi - gatk --java-options "-Xmx~{java_mem_mb}m" SVCleanPt1a \ + python /opt/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py \ -V ~{vcf} \ -O ~{output_vcf} \ - --fail-list ~{background_fail_list} \ - --pass-list ~{bothsides_pass_list} \ - --chr-X ~{chr_x} \ - --chr-Y ~{chr_y} \ - --output-samples-list ~{output_samples_list} + --fail-list ~{background_list} \ + --pass-list ~{bothsides_pass_list} + + tabix -p vcf ~{output_vcf} >>> output { - File include_list="~{output_samples_list}" - File intermediate_vcf="~{output_vcf}" - File intermediate_vcf_idx="~{output_vcf}.tbi" + File out="~{output_vcf}" + File out_idx="~{output_vcf}.tbi" } } -task CleanVcf1b { +task CleanVcfReviseOverlappingCnvs { input { File vcf String prefix @@ -375,15 +270,14 @@ task CleanVcf1b { RuntimeAttr? runtime_attr_override } - Float input_size = size([vcf], "GB") RuntimeAttr runtime_default = object { mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 2), + disk_gb: ceil(10.0 + size(vcf, "GB") * 2), cpu_cores: 1, preemptible_tries: 3, max_retries: 1, boot_disk_gb: 10 - } + } RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) runtime { memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" @@ -395,8 +289,8 @@ task CleanVcf1b { bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) } - String output_vcf = "~{prefix}.vcf.gz" Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7) + String output_vcf = "~{prefix}.vcf.gz" command <<< set -euo pipefail @@ -405,7 +299,7 @@ task CleanVcf1b { tabix -p vcf ~{vcf} fi - gatk --java-options "-Xmx~{java_mem_mb}m" SVCleanPt1b \ + gatk --java-options "-Xmx~{java_mem_mb}m" SVReviseOverlappingCnvs \ -V ~{vcf} \ -O ~{output_vcf} >>> @@ -416,26 +310,22 @@ task CleanVcf1b { } } -task CleanVcf2 { +task CleanVcfReviseLargeCnvs { input { File vcf String prefix - File include_list String gatk_docker RuntimeAttr? runtime_attr_override } - Float input_size = size([vcf, include_list], "GB") - Float base_disk_gb = 10.0 - Float input_disk_scale = 3.0 RuntimeAttr runtime_default = object { - mem_gb: 2.0, - disk_gb: ceil(base_disk_gb + input_size * input_disk_scale), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } + mem_gb: 3.75, + disk_gb: ceil(10.0 + size(vcf, "GB") * 2), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) runtime { memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" @@ -447,8 +337,8 @@ task CleanVcf2 { bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) } - String output_revised_list = "~{prefix}.txt" Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7) + String output_vcf = "~{prefix}.vcf.gz" command <<< set -euo pipefail @@ -457,36 +347,33 @@ task CleanVcf2 { tabix -p vcf ~{vcf} fi - gatk --java-options "-Xmx~{java_mem_mb}m" SVCleanPt2 \ + gatk --java-options "-Xmx~{java_mem_mb}m" SVReviseLargeCnvs \ -V ~{vcf} \ - --sample-list ~{include_list} \ - --output-revised-list ~{output_revised_list} + -O ~{output_vcf} >>> output { - File out="~{output_revised_list}" + File out="~{output_vcf}" + File out_idx="~{output_vcf}.tbi" } } - -task CleanVcf3 { +task CleanVcfReviseAbnormalAllosomes { input { - File rd_cn_revise - Int? max_samples_shard - String sv_pipeline_docker + File vcf + String prefix + String gatk_docker RuntimeAttr? runtime_attr_override } - - Int max_samples_shard_ = select_first([max_samples_shard, 7000]) - Float input_size = size(rd_cn_revise, "GB") + RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 2.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } + mem_gb: 3.75, + disk_gb: ceil(10.0 + size(vcf, "GB") * 2), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) runtime { memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" @@ -494,42 +381,47 @@ task CleanVcf3 { cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker + docker: gatk_docker bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) } + Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7) + String output_vcf = "~{prefix}.vcf.gz" + command <<< set -euo pipefail - python /opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part3.py ~{rd_cn_revise} -s ~{max_samples_shard_} - # Ensure there is at least one shard - touch shards/out.0_0.txt + + if [ ! -f "~{vcf}.tbi" ]; then + tabix -p vcf ~{vcf} + fi + + gatk --java-options "-Xmx~{java_mem_mb}m" SVReviseAbnormalAllosomes \ + -V ~{vcf} \ + -O ~{output_vcf} >>> output { - Array[File] shards = glob("shards/*") + File out="~{output_vcf}" + File out_idx="~{output_vcf}.tbi" } } - -task CleanVcf4 { +task CleanVcfReviseMultiallelicCnvs { input { File vcf String prefix - File rd_cn_revise - File? outlier_samples_list String gatk_docker RuntimeAttr? runtime_attr_override } - Float input_size = size([vcf, rd_cn_revise], "GB") RuntimeAttr runtime_default = object { - mem_gb: 2.0, - disk_gb: 50, + mem_gb: 3.75, + disk_gb: ceil(10.0 + size(vcf, "GB") * 2), cpu_cores: 1, preemptible_tries: 3, max_retries: 1, boot_disk_gb: 10 - } + } RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) runtime { memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" @@ -541,8 +433,8 @@ task CleanVcf4 { bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) } - String output_vcf = "~{prefix}.vcf.gz" Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7) + String output_vcf = "~{prefix}.vcf.gz" command <<< set -euo pipefail @@ -551,11 +443,9 @@ task CleanVcf4 { tabix -p vcf ~{vcf} fi - gatk --java-options "-Xmx~{java_mem_mb}m" SVCleanPt4 \ + gatk --java-options "-Xmx~{java_mem_mb}m" SVReviseMultiallelicCnvs \ -V ~{vcf} \ - -O ~{output_vcf} \ - --revised-cn-list ~{rd_cn_revise} \ - ~{if defined(outlier_samples_list) then "--outliers-list ~{outlier_samples_list}" else "" } + -O ~{output_vcf} >>> output { @@ -564,8 +454,7 @@ task CleanVcf4 { } } - -task CleanVcf5 { +task CleanVcfPostprocess { input { File vcf String prefix @@ -573,15 +462,14 @@ task CleanVcf5 { RuntimeAttr? runtime_attr_override } - Float input_size = size([vcf], "GB") RuntimeAttr runtime_default = object { - mem_gb: 2.0, - disk_gb: 50, + mem_gb: 3.75, + disk_gb: ceil(10.0 + size(vcf, "GB") * 2), cpu_cores: 1, preemptible_tries: 3, max_retries: 1, boot_disk_gb: 10 - } + } RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) runtime { memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" @@ -593,8 +481,8 @@ task CleanVcf5 { bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) } - String output_vcf = "~{prefix}.vcf.gz" Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7) + String output_vcf = "~{prefix}.vcf.gz" command <<< set -euo pipefail @@ -603,9 +491,11 @@ task CleanVcf5 { tabix -p vcf ~{vcf} fi - gatk --java-options "-Xmx~{java_mem_mb}m" SVCleanPt5 \ + python /opt/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py \ -V ~{vcf} \ -O ~{output_vcf} + + tabix -p vcf ~{output_vcf} >>> output { @@ -614,7 +504,6 @@ task CleanVcf5 { } } - task RescueMobileElementDeletions { input { File vcf From fec8d596b144912c5a1c930609acf78c2b2e34cf Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Fri, 6 Dec 2024 09:33:47 -0500 Subject: [PATCH 19/40] WIP --- wdl/CleanVcfChromosome.wdl | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl index c89d574eb..fe2564be9 100644 --- a/wdl/CleanVcfChromosome.wdl +++ b/wdl/CleanVcfChromosome.wdl @@ -32,10 +32,13 @@ workflow CleanVcfChromosome { String sv_pipeline_docker # overrides for local tasks + RuntimeAttr? runtime_attr_preprocess RuntimeAttr? runtime_attr_revise_overlapping_cnvs RuntimeAttr? runtime_attr_revise_large_cnvs RuntimeAttr? runtime_attr_revise_abnormal_allosomes RuntimeAttr? runtime_attr_revise_multiallelics + RuntimeAttr? runtime_attr_postprocess + RuntimeAttr? runtime_override_clean_vcf_1a RuntimeAttr? runtime_override_clean_vcf_1b RuntimeAttr? runtime_override_clean_vcf_2 @@ -103,6 +106,7 @@ workflow CleanVcfChromosome { call CleanVcfReviseAbnormalAllosomes { input: vcf=CleanVcfReviseLargeCnvs.out, + outlier_samples_list=outlier_samples_list, prefix="~{prefix}.revise_abnormal_allosomes", gatk_docker=gatk_docker, runtime_attr_override=runtime_attr_revise_abnormal_allosomes @@ -121,7 +125,7 @@ workflow CleanVcfChromosome { vcf=CleanVcfReviseMultiallelicCnvs.out, prefix="~{prefix}.postprocess", gatk_docker=gatk_docker, - runtime_attr_override=runtime_attr_revise_multiallelics + runtime_attr_override=runtime_attr_postprocess } call DropRedundantCnvs { @@ -313,6 +317,7 @@ task CleanVcfReviseOverlappingCnvs { task CleanVcfReviseLargeCnvs { input { File vcf + File? outlier_samples_list String prefix String gatk_docker RuntimeAttr? runtime_attr_override @@ -349,7 +354,8 @@ task CleanVcfReviseLargeCnvs { gatk --java-options "-Xmx~{java_mem_mb}m" SVReviseLargeCnvs \ -V ~{vcf} \ - -O ~{output_vcf} + -O ~{output_vcf} \ + ~{if defined(outlier_samples_list) then "--outlier-samples ~{outlier_samples_list}" else "" } >>> output { From e27026bff22524302f76cd7c34370b7cfaf09a11 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Wed, 11 Dec 2024 11:48:43 -0500 Subject: [PATCH 20/40] Updated pre/postprocess to use pipeline docker --- wdl/CleanVcfChromosome.wdl | 85 +++++++++++++++++++++++++++++++------- 1 file changed, 71 insertions(+), 14 deletions(-) diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl index fe2564be9..26da02ed5 100644 --- a/wdl/CleanVcfChromosome.wdl +++ b/wdl/CleanVcfChromosome.wdl @@ -33,7 +33,8 @@ workflow CleanVcfChromosome { # overrides for local tasks RuntimeAttr? runtime_attr_preprocess - RuntimeAttr? runtime_attr_revise_overlapping_cnvs + RuntimeAttr? runtime_attr_revise_overlapping_cnv_gts + RuntimeAttr? runtime_attr_revise_overlapping_cnv_cns RuntimeAttr? runtime_attr_revise_large_cnvs RuntimeAttr? runtime_attr_revise_abnormal_allosomes RuntimeAttr? runtime_attr_revise_multiallelics @@ -83,21 +84,30 @@ workflow CleanVcfChromosome { background_list=background_list, bothsides_pass_list=bothsides_pass_list, prefix="~{prefix}.preprocess", - gatk_docker=gatk_docker, + sv_pipeline_docker=sv_pipeline_docker, runtime_attr_override=runtime_attr_preprocess } - call CleanVcfReviseOverlappingCnvs { + call CleanVcfReviseOverlappingCnvGts { input: vcf=CleanVcfPreprocess.out, - prefix="~{prefix}.revise_overlapping_cnvs", + prefix="~{prefix}.revise_overlapping_cnv_gts", + gatk_docker=gatk_docker, + runtime_attr_override=runtime_attr_revise_overlapping_cnv_gts + } + + call CleanVcfReviseOverlappingCnvCns { + input: + vcf=CleanVcfReviseOverlappingCnvGts.out, + prefix="~{prefix}.revise_overlapping_cnv_cns", gatk_docker=gatk_docker, - runtime_attr_override=runtime_attr_revise_overlapping_cnvs + runtime_attr_override=runtime_attr_revise_overlapping_cnv_cns } call CleanVcfReviseLargeCnvs { input: - vcf=CleanVcfReviseOverlappingCnvs.out, + vcf=CleanVcfReviseOverlappingCnvGts.out, + outlier_samples_list=outlier_samples_list, prefix="~{prefix}.revise_large_cnvs", gatk_docker=gatk_docker, runtime_attr_override=runtime_attr_revise_large_cnvs @@ -106,7 +116,6 @@ workflow CleanVcfChromosome { call CleanVcfReviseAbnormalAllosomes { input: vcf=CleanVcfReviseLargeCnvs.out, - outlier_samples_list=outlier_samples_list, prefix="~{prefix}.revise_abnormal_allosomes", gatk_docker=gatk_docker, runtime_attr_override=runtime_attr_revise_abnormal_allosomes @@ -124,7 +133,7 @@ workflow CleanVcfChromosome { input: vcf=CleanVcfReviseMultiallelicCnvs.out, prefix="~{prefix}.postprocess", - gatk_docker=gatk_docker, + sv_pipeline_docker=sv_pipeline_docker, runtime_attr_override=runtime_attr_postprocess } @@ -218,7 +227,7 @@ task CleanVcfPreprocess { File background_list File bothsides_pass_list String prefix - String gatk_docker + String sv_pipeline_docker RuntimeAttr? runtime_attr_override } @@ -237,7 +246,7 @@ task CleanVcfPreprocess { cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: gatk_docker + docker: sv_pipeline_docker bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) } @@ -266,7 +275,55 @@ task CleanVcfPreprocess { } } -task CleanVcfReviseOverlappingCnvs { +task CleanVcfReviseOverlappingCnvGts { + input { + File vcf + String prefix + String gatk_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr runtime_default = object { + mem_gb: 3.75, + disk_gb: ceil(10.0 + size(vcf, "GB") * 2), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: gatk_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7) + String output_vcf = "~{prefix}.vcf.gz" + + command <<< + set -euo pipefail + + if [ ! -f "~{vcf}.tbi" ]; then + tabix -p vcf ~{vcf} + fi + + gatk --java-options "-Xmx~{java_mem_mb}m" SVReviseOverlappingCnvGts \ + -V ~{vcf} \ + -O ~{output_vcf} + >>> + + output { + File out="~{output_vcf}" + File out_idx="~{output_vcf}.tbi" + } +} + +task CleanVcfReviseOverlappingCnvCns { input { File vcf String prefix @@ -303,7 +360,7 @@ task CleanVcfReviseOverlappingCnvs { tabix -p vcf ~{vcf} fi - gatk --java-options "-Xmx~{java_mem_mb}m" SVReviseOverlappingCnvs \ + gatk --java-options "-Xmx~{java_mem_mb}m" SVReviseOverlappingCnvCns \ -V ~{vcf} \ -O ~{output_vcf} >>> @@ -464,7 +521,7 @@ task CleanVcfPostprocess { input { File vcf String prefix - String gatk_docker + String sv_pipeline_docker RuntimeAttr? runtime_attr_override } @@ -483,7 +540,7 @@ task CleanVcfPostprocess { cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: gatk_docker + docker: sv_pipeline_docker bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) } From a92904d1e3c90e36d183b810e97b3f7dd8bcbfc8 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Thu, 12 Dec 2024 16:26:24 -0500 Subject: [PATCH 21/40] Finished testing cleanvcfpreprocess --- header.txt | 0 inputs/values/dockers.json | 6 +- .../scripts/cleanvcf_preprocess.py | 63 +++++-------------- .../replace_ev_numeric_code_with_string.py | 5 +- wdl/CleanVcfChromosome.wdl | 21 ++++++- 5 files changed, 44 insertions(+), 51 deletions(-) create mode 100644 header.txt diff --git a/header.txt b/header.txt new file mode 100644 index 000000000..e69de29bb diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json index b68084b91..7fea73d36 100644 --- a/inputs/values/dockers.json +++ b/inputs/values/dockers.json @@ -12,8 +12,8 @@ "samtools_cloud_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/samtools-cloud:2024-10-25-v0.29-beta-5ea22a52", "sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2024-10-25-v0.29-beta-5ea22a52", "sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base-mini:2024-10-25-v0.29-beta-5ea22a52", - "sv_pipeline_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2024-11-15-v1.0-488d7cb0", - "sv_pipeline_qc_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2024-11-15-v1.0-488d7cb0", + "sv_pipeline_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-e27026bff22524302f76cd7c34370b7cfaf09a11", + "sv_pipeline_qc_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-e27026bff22524302f76cd7c34370b7cfaf09a11", "wham_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/wham:2024-10-25-v0.29-beta-5ea22a52", "igv_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/igv:mw-xz-fixes-2-b1be6a9", "duphold_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/duphold:mw-xz-fixes-2-b1be6a9", @@ -28,5 +28,5 @@ "sv_utils_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-utils:2024-10-25-v0.29-beta-5ea22a52", "gq_recalibrator_docker": "us.gcr.io/broad-dsde-methods/markw/gatk:mw-tb-form-sv-filter-training-data-899360a", "str": "us.gcr.io/broad-dsde-methods/gatk-sv/str:2023-05-23-v0.27.3-beta-e537bdd6", - "denovo": "us.gcr.io/broad-dsde-methods/gatk-sv/denovo:2024-11-15-v1.0-488d7cb0" + "denovo": "us-central1-docker.pkg.dev/talkowski-training/kj-development/denovo:kj-clean-vcf-e27026bff22524302f76cd7c34370b7cfaf09a11" } \ No newline at end of file diff --git a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py index 787bcfc5b..0326d4fc1 100644 --- a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py +++ b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py @@ -2,9 +2,8 @@ import argparse import pysam +import gzip -# Constants -EV = 'EV' VAR_GQ = 'VAR_GQ' MULTIALLELIC = 'MULTIALLELIC' UNRESOLVED = 'UNRESOLVED' @@ -23,16 +22,7 @@ def read_last_column(file_path): result_set.add(columns[-1]) return result_set - -def add_header_lines(header): - header.add_line('##FILTER=') - header.add_line('##INFO=') - header.add_line('##INFO=') - header.add_line('##INFO=') - - def process_record(record, fail_set, pass_set): - record = process_EV(record) record = process_varGQ(record) record = process_multiallelic(record) record = process_unresolved(record) @@ -40,21 +30,6 @@ def process_record(record, fail_set, pass_set): record = process_bothsides_support(record, pass_set) return record - -def process_EV(record): - for sample in record.samples: - genotype = record.samples[sample] - if EV in genotype and genotype[EV] is not None: - ev_attribute = genotype[EV] - try: - ev_index = int(ev_attribute) - if 0 <= ev_index < len(EV_VALUES): - genotype[EV] = EV_VALUES[ev_index] - except ValueError: - pass - return record - - def process_varGQ(record): if VAR_GQ in record.info: var_gq = record.info[VAR_GQ] @@ -64,59 +39,55 @@ def process_varGQ(record): record.qual = var_gq return record - def process_multiallelic(record): if MULTIALLELIC in record.info: del record.info[MULTIALLELIC] return record - def process_unresolved(record): if UNRESOLVED in record.info: del record.info[UNRESOLVED] record.filter.add(UNRESOLVED) return record - def process_noisy(record, fail_set): if record.id in fail_set: record.info[HIGH_SR_BACKGROUND] = True return record - def process_bothsides_support(record, pass_set): if record.id in pass_set: record.info[BOTHSIDES_SUPPORT] = True return record - if __name__ == '__main__': + # Parse arguments parser = argparse.ArgumentParser(description='CleanVcf preprocessing.') - parser.add_argument('-O', '--output', dest='output_vcf', required=True, help='Output VCF file') parser.add_argument('-V', '--input', dest='input_vcf', required=True, help='Input VCF file') + parser.add_argument('-O', '--output', dest='output_vcf', required=True, help='Output VCF file') parser.add_argument('--fail-list', required=True, help='File with variants failing the background test') parser.add_argument('--pass-list', required=True, help='File with variants passing both sides') args = parser.parse_args() - # Read noisy and bothsides support events into sets + # Read input files fail_set = read_last_column(args.fail_list) pass_set = read_last_column(args.pass_list) - - # Open input VCF - vcf_in = pysam.VariantFile(args.input_vcf) - - # Modify header - header = vcf_in.header.copy() - add_header_lines(header) - - # Open output VCF - vcf_out = pysam.VariantFile(args.output_vcf, 'w', header=header) - - # Process and write variants + if args.input_vcf.endswith('.gz'): + vcf_in = pysam.VariantFile(gzip.open(args.input_vcf, 'rt')) + else: + vcf_in = pysam.VariantFile(args.input_vcf) + + # Open output file + if args.output_vcf.endswith('.gz'): + vcf_out = pysam.VariantFile(args.output_vcf, 'wz', header=vcf_in.header) + else: + vcf_out = pysam.VariantFile(args.output_vcf, 'w', header=vcf_in.header.copy()) + + # Process records for record in vcf_in: record = process_record(record, fail_set, pass_set) vcf_out.write(record) - + # Close files vcf_in.close() vcf_out.close() diff --git a/src/sv-pipeline/04_variant_resolution/scripts/replace_ev_numeric_code_with_string.py b/src/sv-pipeline/04_variant_resolution/scripts/replace_ev_numeric_code_with_string.py index b7d611d41..69c7d16b6 100755 --- a/src/sv-pipeline/04_variant_resolution/scripts/replace_ev_numeric_code_with_string.py +++ b/src/sv-pipeline/04_variant_resolution/scripts/replace_ev_numeric_code_with_string.py @@ -61,7 +61,10 @@ def main(): if args.fout in '- stdout'.split(): fout = sys.stdout else: - fout = open(args.fout, 'w') + if args.fout.endswith(".gz"): + fout = gzip.open(args.fout, 'wt') + else: + fout = open(args.fout, 'w') while True: line = vcf.readline().rstrip() diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl index 26da02ed5..fb173b146 100644 --- a/wdl/CleanVcfChromosome.wdl +++ b/wdl/CleanVcfChromosome.wdl @@ -259,9 +259,28 @@ task CleanVcfPreprocess { if [ ! -f "~{vcf}.tbi" ]; then tabix -p vcf ~{vcf} fi + + python /opt/sv-pipeline/04_variant_resolution/scripts/replace_ev_numeric_code_with_string.py \ + ~{vcf} \ + processed.vcf.gz + + zgrep '^##' processed.vcf.gz > header.txt + + cat <> header.txt + ##FILTER= + ##INFO= + ##INFO= + ##INFO= + EOF + + zgrep '^#CHROM' processed.vcf.gz >> header.txt + + bcftools view processed.vcf.gz | bcftools reheader -h header.txt | bgzip -c > processed.reheader.vcf.gz + + rm header.txt python /opt/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py \ - -V ~{vcf} \ + -V processed.reheader.vcf.gz \ -O ~{output_vcf} \ --fail-list ~{background_list} \ --pass-list ~{bothsides_pass_list} From 766acf63d47b59191cfd2362c5acb8311202a904 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Thu, 12 Dec 2024 16:26:54 -0500 Subject: [PATCH 22/40] Undo header.txt addition --- header.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 header.txt diff --git a/header.txt b/header.txt deleted file mode 100644 index e69de29bb..000000000 From 05a17c2e28a41353ab119342a3462099dc5a19c5 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Thu, 12 Dec 2024 17:44:47 -0500 Subject: [PATCH 23/40] Modified postprocess function, works now --- header.txt | 92 +++++++++++++++++++ inputs/values/dockers.json | 6 +- .../scripts/cleanvcf_postprocess.py | 58 ++++++------ .../scripts/cleanvcf_preprocess.py | 8 ++ 4 files changed, 132 insertions(+), 32 deletions(-) create mode 100644 header.txt diff --git a/header.txt b/header.txt new file mode 100644 index 000000000..4923d8e8d --- /dev/null +++ b/header.txt @@ -0,0 +1,92 @@ +##fileformat=VCFv4.2 +##FILTER= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##CPX_TYPE_INS_iDEL="Insertion with deletion at insertion site." +##CPX_TYPE_INVdel="Complex inversion with 3' flanking deletion." +##CPX_TYPE_INVdup="Complex inversion with 3' flanking duplication." +##CPX_TYPE_dDUP="Dispersed duplication." +##CPX_TYPE_dDUP_iDEL="Dispersed duplication with deletion at insertion site." +##CPX_TYPE_delINV="Complex inversion with 5' flanking deletion." +##CPX_TYPE_delINVdel="Complex inversion with 5' and 3' flanking deletions." +##CPX_TYPE_delINVdup="Complex inversion with 5' flanking deletion and 3' flanking duplication." +##CPX_TYPE_dupINV="Complex inversion with 5' flanking duplication." +##CPX_TYPE_dupINVdel="Complex inversion with 5' flanking duplication and 3' flanking deletion." +##CPX_TYPE_dupINVdup="Complex inversion with 5' and 3' flanking duplications." +##CPX_TYPE_piDUP_FR="Palindromic inverted tandem duplication, forward-reverse orientation." +##CPX_TYPE_piDUP_RF="Palindromic inverted tandem duplication, reverse-forward orientation." +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##bcftools_viewCommand=view --header-only /cromwell_root/fc-122007c3-964c-4cc1-b244-e9d6b2a0ea43/submissions/231aacd9-418b-40b6-a3b4-b32e697697bf/CombineBatches/bb6039d1-7dfd-4710-95df-c121caa22646/call-ClusterDepth/shard-0/VcfClusterSingleChrom/7038f60f-5b99-4e6c-b6de-12c9d5bb0bd6/call-ClusterSingleChrom/ClusterSingleChrom/54e9cb83-bb56-4c6c-aeee-a6291e3d4a09/call-ShardedCluster/shard-0/ShardedCluster/7b5e9498-d3b7-4fe7-ac47-5727b462dfc6/call-ConcatVcfs/brainvar_all_samples.chr1.depth.DEL.clustered.vcf.gz; Date=Wed Sep 18 21:08:50 2024 +##bcftools_viewCommand=view -S ^/cromwell_root/fc-122007c3-964c-4cc1-b244-e9d6b2a0ea43/submissions/ee2e1fee-dc5d-4183-9897-0c7e64b3be56/FilterBatchSamples/0766fc10-de05-4b84-9870-caa80fdc9bdd/call-CatOutliers/brainvar_all_samples.outliers.samples.list --force-samples --no-update /cromwell_root/fc-122007c3-964c-4cc1-b244-e9d6b2a0ea43/submissions/d5daf9ab-92fe-401b-97b3-48519a3312f9/FilterBatchSites/e26397de-b866-4f86-bced-1bbb4ed1852b/call-FilterAnnotateVcf/shard-4/brainvar_all_samples.depth.with_evidence.vcf.gz; Date=Tue Sep 17 15:58:20 2024 +##bcftools_viewCommand=view -e 'SVTYPE!="CNV" && COUNT(GT="alt")==0' -O z -o brainvar_all_samples.depth.outliers_removed.vcf.gz; Date=Tue Sep 17 15:58:20 2024 +##bcftools_viewCommand=view -i %ID!=@excluded_vids.list -Oz -o brainvar_all_samples.cluster_batch.depth.chr1.exclude_intervals.vcf.gz /cromwell_root/fc-122007c3-964c-4cc1-b244-e9d6b2a0ea43/submissions/b59992f9-22ce-4f42-a75e-ff60defa10a5/ClusterBatch/5988ced3-7983-4868-bb99-99d55a2446b8/call-ClusterDepth/ClusterDepth/27ed208e-9b5a-47b5-849a-5f6e854c312b/call-SVCluster/shard-0/brainvar_all_samples.cluster_batch.depth.chr1.clustered.vcf.gz; Date=Mon Sep 16 17:47:23 2024 +##bcftools_viewVersion=1.15.1+htslib-1.15.1 +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##source=depth +##bcftools_viewVersion=1.21+htslib-1.21 +##bcftools_viewCommand=view -h /Users/kjaising/Desktop/Work/CleanVcf/Postprocess/brainvar_all_samples_gatk.vcf.gz; Date=Thu Dec 12 17:05:30 2024 +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HSB092 HSB100 HSB102 HSB103 HSB105 HSB107 HSB112 HSB113 HSB114 HSB115 HSB116 HSB118 HSB119 HSB121 HSB122 HSB127 HSB130 HSB131 HSB132 HSB136 HSB139 HSB142 HSB143 HSB148 HSB149 HSB150 HSB152 HSB153 HSB154 HSB155 HSB159 HSB171 HSB172 HSB173 HSB174 HSB175 HSB178 HSB194 HSB195 HSB221 HSB222 HSB223 HSB238 HSB239 HSB248 HSB260 HSB261 HSB265 HSB267 HSB268 HSB270 HSB271 HSB272 HSB274 HSB275 HSB278 HSB279 HSB282 HSB286 HSB289 HSB292 HSB313 HSB316 HSB321 HSB322 HSB332 HSB337 HSB338 HSB339 HSB340 HSB341 HSB342 HSB343 HSB344 HSB345 HSB388 HSB389 HSB394 HSB395 HSB396 HSB398 HSB411 HSB412 HSB413 HSB414 HSB415 HSB416 HSB417 HSB418 HSB420 HSB421 HSB422 HSB425 HSB427 HSB428 HSB429 HSB430 HSB431 HSB432 HSB433 HSB439 HSB440 HSB442 HSB443 HSB444 HSB445 HSB452 HSB453 HSB454 HSB455 HSB456 HSB457 HSB459 HSB460 HSB461 HSB462 HSB463 HSB464 HSB465 HSB466 HSB467 HSB468 HSB469 HSB470 HSB471 HSB472 HSB473 HSB474 HSB475 HSB476 HSB478 HSB479 HSB480 HSB481 HSB482 HSB483 HSB484 HSB485 HSB486 HSB487 HSB488 HSB489 HSB490 HSB492 HSB493 HSB494 HSB495 HSB496 HSB497 HSB498 HSB499 HSB500 HSB501 HSB502 HSB503 HSB504 HSB505 HSB506 HSB507 HSB508 HSB509 HSB510 HSB511 HSB513 HSB514 HSB515 HSB516 HSB536 HSB543 HSB544 HSB545 HSB546 HSB547 HSB561 HSB562 HSB563 HSB564 HSB565 HSB566 HSB568 HSB569 HSB571 HSB572 HSB573 HSB577 HSB578 HSB579 HSB583 HSB587 HSB589 HSB590 HSB591 HSB593 HSB594 HSB595 HSB596 HSB597 HSB598 HSB608 HSB615 HSB616 HSB618 HSB619 HSB622 HSB623 HSB624 HSB625 HSB626 HSB627 HSB629 HSB630 HSB631 HSB633 HSB634 HSB637 HSB638 HSB643 HSB644 HSB645 HSB646 HSB649 HSB650 HSB651 HSB652 HSB653 HSB654 HSB657 HSB666 HSB669 HSB670 HSB671 HSB672 HSB674 HSB676 HSB679 diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json index 7fea73d36..8f7b64fd5 100644 --- a/inputs/values/dockers.json +++ b/inputs/values/dockers.json @@ -12,8 +12,8 @@ "samtools_cloud_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/samtools-cloud:2024-10-25-v0.29-beta-5ea22a52", "sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2024-10-25-v0.29-beta-5ea22a52", "sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base-mini:2024-10-25-v0.29-beta-5ea22a52", - "sv_pipeline_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-e27026bff22524302f76cd7c34370b7cfaf09a11", - "sv_pipeline_qc_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-e27026bff22524302f76cd7c34370b7cfaf09a11", + "sv_pipeline_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-766acf63d47b59191cfd2362c5acb8311202a904", + "sv_pipeline_qc_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-766acf63d47b59191cfd2362c5acb8311202a904", "wham_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/wham:2024-10-25-v0.29-beta-5ea22a52", "igv_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/igv:mw-xz-fixes-2-b1be6a9", "duphold_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/duphold:mw-xz-fixes-2-b1be6a9", @@ -28,5 +28,5 @@ "sv_utils_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-utils:2024-10-25-v0.29-beta-5ea22a52", "gq_recalibrator_docker": "us.gcr.io/broad-dsde-methods/markw/gatk:mw-tb-form-sv-filter-training-data-899360a", "str": "us.gcr.io/broad-dsde-methods/gatk-sv/str:2023-05-23-v0.27.3-beta-e537bdd6", - "denovo": "us-central1-docker.pkg.dev/talkowski-training/kj-development/denovo:kj-clean-vcf-e27026bff22524302f76cd7c34370b7cfaf09a11" + "denovo": "us-central1-docker.pkg.dev/talkowski-training/kj-development/denovo:kj-clean-vcf-766acf63d47b59191cfd2362c5acb8311202a904" } \ No newline at end of file diff --git a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py index ca4fabc35..16cfeaf04 100644 --- a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py +++ b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py @@ -2,37 +2,37 @@ import argparse import pysam +import gzip + -# Constants EV = 'EV' SVTYPE = 'SVTYPE' ME = 'ME' UNR = 'UNR' -FILTER_VCF_INFO_LINES = {'BND_DEPTH', 'BND_MATEID', 'SPLIT_READS', 'PAIRED_END_READS', 'CLUSTER_MEMBER_IDS'} -FILTER_VCF_LINES = {'ID=UNR', 'ID=BND_DEPTH', 'ID=BND_MATEID', 'ID=CLUSTER_MEMBER_IDS', 'ID=PAIRED_END_READS', 'ID=SPLIT_READS'} - - -def modify_header(header): +FILTER_VCF_INFO_LINES = { + 'BND_DEPTH', 'BND_MATEID', 'SPLIT_READS', 'PAIRED_END_READS', + 'CLUSTER_MEMBER_IDS', 'MULTIALLELIC', 'UNRESOLVED' +} +FILTER_VCF_TEXT_LINES = { + 'CIPOS', 'CIEND', 'RMSSTD', 'source', 'bcftools', 'GATKCommandLine', 'fileformat' +} + +def cleanse_header(header): new_header = pysam.VariantHeader() - # Copy over header lines, excluding some for line in header.records: include_line = True if line.type == 'INFO' and line.get('ID') in FILTER_VCF_INFO_LINES: include_line = False + elif any(fv_line in str(line) for fv_line in FILTER_VCF_TEXT_LINES): + include_line = False elif line.type == 'FORMAT' and line.get('ID') == EV: include_line = False elif line.type == 'ALT' and line.get('ID') == UNR: include_line = False - elif any(fv_line in str(line) for fv_line in FILTER_VCF_LINES): - include_line = False if include_line: new_header.add_line(str(line)) - - # Add new header line for EV - new_header.add_line('##FORMAT=') - - # Add samples to header + for sample in header.samples: new_header.add_sample(sample) @@ -55,7 +55,7 @@ def cleanse_info_fields(record): def process_svtype(record): svtype = record.info.get(SVTYPE, None) - # Check for mobile element in alleles + # Skip if variant has mobile element has_mobile_element = False if record.alts: for allele in record.alts: @@ -64,8 +64,6 @@ def process_svtype(record): if symbol == ME: has_mobile_element = True break - - # If SVTYPE is missing or variant has mobile element, skip processing if svtype is None or has_mobile_element: return record @@ -79,10 +77,7 @@ def process_svtype(record): genotype = record.samples[sample] gt = genotype.get('GT', (None, None)) - # Count number of alt alleles alt_count = sum(1 for allele_index in gt if allele_index is not None and allele_index > 0) - - # Update GT accordingly if alt_count == 1: genotype['GT'] = (0, 1) elif alt_count == 2: @@ -92,21 +87,26 @@ def process_svtype(record): if __name__ == '__main__': + # Parse arguments parser = argparse.ArgumentParser(description='CleanVcf postprocessing.') parser.add_argument('-O', '--output', dest='output_vcf', required=True, help='Output VCF file') parser.add_argument('-V', '--input', dest='input_vcf', required=True, help='Input VCF file') args = parser.parse_args() # Open input VCF - vcf_in = pysam.VariantFile(args.input_vcf) - - # Modify header - new_header = modify_header(vcf_in.header) - - # Open output VCF - vcf_out = pysam.VariantFile(args.output_vcf, 'w', header=new_header) - - # Process and write variants + if args.input_vcf.endswith('.gz'): + vcf_in = pysam.VariantFile(gzip.open(args.input_vcf, 'rt')) + else: + vcf_in = pysam.VariantFile(args.input_vcf) + new_header = cleanse_header(vcf_in.header) + + # Open output file + if args.output_vcf.endswith('.gz'): + vcf_out = pysam.VariantFile(args.output_vcf, 'wz', header=new_header) + else: + vcf_out = pysam.VariantFile(args.output_vcf, 'w', header=new_header) + + # Process records for record in vcf_in: record = process_record(record) vcf_out.write(record) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py index 0326d4fc1..32477a24d 100644 --- a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py +++ b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py @@ -4,6 +4,7 @@ import pysam import gzip + VAR_GQ = 'VAR_GQ' MULTIALLELIC = 'MULTIALLELIC' UNRESOLVED = 'UNRESOLVED' @@ -22,6 +23,7 @@ def read_last_column(file_path): result_set.add(columns[-1]) return result_set + def process_record(record, fail_set, pass_set): record = process_varGQ(record) record = process_multiallelic(record) @@ -30,6 +32,7 @@ def process_record(record, fail_set, pass_set): record = process_bothsides_support(record, pass_set) return record + def process_varGQ(record): if VAR_GQ in record.info: var_gq = record.info[VAR_GQ] @@ -39,27 +42,32 @@ def process_varGQ(record): record.qual = var_gq return record + def process_multiallelic(record): if MULTIALLELIC in record.info: del record.info[MULTIALLELIC] return record + def process_unresolved(record): if UNRESOLVED in record.info: del record.info[UNRESOLVED] record.filter.add(UNRESOLVED) return record + def process_noisy(record, fail_set): if record.id in fail_set: record.info[HIGH_SR_BACKGROUND] = True return record + def process_bothsides_support(record, pass_set): if record.id in pass_set: record.info[BOTHSIDES_SUPPORT] = True return record + if __name__ == '__main__': # Parse arguments parser = argparse.ArgumentParser(description='CleanVcf preprocessing.') From d4033870e659b59c96980ba834241a3e68a5865e Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Thu, 12 Dec 2024 17:45:04 -0500 Subject: [PATCH 24/40] Removed header.txt again --- header.txt | 92 ------------------------------------------------------ 1 file changed, 92 deletions(-) delete mode 100644 header.txt diff --git a/header.txt b/header.txt deleted file mode 100644 index 4923d8e8d..000000000 --- a/header.txt +++ /dev/null @@ -1,92 +0,0 @@ -##fileformat=VCFv4.2 -##FILTER= -##ALT= -##ALT= -##ALT= -##ALT= -##ALT= -##ALT= -##ALT= -##ALT= -##ALT= -##ALT= -##ALT= -##CPX_TYPE_INS_iDEL="Insertion with deletion at insertion site." -##CPX_TYPE_INVdel="Complex inversion with 3' flanking deletion." -##CPX_TYPE_INVdup="Complex inversion with 3' flanking duplication." -##CPX_TYPE_dDUP="Dispersed duplication." -##CPX_TYPE_dDUP_iDEL="Dispersed duplication with deletion at insertion site." -##CPX_TYPE_delINV="Complex inversion with 5' flanking deletion." -##CPX_TYPE_delINVdel="Complex inversion with 5' and 3' flanking deletions." -##CPX_TYPE_delINVdup="Complex inversion with 5' flanking deletion and 3' flanking duplication." -##CPX_TYPE_dupINV="Complex inversion with 5' flanking duplication." -##CPX_TYPE_dupINVdel="Complex inversion with 5' flanking duplication and 3' flanking deletion." -##CPX_TYPE_dupINVdup="Complex inversion with 5' and 3' flanking duplications." -##CPX_TYPE_piDUP_FR="Palindromic inverted tandem duplication, forward-reverse orientation." -##CPX_TYPE_piDUP_RF="Palindromic inverted tandem duplication, reverse-forward orientation." -##FILTER= -##FILTER= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##FORMAT= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##INFO= -##bcftools_viewCommand=view --header-only /cromwell_root/fc-122007c3-964c-4cc1-b244-e9d6b2a0ea43/submissions/231aacd9-418b-40b6-a3b4-b32e697697bf/CombineBatches/bb6039d1-7dfd-4710-95df-c121caa22646/call-ClusterDepth/shard-0/VcfClusterSingleChrom/7038f60f-5b99-4e6c-b6de-12c9d5bb0bd6/call-ClusterSingleChrom/ClusterSingleChrom/54e9cb83-bb56-4c6c-aeee-a6291e3d4a09/call-ShardedCluster/shard-0/ShardedCluster/7b5e9498-d3b7-4fe7-ac47-5727b462dfc6/call-ConcatVcfs/brainvar_all_samples.chr1.depth.DEL.clustered.vcf.gz; Date=Wed Sep 18 21:08:50 2024 -##bcftools_viewCommand=view -S ^/cromwell_root/fc-122007c3-964c-4cc1-b244-e9d6b2a0ea43/submissions/ee2e1fee-dc5d-4183-9897-0c7e64b3be56/FilterBatchSamples/0766fc10-de05-4b84-9870-caa80fdc9bdd/call-CatOutliers/brainvar_all_samples.outliers.samples.list --force-samples --no-update /cromwell_root/fc-122007c3-964c-4cc1-b244-e9d6b2a0ea43/submissions/d5daf9ab-92fe-401b-97b3-48519a3312f9/FilterBatchSites/e26397de-b866-4f86-bced-1bbb4ed1852b/call-FilterAnnotateVcf/shard-4/brainvar_all_samples.depth.with_evidence.vcf.gz; Date=Tue Sep 17 15:58:20 2024 -##bcftools_viewCommand=view -e 'SVTYPE!="CNV" && COUNT(GT="alt")==0' -O z -o brainvar_all_samples.depth.outliers_removed.vcf.gz; Date=Tue Sep 17 15:58:20 2024 -##bcftools_viewCommand=view -i %ID!=@excluded_vids.list -Oz -o brainvar_all_samples.cluster_batch.depth.chr1.exclude_intervals.vcf.gz /cromwell_root/fc-122007c3-964c-4cc1-b244-e9d6b2a0ea43/submissions/b59992f9-22ce-4f42-a75e-ff60defa10a5/ClusterBatch/5988ced3-7983-4868-bb99-99d55a2446b8/call-ClusterDepth/ClusterDepth/27ed208e-9b5a-47b5-849a-5f6e854c312b/call-SVCluster/shard-0/brainvar_all_samples.cluster_batch.depth.chr1.clustered.vcf.gz; Date=Mon Sep 16 17:47:23 2024 -##bcftools_viewVersion=1.15.1+htslib-1.15.1 -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##contig= -##source=depth -##bcftools_viewVersion=1.21+htslib-1.21 -##bcftools_viewCommand=view -h /Users/kjaising/Desktop/Work/CleanVcf/Postprocess/brainvar_all_samples_gatk.vcf.gz; Date=Thu Dec 12 17:05:30 2024 -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HSB092 HSB100 HSB102 HSB103 HSB105 HSB107 HSB112 HSB113 HSB114 HSB115 HSB116 HSB118 HSB119 HSB121 HSB122 HSB127 HSB130 HSB131 HSB132 HSB136 HSB139 HSB142 HSB143 HSB148 HSB149 HSB150 HSB152 HSB153 HSB154 HSB155 HSB159 HSB171 HSB172 HSB173 HSB174 HSB175 HSB178 HSB194 HSB195 HSB221 HSB222 HSB223 HSB238 HSB239 HSB248 HSB260 HSB261 HSB265 HSB267 HSB268 HSB270 HSB271 HSB272 HSB274 HSB275 HSB278 HSB279 HSB282 HSB286 HSB289 HSB292 HSB313 HSB316 HSB321 HSB322 HSB332 HSB337 HSB338 HSB339 HSB340 HSB341 HSB342 HSB343 HSB344 HSB345 HSB388 HSB389 HSB394 HSB395 HSB396 HSB398 HSB411 HSB412 HSB413 HSB414 HSB415 HSB416 HSB417 HSB418 HSB420 HSB421 HSB422 HSB425 HSB427 HSB428 HSB429 HSB430 HSB431 HSB432 HSB433 HSB439 HSB440 HSB442 HSB443 HSB444 HSB445 HSB452 HSB453 HSB454 HSB455 HSB456 HSB457 HSB459 HSB460 HSB461 HSB462 HSB463 HSB464 HSB465 HSB466 HSB467 HSB468 HSB469 HSB470 HSB471 HSB472 HSB473 HSB474 HSB475 HSB476 HSB478 HSB479 HSB480 HSB481 HSB482 HSB483 HSB484 HSB485 HSB486 HSB487 HSB488 HSB489 HSB490 HSB492 HSB493 HSB494 HSB495 HSB496 HSB497 HSB498 HSB499 HSB500 HSB501 HSB502 HSB503 HSB504 HSB505 HSB506 HSB507 HSB508 HSB509 HSB510 HSB511 HSB513 HSB514 HSB515 HSB516 HSB536 HSB543 HSB544 HSB545 HSB546 HSB547 HSB561 HSB562 HSB563 HSB564 HSB565 HSB566 HSB568 HSB569 HSB571 HSB572 HSB573 HSB577 HSB578 HSB579 HSB583 HSB587 HSB589 HSB590 HSB591 HSB593 HSB594 HSB595 HSB596 HSB597 HSB598 HSB608 HSB615 HSB616 HSB618 HSB619 HSB622 HSB623 HSB624 HSB625 HSB626 HSB627 HSB629 HSB630 HSB631 HSB633 HSB634 HSB637 HSB638 HSB643 HSB644 HSB645 HSB646 HSB649 HSB650 HSB651 HSB652 HSB653 HSB654 HSB657 HSB666 HSB669 HSB670 HSB671 HSB672 HSB674 HSB676 HSB679 From 170ec7c38b004f02639b5f82270113b74162fe79 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Fri, 13 Dec 2024 10:44:54 -0500 Subject: [PATCH 25/40] Updated header writing --- inputs/values/dockers.json | 6 +++--- .../04_variant_resolution/scripts/cleanvcf_postprocess.py | 5 ++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json index 8f7b64fd5..724abee14 100644 --- a/inputs/values/dockers.json +++ b/inputs/values/dockers.json @@ -12,8 +12,8 @@ "samtools_cloud_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/samtools-cloud:2024-10-25-v0.29-beta-5ea22a52", "sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2024-10-25-v0.29-beta-5ea22a52", "sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base-mini:2024-10-25-v0.29-beta-5ea22a52", - "sv_pipeline_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-766acf63d47b59191cfd2362c5acb8311202a904", - "sv_pipeline_qc_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-766acf63d47b59191cfd2362c5acb8311202a904", + "sv_pipeline_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-d4033870e659b59c96980ba834241a3e68a5865e", + "sv_pipeline_qc_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-d4033870e659b59c96980ba834241a3e68a5865e", "wham_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/wham:2024-10-25-v0.29-beta-5ea22a52", "igv_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/igv:mw-xz-fixes-2-b1be6a9", "duphold_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/duphold:mw-xz-fixes-2-b1be6a9", @@ -28,5 +28,5 @@ "sv_utils_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-utils:2024-10-25-v0.29-beta-5ea22a52", "gq_recalibrator_docker": "us.gcr.io/broad-dsde-methods/markw/gatk:mw-tb-form-sv-filter-training-data-899360a", "str": "us.gcr.io/broad-dsde-methods/gatk-sv/str:2023-05-23-v0.27.3-beta-e537bdd6", - "denovo": "us-central1-docker.pkg.dev/talkowski-training/kj-development/denovo:kj-clean-vcf-766acf63d47b59191cfd2362c5acb8311202a904" + "denovo": "us-central1-docker.pkg.dev/talkowski-training/kj-development/denovo:kj-clean-vcf-d4033870e659b59c96980ba834241a3e68a5865e" } \ No newline at end of file diff --git a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py index 16cfeaf04..7efe4d330 100644 --- a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py +++ b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py @@ -11,12 +11,15 @@ UNR = 'UNR' FILTER_VCF_INFO_LINES = { 'BND_DEPTH', 'BND_MATEID', 'SPLIT_READS', 'PAIRED_END_READS', - 'CLUSTER_MEMBER_IDS', 'MULTIALLELIC', 'UNRESOLVED' + 'CLUSTER_MEMBER_IDS', 'MULTIALLELIC', 'UNRESOLVED', 'VARGQ', + 'EVENT', 'REVISED_EVENT', 'MULTI_CNV' } FILTER_VCF_TEXT_LINES = { 'CIPOS', 'CIEND', 'RMSSTD', 'source', 'bcftools', 'GATKCommandLine', 'fileformat' } +# TODO: Remove INFO fields in advance of script: 'MULTI_CNV', 'VARGQ', 'REVISED_EVENT' + def cleanse_header(header): new_header = pysam.VariantHeader() From eed4c575fd8bb26753bb94cf4c398513fb51f099 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Fri, 13 Dec 2024 13:08:52 -0500 Subject: [PATCH 26/40] Decommissioned post-process script --- inputs/values/dockers.json | 6 +- .../scripts/cleanvcf_postprocess.py | 119 ------------------ wdl/CleanVcfChromosome.wdl | 12 +- 3 files changed, 10 insertions(+), 127 deletions(-) delete mode 100644 src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json index 724abee14..0b92b966f 100644 --- a/inputs/values/dockers.json +++ b/inputs/values/dockers.json @@ -12,8 +12,8 @@ "samtools_cloud_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/samtools-cloud:2024-10-25-v0.29-beta-5ea22a52", "sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2024-10-25-v0.29-beta-5ea22a52", "sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base-mini:2024-10-25-v0.29-beta-5ea22a52", - "sv_pipeline_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-d4033870e659b59c96980ba834241a3e68a5865e", - "sv_pipeline_qc_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-d4033870e659b59c96980ba834241a3e68a5865e", + "sv_pipeline_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-170ec7c38b004f02639b5f82270113b74162fe79", + "sv_pipeline_qc_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-170ec7c38b004f02639b5f82270113b74162fe79", "wham_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/wham:2024-10-25-v0.29-beta-5ea22a52", "igv_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/igv:mw-xz-fixes-2-b1be6a9", "duphold_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/duphold:mw-xz-fixes-2-b1be6a9", @@ -28,5 +28,5 @@ "sv_utils_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-utils:2024-10-25-v0.29-beta-5ea22a52", "gq_recalibrator_docker": "us.gcr.io/broad-dsde-methods/markw/gatk:mw-tb-form-sv-filter-training-data-899360a", "str": "us.gcr.io/broad-dsde-methods/gatk-sv/str:2023-05-23-v0.27.3-beta-e537bdd6", - "denovo": "us-central1-docker.pkg.dev/talkowski-training/kj-development/denovo:kj-clean-vcf-d4033870e659b59c96980ba834241a3e68a5865e" + "denovo": "us-central1-docker.pkg.dev/talkowski-training/kj-development/denovo:kj-clean-vcf-170ec7c38b004f02639b5f82270113b74162fe79" } \ No newline at end of file diff --git a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py deleted file mode 100644 index 7efe4d330..000000000 --- a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py +++ /dev/null @@ -1,119 +0,0 @@ -#!/bin/python - -import argparse -import pysam -import gzip - - -EV = 'EV' -SVTYPE = 'SVTYPE' -ME = 'ME' -UNR = 'UNR' -FILTER_VCF_INFO_LINES = { - 'BND_DEPTH', 'BND_MATEID', 'SPLIT_READS', 'PAIRED_END_READS', - 'CLUSTER_MEMBER_IDS', 'MULTIALLELIC', 'UNRESOLVED', 'VARGQ', - 'EVENT', 'REVISED_EVENT', 'MULTI_CNV' -} -FILTER_VCF_TEXT_LINES = { - 'CIPOS', 'CIEND', 'RMSSTD', 'source', 'bcftools', 'GATKCommandLine', 'fileformat' -} - -# TODO: Remove INFO fields in advance of script: 'MULTI_CNV', 'VARGQ', 'REVISED_EVENT' - -def cleanse_header(header): - new_header = pysam.VariantHeader() - - for line in header.records: - include_line = True - if line.type == 'INFO' and line.get('ID') in FILTER_VCF_INFO_LINES: - include_line = False - elif any(fv_line in str(line) for fv_line in FILTER_VCF_TEXT_LINES): - include_line = False - elif line.type == 'FORMAT' and line.get('ID') == EV: - include_line = False - elif line.type == 'ALT' and line.get('ID') == UNR: - include_line = False - if include_line: - new_header.add_line(str(line)) - - for sample in header.samples: - new_header.add_sample(sample) - - return new_header - - -def process_record(record): - record = cleanse_info_fields(record) - record = process_svtype(record) - return record - - -def cleanse_info_fields(record): - for field in FILTER_VCF_INFO_LINES: - if field in record.info: - del record.info[field] - return record - - -def process_svtype(record): - svtype = record.info.get(SVTYPE, None) - - # Skip if variant has mobile element - has_mobile_element = False - if record.alts: - for allele in record.alts: - if allele.startswith('<') and allele.endswith('>'): - symbol = allele[1:-1] - if symbol == ME: - has_mobile_element = True - break - if svtype is None or has_mobile_element: - return record - - # Update alleles - ref_allele = record.ref - alt_allele = f'<{svtype}>' - record.alleles = (ref_allele, alt_allele) - - # Update genotypes - for sample in record.samples: - genotype = record.samples[sample] - gt = genotype.get('GT', (None, None)) - - alt_count = sum(1 for allele_index in gt if allele_index is not None and allele_index > 0) - if alt_count == 1: - genotype['GT'] = (0, 1) - elif alt_count == 2: - genotype['GT'] = (1, 1) - - return record - - -if __name__ == '__main__': - # Parse arguments - parser = argparse.ArgumentParser(description='CleanVcf postprocessing.') - parser.add_argument('-O', '--output', dest='output_vcf', required=True, help='Output VCF file') - parser.add_argument('-V', '--input', dest='input_vcf', required=True, help='Input VCF file') - args = parser.parse_args() - - # Open input VCF - if args.input_vcf.endswith('.gz'): - vcf_in = pysam.VariantFile(gzip.open(args.input_vcf, 'rt')) - else: - vcf_in = pysam.VariantFile(args.input_vcf) - new_header = cleanse_header(vcf_in.header) - - # Open output file - if args.output_vcf.endswith('.gz'): - vcf_out = pysam.VariantFile(args.output_vcf, 'wz', header=new_header) - else: - vcf_out = pysam.VariantFile(args.output_vcf, 'w', header=new_header) - - # Process records - for record in vcf_in: - record = process_record(record) - vcf_out.write(record) - - # Close files - vcf_in.close() - vcf_out.close() diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl index fb173b146..69a1e068e 100644 --- a/wdl/CleanVcfChromosome.wdl +++ b/wdl/CleanVcfChromosome.wdl @@ -268,7 +268,7 @@ task CleanVcfPreprocess { cat <> header.txt ##FILTER= - ##INFO= + ##INFO= ##INFO= ##INFO= EOF @@ -572,10 +572,12 @@ task CleanVcfPostprocess { if [ ! -f "~{vcf}.tbi" ]; then tabix -p vcf ~{vcf} fi - - python /opt/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py \ - -V ~{vcf} \ - -O ~{output_vcf} + + bcftools annotate -x INFO/MULTIALLELIC,INFO/UNRESOLVED,INFO/EVENT,INFO/REVISED_EVENT,INFO/MULTI_CNV,INFO/varGQ ~{vcf} -o processed.vcf.gz -O z + + bcftools view -h processed.vcf.gz | grep -v -E "CIPOS|CIEND|RMSSTD|source|bcftools|GATKCommandLine|##FORMAT=|##ALT=|##INFO= header.txt + + bcftools reheader -h header.txt processed.vcf.gz -o ~{output_vcf} tabix -p vcf ~{output_vcf} >>> From 078155c3a3f9b502e8b245dec98b1a01809745d6 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Fri, 13 Dec 2024 14:22:33 -0500 Subject: [PATCH 27/40] Updated vargq values --- .../04_variant_resolution/scripts/cleanvcf_preprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py index 32477a24d..d04e0a781 100644 --- a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py +++ b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py @@ -5,7 +5,7 @@ import gzip -VAR_GQ = 'VAR_GQ' +VAR_GQ = 'varGQ' MULTIALLELIC = 'MULTIALLELIC' UNRESOLVED = 'UNRESOLVED' HIGH_SR_BACKGROUND = 'HIGH_SR_BACKGROUND' From ffa4439e95ca4809a01862ae5fc3f59d070a5133 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Mon, 6 Jan 2025 14:21:19 -0500 Subject: [PATCH 28/40] Minor update to pass correct VCF downstream --- wdl/CleanVcfChromosome.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl index 69a1e068e..441049c32 100644 --- a/wdl/CleanVcfChromosome.wdl +++ b/wdl/CleanVcfChromosome.wdl @@ -106,7 +106,7 @@ workflow CleanVcfChromosome { call CleanVcfReviseLargeCnvs { input: - vcf=CleanVcfReviseOverlappingCnvGts.out, + vcf=CleanVcfReviseOverlappingCnvCns.out, outlier_samples_list=outlier_samples_list, prefix="~{prefix}.revise_large_cnvs", gatk_docker=gatk_docker, From c75c86a03011a1a29cd1b1445c6ec8df91aeee11 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Mon, 6 Jan 2025 15:15:05 -0500 Subject: [PATCH 29/40] Created postprocessing file --- .../scripts/cleanvcf_postprocess.py | 46 +++++++++++++++++++ wdl/CleanVcfChromosome.wdl | 6 ++- 2 files changed, 51 insertions(+), 1 deletion(-) create mode 100644 src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py diff --git a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py new file mode 100644 index 000000000..c6bbcbc9d --- /dev/null +++ b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py @@ -0,0 +1,46 @@ +#!/bin/python + +import argparse +import pysam +import gzip + + +def process_record(record): + record = process_svtype(record) + return record + + +def process_svtype(record): + if record.info.get('SVTYPE') == 'DUP': + if not any(':ME' in alt for alt in record.alts): + record.alts = ('',) + return record + + +if __name__ == '__main__': + # Parse arguments + parser = argparse.ArgumentParser(description='CleanVcf postprocessing.') + parser.add_argument('-V', '--input', dest='input_vcf', required=True, help='Input VCF file') + parser.add_argument('-O', '--output', dest='output_vcf', required=True, help='Output VCF file') + args = parser.parse_args() + + # Read input files + if args.input_vcf.endswith('.gz'): + vcf_in = pysam.VariantFile(gzip.open(args.input_vcf, 'rt')) + else: + vcf_in = pysam.VariantFile(args.input_vcf) + + # Open output file + if args.output_vcf.endswith('.gz'): + vcf_out = pysam.VariantFile(args.output_vcf, 'wz', header=vcf_in.header) + else: + vcf_out = pysam.VariantFile(args.output_vcf, 'w', header=vcf_in.header.copy()) + + # Process records + for record in vcf_in: + record = process_record(record) + vcf_out.write(record) + + # Close files + vcf_in.close() + vcf_out.close() diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl index 441049c32..ee2824f65 100644 --- a/wdl/CleanVcfChromosome.wdl +++ b/wdl/CleanVcfChromosome.wdl @@ -577,8 +577,12 @@ task CleanVcfPostprocess { bcftools view -h processed.vcf.gz | grep -v -E "CIPOS|CIEND|RMSSTD|source|bcftools|GATKCommandLine|##FORMAT=|##ALT=|##INFO= header.txt - bcftools reheader -h header.txt processed.vcf.gz -o ~{output_vcf} + bcftools reheader -h header.txt processed.vcf.gz -o processed.reheader.vcf.gz + python /opt/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py \ + -V processed.reheader.vcf.gz \ + -O ~{output_vcf} + tabix -p vcf ~{output_vcf} >>> From 0edef35fe4b3020cab020dd59fdd23d2620f4be9 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Mon, 6 Jan 2025 15:38:19 -0500 Subject: [PATCH 30/40] Updated postprocessing order --- inputs/values/dockers.json | 6 +++--- wdl/CleanVcfChromosome.wdl | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json index 0b92b966f..947cbc84f 100644 --- a/inputs/values/dockers.json +++ b/inputs/values/dockers.json @@ -12,8 +12,8 @@ "samtools_cloud_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/samtools-cloud:2024-10-25-v0.29-beta-5ea22a52", "sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2024-10-25-v0.29-beta-5ea22a52", "sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base-mini:2024-10-25-v0.29-beta-5ea22a52", - "sv_pipeline_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-170ec7c38b004f02639b5f82270113b74162fe79", - "sv_pipeline_qc_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-170ec7c38b004f02639b5f82270113b74162fe79", + "sv_pipeline_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-c75c86a03011a1a29cd1b1445c6ec8df91aeee11", + "sv_pipeline_qc_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-c75c86a03011a1a29cd1b1445c6ec8df91aeee11", "wham_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/wham:2024-10-25-v0.29-beta-5ea22a52", "igv_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/igv:mw-xz-fixes-2-b1be6a9", "duphold_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/duphold:mw-xz-fixes-2-b1be6a9", @@ -28,5 +28,5 @@ "sv_utils_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-utils:2024-10-25-v0.29-beta-5ea22a52", "gq_recalibrator_docker": "us.gcr.io/broad-dsde-methods/markw/gatk:mw-tb-form-sv-filter-training-data-899360a", "str": "us.gcr.io/broad-dsde-methods/gatk-sv/str:2023-05-23-v0.27.3-beta-e537bdd6", - "denovo": "us-central1-docker.pkg.dev/talkowski-training/kj-development/denovo:kj-clean-vcf-170ec7c38b004f02639b5f82270113b74162fe79" + "denovo": "us-central1-docker.pkg.dev/talkowski-training/kj-development/denovo:kj-clean-vcf-c75c86a03011a1a29cd1b1445c6ec8df91aeee11" } \ No newline at end of file diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl index ee2824f65..8e174714b 100644 --- a/wdl/CleanVcfChromosome.wdl +++ b/wdl/CleanVcfChromosome.wdl @@ -573,15 +573,15 @@ task CleanVcfPostprocess { tabix -p vcf ~{vcf} fi - bcftools annotate -x INFO/MULTIALLELIC,INFO/UNRESOLVED,INFO/EVENT,INFO/REVISED_EVENT,INFO/MULTI_CNV,INFO/varGQ ~{vcf} -o processed.vcf.gz -O z + python /opt/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py \ + -V ~{vcf} \ + -O processed.vcf.gz - bcftools view -h processed.vcf.gz | grep -v -E "CIPOS|CIEND|RMSSTD|source|bcftools|GATKCommandLine|##FORMAT=|##ALT=|##INFO= header.txt + bcftools annotate -x INFO/MULTIALLELIC,INFO/UNRESOLVED,INFO/EVENT,INFO/REVISED_EVENT,INFO/MULTI_CNV,INFO/varGQ processed.vcf.gz -o processed.annotated.vcf.gz -O z - bcftools reheader -h header.txt processed.vcf.gz -o processed.reheader.vcf.gz + bcftools view -h processed.annotated.vcf.gz | grep -v -E "CIPOS|CIEND|RMSSTD|source|bcftools|GATKCommandLine|##FORMAT=|##ALT=|##INFO= header.txt - python /opt/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py \ - -V processed.reheader.vcf.gz \ - -O ~{output_vcf} + bcftools reheader -h header.txt processed.annotated.vcf.gz -o ~{output_vcf} tabix -p vcf ~{output_vcf} >>> From a2e7381c9f4822c8b171f99d679b394cc4bd60ac Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Mon, 6 Jan 2025 16:20:59 -0500 Subject: [PATCH 31/40] Updated header of output VCF to include all required fields --- inputs/values/dockers.json | 6 +++--- .../scripts/cleanvcf_postprocess.py | 5 ++--- wdl/CleanVcfChromosome.wdl | 9 ++++++++- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json index 947cbc84f..bc8f1a448 100644 --- a/inputs/values/dockers.json +++ b/inputs/values/dockers.json @@ -12,8 +12,8 @@ "samtools_cloud_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/samtools-cloud:2024-10-25-v0.29-beta-5ea22a52", "sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2024-10-25-v0.29-beta-5ea22a52", "sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base-mini:2024-10-25-v0.29-beta-5ea22a52", - "sv_pipeline_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-c75c86a03011a1a29cd1b1445c6ec8df91aeee11", - "sv_pipeline_qc_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-c75c86a03011a1a29cd1b1445c6ec8df91aeee11", + "sv_pipeline_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-0edef35fe4b3020cab020dd59fdd23d2620f4be9", + "sv_pipeline_qc_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-0edef35fe4b3020cab020dd59fdd23d2620f4be9", "wham_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/wham:2024-10-25-v0.29-beta-5ea22a52", "igv_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/igv:mw-xz-fixes-2-b1be6a9", "duphold_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/duphold:mw-xz-fixes-2-b1be6a9", @@ -28,5 +28,5 @@ "sv_utils_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-utils:2024-10-25-v0.29-beta-5ea22a52", "gq_recalibrator_docker": "us.gcr.io/broad-dsde-methods/markw/gatk:mw-tb-form-sv-filter-training-data-899360a", "str": "us.gcr.io/broad-dsde-methods/gatk-sv/str:2023-05-23-v0.27.3-beta-e537bdd6", - "denovo": "us-central1-docker.pkg.dev/talkowski-training/kj-development/denovo:kj-clean-vcf-c75c86a03011a1a29cd1b1445c6ec8df91aeee11" + "denovo": "us-central1-docker.pkg.dev/talkowski-training/kj-development/denovo:kj-clean-vcf-0edef35fe4b3020cab020dd59fdd23d2620f4be9" } \ No newline at end of file diff --git a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py index c6bbcbc9d..46a31aa00 100644 --- a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py +++ b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_postprocess.py @@ -11,9 +11,8 @@ def process_record(record): def process_svtype(record): - if record.info.get('SVTYPE') == 'DUP': - if not any(':ME' in alt for alt in record.alts): - record.alts = ('',) + if not any(':ME' in alt for alt in record.alts): + record.alts = ('<' + record.info.get('SVTYPE') + '>',) return record diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl index 8e174714b..01d0cea83 100644 --- a/wdl/CleanVcfChromosome.wdl +++ b/wdl/CleanVcfChromosome.wdl @@ -579,8 +579,15 @@ task CleanVcfPostprocess { bcftools annotate -x INFO/MULTIALLELIC,INFO/UNRESOLVED,INFO/EVENT,INFO/REVISED_EVENT,INFO/MULTI_CNV,INFO/varGQ processed.vcf.gz -o processed.annotated.vcf.gz -O z - bcftools view -h processed.annotated.vcf.gz | grep -v -E "CIPOS|CIEND|RMSSTD|source|bcftools|GATKCommandLine|##FORMAT=|##ALT=|##INFO= header.txt + bcftools view -h processed.annotated.vcf.gz | grep "^##" | \ + grep -v -E "CIPOS|CIEND|RMSSTD|source|bcftools|GATKCommandLine|##FORMAT=|##ALT=|##INFO= temp_header.txt + echo '##INFO=' >> temp_header.txt + echo '##ALT=' >> temp_header.txt + bcftools view -h processed.annotated.vcf.gz | grep "^#CHROM" > chrom_header.txt + + cat temp_header.txt chrom_header.txt > header.txt + bcftools reheader -h header.txt processed.annotated.vcf.gz -o ~{output_vcf} tabix -p vcf ~{output_vcf} From 3aec8d05147587b67924787e3252b32c078fa18c Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Tue, 7 Jan 2025 09:54:29 -0500 Subject: [PATCH 32/40] Updated changes pre-testing on gnomad --- inputs/values/dockers.json | 6 ++--- wdl/CleanVcf.wdl | 49 ++++++++++++++++++++------------------ wdl/CleanVcfChromosome.wdl | 17 ++++--------- 3 files changed, 34 insertions(+), 38 deletions(-) diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json index bc8f1a448..32ac70f24 100644 --- a/inputs/values/dockers.json +++ b/inputs/values/dockers.json @@ -12,8 +12,8 @@ "samtools_cloud_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/samtools-cloud:2024-10-25-v0.29-beta-5ea22a52", "sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2024-10-25-v0.29-beta-5ea22a52", "sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base-mini:2024-10-25-v0.29-beta-5ea22a52", - "sv_pipeline_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-0edef35fe4b3020cab020dd59fdd23d2620f4be9", - "sv_pipeline_qc_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-0edef35fe4b3020cab020dd59fdd23d2620f4be9", + "sv_pipeline_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-a2e7381c9f4822c8b171f99d679b394cc4bd60ac", + "sv_pipeline_qc_docker": "us-central1-docker.pkg.dev/talkowski-training/kj-development/sv-pipeline:kj-clean-vcf-a2e7381c9f4822c8b171f99d679b394cc4bd60ac", "wham_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/wham:2024-10-25-v0.29-beta-5ea22a52", "igv_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/igv:mw-xz-fixes-2-b1be6a9", "duphold_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/duphold:mw-xz-fixes-2-b1be6a9", @@ -28,5 +28,5 @@ "sv_utils_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-utils:2024-10-25-v0.29-beta-5ea22a52", "gq_recalibrator_docker": "us.gcr.io/broad-dsde-methods/markw/gatk:mw-tb-form-sv-filter-training-data-899360a", "str": "us.gcr.io/broad-dsde-methods/gatk-sv/str:2023-05-23-v0.27.3-beta-e537bdd6", - "denovo": "us-central1-docker.pkg.dev/talkowski-training/kj-development/denovo:kj-clean-vcf-0edef35fe4b3020cab020dd59fdd23d2620f4be9" + "denovo": "us-central1-docker.pkg.dev/talkowski-training/kj-development/denovo:kj-clean-vcf-a2e7381c9f4822c8b171f99d679b394cc4bd60ac" } \ No newline at end of file diff --git a/wdl/CleanVcf.wdl b/wdl/CleanVcf.wdl index 42974547b..bb64b930e 100644 --- a/wdl/CleanVcf.wdl +++ b/wdl/CleanVcf.wdl @@ -58,12 +58,13 @@ workflow CleanVcf { RuntimeAttr? runtime_attr_create_ploidy # overrides for CleanVcfContig - RuntimeAttr? runtime_override_clean_vcf_1a - RuntimeAttr? runtime_override_clean_vcf_1b - RuntimeAttr? runtime_override_clean_vcf_2 - RuntimeAttr? runtime_override_clean_vcf_3 - RuntimeAttr? runtime_override_clean_vcf_4 - RuntimeAttr? runtime_override_clean_vcf_5 + RuntimeAttr? runtime_attr_preprocess + RuntimeAttr? runtime_attr_revise_overlapping_cnv_gts + RuntimeAttr? runtime_attr_revise_overlapping_cnv_cns + RuntimeAttr? runtime_attr_revise_large_cnvs + RuntimeAttr? runtime_attr_revise_abnormal_allosomes + RuntimeAttr? runtime_attr_revise_multiallelics + RuntimeAttr? runtime_attr_postprocess RuntimeAttr? runtime_override_stitch_fragmented_cnvs RuntimeAttr? runtime_override_final_cleanup RuntimeAttr? runtime_attr_format @@ -106,33 +107,35 @@ workflow CleanVcf { input: vcf=complex_genotype_vcfs[i], contig=contig, + chr_x=chr_x, + chr_y=chr_y, + prefix="~{cohort_name}.~{contig}", + background_list=complex_resolve_background_fail_list, - ped_file=ped_file, bothsides_pass_list=complex_resolve_bothside_pass_list, - allosome_fai=allosome_fai, - prefix="~{cohort_name}.~{contig}", - max_shards_per_chrom_step1=max_shards_per_chrom_step1, - min_records_per_shard_step1=min_records_per_shard_step1, - samples_per_step2_shard=samples_per_step2_shard, - max_samples_per_shard_step3=max_samples_per_shard_step3, outlier_samples_list=outlier_samples_list, + ped_file=ped_file, + allosome_fai=allosome_fai, + + HERVK_reference=HERVK_reference, + LINE1_reference=LINE1_reference, + use_hail=use_hail, gcs_project=gcs_project, ploidy_table=CreatePloidyTableFromPed.out, - HERVK_reference=HERVK_reference, - LINE1_reference=LINE1_reference, - chr_x=chr_x, - chr_y=chr_y, + gatk_docker=gatk_docker, linux_docker=linux_docker, sv_base_mini_docker=sv_base_mini_docker, sv_pipeline_docker=sv_pipeline_docker, - runtime_override_clean_vcf_1a=runtime_override_clean_vcf_1a, - runtime_override_clean_vcf_1b=runtime_override_clean_vcf_1b, - runtime_override_clean_vcf_2=runtime_override_clean_vcf_2, - runtime_override_clean_vcf_3=runtime_override_clean_vcf_3, - runtime_override_clean_vcf_4=runtime_override_clean_vcf_4, - runtime_override_clean_vcf_5=runtime_override_clean_vcf_5, + + runtime_attr_preprocess=runtime_attr_preprocess, + runtime_attr_revise_overlapping_cnv_gts=runtime_attr_revise_overlapping_cnv_gts, + runtime_attr_revise_overlapping_cnv_cns=runtime_attr_revise_overlapping_cnv_cns, + runtime_attr_revise_large_cnvs=runtime_attr_revise_large_cnvs, + runtime_attr_revise_abnormal_allosomes=runtime_attr_revise_abnormal_allosomes, + runtime_attr_revise_multiallelics=runtime_attr_revise_multiallelics, + runtime_attr_postprocess=runtime_attr_postprocess, runtime_override_stitch_fragmented_cnvs=runtime_override_stitch_fragmented_cnvs, runtime_override_final_cleanup=runtime_override_final_cleanup, runtime_override_split_vcf_to_clean=runtime_override_split_vcf_to_clean, diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl index 01d0cea83..3d8c6dfac 100644 --- a/wdl/CleanVcfChromosome.wdl +++ b/wdl/CleanVcfChromosome.wdl @@ -9,20 +9,20 @@ workflow CleanVcfChromosome { input { File vcf String contig + String chr_x + String chr_y + String prefix + File background_list File bothsides_pass_list File? outlier_samples_list File ped_file + File ploidy_table File allosome_fai - String prefix File HERVK_reference File LINE1_reference - File ploidy_table - String chr_x - String chr_y - Boolean use_hail String? gcs_project @@ -39,13 +39,6 @@ workflow CleanVcfChromosome { RuntimeAttr? runtime_attr_revise_abnormal_allosomes RuntimeAttr? runtime_attr_revise_multiallelics RuntimeAttr? runtime_attr_postprocess - - RuntimeAttr? runtime_override_clean_vcf_1a - RuntimeAttr? runtime_override_clean_vcf_1b - RuntimeAttr? runtime_override_clean_vcf_2 - RuntimeAttr? runtime_override_clean_vcf_3 - RuntimeAttr? runtime_override_clean_vcf_4 - RuntimeAttr? runtime_override_clean_vcf_5 RuntimeAttr? runtime_override_stitch_fragmented_cnvs RuntimeAttr? runtime_override_final_cleanup RuntimeAttr? runtime_override_rescue_me_dels From 581c5c14b9d61edcdf2ab0c9797a783c7c8dfc7b Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Thu, 9 Jan 2025 15:41:53 -0500 Subject: [PATCH 33/40] Removed unnecessary runtime attributes --- wdl/CleanVcfChromosome.wdl | 9 --------- 1 file changed, 9 deletions(-) diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl index 3d8c6dfac..8ace1eea6 100644 --- a/wdl/CleanVcfChromosome.wdl +++ b/wdl/CleanVcfChromosome.wdl @@ -44,20 +44,11 @@ workflow CleanVcfChromosome { RuntimeAttr? runtime_override_rescue_me_dels RuntimeAttr? runtime_attr_add_high_fp_rate_filters - RuntimeAttr? runtime_override_preconcat_step1 - RuntimeAttr? runtime_override_hail_merge_step1 - RuntimeAttr? runtime_override_fix_header_step1 - RuntimeAttr? runtime_override_preconcat_drc RuntimeAttr? runtime_override_hail_merge_drc RuntimeAttr? runtime_override_fix_header_drc - # overrides for MiniTasks - RuntimeAttr? runtime_override_split_vcf_to_clean - RuntimeAttr? runtime_override_split_include_list - RuntimeAttr? runtime_override_combine_clean_vcf_2 RuntimeAttr? runtime_override_drop_redundant_cnvs - RuntimeAttr? runtime_override_combine_step_1_vcfs RuntimeAttr? runtime_override_sort_drop_redundant_cnvs RuntimeAttr? runtime_attr_format } From be3a8014b614e3beb9b3ab8cd4dc43bfbd0a56de Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Mon, 13 Jan 2025 10:54:16 -0500 Subject: [PATCH 34/40] Added sex revisions for male GT --- .../scripts/cleanvcf_preprocess.py | 94 ++++++++++++++++++- wdl/CleanVcfChromosome.wdl | 6 ++ 2 files changed, 98 insertions(+), 2 deletions(-) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py index d04e0a781..d0a144711 100644 --- a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py +++ b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py @@ -12,6 +12,7 @@ BOTHSIDES_SUPPORT = 'BOTHSIDES_SUPPORT' REVISED_EVENT = 'REVISED_EVENT' EV_VALUES = ['SR', 'PE', 'SR,PE', 'RD', 'BAF', 'RD,BAF'] +MIN_ALLOSOME_EVENT_SIZE = 50 def read_last_column(file_path): @@ -24,12 +25,13 @@ def read_last_column(file_path): return result_set -def process_record(record, fail_set, pass_set): +def process_record(record, chrX, chrY, fail_set, pass_set): record = process_varGQ(record) record = process_multiallelic(record) record = process_unresolved(record) record = process_noisy(record, fail_set) record = process_bothsides_support(record, pass_set) + record = process_allosomes(record, chrX, chrY) return record @@ -68,16 +70,104 @@ def process_bothsides_support(record, pass_set): return record +def process_allosomes(record, chrX, chrY): + chromosome = record.chrom + if chromosome not in (chrX, chrY): + return record + + updated_samples = [] + sv_type = record.info.get('SVTYPE', '') + sv_len = record.info.get('SVLEN', 0) + + if sv_type in ('DEL', 'DUP') and sv_len >= MIN_ALLOSOME_EVENT_SIZE: + is_y = (chromosome == chrY) + + for sample in record.samples: + genotype = record.samples[sample] + sex = genotype.get('EXPECTED_COPY_NUMBER_FORMAT', None) + + if sex == 1: # Male + if is_revisable_event(record, is_y, sex): + record.info[REVISED_EVENT] = True + adjust_male_genotype(genotype, sv_type) + elif sex == 2 and is_y: # Female + genotype['GT'] = (None, None) # NO_CALL for females on chrY + elif sex == 0: # Unknown + genotype['GT'] = (None, None) # NO_CALL for unknown sex + + updated_samples.append(sample) + + return record + + +def is_revisable_event(record, is_y, sex): + genotypes = record.samples.values() + male_counts = [0, 0, 0, 0] + female_counts = [0, 0, 0, 0] + + for genotype in genotypes: + rd_cn = genotype.get('RD_CN', -1) + rd_cn_val = min(rd_cn, 3) if rd_cn != -1 else -1 + if rd_cn_val == -1: + continue + + if sex == 1: # Male + male_counts[rd_cn_val] += 1 + elif sex == 2: # Female + female_counts[rd_cn_val] += 1 + + male_median = calc_median_distribution(male_counts) + female_median = calc_median_distribution(female_counts) + + return male_median == 2 and (is_y and female_median == 0 or not is_y and female_median == 4) + + +def adjust_male_genotype(genotype, sv_type): + rd_cn = genotype.get('RD_CN', 0) + genotype['RD_CN'] = rd_cn + 1 + ref_allele, alt_allele = genotype['alleles'] + + if sv_type == 'DEL': + if rd_cn >= 1: + genotype['GT'] = (ref_allele, ref_allele) + elif rd_cn == 0: + genotype['GT'] = (ref_allele, alt_allele) + elif sv_type == 'DUP': + if rd_cn <= 1: + genotype['GT'] = (ref_allele, ref_allele) + elif rd_cn == 2: + genotype['GT'] = (ref_allele, alt_allele) + else: + genotype['GT'] = (alt_allele, alt_allele) + + +def calc_median_distribution(counts): + total = sum(counts) + if total == 0: + return -1 + + target = total // 2 + running_total = 0 + for i, count in enumerate(counts): + running_total += count + if running_total >= target: + return i * 2 if running_total > target else i * 2 + 1 + + if __name__ == '__main__': # Parse arguments parser = argparse.ArgumentParser(description='CleanVcf preprocessing.') parser.add_argument('-V', '--input', dest='input_vcf', required=True, help='Input VCF file') parser.add_argument('-O', '--output', dest='output_vcf', required=True, help='Output VCF file') + parser.add_argument('--chrX', required=True, help='Chromosome X representation in VCF') + parser.add_argument('--chrY', required=True, help='Chromosome Y representation in VCF') parser.add_argument('--fail-list', required=True, help='File with variants failing the background test') parser.add_argument('--pass-list', required=True, help='File with variants passing both sides') args = parser.parse_args() # Read input files + chrX = args.chrX + chrY = args.chrY fail_set = read_last_column(args.fail_list) pass_set = read_last_column(args.pass_list) if args.input_vcf.endswith('.gz'): @@ -93,7 +183,7 @@ def process_bothsides_support(record, pass_set): # Process records for record in vcf_in: - record = process_record(record, fail_set, pass_set) + record = process_record(record, chrX, chrY, fail_set, pass_set) vcf_out.write(record) # Close files diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl index 8ace1eea6..0b83a7487 100644 --- a/wdl/CleanVcfChromosome.wdl +++ b/wdl/CleanVcfChromosome.wdl @@ -65,6 +65,8 @@ workflow CleanVcfChromosome { call CleanVcfPreprocess { input: vcf=FormatVcfToClean.out, + chr_x=chr_x, + chr_y=chr_y, background_list=background_list, bothsides_pass_list=bothsides_pass_list, prefix="~{prefix}.preprocess", @@ -208,6 +210,8 @@ workflow CleanVcfChromosome { task CleanVcfPreprocess { input { File vcf + String chr_x + String chr_y File background_list File bothsides_pass_list String prefix @@ -266,6 +270,8 @@ task CleanVcfPreprocess { python /opt/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py \ -V processed.reheader.vcf.gz \ -O ~{output_vcf} \ + --chrX ~{chr_x} \ + --chrY ~{chr_y} \ --fail-list ~{background_list} \ --pass-list ~{bothsides_pass_list} From 9a6da9d9c8bdb6f5e17cc97806ea803ce3e909d2 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Fri, 17 Jan 2025 17:01:22 -0500 Subject: [PATCH 35/40] Coalesced overlapping cnv tools into one --- .../scripts/cleanvcf_preprocess.py | 2 +- wdl/CleanVcfChromosome.wdl | 265 ++++++++++-------- 2 files changed, 155 insertions(+), 112 deletions(-) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py index d0a144711..f8f1a90cc 100644 --- a/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py +++ b/src/sv-pipeline/04_variant_resolution/scripts/cleanvcf_preprocess.py @@ -119,7 +119,7 @@ def is_revisable_event(record, is_y, sex): male_median = calc_median_distribution(male_counts) female_median = calc_median_distribution(female_counts) - return male_median == 2 and (is_y and female_median == 0 or not is_y and female_median == 4) + return male_median == 1 and (female_median == 0 if is_y else female_median == 2) def adjust_male_genotype(genotype, sv_type): diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl index 0b83a7487..dddc27f72 100644 --- a/wdl/CleanVcfChromosome.wdl +++ b/wdl/CleanVcfChromosome.wdl @@ -33,6 +33,7 @@ workflow CleanVcfChromosome { # overrides for local tasks RuntimeAttr? runtime_attr_preprocess + RuntimeAttr? runtime_attr_revise_overlapping_cnvs RuntimeAttr? runtime_attr_revise_overlapping_cnv_gts RuntimeAttr? runtime_attr_revise_overlapping_cnv_cns RuntimeAttr? runtime_attr_revise_large_cnvs @@ -74,25 +75,17 @@ workflow CleanVcfChromosome { runtime_attr_override=runtime_attr_preprocess } - call CleanVcfReviseOverlappingCnvGts { + call CleanVcfReviseOverlappingCnvs { input: vcf=CleanVcfPreprocess.out, - prefix="~{prefix}.revise_overlapping_cnv_gts", + prefix="~{prefix}.revise_overlapping_cnvs", gatk_docker=gatk_docker, - runtime_attr_override=runtime_attr_revise_overlapping_cnv_gts - } - - call CleanVcfReviseOverlappingCnvCns { - input: - vcf=CleanVcfReviseOverlappingCnvGts.out, - prefix="~{prefix}.revise_overlapping_cnv_cns", - gatk_docker=gatk_docker, - runtime_attr_override=runtime_attr_revise_overlapping_cnv_cns + runtime_attr_override=runtime_attr_revise_overlapping_cnvs } call CleanVcfReviseLargeCnvs { input: - vcf=CleanVcfReviseOverlappingCnvCns.out, + vcf=CleanVcfReviseOverlappingCnvs.out, outlier_samples_list=outlier_samples_list, prefix="~{prefix}.revise_large_cnvs", gatk_docker=gatk_docker, @@ -220,13 +213,13 @@ task CleanVcfPreprocess { } RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + size(vcf, "GB") * 2), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } + mem_gb: 3.75, + disk_gb: ceil(10.0 + size(vcf, "GB") * 2), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) runtime { memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" @@ -284,6 +277,54 @@ task CleanVcfPreprocess { } } +task CleanVcfReviseOverlappingCnvs { + input { + File vcf + String prefix + String gatk_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr runtime_default = object { + mem_gb: 3.75, + disk_gb: ceil(10.0 + size(vcf, "GB") * 2), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: gatk_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7) + String output_vcf = "~{prefix}.vcf.gz" + + command <<< + set -euo pipefail + + if [ ! -f "~{vcf}.tbi" ]; then + tabix -p vcf ~{vcf} + fi + + gatk --java-options "-Xmx~{java_mem_mb}m" SVReviseOverlappingCnvs \ + -V ~{vcf} \ + -O ~{output_vcf} + >>> + + output { + File out="~{output_vcf}" + File out_idx="~{output_vcf}.tbi" + } +} + task CleanVcfReviseOverlappingCnvGts { input { File vcf @@ -293,13 +334,13 @@ task CleanVcfReviseOverlappingCnvGts { } RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + size(vcf, "GB") * 2), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } + mem_gb: 3.75, + disk_gb: ceil(10.0 + size(vcf, "GB") * 2), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) runtime { memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" @@ -341,13 +382,13 @@ task CleanVcfReviseOverlappingCnvCns { } RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + size(vcf, "GB") * 2), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } + mem_gb: 3.75, + disk_gb: ceil(10.0 + size(vcf, "GB") * 2), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) runtime { memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" @@ -390,13 +431,15 @@ task CleanVcfReviseLargeCnvs { } RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + size(vcf, "GB") * 2), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } + mem_gb: 3.75, + disk_gb: ceil(10.0 + size(vcf, "GB") * 2), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) runtime { memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" @@ -439,13 +482,13 @@ task CleanVcfReviseAbnormalAllosomes { } RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + size(vcf, "GB") * 2), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } + mem_gb: 3.75, + disk_gb: ceil(10.0 + size(vcf, "GB") * 2), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) runtime { memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" @@ -487,13 +530,13 @@ task CleanVcfReviseMultiallelicCnvs { } RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + size(vcf, "GB") * 2), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } + mem_gb: 3.75, + disk_gb: ceil(10.0 + size(vcf, "GB") * 2), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) runtime { memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" @@ -535,13 +578,13 @@ task CleanVcfPostprocess { } RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + size(vcf, "GB") * 2), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } + mem_gb: 3.75, + disk_gb: ceil(10.0 + size(vcf, "GB") * 2), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) runtime { memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" @@ -740,13 +783,13 @@ task StitchFragmentedCnvs { Float input_size = size(vcf, "GB") RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 2), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } + mem_gb: 7.5, + disk_gb: ceil(10.0 + input_size * 2), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) Float mem_gb = select_first([runtime_override.mem_gb, runtime_default.mem_gb]) Int java_mem_mb = ceil(mem_gb * 1000 * 0.8) @@ -781,53 +824,53 @@ task StitchFragmentedCnvs { # Add FILTER status for pockets of variants with high FP rate: wham-only DELs and Scramble-only SVAs with HIGH_SR_BACKGROUND task AddHighFDRFilters { - input { - File vcf - String prefix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } + input { + File vcf + String prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } - Float input_size = size(vcf, "GiB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 3.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } + Float input_size = size(vcf, "GiB") + RuntimeAttr runtime_default = object { + mem_gb: 3.75, + disk_gb: ceil(10.0 + input_size * 3.0), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_pipeline_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } - command <<< - set -euo pipefail + command <<< + set -euo pipefail - python <") - with pysam.VariantFile("~{prefix}.vcf.gz", 'w', header=header) as fo: - for record in fin: - if (record.info['ALGORITHMS'] == ('wham',) and record.info['SVTYPE'] == 'DEL') or \ - (record.info['ALGORITHMS'] == ('scramble',) and record.info['HIGH_SR_BACKGROUND'] and record.alts == ('',)): - record.filter.add('HIGH_ALGORITHM_FDR') - fo.write(record) -CODE - >>> + python <") + with pysam.VariantFile("~{prefix}.vcf.gz", 'w', header=header) as fo: + for record in fin: + if (record.info['ALGORITHMS'] == ('wham',) and record.info['SVTYPE'] == 'DEL') or \ + (record.info['ALGORITHMS'] == ('scramble',) and record.info['HIGH_SR_BACKGROUND'] and record.alts == ('',)): + record.filter.add('HIGH_ALGORITHM_FDR') + fo.write(record) + CODE + >>> - output { - File out = "~{prefix}.vcf.gz" - } + output { + File out = "~{prefix}.vcf.gz" + } } From db4dfca3cfa739952c8cb590c301c8e12e967e4b Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Tue, 21 Jan 2025 15:11:50 -0500 Subject: [PATCH 36/40] Updated workflow inputs to reflect one-tool approach to overlapping CNVs --- wdl/CleanVcf.wdl | 21 +- wdl/CleanVcfChromosome.wdl | 570 +++++++++++++++---------------------- 2 files changed, 238 insertions(+), 353 deletions(-) diff --git a/wdl/CleanVcf.wdl b/wdl/CleanVcf.wdl index bb64b930e..ad166b18a 100644 --- a/wdl/CleanVcf.wdl +++ b/wdl/CleanVcf.wdl @@ -59,8 +59,7 @@ workflow CleanVcf { # overrides for CleanVcfContig RuntimeAttr? runtime_attr_preprocess - RuntimeAttr? runtime_attr_revise_overlapping_cnv_gts - RuntimeAttr? runtime_attr_revise_overlapping_cnv_cns + RuntimeAttr? runtime_attr_revise_overlapping_cnvs RuntimeAttr? runtime_attr_revise_large_cnvs RuntimeAttr? runtime_attr_revise_abnormal_allosomes RuntimeAttr? runtime_attr_revise_multiallelics @@ -70,19 +69,11 @@ workflow CleanVcf { RuntimeAttr? runtime_attr_format RuntimeAttr? runtime_override_rescue_me_dels - RuntimeAttr? runtime_override_preconcat_step1 - RuntimeAttr? runtime_override_hail_merge_step1 - RuntimeAttr? runtime_override_fix_header_step1 - RuntimeAttr? runtime_override_preconcat_drc RuntimeAttr? runtime_override_hail_merge_drc RuntimeAttr? runtime_override_fix_header_drc - RuntimeAttr? runtime_override_split_vcf_to_clean - RuntimeAttr? runtime_override_split_include_list - RuntimeAttr? runtime_override_combine_clean_vcf_2 RuntimeAttr? runtime_override_drop_redundant_cnvs - RuntimeAttr? runtime_override_combine_step_1_vcfs RuntimeAttr? runtime_override_sort_drop_redundant_cnvs } @@ -130,25 +121,17 @@ workflow CleanVcf { sv_pipeline_docker=sv_pipeline_docker, runtime_attr_preprocess=runtime_attr_preprocess, - runtime_attr_revise_overlapping_cnv_gts=runtime_attr_revise_overlapping_cnv_gts, - runtime_attr_revise_overlapping_cnv_cns=runtime_attr_revise_overlapping_cnv_cns, + runtime_attr_revise_overlapping_cnvs=runtime_attr_revise_overlapping_cnvs, runtime_attr_revise_large_cnvs=runtime_attr_revise_large_cnvs, runtime_attr_revise_abnormal_allosomes=runtime_attr_revise_abnormal_allosomes, runtime_attr_revise_multiallelics=runtime_attr_revise_multiallelics, runtime_attr_postprocess=runtime_attr_postprocess, runtime_override_stitch_fragmented_cnvs=runtime_override_stitch_fragmented_cnvs, runtime_override_final_cleanup=runtime_override_final_cleanup, - runtime_override_split_vcf_to_clean=runtime_override_split_vcf_to_clean, - runtime_override_split_include_list=runtime_override_split_include_list, - runtime_override_combine_clean_vcf_2=runtime_override_combine_clean_vcf_2, - runtime_override_preconcat_step1=runtime_override_preconcat_step1, - runtime_override_hail_merge_step1=runtime_override_hail_merge_step1, - runtime_override_fix_header_step1=runtime_override_fix_header_step1, runtime_override_preconcat_drc=runtime_override_preconcat_drc, runtime_override_hail_merge_drc=runtime_override_hail_merge_drc, runtime_override_fix_header_drc=runtime_override_fix_header_drc, runtime_override_drop_redundant_cnvs=runtime_override_drop_redundant_cnvs, - runtime_override_combine_step_1_vcfs=runtime_override_combine_step_1_vcfs, runtime_override_sort_drop_redundant_cnvs=runtime_override_sort_drop_redundant_cnvs, runtime_attr_format=runtime_attr_format, runtime_override_rescue_me_dels=runtime_override_rescue_me_dels diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl index dddc27f72..72b499154 100644 --- a/wdl/CleanVcfChromosome.wdl +++ b/wdl/CleanVcfChromosome.wdl @@ -34,8 +34,6 @@ workflow CleanVcfChromosome { # overrides for local tasks RuntimeAttr? runtime_attr_preprocess RuntimeAttr? runtime_attr_revise_overlapping_cnvs - RuntimeAttr? runtime_attr_revise_overlapping_cnv_gts - RuntimeAttr? runtime_attr_revise_overlapping_cnv_cns RuntimeAttr? runtime_attr_revise_large_cnvs RuntimeAttr? runtime_attr_revise_abnormal_allosomes RuntimeAttr? runtime_attr_revise_multiallelics @@ -325,102 +323,6 @@ task CleanVcfReviseOverlappingCnvs { } } -task CleanVcfReviseOverlappingCnvGts { - input { - File vcf - String prefix - String gatk_docker - RuntimeAttr? runtime_attr_override - } - - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + size(vcf, "GB") * 2), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: gatk_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7) - String output_vcf = "~{prefix}.vcf.gz" - - command <<< - set -euo pipefail - - if [ ! -f "~{vcf}.tbi" ]; then - tabix -p vcf ~{vcf} - fi - - gatk --java-options "-Xmx~{java_mem_mb}m" SVReviseOverlappingCnvGts \ - -V ~{vcf} \ - -O ~{output_vcf} - >>> - - output { - File out="~{output_vcf}" - File out_idx="~{output_vcf}.tbi" - } -} - -task CleanVcfReviseOverlappingCnvCns { - input { - File vcf - String prefix - String gatk_docker - RuntimeAttr? runtime_attr_override - } - - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + size(vcf, "GB") * 2), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: gatk_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7) - String output_vcf = "~{prefix}.vcf.gz" - - command <<< - set -euo pipefail - - if [ ! -f "~{vcf}.tbi" ]; then - tabix -p vcf ~{vcf} - fi - - gatk --java-options "-Xmx~{java_mem_mb}m" SVReviseOverlappingCnvCns \ - -V ~{vcf} \ - -O ~{output_vcf} - >>> - - output { - File out="~{output_vcf}" - File out_idx="~{output_vcf}.tbi" - } -} - task CleanVcfReviseLargeCnvs { input { File vcf @@ -633,297 +535,297 @@ task CleanVcfPostprocess { } task RescueMobileElementDeletions { - input { - File vcf - String prefix - File LINE1 - File HERVK - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } + input { + File vcf + String prefix + File LINE1 + File HERVK + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } - Float input_size = size(vcf, "GiB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75 + input_size * 1.5, - disk_gb: ceil(100.0 + input_size * 3.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } + Float input_size = size(vcf, "GiB") + RuntimeAttr runtime_default = object { + mem_gb: 3.75 + input_size * 1.5, + disk_gb: ceil(100.0 + input_size * 3.0), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_pipeline_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } - command <<< - set -euo pipefail + command <<< + set -euo pipefail - python <.5) print}' | cut -f4 | sed -e 's/$/\tDEL\tPASS\toverlap_LINE1/' > manual_revise.MEI_DEL_from_BND.SVID_SVTYPE_FILTER_INFO.tsv - bedtools coverage -wo -a ~{prefix}.bnd_del.bed.gz -b ~{HERVK} | awk '{if ($NF>.5) print}' | cut -f4 | sed -e 's/$/\tDEL\tPASS\toverlap_HERVK/' >> manual_revise.MEI_DEL_from_BND.SVID_SVTYPE_FILTER_INFO.tsv + bedtools coverage -wo -a ~{prefix}.bnd_del.bed.gz -b ~{LINE1} | awk '{if ($NF>.5) print}' | cut -f4 | sed -e 's/$/\tDEL\tPASS\toverlap_LINE1/' > manual_revise.MEI_DEL_from_BND.SVID_SVTYPE_FILTER_INFO.tsv + bedtools coverage -wo -a ~{prefix}.bnd_del.bed.gz -b ~{HERVK} | awk '{if ($NF>.5) print}' | cut -f4 | sed -e 's/$/\tDEL\tPASS\toverlap_HERVK/' >> manual_revise.MEI_DEL_from_BND.SVID_SVTYPE_FILTER_INFO.tsv - python <',) - if hash_MEI_DEL_reset[record.id] == 'overlap_HERVK': - record.alts = ('',) - fo.write(record) + if record.id in hash_MEI_DEL_reset.keys(): + del record.filter['UNRESOLVED'] + record.info['SVTYPE'] = 'DEL' + record.info['SVLEN'] = record.info['END2'] - record.start + record.stop = record.info['END2'] + record.info.pop("CHR2") + record.info.pop("END2") + record.info.pop("UNRESOLVED_TYPE") + if hash_MEI_DEL_reset[record.id] == 'overlap_LINE1': + record.alts = ('',) + if hash_MEI_DEL_reset[record.id] == 'overlap_HERVK': + record.alts = ('',) + fo.write(record) fin.close() fo.close() CODE - >>> + >>> - output { - File out = "~{prefix}.vcf.gz" - } + output { + File out = "~{prefix}.vcf.gz" + } } # Remove CNVs that are redundant with CPX events or other CNVs task DropRedundantCnvs { - input { - File vcf - String prefix - String contig - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } + input { + File vcf + String prefix + String contig + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } - Float input_size = size(vcf, "GiB") - # disk is cheap, read/write speed is proportional to disk size, and disk IO is a significant time factor: - # in tests on large VCFs, memory usage is ~1.0 * input VCF size - # the biggest disk usage is at the end of the task, with input + output VCF on disk - Int cpu_cores = 2 # speed up compression / decompression of VCFs - RuntimeAttr runtime_default = object { - mem_gb: 3.75 + input_size * 1.5, - disk_gb: ceil(100.0 + input_size * 2.0), - cpu_cores: cpu_cores, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } + Float input_size = size(vcf, "GiB") + # disk is cheap, read/write speed is proportional to disk size, and disk IO is a significant time factor: + # in tests on large VCFs, memory usage is ~1.0 * input VCF size + # the biggest disk usage is at the end of the task, with input + output VCF on disk + Int cpu_cores = 2 # speed up compression / decompression of VCFs + RuntimeAttr runtime_default = object { + mem_gb: 3.75 + input_size * 1.5, + disk_gb: ceil(100.0 + input_size * 2.0), + cpu_cores: cpu_cores, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_pipeline_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } - command <<< - set -euo pipefail - /opt/sv-pipeline/04_variant_resolution/scripts/resolve_cpx_cnv_redundancies.py \ - ~{vcf} ~{prefix}.vcf.gz --temp-dir ./tmp - >>> + command <<< + set -euo pipefail + /opt/sv-pipeline/04_variant_resolution/scripts/resolve_cpx_cnv_redundancies.py \ + ~{vcf} ~{prefix}.vcf.gz --temp-dir ./tmp + >>> - output { - File out = "~{prefix}.vcf.gz" - } + output { + File out = "~{prefix}.vcf.gz" + } } # Stitch fragmented RD-only calls found in 100% of the same samples task StitchFragmentedCnvs { - input { - File vcf - String prefix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(vcf, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 2), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - Float mem_gb = select_first([runtime_override.mem_gb, runtime_default.mem_gb]) - Int java_mem_mb = ceil(mem_gb * 1000 * 0.8) - - runtime { - memory: "~{mem_gb} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } + input { + File vcf + String prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } - command <<< - set -euo pipefail - echo "First pass..." - java -Xmx~{java_mem_mb}M -jar ${STITCH_JAR} 0.2 200000 0.2 ~{vcf} \ - | bgzip \ - > tmp.vcf.gz - rm ~{vcf} - echo "Second pass..." - java -Xmx~{java_mem_mb}M -jar ${STITCH_JAR} 0.2 200000 0.2 tmp.vcf.gz \ - | bgzip \ - > ~{prefix}.vcf.gz - >>> + Float input_size = size(vcf, "GB") + RuntimeAttr runtime_default = object { + mem_gb: 7.5, + disk_gb: ceil(10.0 + input_size * 2), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + Float mem_gb = select_first([runtime_override.mem_gb, runtime_default.mem_gb]) + Int java_mem_mb = ceil(mem_gb * 1000 * 0.8) + + runtime { + memory: "~{mem_gb} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_pipeline_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } - output { - File stitched_vcf_shard = "~{prefix}.vcf.gz" - } + command <<< + set -euo pipefail + echo "First pass..." + java -Xmx~{java_mem_mb}M -jar ${STITCH_JAR} 0.2 200000 0.2 ~{vcf} \ + | bgzip \ + > tmp.vcf.gz + rm ~{vcf} + echo "Second pass..." + java -Xmx~{java_mem_mb}M -jar ${STITCH_JAR} 0.2 200000 0.2 tmp.vcf.gz \ + | bgzip \ + > ~{prefix}.vcf.gz + >>> + + output { + File stitched_vcf_shard = "~{prefix}.vcf.gz" + } } # Add FILTER status for pockets of variants with high FP rate: wham-only DELs and Scramble-only SVAs with HIGH_SR_BACKGROUND task AddHighFDRFilters { - input { - File vcf - String prefix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } + input { + File vcf + String prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } - Float input_size = size(vcf, "GiB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 3.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } + Float input_size = size(vcf, "GiB") + RuntimeAttr runtime_default = object { + mem_gb: 3.75, + disk_gb: ceil(10.0 + input_size * 3.0), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_pipeline_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } - command <<< - set -euo pipefail + command <<< + set -euo pipefail - python <") - with pysam.VariantFile("~{prefix}.vcf.gz", 'w', header=header) as fo: - for record in fin: - if (record.info['ALGORITHMS'] == ('wham',) and record.info['SVTYPE'] == 'DEL') or \ - (record.info['ALGORITHMS'] == ('scramble',) and record.info['HIGH_SR_BACKGROUND'] and record.alts == ('',)): - record.filter.add('HIGH_ALGORITHM_FDR') - fo.write(record) - CODE - >>> + python <") + with pysam.VariantFile("~{prefix}.vcf.gz", 'w', header=header) as fo: + for record in fin: + if (record.info['ALGORITHMS'] == ('wham',) and record.info['SVTYPE'] == 'DEL') or \ + (record.info['ALGORITHMS'] == ('scramble',) and record.info['HIGH_SR_BACKGROUND'] and record.alts == ('',)): + record.filter.add('HIGH_ALGORITHM_FDR') + fo.write(record) +CODE + >>> - output { - File out = "~{prefix}.vcf.gz" - } + output { + File out = "~{prefix}.vcf.gz" + } } # Final VCF cleanup task FinalCleanup { - input { - File vcf - String contig - String prefix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed - # generally assume working memory is ~3 * inputs - Float input_size = size(vcf, "GB") - Float base_disk_gb = 10.0 - Float base_mem_gb = 2.0 - Float input_mem_scale = 3.0 - Float input_disk_scale = 5.0 - RuntimeAttr runtime_default = object { - mem_gb: base_mem_gb + input_size * input_mem_scale, - disk_gb: ceil(base_disk_gb + input_size * input_disk_scale), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } + input { + File vcf + String contig + String prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } - command <<< - set -eu -o pipefail - - /opt/sv-pipeline/04_variant_resolution/scripts/rename_after_vcfcluster.py \ - --chrom ~{contig} \ - --prefix ~{prefix} \ - ~{vcf} stdout \ - | bcftools annotate --no-version -e 'SVTYPE=="CNV" && SVLEN<5000' -x INFO/MEMBERS -Oz -o ~{prefix}.vcf.gz - tabix ~{prefix}.vcf.gz - >>> + # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed + # generally assume working memory is ~3 * inputs + Float input_size = size(vcf, "GB") + Float base_disk_gb = 10.0 + Float base_mem_gb = 2.0 + Float input_mem_scale = 3.0 + Float input_disk_scale = 5.0 + RuntimeAttr runtime_default = object { + mem_gb: base_mem_gb + input_size * input_mem_scale, + disk_gb: ceil(base_disk_gb + input_size * input_disk_scale), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_pipeline_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } - output { - File final_cleaned_shard = "~{prefix}.vcf.gz" - File final_cleaned_shard_idx = "~{prefix}.vcf.gz.tbi" - } + command <<< + set -eu -o pipefail + + /opt/sv-pipeline/04_variant_resolution/scripts/rename_after_vcfcluster.py \ + --chrom ~{contig} \ + --prefix ~{prefix} \ + ~{vcf} stdout \ + | bcftools annotate --no-version -e 'SVTYPE=="CNV" && SVLEN<5000' -x INFO/MEMBERS -Oz -o ~{prefix}.vcf.gz + tabix ~{prefix}.vcf.gz + >>> + + output { + File final_cleaned_shard = "~{prefix}.vcf.gz" + File final_cleaned_shard_idx = "~{prefix}.vcf.gz.tbi" + } } \ No newline at end of file From 24fc04e4cc75fe6edf56147952008abe7025ce03 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Wed, 22 Jan 2025 09:58:58 -0500 Subject: [PATCH 37/40] Readded calcAF.wdl --- wdl/CalcAF.wdl | 179 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 wdl/CalcAF.wdl diff --git a/wdl/CalcAF.wdl b/wdl/CalcAF.wdl new file mode 100644 index 000000000..064c3b28a --- /dev/null +++ b/wdl/CalcAF.wdl @@ -0,0 +1,179 @@ +version 1.0 + +import "Structs.wdl" +import "TasksMakeCohortVcf.wdl" as tmc + +workflow CalcAF { + input { + File vcf + File vcf_idx + Int sv_per_shard + String prefix + String sv_pipeline_docker + File? sample_pop_assignments #Two-column file with sample ID & pop assignment. "." for pop will ignore sample + File? famfile #Used for M/F AF calculations + File? par_bed #Used for marking hemizygous males on X & Y + File? allosomes_list #allosomes .fai used to override default sex chromosome assignments + String? drop_empty_records + + RuntimeAttr? runtime_attr_compute_shard_af + RuntimeAttr? runtime_attr_scatter_vcf + RuntimeAttr? runtime_attr_combine_sharded_vcfs + } + + + # Tabix to chromosome of interest, and shard input VCF for stats collection + call tmc.ScatterVcf { + input: + vcf=vcf, + prefix=prefix, + sv_pipeline_docker=sv_pipeline_docker, + records_per_shard=sv_per_shard, + runtime_attr_override = runtime_attr_scatter_vcf + } + + # Scatter over VCF shards + scatter ( shard in ScatterVcf.shards ) { + # Collect AF summary stats + call ComputeShardAFs { + input: + vcf=shard, + sv_pipeline_docker=sv_pipeline_docker, + prefix=prefix, + sample_pop_assignments=sample_pop_assignments, + famfile=famfile, + par_bed=par_bed, + allosomes_list=allosomes_list, + runtime_attr_override = runtime_attr_compute_shard_af + } + } + + # Merge shards into single VCF + call CombineShardedVcfs { + input: + vcfs=ComputeShardAFs.shard_wAFs, + sv_pipeline_docker=sv_pipeline_docker, + prefix=prefix, + drop_empty_records=drop_empty_records, + runtime_attr_override = runtime_attr_combine_sharded_vcfs + } + + # Final output + output { + File vcf_wAFs = CombineShardedVcfs.vcf_out + File vcf_wAFs_idx = CombineShardedVcfs.vcf_out_idx + } +} + +# Subset a vcf to a single chromosome, and add global AF information (no subpop) +task ComputeShardAFs { + input { + File vcf + String prefix + String sv_pipeline_docker + File? sample_pop_assignments + File? famfile + File? par_bed + File? allosomes_list + RuntimeAttr? runtime_attr_override + } + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1.5, + disk_gb: ceil(20 + size(vcf, "GB") * 2), + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + command <<< + set -euo pipefail + optionals=" " + if [ ~{default="SKIP" sample_pop_assignments} != "SKIP" ]; then + optionals="$( echo "$optionals" ) -p ~{sample_pop_assignments}" + fi + if [ ~{default="SKIP" famfile} != "SKIP" ]; then + optionals="$( echo "$optionals" ) -f ~{famfile}" + fi + if [ ~{default="SKIP" par_bed} != "SKIP" ]; then + optionals="$( echo "$optionals" ) --par ~{par_bed}" + fi + if [ ~{default="SKIP" allosomes_list} != "SKIP" ]; then + optionals="$( echo "$optionals" ) --allosomes-list ~{allosomes_list}" + fi + echo -e "OPTIONALS INTERPRETED AS: $optionals" + echo -e "NOW RUNNING: /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py $( echo "$optionals" ) ~{vcf} stdout" + #Tabix chromosome of interest & compute AN, AC, and AF + /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py $optionals "~{vcf}" stdout \ + | bgzip -c \ + > "~{prefix}.wAFs.vcf.gz" + >>> + + output { + File shard_wAFs = "~{prefix}.wAFs.vcf.gz" + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + + +# Merge VCF shards & drop records with zero remaining non-ref alleles +task CombineShardedVcfs { + input { + Array[File] vcfs + String prefix + String sv_pipeline_docker + String? drop_empty_records + RuntimeAttr? runtime_attr_override + } + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 4, + disk_gb: 50, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + command { + set -euo pipefail + vcf-concat -f ~{write_lines(vcfs)} \ + | vcf-sort \ + > merged.vcf + if [ ~{default="TRUE" drop_empty_records} == "TRUE" ]; then + /opt/sv-pipeline/05_annotation/scripts/prune_allref_records.py \ + merged.vcf stdout \ + | bgzip -c \ + > "~{prefix}.wAFs.vcf.gz" + else + cat merged.vcf | bgzip -c > "~{prefix}.wAFs.vcf.gz" + fi + tabix -p vcf "~{prefix}.wAFs.vcf.gz" + } + + + output { + File vcf_out = "~{prefix}.wAFs.vcf.gz" + File vcf_out_idx = "~{prefix}.wAFs.vcf.gz.tbi" + } + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} + From c588ab1f2d7ede1f3aef339c9639797ecb58e203 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Wed, 22 Jan 2025 10:01:38 -0500 Subject: [PATCH 38/40] Removed calcaf.wdl --- wdl/CalcAF.wdl | 179 ------------------------------------------------- 1 file changed, 179 deletions(-) delete mode 100644 wdl/CalcAF.wdl diff --git a/wdl/CalcAF.wdl b/wdl/CalcAF.wdl deleted file mode 100644 index 064c3b28a..000000000 --- a/wdl/CalcAF.wdl +++ /dev/null @@ -1,179 +0,0 @@ -version 1.0 - -import "Structs.wdl" -import "TasksMakeCohortVcf.wdl" as tmc - -workflow CalcAF { - input { - File vcf - File vcf_idx - Int sv_per_shard - String prefix - String sv_pipeline_docker - File? sample_pop_assignments #Two-column file with sample ID & pop assignment. "." for pop will ignore sample - File? famfile #Used for M/F AF calculations - File? par_bed #Used for marking hemizygous males on X & Y - File? allosomes_list #allosomes .fai used to override default sex chromosome assignments - String? drop_empty_records - - RuntimeAttr? runtime_attr_compute_shard_af - RuntimeAttr? runtime_attr_scatter_vcf - RuntimeAttr? runtime_attr_combine_sharded_vcfs - } - - - # Tabix to chromosome of interest, and shard input VCF for stats collection - call tmc.ScatterVcf { - input: - vcf=vcf, - prefix=prefix, - sv_pipeline_docker=sv_pipeline_docker, - records_per_shard=sv_per_shard, - runtime_attr_override = runtime_attr_scatter_vcf - } - - # Scatter over VCF shards - scatter ( shard in ScatterVcf.shards ) { - # Collect AF summary stats - call ComputeShardAFs { - input: - vcf=shard, - sv_pipeline_docker=sv_pipeline_docker, - prefix=prefix, - sample_pop_assignments=sample_pop_assignments, - famfile=famfile, - par_bed=par_bed, - allosomes_list=allosomes_list, - runtime_attr_override = runtime_attr_compute_shard_af - } - } - - # Merge shards into single VCF - call CombineShardedVcfs { - input: - vcfs=ComputeShardAFs.shard_wAFs, - sv_pipeline_docker=sv_pipeline_docker, - prefix=prefix, - drop_empty_records=drop_empty_records, - runtime_attr_override = runtime_attr_combine_sharded_vcfs - } - - # Final output - output { - File vcf_wAFs = CombineShardedVcfs.vcf_out - File vcf_wAFs_idx = CombineShardedVcfs.vcf_out_idx - } -} - -# Subset a vcf to a single chromosome, and add global AF information (no subpop) -task ComputeShardAFs { - input { - File vcf - String prefix - String sv_pipeline_docker - File? sample_pop_assignments - File? famfile - File? par_bed - File? allosomes_list - RuntimeAttr? runtime_attr_override - } - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 1.5, - disk_gb: ceil(20 + size(vcf, "GB") * 2), - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 1 - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - - command <<< - set -euo pipefail - optionals=" " - if [ ~{default="SKIP" sample_pop_assignments} != "SKIP" ]; then - optionals="$( echo "$optionals" ) -p ~{sample_pop_assignments}" - fi - if [ ~{default="SKIP" famfile} != "SKIP" ]; then - optionals="$( echo "$optionals" ) -f ~{famfile}" - fi - if [ ~{default="SKIP" par_bed} != "SKIP" ]; then - optionals="$( echo "$optionals" ) --par ~{par_bed}" - fi - if [ ~{default="SKIP" allosomes_list} != "SKIP" ]; then - optionals="$( echo "$optionals" ) --allosomes-list ~{allosomes_list}" - fi - echo -e "OPTIONALS INTERPRETED AS: $optionals" - echo -e "NOW RUNNING: /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py $( echo "$optionals" ) ~{vcf} stdout" - #Tabix chromosome of interest & compute AN, AC, and AF - /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py $optionals "~{vcf}" stdout \ - | bgzip -c \ - > "~{prefix}.wAFs.vcf.gz" - >>> - - output { - File shard_wAFs = "~{prefix}.wAFs.vcf.gz" - } - - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - docker: sv_pipeline_docker - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - } -} - - -# Merge VCF shards & drop records with zero remaining non-ref alleles -task CombineShardedVcfs { - input { - Array[File] vcfs - String prefix - String sv_pipeline_docker - String? drop_empty_records - RuntimeAttr? runtime_attr_override - } - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 4, - disk_gb: 50, - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 1 - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - - command { - set -euo pipefail - vcf-concat -f ~{write_lines(vcfs)} \ - | vcf-sort \ - > merged.vcf - if [ ~{default="TRUE" drop_empty_records} == "TRUE" ]; then - /opt/sv-pipeline/05_annotation/scripts/prune_allref_records.py \ - merged.vcf stdout \ - | bgzip -c \ - > "~{prefix}.wAFs.vcf.gz" - else - cat merged.vcf | bgzip -c > "~{prefix}.wAFs.vcf.gz" - fi - tabix -p vcf "~{prefix}.wAFs.vcf.gz" - } - - - output { - File vcf_out = "~{prefix}.wAFs.vcf.gz" - File vcf_out_idx = "~{prefix}.wAFs.vcf.gz.tbi" - } - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - docker: sv_pipeline_docker - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - } -} - From a6588c664bf64b8c9b10a55b6b78f840c6a81eaf Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Wed, 29 Jan 2025 16:26:33 -0500 Subject: [PATCH 39/40] Minor changes to make wdl more readable --- wdl/CleanVcfChromosome.wdl | 185 ++----------------------------------- 1 file changed, 7 insertions(+), 178 deletions(-) diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl index 6b13dbceb..cdbae0c2b 100644 --- a/wdl/CleanVcfChromosome.wdl +++ b/wdl/CleanVcfChromosome.wdl @@ -26,15 +26,12 @@ workflow CleanVcfChromosome { Boolean use_hail String? gcs_project -<<<<<<< HEAD String gatk_docker String linux_docker String sv_base_mini_docker String sv_pipeline_docker -======= File? svtk_to_gatk_script # For debugging File? make_clean_gq_script ->>>>>>> main # overrides for local tasks RuntimeAttr? runtime_attr_preprocess @@ -160,144 +157,14 @@ workflow CleanVcfChromosome { runtime_attr_override=runtime_override_stitch_fragmented_cnvs } -<<<<<<< HEAD call RescueMobileElementDeletions { -======= - call MiniTasks.SplitUncompressed as SplitIncludeList { - input: - whole_file=CleanVcf1a.include_list[0], - lines_per_shard=samples_per_step2_shard, - shard_prefix="~{prefix}.split_include_list.", - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_split_include_list - } - - scatter ( i in range(length(SplitIncludeList.shards)) ){ - call CleanVcf2 { - input: - normal_revise_vcf=CleanVcf1b.normal, - prefix="~{prefix}.clean_vcf_2.shard_~{i}", - include_list=SplitIncludeList.shards[i], - multi_cnvs=CleanVcf1b.multi, - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_clean_vcf_2 - } - } - - call MiniTasks.CatUncompressedFiles as CombineCleanVcf2 { - input: - shards=CleanVcf2.out, - outfile_name="~{prefix}.combine_clean_vcf_2.txt", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_combine_clean_vcf_2 - } - - call CleanVcf3 { - input: - rd_cn_revise=CombineCleanVcf2.outfile, - max_samples_shard = max_samples_per_shard_step3, - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_clean_vcf_3 - } - - scatter ( i in range(length(CleanVcf3.shards)) ){ - call CleanVcf4 { - input: - rd_cn_revise=CleanVcf3.shards[i], - normal_revise_vcf=CleanVcf1b.normal, - prefix="~{prefix}.clean_vcf_4.shard_~{i}", - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_clean_vcf_4 - } - } - - call MiniTasks.CatUncompressedFiles as CombineRevised4 { - input: - shards=CleanVcf4.out, - outfile_name="~{prefix}.combine_revised_4.txt.gz", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_combine_revised_4 - } - - call MiniTasks.CatUncompressedFiles as CombineMultiIds4 { - input: - shards=CleanVcf4.multi_ids, - outfile_name="~{prefix}.combine_multi_ids_4.txt.gz", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_combine_multi_ids_4 - } - - call c5.CleanVcf5 { - input: - revise_vcf_lines=CombineRevised4.outfile, - normal_revise_vcf=CleanVcf1b.normal, - ped_file=ped_file, - sex_chr_revise=CombineStep1SexChrRevisions.outfile, - multi_ids=CombineMultiIds4.outfile, - outlier_samples_list=outlier_samples_list, - contig=contig, - prefix="~{prefix}.clean_vcf_5", - records_per_shard=clean_vcf5_records_per_shard, - threads_per_task=clean_vcf5_threads_per_task, - make_clean_gq_script=make_clean_gq_script, - sv_pipeline_docker=sv_pipeline_docker, - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override_scatter=runtime_override_clean_vcf_5_scatter, - runtime_attr_override_make_cleangq=runtime_override_clean_vcf_5_make_cleangq, - runtime_attr_override_find_redundant_multiallelics=runtime_override_clean_vcf_5_find_redundant_multiallelics, - runtime_attr_override_polish=runtime_override_clean_vcf_5_polish - } - - call DropRedundantCnvs { - input: - vcf=CleanVcf5.polished, - prefix="~{prefix}.drop_redundant_cnvs", - contig=contig, - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_drop_redundant_cnvs - } - - if (use_hail) { - call HailMerge.HailMerge as SortDropRedundantCnvsHail { - input: - vcfs=[DropRedundantCnvs.out], - prefix="~{prefix}.drop_redundant_cnvs.sorted", - gcs_project=gcs_project, - reset_cnv_gts=true, - sv_base_mini_docker=sv_base_mini_docker, - sv_pipeline_docker=sv_pipeline_docker, - runtime_override_preconcat=runtime_override_preconcat_drc, - runtime_override_hail_merge=runtime_override_hail_merge_drc, - runtime_override_fix_header=runtime_override_fix_header_drc - } - } - if (!use_hail) { - call MiniTasks.SortVcf as SortDropRedundantCnvs { - input: - vcf=DropRedundantCnvs.out, - outfile_prefix="~{prefix}.drop_redundant_cnvs.sorted", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_sort_drop_redundant_cnvs - } - } - - call StitchFragmentedCnvs { - input: - vcf=select_first([SortDropRedundantCnvs.out, SortDropRedundantCnvsHail.merged_vcf]), - prefix="~{prefix}.stitch_fragmented_cnvs", - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_stitch_fragmented_cnvs - } - - call RescueMobileElementDeletions { ->>>>>>> main - input: - vcf = StitchFragmentedCnvs.stitched_vcf_shard, - prefix = "~{prefix}.rescue_me_dels", - LINE1 = LINE1_reference, - HERVK = HERVK_reference, - sv_pipeline_docker = sv_pipeline_docker, - runtime_attr_override = runtime_override_rescue_me_dels + input: + vcf = StitchFragmentedCnvs.stitched_vcf_shard, + prefix = "~{prefix}.rescue_me_dels", + LINE1 = LINE1_reference, + HERVK = HERVK_reference, + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override = runtime_override_rescue_me_dels } call AddHighFDRFilters { @@ -585,46 +452,8 @@ task CleanVcfReviseMultiallelicCnvs { bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) } -<<<<<<< HEAD Int java_mem_mb = ceil(select_first([runtime_override.mem_gb, runtime_default.mem_gb]) * 1000 * 0.7) String output_vcf = "~{prefix}.vcf.gz" -======= - vcf = pysam.VariantFile(VCF_FILE) - # Max sample count with PE or SR GT over 3 - max_vf = max(len(vcf.header.samples) * 0.01, 2) - record_start = (batch_num - 1) * segments - record_end = batch_num * segments - record_idx = 0 - print("{} {} {}".format(max_vf, record_start, record_end)) - multi_geno_ids = set([]) - for record in vcf: - record_idx += 1 - if record_idx < record_start: - continue - elif record_idx > record_end: - break - num_gt_over_2 = 0 - for sid in record.samples: - s = record.samples[sid] - # Pick best GT - if s.get('PE_GT') is None: - continue - elif s.get('SR_GT') is None: - gt = s.get('PE_GT') - elif s.get('PE_GT') > 0 and s.get('SR_GT') == 0: - gt = s.get('PE_GT') - elif s.get('PE_GT') == 0: - gt = s.get('SR_GT') - elif s.get('PE_GQ') >= s.get('SR_GQ'): - gt = s.get('PE_GT') - else: - gt = s.get('SR_GT') - if gt > 2: - num_gt_over_2 += 1 - if num_gt_over_2 > max_vf: - multi_geno_ids.add(record.id) - vcf.close() ->>>>>>> main command <<< set -euo pipefail From 835f5b023da8ee84de58abb771c85c8b58b063f7 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Wed, 29 Jan 2025 16:40:30 -0500 Subject: [PATCH 40/40] Removed head from merge conflict --- inputs/values/dockers.json | 4 ---- 1 file changed, 4 deletions(-) diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json index 1719a4602..b4e41f574 100644 --- a/inputs/values/dockers.json +++ b/inputs/values/dockers.json @@ -25,9 +25,5 @@ "sv_utils_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-utils:2025-01-06-v1.0.1-e902bf4e", "gq_recalibrator_docker": "us.gcr.io/broad-dsde-methods/markw/gatk:mw-tb-form-sv-filter-training-data-899360a", "str": "us.gcr.io/broad-dsde-methods/gatk-sv/str:2023-05-23-v0.27.3-beta-e537bdd6", -<<<<<<< HEAD - "denovo": "us-central1-docker.pkg.dev/talkowski-training/kj-development/denovo:kj-clean-vcf-a2e7381c9f4822c8b171f99d679b394cc4bd60ac" -======= "denovo": "us.gcr.io/broad-dsde-methods/gatk-sv/denovo:2025-01-14-v1.0.1-88dbd052" ->>>>>>> main } \ No newline at end of file