From 59d716f238ed09962e76cd71a048ca9b70059b05 Mon Sep 17 00:00:00 2001 From: George Grant Date: Mon, 13 May 2024 14:18:41 -0400 Subject: [PATCH] VS-1368 The tarball is too damn big (#8829) * Compress the tarball saves a bit. * Remove unused contigs from interval_list files by grepping. --------- Co-authored-by: Miguel Covarrubias --- scripts/variantstore/wdl/GvsUtils.wdl | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/scripts/variantstore/wdl/GvsUtils.wdl b/scripts/variantstore/wdl/GvsUtils.wdl index bfb1b47aff7..448e3839ffd 100644 --- a/scripts/variantstore/wdl/GvsUtils.wdl +++ b/scripts/variantstore/wdl/GvsUtils.wdl @@ -286,18 +286,26 @@ task SplitIntervalsTarred { export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} - mkdir interval-files + mkdir orig-interval-files gatk --java-options "-Xmx~{java_memory}g" ~{gatk_tool} \ --dont-mix-contigs \ -R ~{ref_fasta} \ ~{"-L " + intervals} \ ~{"--weight-bed-file " + interval_weights_bed} \ -scatter ~{scatter_count} \ - -O interval-files \ + -O orig-interval-files \ ~{"--extension " + intervals_file_extension} \ --interval-file-num-digits 10 \ ~{split_intervals_extra_args} + mkdir interval-files + + # Take the original interval_list files and remove from their headers all of the unused hg38 contigs + for filename in orig-interval-files/*.interval_list; do + f1=$(basename "$filename") + cat $filename | grep -E -v 'SN\:(chr.*_alt|chr.*_random|chrUn_|HLA-|chrEBV)' > "interval-files/$f1.interval_list" + done + # Print all the interval filenames with their relative paths to a file find interval-files -mindepth 1 -maxdepth 1 | sort > interval_list_list.txt @@ -305,11 +313,11 @@ task SplitIntervalsTarred { OUTPUT_GCS_DIR=$(echo ~{output_gcs_dir} | sed 's/\/$//') if [ -n "$OUTPUT_GCS_DIR" ]; then - gsutil -m cp -r "interval-files" $OUTPUT_GCS_DIR/ + gsutil -m cp -r "interval-files" $OUTPUT_GCS_DIR/ fi # Tar up the interval file directory - tar -cf interval-files.tar interval-files + tar -czf interval-files.tar.gz interval-files >>> runtime { @@ -322,7 +330,7 @@ task SplitIntervalsTarred { } output { - File interval_files_tar = "interval-files.tar" + File interval_files_tar = "interval-files.tar.gz" Array[String] interval_filenames = read_lines("interval_list_list.txt") File monitoring_log = "monitoring.log" }