Skip to content

Commit

Permalink
VS-1368 The tarball is too damn big (#8829)
Browse files Browse the repository at this point in the history
* Compress the tarball saves a bit.
* Remove unused contigs from interval_list files by grepping.
---------

Co-authored-by: Miguel Covarrubias <[email protected]>
  • Loading branch information
2 people authored and RoriCremer committed May 23, 2024
1 parent 4bd5a4f commit 59d716f
Showing 1 changed file with 13 additions and 5 deletions.
18 changes: 13 additions & 5 deletions scripts/variantstore/wdl/GvsUtils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -286,30 +286,38 @@ task SplitIntervalsTarred {

export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}

mkdir interval-files
mkdir orig-interval-files
gatk --java-options "-Xmx~{java_memory}g" ~{gatk_tool} \
--dont-mix-contigs \
-R ~{ref_fasta} \
~{"-L " + intervals} \
~{"--weight-bed-file " + interval_weights_bed} \
-scatter ~{scatter_count} \
-O interval-files \
-O orig-interval-files \
~{"--extension " + intervals_file_extension} \
--interval-file-num-digits 10 \
~{split_intervals_extra_args}

mkdir interval-files

# Take the original interval_list files and remove from their headers all of the unused hg38 contigs
for filename in orig-interval-files/*.interval_list; do
f1=$(basename "$filename")
cat $filename | grep -E -v 'SN\:(chr.*_alt|chr.*_random|chrUn_|HLA-|chrEBV)' > "interval-files/$f1.interval_list"
done

# Print all the interval filenames with their relative paths to a file
find interval-files -mindepth 1 -maxdepth 1 | sort > interval_list_list.txt

# Drop trailing slash if one exists
OUTPUT_GCS_DIR=$(echo ~{output_gcs_dir} | sed 's/\/$//')

if [ -n "$OUTPUT_GCS_DIR" ]; then
gsutil -m cp -r "interval-files" $OUTPUT_GCS_DIR/
gsutil -m cp -r "interval-files" $OUTPUT_GCS_DIR/
fi

# Tar up the interval file directory
tar -cf interval-files.tar interval-files
tar -czf interval-files.tar.gz interval-files
>>>

runtime {
Expand All @@ -322,7 +330,7 @@ task SplitIntervalsTarred {
}

output {
File interval_files_tar = "interval-files.tar"
File interval_files_tar = "interval-files.tar.gz"
Array[String] interval_filenames = read_lines("interval_list_list.txt")
File monitoring_log = "monitoring.log"
}
Expand Down

0 comments on commit 59d716f

Please sign in to comment.