broadinstitute · VJalili · Dec 8, 2023 · Nov 22, 2023 · Dec 1, 2023 · Dec 8, 2023
diff --git a/wdl/GatherSampleEvidence.wdl b/wdl/GatherSampleEvidence.wdl
@@ -24,6 +24,15 @@ workflow GatherSampleEvidence {
     Boolean collect_coverage = true
     Boolean collect_pesr = true
 
+    # Google Cloud Platform (GCP) and Azure users can safely enable and get a slight cost improvement.
+    # Users running on shared file systems should NOT enable this feature.
+    # Enabling this option when running the workflow on GCP or Azure will result in moving the **localized**
+    # files from one path to another on the VM, without impacting the files in their source persistent location.
+    # It will lead to using a slightly smaller disk size and running faster, thereby providing a slight cost improvement.
+    # However, when run on shared file systems (e.g., HPC), it will, by default, create a copy of the
+    # input files, and all subsequent operations will run on the deep copy of the input file.
+    Boolean move_bam_or_cram_files = false
+
     # Convert ambiguous bases (e.g. K, S, Y, etc.) to N
     # Only use if encountering errors (expensive!)
     Boolean revise_base = false
@@ -123,6 +132,7 @@ workflow GatherSampleEvidence {
     input:
       reads_path = bam_or_cram_file,
       reads_index = bam_or_cram_index_,
+      move_files = move_bam_or_cram_files,
       runtime_attr_override = runtime_attr_localize_reads
   }
 
@@ -309,10 +319,11 @@ task LocalizeReads {
   input {
     File reads_path
     File reads_index
+    Boolean move_files = false
     RuntimeAttr? runtime_attr_override
   }
 
-  Float input_size = size(reads_path, "GB")
+  Float input_size = if move_files then size(reads_path, "GB") * 2 else size(reads_path, "GB")
   RuntimeAttr runtime_default = object {
                                   mem_gb: 3.75,
                                   disk_gb: ceil(50.0 + input_size),
@@ -335,8 +346,24 @@ task LocalizeReads {
   Int disk_size = ceil(50 + size(reads_path, "GB"))
 
   command {
-    ln -s ~{reads_path}
-    ln -s ~{reads_index}
+    set -exuo pipefail
+
+    # When this pipeline is run on an HPC, moving files could lead to
+    # moving the files from their original source, compared to moving
+    # them from one directory of the VM to another when run on Cloud.
+    # Therefore, to avoid moving files unexpectedly, we provide both
+    # options for moving and copying, and set the copy as default.
+    # Note that, when copying the files, the task can be slower depending
+    # on the file size and IO performance and will need additional disk
+    # space, hence it will be more expensive to run.
+
+    if ~{move_files}; then
+      mv ~{reads_path} $(basename ~{reads_path})
+      mv ~{reads_index} $(basename ~{reads_index})
+    else
+      cp ~{reads_path} $(basename ~{reads_path})
+      cp ~{reads_index} $(basename ~{reads_index})
+    fi
   }
   output {
     File output_file = basename(reads_path)