From 7a975eae435045bb1a97b3d6b8b7ab6aa788e483 Mon Sep 17 00:00:00 2001 From: ljwharbers Date: Wed, 18 Dec 2024 13:30:53 +0100 Subject: [PATCH 01/21] added function to set dedicated account name --- conf/vsc_kul_uhasselt.config | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index 4e463460..af0affaf 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -47,6 +47,12 @@ aws { maxErrorRetry = 3 } +// Define a function to call the correct account based on queue +// TODO: FIX THIS FUNCTION +def setClusterOptions(queue, defaultAccount, dedicatedAccount) { + queue.toString.contains('dedicated') ? "--clusters=wice --account=${dedicatedAccount}" : "--clusters=wice --account=${defaultAccount}" +} + // Define profiles for each cluster profiles { genius { @@ -112,7 +118,7 @@ profiles { process { // max is 2016000 resourceLimits = [ memory: 1968.GB, cpus: 72, time: 168.h ] - clusterOptions = { "--clusters=wice --account=$tier1_project"} + clusterOptions = { "--clusters=wice --account=$tier1_project" } beforeScript = 'module load cluster/wice' queue = { @@ -121,14 +127,17 @@ profiles { (task.time >= 72.h ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake') } - withLabel: '.*gpu.*'{ + // Set clusterOptions based on queue + clusterOptions = setClusterOptions(queue, tier1_project, 'lp_big_wice_cpu') + + withLabel: '.*gpu.*' { resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] apptainer.runOptions = '--containall --cleanenv --nv' singularity.runOptions = '--containall --cleanenv --nv' clusterOptions = { // suggested to use 16 cpus per gpu def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int) - "--gres=gpu:${gpus} --clusters=wice --account=$tier1_project" + setClusterOptions(queue, tier1_project, 'lp_big_wice_gpu') + " --gres=gpu:${gpus}" } queue = { @@ -140,7 +149,6 @@ profiles { } } - wice_gpu { params.config_profile_description = 'wice_gpu profile for use on the Wice cluster of the VSC HPC.' apptainer.runOptions = '--containall --cleanenv --nv' @@ -152,7 +160,7 @@ profiles { beforeScript = 'module load cluster/wice' clusterOptions = { def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int) - "--gres=gpu:${gpus} --clusters=wice --account=$tier1_project" + setClusterOptions(queue, tier1_project, 'lp_big_wice_gpu') + " --gres=gpu:${gpus}" } queue = { From 9641091920e5a8c0e1103664db19e71d3ded9edf Mon Sep 17 00:00:00 2001 From: ljwharbers Date: Wed, 18 Dec 2024 16:55:27 +0100 Subject: [PATCH 02/21] fixed queue evaluation --- conf/vsc_kul_uhasselt.config | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index af0affaf..9b6828ea 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -50,7 +50,7 @@ aws { // Define a function to call the correct account based on queue // TODO: FIX THIS FUNCTION def setClusterOptions(queue, defaultAccount, dedicatedAccount) { - queue.toString.contains('dedicated') ? "--clusters=wice --account=${dedicatedAccount}" : "--clusters=wice --account=${defaultAccount}" + queue =~ /dedicated/ ? "--clusters=wice --account=${dedicatedAccount}" : "--clusters=wice --account=${defaultAccount}" } // Define profiles for each cluster @@ -118,33 +118,32 @@ profiles { process { // max is 2016000 resourceLimits = [ memory: 1968.GB, cpus: 72, time: 168.h ] - clusterOptions = { "--clusters=wice --account=$tier1_project" } beforeScript = 'module load cluster/wice' + // Define the queue closure queue = { task.memory >= 239.GB ? (task.time >= 72.h ? 'dedicated_big_bigmem' : 'bigmem,hugemem') : (task.time >= 72.h ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake') } - // Set clusterOptions based on queue + // Set clusterOptions based on the evaluated queue clusterOptions = setClusterOptions(queue, tier1_project, 'lp_big_wice_cpu') withLabel: '.*gpu.*' { resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] apptainer.runOptions = '--containall --cleanenv --nv' singularity.runOptions = '--containall --cleanenv --nv' - clusterOptions = { - // suggested to use 16 cpus per gpu - def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int) - setClusterOptions(queue, tier1_project, 'lp_big_wice_gpu') + " --gres=gpu:${gpus}" - } - queue = { task.memory >= 239.GB ? (task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100') : (task.time >= 72.h ? 'dedicated_big_gpu' : 'gpu_a100,gpu') } + clusterOptions = { + // suggested to use 16 cpus per gpu + def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int) + setClusterOptions(queue, tier1_project, 'lp_big_wice_gpu') + " --gres=gpu:${gpus}" + } } } } @@ -158,16 +157,18 @@ profiles { // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] beforeScript = 'module load cluster/wice' + queue = { + task.memory >= 239.GB ? + (task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100') : + (task.time >= 72.h ? 'dedicated_big_gpu' : 'gpu_a100,gpu') + } clusterOptions = { def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int) + // Set clusterOptions based on queue setClusterOptions(queue, tier1_project, 'lp_big_wice_gpu') + " --gres=gpu:${gpus}" } - queue = { - task.memory >= 239.GB ? - (task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100') : - (task.time >= 72.h ? 'dedicated_big_gpu' : 'gpu_a100,gpu') - } + } } From 0db8585c083c2c5ebebada7a78ae99925dd9367b Mon Sep 17 00:00:00 2001 From: ljwharbers Date: Wed, 18 Dec 2024 17:25:40 +0100 Subject: [PATCH 03/21] changed logic --- conf/vsc_kul_uhasselt.config | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index 9b6828ea..0bcd44ac 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -126,9 +126,8 @@ profiles { (task.time >= 72.h ? 'dedicated_big_bigmem' : 'bigmem,hugemem') : (task.time >= 72.h ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake') } - + clusterOptions = queue =~ /dedicated/ ? "--clusters=wice --account=$tier1_project" : "--clusters=wice --account=$tier1_project" // Set clusterOptions based on the evaluated queue - clusterOptions = setClusterOptions(queue, tier1_project, 'lp_big_wice_cpu') withLabel: '.*gpu.*' { resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] @@ -142,7 +141,7 @@ profiles { clusterOptions = { // suggested to use 16 cpus per gpu def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int) - setClusterOptions(queue, tier1_project, 'lp_big_wice_gpu') + " --gres=gpu:${gpus}" + queue =~ /dedicated/ ? "--clusters=wice --account=$tier1_project --gres=gpu:${gpus}" : "--clusters=wice --account=$tier1_project --gres=gpu:${gpus}" } } } @@ -165,10 +164,8 @@ profiles { clusterOptions = { def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int) // Set clusterOptions based on queue - setClusterOptions(queue, tier1_project, 'lp_big_wice_gpu') + " --gres=gpu:${gpus}" - } - - + queue =~ /dedicated/ ? "--clusters=wice --account=$tier1_project --gres=gpu:${gpus}" : "--clusters=wice --account=$tier1_project --gres=gpu:${gpus}" + } } } From 7947847494b08e0a2d2dfe4281aa8fd2f0ba3cf5 Mon Sep 17 00:00:00 2001 From: ljwharbers Date: Thu, 19 Dec 2024 14:10:51 +0100 Subject: [PATCH 04/21] fixed regex --- conf/vsc_kul_uhasselt.config | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index 0bcd44ac..295d3547 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -126,8 +126,8 @@ profiles { (task.time >= 72.h ? 'dedicated_big_bigmem' : 'bigmem,hugemem') : (task.time >= 72.h ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake') } - clusterOptions = queue =~ /dedicated/ ? "--clusters=wice --account=$tier1_project" : "--clusters=wice --account=$tier1_project" - // Set clusterOptions based on the evaluated queue + // Set clusterOptions, changing account based on queue + clusterOptions = queue =~ /dedicated/ ? "--clusters=wice --account=lp_big_wice_cpu" : "--clusters=wice --account=$tier1_project" withLabel: '.*gpu.*' { resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] @@ -141,7 +141,8 @@ profiles { clusterOptions = { // suggested to use 16 cpus per gpu def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int) - queue =~ /dedicated/ ? "--clusters=wice --account=$tier1_project --gres=gpu:${gpus}" : "--clusters=wice --account=$tier1_project --gres=gpu:${gpus}" + // Set clusterOptions, changing account based on queue + queue =~ /dedicated/ ? "--clusters=wice --account=lp_big_wice_gpu --gres=gpu:${gpus}" : "--clusters=wice --account=$tier1_project --gres=gpu:${gpus}" } } } @@ -163,9 +164,9 @@ profiles { } clusterOptions = { def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int) - // Set clusterOptions based on queue - queue =~ /dedicated/ ? "--clusters=wice --account=$tier1_project --gres=gpu:${gpus}" : "--clusters=wice --account=$tier1_project --gres=gpu:${gpus}" - } + // Set clusterOptions, changing account based on queue + queue =~ /dedicated/ ? "--clusters=wice --account=$tier1_project --gres=gpu:${gpus}" : "--clusters=wice --account=$tier1_project --gres=gpu:${gpus}" + } } } From 0e9fafffc6f04456240f11151d0c4ce14f8520a2 Mon Sep 17 00:00:00 2001 From: ljwharbers Date: Thu, 19 Dec 2024 14:19:38 +0100 Subject: [PATCH 05/21] added small text for troubleshooting --- conf/vsc_kul_uhasselt.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index 295d3547..f6902103 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -127,7 +127,7 @@ profiles { (task.time >= 72.h ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake') } // Set clusterOptions, changing account based on queue - clusterOptions = queue =~ /dedicated/ ? "--clusters=wice --account=lp_big_wice_cpu" : "--clusters=wice --account=$tier1_project" + clusterOptions = queue =~ /dedicated/ ? "--clusters=wice --account=lp_big_wice_cpu" : "--clusters=wice --account=$tier1_project --nodes=1" withLabel: '.*gpu.*' { resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] From 27b21266c1b166506687f7939156b7d5fa5384c8 Mon Sep 17 00:00:00 2001 From: ljwharbers Date: Thu, 19 Dec 2024 14:36:32 +0100 Subject: [PATCH 06/21] changed into if else statement --- conf/vsc_kul_uhasselt.config | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index f6902103..23b1c264 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -127,7 +127,11 @@ profiles { (task.time >= 72.h ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake') } // Set clusterOptions, changing account based on queue - clusterOptions = queue =~ /dedicated/ ? "--clusters=wice --account=lp_big_wice_cpu" : "--clusters=wice --account=$tier1_project --nodes=1" + if queue =~ /dedicated/ { + clusterOptions = "--clusters=wice --account=lp_big_wice_cpu" + } else { + clusterOptions = "--clusters=wice --account=$tier1_project" + } withLabel: '.*gpu.*' { resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] From 9fc5bd71f2029b686bc4dfae7d638d0f18a12c71 Mon Sep 17 00:00:00 2001 From: ljwharbers Date: Thu, 19 Dec 2024 15:07:30 +0100 Subject: [PATCH 07/21] removed function --- conf/vsc_kul_uhasselt.config | 6 ------ 1 file changed, 6 deletions(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index 23b1c264..49ee02c5 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -47,12 +47,6 @@ aws { maxErrorRetry = 3 } -// Define a function to call the correct account based on queue -// TODO: FIX THIS FUNCTION -def setClusterOptions(queue, defaultAccount, dedicatedAccount) { - queue =~ /dedicated/ ? "--clusters=wice --account=${dedicatedAccount}" : "--clusters=wice --account=${defaultAccount}" -} - // Define profiles for each cluster profiles { genius { From 10853ac7ab70c2327f128a1013e7bac0c5a60422 Mon Sep 17 00:00:00 2001 From: ljwharbers Date: Thu, 19 Dec 2024 15:17:18 +0100 Subject: [PATCH 08/21] changed back to before --- conf/vsc_kul_uhasselt.config | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index 49ee02c5..0dae1d51 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -121,11 +121,8 @@ profiles { (task.time >= 72.h ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake') } // Set clusterOptions, changing account based on queue - if queue =~ /dedicated/ { - clusterOptions = "--clusters=wice --account=lp_big_wice_cpu" - } else { - clusterOptions = "--clusters=wice --account=$tier1_project" - } + // Set clusterOptions, changing account based on queue + clusterOptions = queue =~ /dedicated/ ? "--clusters=wice --account=lp_big_wice_cpu" : "--clusters=wice --account=$tier1_project" withLabel: '.*gpu.*' { resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] From 69cd2c98d8c6052b0833494f83c43501ae6e259e Mon Sep 17 00:00:00 2001 From: ljwharbers Date: Thu, 19 Dec 2024 15:37:02 +0100 Subject: [PATCH 09/21] reverted some options --- conf/vsc_kul_uhasselt.config | 1 - 1 file changed, 1 deletion(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index 0dae1d51..f8150899 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -121,7 +121,6 @@ profiles { (task.time >= 72.h ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake') } // Set clusterOptions, changing account based on queue - // Set clusterOptions, changing account based on queue clusterOptions = queue =~ /dedicated/ ? "--clusters=wice --account=lp_big_wice_cpu" : "--clusters=wice --account=$tier1_project" withLabel: '.*gpu.*' { From 9f8caedc495e40695b08af4a06d06fc44f03528f Mon Sep 17 00:00:00 2001 From: ljwharbers Date: Thu, 19 Dec 2024 16:51:29 +0100 Subject: [PATCH 10/21] added closure --- conf/vsc_kul_uhasselt.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index f8150899..a5a121a0 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -121,7 +121,7 @@ profiles { (task.time >= 72.h ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake') } // Set clusterOptions, changing account based on queue - clusterOptions = queue =~ /dedicated/ ? "--clusters=wice --account=lp_big_wice_cpu" : "--clusters=wice --account=$tier1_project" + clusterOptions = { queue =~ /dedicated/ ? "--clusters=wice --account=lp_big_wice_cpu" : "--clusters=wice --account=$tier1_project" } withLabel: '.*gpu.*' { resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] From f0f0d4dd9ad44914ded790e5733f5117fd160a17 Mon Sep 17 00:00:00 2001 From: ljwharbers Date: Mon, 6 Jan 2025 13:22:33 +0100 Subject: [PATCH 11/21] added changes to dynamically set --account based on dedicated vs normal queue types --- conf/vsc_kul_uhasselt.config | 41 ++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index a5a121a0..50b91a24 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -113,15 +113,23 @@ profiles { // max is 2016000 resourceLimits = [ memory: 1968.GB, cpus: 72, time: 168.h ] beforeScript = 'module load cluster/wice' - - // Define the queue closure + + // Set queue queue = { task.memory >= 239.GB ? (task.time >= 72.h ? 'dedicated_big_bigmem' : 'bigmem,hugemem') : (task.time >= 72.h ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake') } - // Set clusterOptions, changing account based on queue - clusterOptions = { queue =~ /dedicated/ ? "--clusters=wice --account=lp_big_wice_cpu" : "--clusters=wice --account=$tier1_project" } + + // Set clusterOptions, changing account based on queue + clusterOptions = { + def queueValue = { + task.memory >= 239.GB ? + (task.time >= 72.h ? 'dedicated_big_bigmem' : 'bigmem,hugemem') : + (task.time >= 72.h ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake') + } + queueValue() =~ /dedicated/ ? "--clusters=wice --account=lp_big_wice_cpu" : "--clusters=wice --account=$tier1_project" + } withLabel: '.*gpu.*' { resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] @@ -135,8 +143,17 @@ profiles { clusterOptions = { // suggested to use 16 cpus per gpu def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int) + // Do same queue evaluation as above + def queueValue = { + task.memory >= 239.GB ? + (task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100') : + (task.time >= 72.h ? 'dedicated_big_gpu' : 'gpu_a100,gpu') + } + // Set clusterOptions, changing account based on queue - queue =~ /dedicated/ ? "--clusters=wice --account=lp_big_wice_gpu --gres=gpu:${gpus}" : "--clusters=wice --account=$tier1_project --gres=gpu:${gpus}" + queueValue() =~ /dedicated_big_gpu_h100/ ? "--clusters=wice --account=lp_big_wice_gpu_h100 --gres=gpu:${gpus}" : + queueValue() =~ /dedicated_big_gpu/ ? "--clusters=wice --account=lp_big_wice_gpu --gres=gpu:${gpus}" : + "--clusters=wice --account=$tier1_project --gres=gpu:${gpus}" } } } @@ -151,15 +168,25 @@ profiles { // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] beforeScript = 'module load cluster/wice' - queue = { + queue = { task.memory >= 239.GB ? (task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100') : (task.time >= 72.h ? 'dedicated_big_gpu' : 'gpu_a100,gpu') } clusterOptions = { + // suggested to use 16 cpus per gpu def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int) + // Do same queue evaluation as above + def queueValue = { + task.memory >= 239.GB ? + (task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100') : + (task.time >= 72.h ? 'dedicated_big_gpu' : 'gpu_a100,gpu') + } + // Set clusterOptions, changing account based on queue - queue =~ /dedicated/ ? "--clusters=wice --account=$tier1_project --gres=gpu:${gpus}" : "--clusters=wice --account=$tier1_project --gres=gpu:${gpus}" + queueValue() =~ /dedicated_big_gpu_h100/ ? "--clusters=wice --account=lp_big_wice_gpu_h100 --gres=gpu:${gpus}" : + queueValue() =~ /dedicated_big_gpu/ ? "--clusters=wice --account=lp_big_wice_gpu --gres=gpu:${gpus}" : + "--clusters=wice --account=$tier1_project --gres=gpu:${gpus}" } } } From 729c0a191a34a86f5be222286b71efc9636411be Mon Sep 17 00:00:00 2001 From: ljwharbers Date: Mon, 6 Jan 2025 13:37:05 +0100 Subject: [PATCH 12/21] fixed trailing whitespace --- conf/vsc_kul_uhasselt.config | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index 50b91a24..ab558092 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -83,7 +83,6 @@ profiles { } } - genius_gpu { params.config_profile_description = 'genius_gpu profile for use on the genius cluster of the VSC HPC.' apptainer.runOptions = '--containall --cleanenv --nv' @@ -113,14 +112,14 @@ profiles { // max is 2016000 resourceLimits = [ memory: 1968.GB, cpus: 72, time: 168.h ] beforeScript = 'module load cluster/wice' - + // Set queue queue = { task.memory >= 239.GB ? (task.time >= 72.h ? 'dedicated_big_bigmem' : 'bigmem,hugemem') : (task.time >= 72.h ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake') } - + // Set clusterOptions, changing account based on queue clusterOptions = { def queueValue = { @@ -128,7 +127,7 @@ profiles { (task.time >= 72.h ? 'dedicated_big_bigmem' : 'bigmem,hugemem') : (task.time >= 72.h ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake') } - queueValue() =~ /dedicated/ ? "--clusters=wice --account=lp_big_wice_cpu" : "--clusters=wice --account=$tier1_project" + queueValue() =~ /dedicated/ ? "--clusters=wice --account=lp_big_wice_cpu" : "--clusters=wice --account=$tier1_project" } withLabel: '.*gpu.*' { @@ -149,7 +148,7 @@ profiles { (task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100') : (task.time >= 72.h ? 'dedicated_big_gpu' : 'gpu_a100,gpu') } - + // Set clusterOptions, changing account based on queue queueValue() =~ /dedicated_big_gpu_h100/ ? "--clusters=wice --account=lp_big_wice_gpu_h100 --gres=gpu:${gpus}" : queueValue() =~ /dedicated_big_gpu/ ? "--clusters=wice --account=lp_big_wice_gpu --gres=gpu:${gpus}" : @@ -182,7 +181,7 @@ profiles { (task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100') : (task.time >= 72.h ? 'dedicated_big_gpu' : 'gpu_a100,gpu') } - + // Set clusterOptions, changing account based on queue queueValue() =~ /dedicated_big_gpu_h100/ ? "--clusters=wice --account=lp_big_wice_gpu_h100 --gres=gpu:${gpus}" : queueValue() =~ /dedicated_big_gpu/ ? "--clusters=wice --account=lp_big_wice_gpu --gres=gpu:${gpus}" : From 51c97786d5af284bda7329531acc7359f64ca4e4 Mon Sep 17 00:00:00 2001 From: ljwharbers Date: Mon, 6 Jan 2025 13:38:30 +0100 Subject: [PATCH 13/21] fixed linting --- conf/vsc_kul_uhasselt.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index ab558092..ac5419c4 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -121,7 +121,7 @@ profiles { } // Set clusterOptions, changing account based on queue - clusterOptions = { + clusterOptions = { def queueValue = { task.memory >= 239.GB ? (task.time >= 72.h ? 'dedicated_big_bigmem' : 'bigmem,hugemem') : @@ -167,7 +167,7 @@ profiles { // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] beforeScript = 'module load cluster/wice' - queue = { + queue = { task.memory >= 239.GB ? (task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100') : (task.time >= 72.h ? 'dedicated_big_gpu' : 'gpu_a100,gpu') From c1b3583681a407d3bfeb8b726b233bf765a9e908 Mon Sep 17 00:00:00 2001 From: ljwharbers Date: Tue, 7 Jan 2025 15:02:50 +0100 Subject: [PATCH 14/21] added queue types in docs --- docs/vsc_kul_uhasselt.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/vsc_kul_uhasselt.md b/docs/vsc_kul_uhasselt.md index 2f31304c..7e1af924 100644 --- a/docs/vsc_kul_uhasselt.md +++ b/docs/vsc_kul_uhasselt.md @@ -14,9 +14,17 @@ A nextflow module is available that can be loaded `module load Nextflow` but it 2. Set up the environment variables in `~/.bashrc` or `~/.bash_profile`: +:::note +If you have access to dedicated nodes, you can export these as a command separated list. These queues will only be used if specified task requirements are not available in the normal partitions but they are available in dedicated partitions. +::: + ```bash export SLURM_ACCOUNT="" +# Comma-separated list of available dedicated partitions (if any) +# For example: export VSC_DEDICATED_QUEUES="dedicated_big_bigmem,dedicated_big_gpu" +export VSC_DEDICATED_QUEUES="" + # Needed for running Nextflow jobs export NXF_HOME="$VSC_SCRATCH/.nextflow" export NXF_WORK="$VSC_SCRATCH/work" From 2c391ba9fc5f354b6b6747d1985560380b3ef623 Mon Sep 17 00:00:00 2001 From: ljwharbers Date: Tue, 7 Jan 2025 15:03:12 +0100 Subject: [PATCH 15/21] refactored code for dedicated queues and added max tasklimit if dedicated queue is not available --- conf/vsc_kul_uhasselt.config | 82 +++++++++++++++++++++++++++--------- 1 file changed, 61 insertions(+), 21 deletions(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index ac5419c4..a3e92c4b 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -2,6 +2,8 @@ // see: https://github.com/nf-core/configs?tab=readme-ov-file#adding-a-new-config scratch_dir = System.getenv("VSC_SCRATCH") ?: "/tmp" tier1_project = System.getenv("SLURM_ACCOUNT") ?: null +avail_queues = System.getenv("VSC_DEDICATED_QUEUES") ?: null +def availQueues = avail_queues?.toString()?.split(',') // Perform work directory cleanup when the run has succesfully completed // cleanup = true @@ -47,6 +49,11 @@ aws { maxErrorRetry = 3 } +// Function to limit task time when dedicated queues are not available +def limitTaskTime(time, maxTime) { + return time > maxTime ? maxTime : time +} + // Define profiles for each cluster profiles { genius { @@ -68,6 +75,8 @@ profiles { resourceLimits = [ memory: 703.GB, cpus: 36 , time: 168.h ] apptainer.runOptions = '--containall --cleanenv --nv' singularity.runOptions = '--containall --cleanenv --nv' + + // Set clusteroptions clusterOptions = { // suggested to use 9 cpus per gpu def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/9) as int) @@ -104,7 +113,7 @@ profiles { } } } - + wice { params.config_profile_description = 'wice profile for use on the Wice cluster of the VSC HPC.' @@ -112,19 +121,26 @@ profiles { // max is 2016000 resourceLimits = [ memory: 1968.GB, cpus: 72, time: 168.h ] beforeScript = 'module load cluster/wice' - + // Set queue + // The task time is limites to 72 hours if the memory is larger than 239GB + // and dedicated queues are not available queue = { - task.memory >= 239.GB ? - (task.time >= 72.h ? 'dedicated_big_bigmem' : 'bigmem,hugemem') : - (task.time >= 72.h ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake') + def maxTime = 72.h + if (task.memory >= 239.GB) { + task.time = task.time >= maxTime && !availQueues.contains('dedicated_big_bigmem') ? + limitTaskTime(task.time, maxTime) : task.time + return availQueues.contains('dedicated_big_bigmem') ? 'dedicated_big_bigmem' : 'bigmem,hugemem' + } else { + return task.time >= maxTime ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake' + } } - + // Set clusterOptions, changing account based on queue clusterOptions = { def queueValue = { - task.memory >= 239.GB ? - (task.time >= 72.h ? 'dedicated_big_bigmem' : 'bigmem,hugemem') : + task.memory >= 239.GB ? + (task.time >= 72.h && availQueues.contains('dedicated_big_bigmem') ? 'dedicated_big_bigmem' : 'bigmem,hugemem') : (task.time >= 72.h ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake') } queueValue() =~ /dedicated/ ? "--clusters=wice --account=lp_big_wice_cpu" : "--clusters=wice --account=$tier1_project" @@ -134,19 +150,31 @@ profiles { resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] apptainer.runOptions = '--containall --cleanenv --nv' singularity.runOptions = '--containall --cleanenv --nv' + + // Set queue + // The task time is limites to 72 hours if the memory is larger than 239GB + // and dedicated queues are not available queue = { - task.memory >= 239.GB ? - (task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100') : - (task.time >= 72.h ? 'dedicated_big_gpu' : 'gpu_a100,gpu') + def maxTime = 72.h + if (task.memory >= 239.GB) { + task.time = task.time >= maxTime && !availQueues.contains('dedicated_big_gpu_h100') ? + limitTaskTime(task.time, maxTime) : task.time + return availQueues.contains('dedicated_big_gpu_h100') ? 'dedicated_big_gpu_h100' : 'gpu_h100' + } else { + task.time = task.time >= maxTime && !availQueues.contains('dedicated_big_gpu') ? + limitTaskTime(task.time, maxTime) : task.time + return availQueues.contains('dedicated_big_gpu') ? 'dedicated_big_gpu' : 'gpu_a100,gpu' + } } + clusterOptions = { // suggested to use 16 cpus per gpu def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int) // Do same queue evaluation as above def queueValue = { - task.memory >= 239.GB ? - (task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100') : - (task.time >= 72.h ? 'dedicated_big_gpu' : 'gpu_a100,gpu') + task.memory >= 239.GB ? + (task.time >= 72.h && availQueues.contains('dedicated_big_gpu_h100') ? 'dedicated_big_gpu_h100' : 'gpu_h100') : + (task.time >= 72.h && availQueues.contains('dedicated_big_gpu') ? 'dedicated_big_gpu' : 'gpu_a100,gpu') } // Set clusterOptions, changing account based on queue @@ -167,19 +195,31 @@ profiles { // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] beforeScript = 'module load cluster/wice' + // Set queue + // The task time is limites to 72 hours if the memory is larger than 239GB + // and dedicated queues are not available queue = { - task.memory >= 239.GB ? - (task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100') : - (task.time >= 72.h ? 'dedicated_big_gpu' : 'gpu_a100,gpu') + def maxTime = 72.h + if (task.memory >= 239.GB) { + task.time = task.time >= maxTime && !availQueues.contains('dedicated_big_gpu_h100') ? + limitTaskTime(task.time, maxTime) : task.time + return availQueues.contains('dedicated_big_gpu_h100') ? 'dedicated_big_gpu_h100' : 'gpu_h100' + } else { + task.time = task.time >= maxTime && !availQueues.contains('dedicated_big_gpu') ? + limitTaskTime(task.time, maxTime) : task.time + return availQueues.contains('dedicated_big_gpu') ? 'dedicated_big_gpu' : 'gpu_a100,gpu' + } } + + // Set clusteroptions clusterOptions = { // suggested to use 16 cpus per gpu def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int) - // Do same queue evaluation as above + // Do same queue evaluation as above, without adjusting task.time def queueValue = { - task.memory >= 239.GB ? - (task.time >= 72.h ? 'dedicated_big_gpu_h100' : 'gpu_h100') : - (task.time >= 72.h ? 'dedicated_big_gpu' : 'gpu_a100,gpu') + task.memory >= 239.GB ? + (task.time >= 72.h && availQueues.contains('dedicated_big_gpu_h100') ? 'dedicated_big_gpu_h100' : 'gpu_h100') : + (task.time >= 72.h && availQueues.contains('dedicated_big_gpu') ? 'dedicated_big_gpu' : 'gpu_a100,gpu') } // Set clusterOptions, changing account based on queue From 4cbf96636faa094acac4c48e664e47efd69a13e0 Mon Sep 17 00:00:00 2001 From: ljwharbers Date: Tue, 7 Jan 2025 15:05:53 +0100 Subject: [PATCH 16/21] linting issues --- conf/vsc_kul_uhasselt.config | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index a3e92c4b..9600f5be 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -75,7 +75,7 @@ profiles { resourceLimits = [ memory: 703.GB, cpus: 36 , time: 168.h ] apptainer.runOptions = '--containall --cleanenv --nv' singularity.runOptions = '--containall --cleanenv --nv' - + // Set clusteroptions clusterOptions = { // suggested to use 9 cpus per gpu @@ -113,7 +113,7 @@ profiles { } } } - + wice { params.config_profile_description = 'wice profile for use on the Wice cluster of the VSC HPC.' @@ -121,26 +121,26 @@ profiles { // max is 2016000 resourceLimits = [ memory: 1968.GB, cpus: 72, time: 168.h ] beforeScript = 'module load cluster/wice' - + // Set queue // The task time is limites to 72 hours if the memory is larger than 239GB // and dedicated queues are not available queue = { def maxTime = 72.h if (task.memory >= 239.GB) { - task.time = task.time >= maxTime && !availQueues.contains('dedicated_big_bigmem') ? + task.time = task.time >= maxTime && !availQueues.contains('dedicated_big_bigmem') ? limitTaskTime(task.time, maxTime) : task.time return availQueues.contains('dedicated_big_bigmem') ? 'dedicated_big_bigmem' : 'bigmem,hugemem' } else { return task.time >= maxTime ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake' } } - + // Set clusterOptions, changing account based on queue clusterOptions = { def queueValue = { - task.memory >= 239.GB ? - (task.time >= 72.h && availQueues.contains('dedicated_big_bigmem') ? 'dedicated_big_bigmem' : 'bigmem,hugemem') : + task.memory >= 239.GB ? + (task.time >= 72.h && availQueues.contains('dedicated_big_bigmem') ? 'dedicated_big_bigmem' : 'bigmem,hugemem') : (task.time >= 72.h ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake') } queueValue() =~ /dedicated/ ? "--clusters=wice --account=lp_big_wice_cpu" : "--clusters=wice --account=$tier1_project" @@ -157,11 +157,11 @@ profiles { queue = { def maxTime = 72.h if (task.memory >= 239.GB) { - task.time = task.time >= maxTime && !availQueues.contains('dedicated_big_gpu_h100') ? + task.time = task.time >= maxTime && !availQueues.contains('dedicated_big_gpu_h100') ? limitTaskTime(task.time, maxTime) : task.time return availQueues.contains('dedicated_big_gpu_h100') ? 'dedicated_big_gpu_h100' : 'gpu_h100' } else { - task.time = task.time >= maxTime && !availQueues.contains('dedicated_big_gpu') ? + task.time = task.time >= maxTime && !availQueues.contains('dedicated_big_gpu') ? limitTaskTime(task.time, maxTime) : task.time return availQueues.contains('dedicated_big_gpu') ? 'dedicated_big_gpu' : 'gpu_a100,gpu' } @@ -173,7 +173,7 @@ profiles { // Do same queue evaluation as above def queueValue = { task.memory >= 239.GB ? - (task.time >= 72.h && availQueues.contains('dedicated_big_gpu_h100') ? 'dedicated_big_gpu_h100' : 'gpu_h100') : + (task.time >= 72.h && availQueues.contains('dedicated_big_gpu_h100') ? 'dedicated_big_gpu_h100' : 'gpu_h100') : (task.time >= 72.h && availQueues.contains('dedicated_big_gpu') ? 'dedicated_big_gpu' : 'gpu_a100,gpu') } @@ -201,11 +201,11 @@ profiles { queue = { def maxTime = 72.h if (task.memory >= 239.GB) { - task.time = task.time >= maxTime && !availQueues.contains('dedicated_big_gpu_h100') ? + task.time = task.time >= maxTime && !availQueues.contains('dedicated_big_gpu_h100') ? limitTaskTime(task.time, maxTime) : task.time return availQueues.contains('dedicated_big_gpu_h100') ? 'dedicated_big_gpu_h100' : 'gpu_h100' } else { - task.time = task.time >= maxTime && !availQueues.contains('dedicated_big_gpu') ? + task.time = task.time >= maxTime && !availQueues.contains('dedicated_big_gpu') ? limitTaskTime(task.time, maxTime) : task.time return availQueues.contains('dedicated_big_gpu') ? 'dedicated_big_gpu' : 'gpu_a100,gpu' } @@ -217,8 +217,8 @@ profiles { def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int) // Do same queue evaluation as above, without adjusting task.time def queueValue = { - task.memory >= 239.GB ? - (task.time >= 72.h && availQueues.contains('dedicated_big_gpu_h100') ? 'dedicated_big_gpu_h100' : 'gpu_h100') : + task.memory >= 239.GB ? + (task.time >= 72.h && availQueues.contains('dedicated_big_gpu_h100') ? 'dedicated_big_gpu_h100' : 'gpu_h100') : (task.time >= 72.h && availQueues.contains('dedicated_big_gpu') ? 'dedicated_big_gpu' : 'gpu_a100,gpu') } From b26fc66ae955cb0908495764dfe9d7af091ea94c Mon Sep 17 00:00:00 2001 From: ljwharbers Date: Tue, 7 Jan 2025 15:07:39 +0100 Subject: [PATCH 17/21] more linting issues --- conf/vsc_kul_uhasselt.config | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index 9600f5be..2be08de4 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -150,7 +150,7 @@ profiles { resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] apptainer.runOptions = '--containall --cleanenv --nv' singularity.runOptions = '--containall --cleanenv --nv' - + // Set queue // The task time is limites to 72 hours if the memory is larger than 239GB // and dedicated queues are not available @@ -166,13 +166,13 @@ profiles { return availQueues.contains('dedicated_big_gpu') ? 'dedicated_big_gpu' : 'gpu_a100,gpu' } } - + clusterOptions = { // suggested to use 16 cpus per gpu def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int) // Do same queue evaluation as above def queueValue = { - task.memory >= 239.GB ? + task.memory >= 239.GB ? (task.time >= 72.h && availQueues.contains('dedicated_big_gpu_h100') ? 'dedicated_big_gpu_h100' : 'gpu_h100') : (task.time >= 72.h && availQueues.contains('dedicated_big_gpu') ? 'dedicated_big_gpu' : 'gpu_a100,gpu') } @@ -210,7 +210,7 @@ profiles { return availQueues.contains('dedicated_big_gpu') ? 'dedicated_big_gpu' : 'gpu_a100,gpu' } } - + // Set clusteroptions clusterOptions = { // suggested to use 16 cpus per gpu From b20292d0648f8842e0eaad4ef1adc8b074ab6185 Mon Sep 17 00:00:00 2001 From: ljwharbers Date: Tue, 7 Jan 2025 15:20:44 +0100 Subject: [PATCH 18/21] fixed obtaining queue environment variable --- conf/vsc_kul_uhasselt.config | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index 2be08de4..c7a2c5f1 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -1,9 +1,9 @@ // Default to /tmp directory if $VSC_SCRATCH scratch env is not available, // see: https://github.com/nf-core/configs?tab=readme-ov-file#adding-a-new-config -scratch_dir = System.getenv("VSC_SCRATCH") ?: "/tmp" -tier1_project = System.getenv("SLURM_ACCOUNT") ?: null -avail_queues = System.getenv("VSC_DEDICATED_QUEUES") ?: null -def availQueues = avail_queues?.toString()?.split(',') +scratch_dir = System.getenv("VSC_SCRATCH") ?: "/tmp" +tier1_project = System.getenv("SLURM_ACCOUNT") ?: null +def avail_queues = System.getenv("VSC_DEDICATED_QUEUES") ?: "" +def availQueues = avail_queues.toString().split(',') // Perform work directory cleanup when the run has succesfully completed // cleanup = true From 6d07e08f7b6b67755b87097439b175ab014bd07e Mon Sep 17 00:00:00 2001 From: Joon-Klaps Date: Thu, 9 Jan 2025 12:29:29 +0000 Subject: [PATCH 19/21] export logic outside of profile scope into custom functions --- conf/vsc_kul_uhasselt.config | 275 ++++++++++++++++++++--------------- 1 file changed, 158 insertions(+), 117 deletions(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index c7a2c5f1..5c841cc5 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -1,9 +1,9 @@ // Default to /tmp directory if $VSC_SCRATCH scratch env is not available, // see: https://github.com/nf-core/configs?tab=readme-ov-file#adding-a-new-config -scratch_dir = System.getenv("VSC_SCRATCH") ?: "/tmp" -tier1_project = System.getenv("SLURM_ACCOUNT") ?: null -def avail_queues = System.getenv("VSC_DEDICATED_QUEUES") ?: "" -def availQueues = avail_queues.toString().split(',') +def SCRATCH_DIR = System.getenv("VSC_SCRATCH") ?: "/tmp" +def TIER2_PROJECT = System.getenv("SLURM_ACCOUNT") ?: null +def DEDICATED_QUEUES = System.getenv("VSC_DEDICATED_QUEUES") ?: "" +def AVAILABLE_QUEUES = DEDICATED_QUEUES.toString().split(',') // Perform work directory cleanup when the run has succesfully completed // cleanup = true @@ -30,7 +30,7 @@ process { singularity { enabled = true autoMounts = true - cacheDir = "$scratch_dir/.singularity" + cacheDir = "$SCRATCH_DIR/.singularity" pullTimeout = "30 min" } @@ -40,8 +40,8 @@ params { } env { - APPTAINER_TMPDIR="$scratch_dir/.apptainer/tmp" - APPTAINER_CACHEDIR="$scratch_dir/.apptainer/cache" + APPTAINER_TMPDIR="$SCRATCH_DIR/.apptainer/tmp" + APPTAINER_CACHEDIR="$SCRATCH_DIR/.apptainer/cache" } // AWS maximum retries for errors (This way the pipeline doesn't fail if the download fails one time) @@ -49,11 +49,132 @@ aws { maxErrorRetry = 3 } -// Function to limit task time when dedicated queues are not available +/* + * Queue Selection Utility Functions for HPC Environments + * ================================================== + * This module provides functions to determine appropriate HPC queues based on task requirements + * for both GENIUS and WICE clusters. + */ + +/* + * Constants: + * ---------- + * TIME_THRESHOLD: 72 hours - Threshold for determining long-running jobs + * MEMORY_THRESHOLD (GENIUS): 175GB - Memory threshold for bigmem queues + * MEMORY_THRESHOLD (WICE): 239GB - Memory threshold for high-memory queues +*/ +def TIME_THRESHOLD = 72.h +def MEMORY_THRESHOLD_GENIUS = 175.GB +def MEMORY_THRESHOLD_WICE = 239.GB + +/* + * --------- + * Functions: + * ---------- + * These functions are designed to select the appropriate HPC queues of + * VSC_KUL_UHASSELT based on task requirements. They handle both standard + * and GPU queues, considering memory requirements, execution time, and + * queue availability. +*/ + +/* + * limitTaskTime(time, maxTime) + * Ensures task time doesn't exceed the maximum allowed time + * @param time Current task time + * @param maxTime Maximum allowed time + * @return Limited task time +*/ def limitTaskTime(time, maxTime) { return time > maxTime ? maxTime : time } +/* + * determineGeniusQueue(task) + * Selects appropriate CPU queue for GENIUS cluster + * @param task Nextflow task object containing memory and time requirements + * @return Queue name based on task requirements +*/ +def determineGeniusQueue = { task -> + if (task.memory >= MEMORY_THRESHOLD_GENIUS) { + if (task.time >= TIME_THRESHOLD) { + return AVAILABLE_QUEUES.contains('dedicated_big_bigmem') ? 'dedicated_big_bigmem' : 'bigmem_long' + } + return 'bigmem' + } + return task.time >= TIME_THRESHOLD ? 'batch_long' : 'batch' +} + +/* + * determineGeniusGpuQueue(task) + * Selects appropriate GPU queue for GENIUS cluster + * @param task Nextflow task object containing memory and time requirements + * @return GPU queue name based on task requirements +*/ +def determineGeniusGpuQueue = { task -> + if (task.memory >= MEMORY_THRESHOLD_GENIUS) { + return task.time >= TIME_THRESHOLD ? 'gpu_v100_long' : 'gpu_v100' + } + if (task.time >= TIME_THRESHOLD) { + return AVAILABLE_QUEUES.contains('dedicated_rega_gpu') ? 'dedicated_rega_gpu' : 'gpu_p100_long,amd_long' + } + return 'gpu_p100,amd' +} + +/* + * determineWiceQueue(task) + * Selects appropriate CPU queue for WICE cluster + * @param task Nextflow task object containing memory and time requirements + * @return Queue name based on task requirements and availability +*/ +def determineWiceQueue = { task -> + if (task.memory >= MEMORY_THRESHOLD_WICE) { + if (AVAILABLE_QUEUES.contains('dedicated_big_bigmem')) { + return 'dedicated_big_bigmem' + } else { + task.time = limitTaskTime(task.time, TIME_THRESHOLD) + return 'bigmem,hugemem' + } + } + + return task.time >= TIME_THRESHOLD ? + 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : + 'batch,batch_sapphirerapids,batch_icelake' +} + +/* + * determineWiceGpuQueue(task) + * Selects appropriate GPU queue for WICE cluster + * @param task Nextflow task object containing memory and time requirements + * @return GPU queue name based on task requirements +*/ +def determineWiceGpuQueue = { task -> + def isHighMemory = task.memory >= MEMORY_THRESHOLD_WICE + def isDedicatedQueue = isHighMemory ? + AVAILABLE_QUEUES.contains('dedicated_big_gpu_h100') : + AVAILABLE_QUEUES.contains('dedicated_big_gpu') + + if (task.time >= TIME_THRESHOLD && !isDedicatedQueue) { + task.time = limitTaskTime(task.time, TIME_THRESHOLD) + } + + if (isHighMemory) { + return isDedicatedQueue ? 'dedicated_big_gpu_h100' : 'gpu_h100' + } else { + return isDedicatedQueue ? 'dedicated_big_gpu' : 'gpu_a100,gpu' + } +} + +/* + * ======== + * Profiles + * ======== + * These profiles define the resource limits, queue selection, and cluster options + * for WICE and GENIUS clusters. They also include GPU-specific configurations. + * Details of the resource limits can be found in for genius at + * https://docs.vscentrum.be/leuven/tier2_hardware/genius_hardware.html + * and for wice at https://docs.vscentrum.be/leuven/tier2_hardware/wice_hardware.html +*/ + // Define profiles for each cluster profiles { genius { @@ -62,31 +183,22 @@ profiles { process { // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h ] - beforeScript = 'module load cluster/genius' - clusterOptions = { "--clusters=genius --account=$tier1_project" } - - queue = { - task.memory >= 175.GB ? - (task.time >= 72.h ? 'dedicated_big_bigmem,dedicated_big_batch,bigmem_long' : 'bigmem') : - (task.time >= 72.h ? 'batch_long' : 'batch') + beforeScript = 'module load cluster/genius' + queue = { determineGeniusQueue(task) } + clusterOptions = { + determineGeniusQueue(task) =~ /dedicated/ ? + "--clusters=genius --account=lp_big_genius_cpu" : + "--clusters=genius --account=$TIER2_PROJECT" } withLabel: '.*gpu.*'{ resourceLimits = [ memory: 703.GB, cpus: 36 , time: 168.h ] apptainer.runOptions = '--containall --cleanenv --nv' singularity.runOptions = '--containall --cleanenv --nv' - - // Set clusteroptions + queue = { determineGeniusGpuQueue(task) } clusterOptions = { - // suggested to use 9 cpus per gpu def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/9) as int) - "--gres=gpu:${gpus} --clusters=genius --account=$tier1_project" - } - - queue = { - task.memory >= 175.GB ? - (task.time >= 72.h ? 'gpu_v100_long' : 'gpu_v100') : - (task.time >= 72.h ? 'gpu_p100_long,amd_long' : 'gpu_p100,amd') + "--gres=gpu:${gpus} --clusters=genius --account=$TIER2_PROJECT" } } } @@ -101,15 +213,10 @@ profiles { // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h] beforeScript = 'module load cluster/genius' + queue = { determineGeniusGpuQueue(task) } clusterOptions = { def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/9) as int) - "--gres=gpu:${gpus} --clusters=genius --account=$tier1_project" - } - - queue = { - task.memory >= 175.GB ? - (task.time >= 72.h ? 'gpu_v100_long' : 'gpu_v100') : - (task.time >= 72.h ? 'gpu_p100_long,amd_long' : 'gpu_p100,amd') + "--gres=gpu:${gpus} --clusters=genius --account=$TIER2_PROJECT" } } } @@ -121,66 +228,24 @@ profiles { // max is 2016000 resourceLimits = [ memory: 1968.GB, cpus: 72, time: 168.h ] beforeScript = 'module load cluster/wice' - - // Set queue - // The task time is limites to 72 hours if the memory is larger than 239GB - // and dedicated queues are not available - queue = { - def maxTime = 72.h - if (task.memory >= 239.GB) { - task.time = task.time >= maxTime && !availQueues.contains('dedicated_big_bigmem') ? - limitTaskTime(task.time, maxTime) : task.time - return availQueues.contains('dedicated_big_bigmem') ? 'dedicated_big_bigmem' : 'bigmem,hugemem' - } else { - return task.time >= maxTime ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake' - } - } - - // Set clusterOptions, changing account based on queue + queue = { determineWiceQueue(task) } clusterOptions = { - def queueValue = { - task.memory >= 239.GB ? - (task.time >= 72.h && availQueues.contains('dedicated_big_bigmem') ? 'dedicated_big_bigmem' : 'bigmem,hugemem') : - (task.time >= 72.h ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake') - } - queueValue() =~ /dedicated/ ? "--clusters=wice --account=lp_big_wice_cpu" : "--clusters=wice --account=$tier1_project" + determineWiceQueue(task) =~ /dedicated/ ? + "--clusters=wice --account=lp_big_wice_cpu" : + "--clusters=wice --account=$TIER2_PROJECT" } withLabel: '.*gpu.*' { resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] apptainer.runOptions = '--containall --cleanenv --nv' singularity.runOptions = '--containall --cleanenv --nv' - - // Set queue - // The task time is limites to 72 hours if the memory is larger than 239GB - // and dedicated queues are not available - queue = { - def maxTime = 72.h - if (task.memory >= 239.GB) { - task.time = task.time >= maxTime && !availQueues.contains('dedicated_big_gpu_h100') ? - limitTaskTime(task.time, maxTime) : task.time - return availQueues.contains('dedicated_big_gpu_h100') ? 'dedicated_big_gpu_h100' : 'gpu_h100' - } else { - task.time = task.time >= maxTime && !availQueues.contains('dedicated_big_gpu') ? - limitTaskTime(task.time, maxTime) : task.time - return availQueues.contains('dedicated_big_gpu') ? 'dedicated_big_gpu' : 'gpu_a100,gpu' - } - } - - clusterOptions = { - // suggested to use 16 cpus per gpu + queue = { determineWiceGpuQueue(task) } + clusterOptions = { def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int) - // Do same queue evaluation as above - def queueValue = { - task.memory >= 239.GB ? - (task.time >= 72.h && availQueues.contains('dedicated_big_gpu_h100') ? 'dedicated_big_gpu_h100' : 'gpu_h100') : - (task.time >= 72.h && availQueues.contains('dedicated_big_gpu') ? 'dedicated_big_gpu' : 'gpu_a100,gpu') - } - - // Set clusterOptions, changing account based on queue - queueValue() =~ /dedicated_big_gpu_h100/ ? "--clusters=wice --account=lp_big_wice_gpu_h100 --gres=gpu:${gpus}" : - queueValue() =~ /dedicated_big_gpu/ ? "--clusters=wice --account=lp_big_wice_gpu --gres=gpu:${gpus}" : - "--clusters=wice --account=$tier1_project --gres=gpu:${gpus}" + def queueValue = determineWiceGpuQueue(task) + queueValue =~ /dedicated_big_gpu_h100/ ? "--clusters=wice --account=lp_big_wice_gpu_h100 --gres=gpu:${gpus}" : + queueValue =~ /dedicated_big_gpu/ ? "--clusters=wice --account=lp_big_wice_gpu --gres=gpu:${gpus}" : + "--clusters=wice --account=$TIER2_PROJECT --gres=gpu:${gpus}" } } } @@ -193,39 +258,15 @@ profiles { process { // 768 - 65 so 65GB for overhead, max is 720000MB - resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] - beforeScript = 'module load cluster/wice' - // Set queue - // The task time is limites to 72 hours if the memory is larger than 239GB - // and dedicated queues are not available - queue = { - def maxTime = 72.h - if (task.memory >= 239.GB) { - task.time = task.time >= maxTime && !availQueues.contains('dedicated_big_gpu_h100') ? - limitTaskTime(task.time, maxTime) : task.time - return availQueues.contains('dedicated_big_gpu_h100') ? 'dedicated_big_gpu_h100' : 'gpu_h100' - } else { - task.time = task.time >= maxTime && !availQueues.contains('dedicated_big_gpu') ? - limitTaskTime(task.time, maxTime) : task.time - return availQueues.contains('dedicated_big_gpu') ? 'dedicated_big_gpu' : 'gpu_a100,gpu' - } - } - - // Set clusteroptions - clusterOptions = { - // suggested to use 16 cpus per gpu + beforeScript = 'module load cluster/wice' + resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] + queue = { determineWiceGpuQueue(task) } + clusterOptions = { def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int) - // Do same queue evaluation as above, without adjusting task.time - def queueValue = { - task.memory >= 239.GB ? - (task.time >= 72.h && availQueues.contains('dedicated_big_gpu_h100') ? 'dedicated_big_gpu_h100' : 'gpu_h100') : - (task.time >= 72.h && availQueues.contains('dedicated_big_gpu') ? 'dedicated_big_gpu' : 'gpu_a100,gpu') - } - - // Set clusterOptions, changing account based on queue - queueValue() =~ /dedicated_big_gpu_h100/ ? "--clusters=wice --account=lp_big_wice_gpu_h100 --gres=gpu:${gpus}" : - queueValue() =~ /dedicated_big_gpu/ ? "--clusters=wice --account=lp_big_wice_gpu --gres=gpu:${gpus}" : - "--clusters=wice --account=$tier1_project --gres=gpu:${gpus}" + def queueValue = determineWiceGpuQueue(task) + queueValue =~ /dedicated_big_gpu_h100/ ? "--clusters=wice --account=lp_big_wice_gpu_h100 --gres=gpu:${gpus}" : + queueValue =~ /dedicated_big_gpu/ ? "--clusters=wice --account=lp_big_wice_gpu --gres=gpu:${gpus}" : + "--clusters=wice --account=$TIER2_PROJECT --gres=gpu:${gpus}" } } } @@ -234,7 +275,7 @@ profiles { params.config_profile_description = 'superdome profile for use on the genius cluster of the VSC HPC.' process { - clusterOptions = {"--clusters=genius --account=$tier1_project"} + clusterOptions = {"--clusters=genius --account=$TIER2_PROJECT"} beforeScript = 'module load cluster/genius/superdome' // 6000 - 228 so 228GB for overhead, max is 5910888MB resourceLimits = [ memory: 5772.GB, cpus: 14, time: 168.h] From 1240bbb03a7a405e1bd6d44ae69f842c862e299e Mon Sep 17 00:00:00 2001 From: Joon-Klaps Date: Thu, 9 Jan 2025 14:41:00 +0000 Subject: [PATCH 20/21] include beforescript logic to accomodate new module load logic --- conf/vsc_kul_uhasselt.config | 23 +++++++++++++---------- docs/vsc_kul_uhasselt.md | 2 +- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index 5c841cc5..91d629f3 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -115,9 +115,10 @@ def determineGeniusGpuQueue = { task -> return task.time >= TIME_THRESHOLD ? 'gpu_v100_long' : 'gpu_v100' } if (task.time >= TIME_THRESHOLD) { - return AVAILABLE_QUEUES.contains('dedicated_rega_gpu') ? 'dedicated_rega_gpu' : 'gpu_p100_long,amd_long' + return AVAILABLE_QUEUES.contains('dedicated_rega_gpu') ? 'dedicated_rega_gpu' : + AVAILABLE_QUEUES.contains('amd') ? 'amd_long' : 'gpu_p100_long' } - return 'gpu_p100,amd' + return AVAILABLE_QUEUES.contains('amd') ? 'amd' : 'gpu_p100' } /* @@ -183,7 +184,7 @@ profiles { process { // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h ] - beforeScript = 'module load cluster/genius' + beforeScript = { 'module load cluster/genius/' + determineGeniusQueue(task).toString().split(',')[0] } queue = { determineGeniusQueue(task) } clusterOptions = { determineGeniusQueue(task) =~ /dedicated/ ? @@ -193,6 +194,7 @@ profiles { withLabel: '.*gpu.*'{ resourceLimits = [ memory: 703.GB, cpus: 36 , time: 168.h ] + beforeScript = { 'module load cluster/genius/' + determineGeniusGpuQueue(task).toString().split(',')[0] } apptainer.runOptions = '--containall --cleanenv --nv' singularity.runOptions = '--containall --cleanenv --nv' queue = { determineGeniusGpuQueue(task) } @@ -212,8 +214,8 @@ profiles { process { // 768 - 65 so 65GB for overhead, max is 720000MB resourceLimits = [ memory: 703.GB, cpus: 36, time: 168.h] - beforeScript = 'module load cluster/genius' - queue = { determineGeniusGpuQueue(task) } + beforeScript = { 'module load cluster/genius/' + determineGeniusGpuQueue(task).toString().split(',')[0] } + queue = { determineGeniusGpuQueue(task) } clusterOptions = { def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/9) as int) "--gres=gpu:${gpus} --clusters=genius --account=$TIER2_PROJECT" @@ -227,7 +229,7 @@ profiles { process { // max is 2016000 resourceLimits = [ memory: 1968.GB, cpus: 72, time: 168.h ] - beforeScript = 'module load cluster/wice' + beforeScript = { 'module load cluster/wice/' + determineWiceQueue(task).toString().split(',')[0] } queue = { determineWiceQueue(task) } clusterOptions = { determineWiceQueue(task) =~ /dedicated/ ? @@ -239,6 +241,7 @@ profiles { resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] apptainer.runOptions = '--containall --cleanenv --nv' singularity.runOptions = '--containall --cleanenv --nv' + beforeScript = { 'module load cluster/wice/' + determineWiceGpuQueue(task).toString().split(',')[0] } queue = { determineWiceGpuQueue(task) } clusterOptions = { def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int) @@ -258,10 +261,10 @@ profiles { process { // 768 - 65 so 65GB for overhead, max is 720000MB - beforeScript = 'module load cluster/wice' - resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] - queue = { determineWiceGpuQueue(task) } - clusterOptions = { + beforeScript = { 'module load cluster/wice/' + determineWiceGpuQueue(task).toString().split(',')[0] } + resourceLimits = [ memory: 703.GB, cpus: 64, time: 168.h ] + queue = { determineWiceGpuQueue(task) } + clusterOptions = { def gpus = task.accelerator?.request ?: Math.max(1, Math.floor((task.cpus ?:1)/16) as int) def queueValue = determineWiceGpuQueue(task) queueValue =~ /dedicated_big_gpu_h100/ ? "--clusters=wice --account=lp_big_wice_gpu_h100 --gres=gpu:${gpus}" : diff --git a/docs/vsc_kul_uhasselt.md b/docs/vsc_kul_uhasselt.md index 7e1af924..fd9c676d 100644 --- a/docs/vsc_kul_uhasselt.md +++ b/docs/vsc_kul_uhasselt.md @@ -15,7 +15,7 @@ A nextflow module is available that can be loaded `module load Nextflow` but it 2. Set up the environment variables in `~/.bashrc` or `~/.bash_profile`: :::note -If you have access to dedicated nodes, you can export these as a command separated list. These queues will only be used if specified task requirements are not available in the normal partitions but they are available in dedicated partitions. +If you have access to dedicated nodes, you can export these as a command separated list. These queues will only be used if specified task requirements are not available in the normal partitions but they are available in dedicated partitions. AMD is considered a dedicated partition. ::: ```bash From 0208521a35391808e08d66206cbeba9e765e8b91 Mon Sep 17 00:00:00 2001 From: Joon-Klaps Date: Fri, 10 Jan 2025 09:42:06 +0000 Subject: [PATCH 21/21] use only dedicated when in need of it + refactor for readability --- conf/vsc_kul_uhasselt.config | 62 +++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 23 deletions(-) diff --git a/conf/vsc_kul_uhasselt.config b/conf/vsc_kul_uhasselt.config index 91d629f3..c20d450f 100644 --- a/conf/vsc_kul_uhasselt.config +++ b/conf/vsc_kul_uhasselt.config @@ -95,13 +95,17 @@ def limitTaskTime(time, maxTime) { * @return Queue name based on task requirements */ def determineGeniusQueue = { task -> - if (task.memory >= MEMORY_THRESHOLD_GENIUS) { - if (task.time >= TIME_THRESHOLD) { - return AVAILABLE_QUEUES.contains('dedicated_big_bigmem') ? 'dedicated_big_bigmem' : 'bigmem_long' - } - return 'bigmem' + def isHighMemory = task.memory >= MEMORY_THRESHOLD_GENIUS + def isLongRunning = task.time >= TIME_THRESHOLD + def hasDedicatedBigmem = AVAILABLE_QUEUES.contains('dedicated_big_bigmem') + + if (isHighMemory) { + return isLongRunning ? + (hasDedicatedBigmem ? 'dedicated_big_bigmem' : 'bigmem_long') : + 'bigmem' } - return task.time >= TIME_THRESHOLD ? 'batch_long' : 'batch' + + return isLongRunning ? 'batch_long' : 'batch' } /* @@ -111,14 +115,22 @@ def determineGeniusQueue = { task -> * @return GPU queue name based on task requirements */ def determineGeniusGpuQueue = { task -> - if (task.memory >= MEMORY_THRESHOLD_GENIUS) { - return task.time >= TIME_THRESHOLD ? 'gpu_v100_long' : 'gpu_v100' + def isHighMemory = task.memory >= MEMORY_THRESHOLD_GENIUS + def isLongRunning = task.time >= TIME_THRESHOLD + def hasDedicatedGpu = AVAILABLE_QUEUES.contains('dedicated_rega_gpu') + def hasAmdGpu = AVAILABLE_QUEUES.contains('amd') + + if (isHighMemory) { + return isLongRunning ? 'gpu_v100_long' : 'gpu_v100' } - if (task.time >= TIME_THRESHOLD) { - return AVAILABLE_QUEUES.contains('dedicated_rega_gpu') ? 'dedicated_rega_gpu' : - AVAILABLE_QUEUES.contains('amd') ? 'amd_long' : 'gpu_p100_long' + + if (isLongRunning) { + if (hasDedicatedGpu) return 'dedicated_rega_gpu' + if (hasAmdGpu) return 'amd_long' + return 'gpu_p100_long' } - return AVAILABLE_QUEUES.contains('amd') ? 'amd' : 'gpu_p100' + + return hasAmdGpu ? 'amd' : 'gpu_p100' } /* @@ -128,16 +140,19 @@ def determineGeniusGpuQueue = { task -> * @return Queue name based on task requirements and availability */ def determineWiceQueue = { task -> - if (task.memory >= MEMORY_THRESHOLD_WICE) { - if (AVAILABLE_QUEUES.contains('dedicated_big_bigmem')) { + def isHighMemory = task.memory >= MEMORY_THRESHOLD_WICE + def isLongRunning = task.time >= TIME_THRESHOLD + def hasDedicatedQueue = AVAILABLE_QUEUES.contains('dedicated_big_bigmem') + + if (isHighMemory) { + if (isLongRunning && hasDedicatedQueue) { return 'dedicated_big_bigmem' - } else { - task.time = limitTaskTime(task.time, TIME_THRESHOLD) - return 'bigmem,hugemem' } + task.time = limitTaskTime(task.time, TIME_THRESHOLD) + return 'bigmem,hugemem' } - return task.time >= TIME_THRESHOLD ? + return isLongRunning ? 'batch_long,batch_icelake_long,batch_sapphirerapids_long' : 'batch,batch_sapphirerapids,batch_icelake' } @@ -150,19 +165,20 @@ def determineWiceQueue = { task -> */ def determineWiceGpuQueue = { task -> def isHighMemory = task.memory >= MEMORY_THRESHOLD_WICE - def isDedicatedQueue = isHighMemory ? + def isLongRunning = task.time >= TIME_THRESHOLD + def hasDedicatedQueue = isHighMemory ? AVAILABLE_QUEUES.contains('dedicated_big_gpu_h100') : AVAILABLE_QUEUES.contains('dedicated_big_gpu') - if (task.time >= TIME_THRESHOLD && !isDedicatedQueue) { + if (isLongRunning && !hasDedicatedQueue) { task.time = limitTaskTime(task.time, TIME_THRESHOLD) } if (isHighMemory) { - return isDedicatedQueue ? 'dedicated_big_gpu_h100' : 'gpu_h100' - } else { - return isDedicatedQueue ? 'dedicated_big_gpu' : 'gpu_a100,gpu' + return (isLongRunning && hasDedicatedQueue) ? 'dedicated_big_gpu_h100' : 'gpu_h100' } + + return (isLongRunning && hasDedicatedQueue) ? 'dedicated_big_gpu' : 'gpu_a100,gpu' } /*