simplify set-up on 1 node

adamovanja · Aug 14, 2024 · 65ff8ee · 65ff8ee
1 parent 52bc885
commit 65ff8ee
Show file tree

Hide file tree

Showing 4 changed files with 127 additions and 94 deletions.
diff --git a/README.md b/README.md
@@ -53,11 +53,13 @@ python q2_ritme/eval_best_trial_overall.py --model_path "experiments/models"
 ````
 
 ### Training with slurm on HPC
-To train a model with slurm, edit the file `launch_slurm_cpu.sh` and then run
+To train a model with slurm on 1 node, edit the file `launch_slurm_cpu.sh` and then run
 ````
 sbatch launch_slurm_cpu.sh
 ````
-If you (or your collaborators) plan to launch multiple jobs on the same infrastructure you should set the variable `JOB_NB` in `launch_slurm_cpu.sh` accordingly. This variable makes sure that the assigned ports don't overlap (see [here](https://docs.ray.io/en/latest/cluster/vms/user-guides/community/slurm.html#slurm-networking-caveats)). Currently, we allow for 3 parallel ray slurm jobs to be executed.
+
+To train a model with slurm on multiple nodes or to enable running of multiple ray instances on the same HPC, you can use: `sbatch launch_slurm_cpu_multi_node.sh`. If you (or your collaborators) plan to launch multiple jobs on the same infrastructure you should set the variable `JOB_NB` in `launch_slurm_cpu_multi_node.sh` accordingly. This variable makes sure that the assigned ports don't overlap (see [here](https://docs.ray.io/en/latest/cluster/vms/user-guides/community/slurm.html#slurm-networking-caveats)). Currently, the script allows for 3 parallel ray slurm jobs to be executed.
+**Note:** training a model with slurm on multiple nodes can be very specific to your infrastructure. So you might need to adjust this bash script to your set-up.
 
 #### Some common slurm errors:
 If you are using SLURM and ...

diff --git a/launch_slurm_cpu.sh b/launch_slurm_cpu.sh
@@ -3,8 +3,8 @@
 #SBATCH --job-name="run_config"
 #SBATCH -A partition_name
 #SBATCH --nodes=1
-#SBATCH --cpus-per-task=20
-#SBATCH --time=23:59:59
+#SBATCH --cpus-per-task=100
+#SBATCH --time=119:59:59
 #SBATCH --mem-per-cpu=4096
 #SBATCH --output="%x_out.txt"
 #SBATCH --open-mode=append
@@ -17,91 +17,12 @@ echo "SLURM_GPUS_PER_TASK: $SLURM_GPUS_PER_TASK"
 # ! USER SETTINGS HERE
 # -> config file to use
 CONFIG="q2_ritme/run_config.json"
-# -> count of this concurrent job launched on same infrastructure
-# -> only these values are allowed: 1, 2, 3 - since below ports are
-# -> otherwise taken or not allowed
-JOB_NB=1
 
 # if your number of threads are limited increase as needed
 ulimit -u 60000
 ulimit -n 524288
 # ! USER END __________
 
-# __doc_head_address_start__
-# script was edited from:
-# https://docs.ray.io/en/latest/cluster/vms/user-guides/community/slurm.html
-
-# Getting the node names
-nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
-nodes_array=($nodes)
-
-head_node=${nodes_array[0]}
-head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
-
-# if we detect a space character in the head node IP, we'll
-# convert it to an ipv4 address. This step is optional.
-if [[ "$head_node_ip" == *" "* ]]; then
-IFS=' ' read -ra ADDR <<<"$head_node_ip"
-if [[ ${#ADDR[0]} -gt 16 ]]; then
-  head_node_ip=${ADDR[1]}
-else
-  head_node_ip=${ADDR[0]}
-fi
-echo "IPV6 address detected. We split the IPV4 address as $head_node_ip"
-fi
-# __doc_head_address_end__
-
-# __doc_head_ray_start__
-port=$((6378 + JOB_NB))
-node_manager_port=$((6600 + JOB_NB * 100))
-object_manager_port=$((6601 + JOB_NB * 100))
-ray_client_server_port=$((1 + JOB_NB * 10000))
-redis_shard_ports=$((6602 + JOB_NB * 100))
-min_worker_port=$((2 + JOB_NB * 10000))
-max_worker_port=$((9999 + JOB_NB * 10000))
-dashboard_port=$((8265 + JOB_NB))
-
-ip_head=$head_node_ip:$port
-export ip_head
-echo "IP Head: $ip_head"
-
-echo "Starting HEAD at $head_node"
-srun --nodes=1 --ntasks=1 -w "$head_node" \
-    ray start --head --node-ip-address="$head_node_ip" \
-    --port=$port \
-    --node-manager-port=$node_manager_port \
-    --object-manager-port=$object_manager_port \
-    --ray-client-server-port=$ray_client_server_port \
-    --redis-shard-ports=$redis_shard_ports \
-    --min-worker-port=$min_worker_port \
-    --max-worker-port=$max_worker_port \
-    --dashboard-port=$dashboard_port \
-    --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_TASK:-0}" --block &
-# __doc_head_ray_end__
-
-# __doc_worker_ray_start__
-# optional, though may be useful in certain versions of Ray < 1.0.
-sleep 10
-
-# number of nodes other than the head node
-worker_num=$((SLURM_JOB_NUM_NODES - 1))
-
-for ((i = 1; i <= worker_num; i++)); do
-    node_i=${nodes_array[$i]}
-    echo "Starting WORKER $i at $node_i"
-    srun --nodes=1 --ntasks=1 -w "$node_i" \
-        ray start --address "$ip_head" \
-        --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_TASK:-0}" --block &
-    sleep 5
-done
-# __doc_worker_ray_end__
-
-# Output the dashboard URL
-dashboard_url="http://${head_node_ip}:${dashboard_port}"
-export RAY_DASHBOARD_URL="$dashboard_url"
-echo "Ray Dashboard URL: $RAY_DASHBOARD_URL"
-
-# __doc_script_start__
 python -u q2_ritme/run_n_eval_tune.py --config $CONFIG
 sstat -j $SLURM_JOB_ID
 

diff --git a/launch_slurm_cpu_multi_node.sh b/launch_slurm_cpu_multi_node.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+
+#SBATCH --job-name="run_config"
+#SBATCH -A partition_name
+#SBATCH --nodes=1
+#SBATCH --cpus-per-task=100
+#SBATCH --time=23:59:59
+#SBATCH --mem-per-cpu=4096
+#SBATCH --output="%x_out.txt"
+#SBATCH --open-mode=append
+
+set -x
+
+echo "SLURM_CPUS_PER_TASK: $SLURM_CPUS_PER_TASK"
+echo "SLURM_GPUS_PER_TASK: $SLURM_GPUS_PER_TASK"
+
+# ! USER SETTINGS HERE
+# -> config file to use
+CONFIG="q2_ritme/run_config.json"
+# -> count of this concurrent job launched on same infrastructure
+# -> only these values are allowed: 1, 2, 3 - since below ports are
+# -> otherwise taken or not allowed
+JOB_NB=2
+
+# if your number of threads are limited increase as needed
+ulimit -u 60000
+ulimit -n 524288
+# ! USER END __________
+
+# __doc_head_address_start__
+# script was edited from:
+# https://docs.ray.io/en/latest/cluster/vms/user-guides/community/slurm.html
+
+# Getting the node names
+nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
+nodes_array=($nodes)
+
+head_node=${nodes_array[0]}
+head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
+
+# if we detect a space character in the head node IP, we'll
+# convert it to an ipv4 address. This step is optional.
+if [[ "$head_node_ip" == *" "* ]]; then
+IFS=' ' read -ra ADDR <<<"$head_node_ip"
+if [[ ${#ADDR[0]} -gt 16 ]]; then
+  head_node_ip=${ADDR[1]}
+else
+  head_node_ip=${ADDR[0]}
+fi
+echo "IPV6 address detected. We split the IPV4 address as $head_node_ip"
+fi
+# __doc_head_address_end__
+
+# __doc_head_ray_start__
+port=$((6378 + JOB_NB))
+node_manager_port=$((6600 + JOB_NB * 100))
+object_manager_port=$((6601 + JOB_NB * 100))
+ray_client_server_port=$((1 + JOB_NB * 10000))
+redis_shard_ports=$((6602 + JOB_NB * 100))
+min_worker_port=$((2 + JOB_NB * 10000))
+max_worker_port=$((9999 + JOB_NB * 10000))
+dashboard_port=$((8265 + JOB_NB))
+
+ip_head=$head_node_ip:$port
+export ip_head
+echo "IP Head: $ip_head"
+
+echo "Starting HEAD at $head_node"
+srun --nodes=1 --ntasks=1 -w "$head_node" \
+    ray start --head --node-ip-address="$head_node_ip" \
+    --port=$port \
+    --node-manager-port=$node_manager_port \
+    --object-manager-port=$object_manager_port \
+    --ray-client-server-port=$ray_client_server_port \
+    --redis-shard-ports=$redis_shard_ports \
+    --min-worker-port=$min_worker_port \
+    --max-worker-port=$max_worker_port \
+    --dashboard-port=$dashboard_port \
+    --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_TASK:-0}" --block &
+# __doc_head_ray_end__
+
+# __doc_worker_ray_start__
+# optional, though may be useful in certain versions of Ray < 1.0.
+sleep 10
+
+# number of nodes other than the head node
+worker_num=$((SLURM_JOB_NUM_NODES - 1))
+
+for ((i = 1; i <= worker_num; i++)); do
+    node_i=${nodes_array[$i]}
+    echo "Starting WORKER $i at $node_i"
+    srun --nodes=1 --ntasks=1 -w "$node_i" \
+        ray start --address "$ip_head" \
+        --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_TASK:-0}" --block &
+    sleep 5
+done
+# __doc_worker_ray_end__
+
+# Output the dashboard URL
+dashboard_url="http://${head_node_ip}:${dashboard_port}"
+export RAY_DASHBOARD_URL="$dashboard_url"
+echo "Ray Dashboard URL: $RAY_DASHBOARD_URL"
+
+# __doc_script_start__
+python -u q2_ritme/run_n_eval_tune.py --config $CONFIG
+sstat -j $SLURM_JOB_ID
+
+# get elapsed time of job
+echo "TIME COUNTER:"
+sacct -j $SLURM_JOB_ID --format=elapsed --allocations
diff --git a/q2_ritme/tune_models.py b/q2_ritme/tune_models.py
@@ -1,17 +1,16 @@
-import logging
 import os
 import random
 
 import dotenv
 import numpy as np
 import pandas as pd
+import ray
 import skbio
 import torch
 from ray import air, init, tune
 from ray.air.integrations.mlflow import MLflowLoggerCallback
 from ray.air.integrations.wandb import WandbLoggerCallback
 from ray.tune.schedulers import AsyncHyperBandScheduler, HyperBandScheduler
-from ray.tune.search import ConcurrencyLimiter
 from ray.tune.search.optuna import OptunaSearch
 
 from q2_ritme.model_space import static_searchspace as ss
@@ -91,13 +90,14 @@ def run_trials(
     # todo: configure dashboard here - see "ray dashboard set up" online once
     # todo: ray (Q2>Py) is updated
     context = init(
-        address="auto",
+        address="local",
         include_dashboard=False,
         ignore_reinit_error=True,
-        logging_level=logging.DEBUG,
-        log_to_driver=True,
+        # logging_level=logging.DEBUG,
+        # log_to_driver=True,
     )
-    print(context.dashboard_url)
+    print(f"Ray cluster resources: {ray.cluster_resources()}")
+    print(f"Dashboard URL at: {context.dashboard_url}")
     # note: both schedulers might decide to run more trials than allocated
     if not fully_reproducible:
         # AsyncHyperBand enables aggressive early stopping of bad trials.
@@ -112,9 +112,9 @@ def run_trials(
         )
 
         search_algo = OptunaSearch(seed=seed_model)
-        search_algo = ConcurrencyLimiter(
-            search_algo, max_concurrent=max_concurrent_trials
-        )
+        # search_algo = ConcurrencyLimiter(
+        #     search_algo, max_concurrent=max_concurrent_trials
+        # )
     else:
         # ! HyperBandScheduler slower BUT
         # ! improves the reproducibility of experiments by ensuring that all trials
@@ -195,8 +195,8 @@ def run_trials(
             scheduler=scheduler,
             # number of trials to run - schedulers might decide to run more trials
             num_samples=num_trials,
-            # # todo: remove below or change search_alg
-            # max_concurrent_trials=max_concurrent_trials,
+            # set max concurrent trials to launch
+            max_concurrent_trials=max_concurrent_trials,
             # ! set seed
             search_alg=search_algo,
         ),