Skip to content

Commit

Permalink
ReturnnTrainingJob torchrun, use --standalone for single node (#462)
Browse files Browse the repository at this point in the history
Fix #459
  • Loading branch information
albertz authored Nov 27, 2023
1 parent 80bcfb2 commit 3d0dd86
Showing 1 changed file with 6 additions and 3 deletions.
9 changes: 6 additions & 3 deletions returnn/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,11 +250,14 @@ def _get_run_cmd(self):
if self.horovod_num_processes:
if self.distributed_launch_cmd == "torchrun":
# use torchrun to lauch DDP training when the backend is torch
run_cmd = [
"torchrun",
prefix = ["torchrun"]
if (self.multi_node_slots or 1) == 1:
prefix += ["--standalone"]
prefix += [
f"--nnodes={self.multi_node_slots or 1}",
f"--nproc-per-node={self.horovod_num_processes}",
] + run_cmd[1:]
]
run_cmd = prefix + run_cmd[1:]
elif self.distributed_launch_cmd == "mpirun":
# Normally, if the engine (e.g. SGE or Slurm) is configured correctly,
# it automatically provides the information on multiple nodes to mpirun,
Expand Down

0 comments on commit 3d0dd86

Please sign in to comment.