From d23693aed50e26e6ce6fdc5c0fe060171985d4a8 Mon Sep 17 00:00:00 2001 From: Dinghao Zhou Date: Mon, 6 May 2024 16:56:17 +0800 Subject: [PATCH 1/2] fix rdzv_id rdzv_id may be not equal in multi node setup using RANDOM --- distributed/ddp-tutorial-series/slurm/sbatch_run.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/distributed/ddp-tutorial-series/slurm/sbatch_run.sh b/distributed/ddp-tutorial-series/slurm/sbatch_run.sh index ac02d19e99..2b59634c74 100644 --- a/distributed/ddp-tutorial-series/slurm/sbatch_run.sh +++ b/distributed/ddp-tutorial-series/slurm/sbatch_run.sh @@ -14,10 +14,11 @@ head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) echo Node IP: $head_node_ip export LOGLEVEL=INFO +job_id=2024 srun torchrun \ --nnodes 4 \ --nproc_per_node 1 \ ---rdzv_id $RANDOM \ +--rdzv_id ${jobid} \ --rdzv_backend c10d \ --rdzv_endpoint $head_node_ip:29500 \ -/shared/examples/multinode_torchrun.py 50 10 \ No newline at end of file +/shared/examples/multinode_torchrun.py 50 10 From 0646d69f74eb8f22bde122a35b78a7d30f234494 Mon Sep 17 00:00:00 2001 From: Dinghao Zhou Date: Mon, 6 May 2024 16:56:50 +0800 Subject: [PATCH 2/2] Update sbatch_run.sh --- distributed/ddp-tutorial-series/slurm/sbatch_run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/distributed/ddp-tutorial-series/slurm/sbatch_run.sh b/distributed/ddp-tutorial-series/slurm/sbatch_run.sh index 2b59634c74..2676b684ea 100644 --- a/distributed/ddp-tutorial-series/slurm/sbatch_run.sh +++ b/distributed/ddp-tutorial-series/slurm/sbatch_run.sh @@ -18,7 +18,7 @@ job_id=2024 srun torchrun \ --nnodes 4 \ --nproc_per_node 1 \ ---rdzv_id ${jobid} \ +--rdzv_id ${job_id} \ --rdzv_backend c10d \ --rdzv_endpoint $head_node_ip:29500 \ /shared/examples/multinode_torchrun.py 50 10