-
Notifications
You must be signed in to change notification settings - Fork 133
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
训练75模型,TP8,PP4,4张A800sft微调训练,总是报错torch.distributed.DistStoreError: Socket Timeout #468
Comments
拉起的任务是32机8卡,请自行检查参数配置 |
你好,修改后配置: Here are some configs controled by envif [ -z ${MP_DATASET_TYPE} ];then if [ -z ${MP_AC_LAYERS} ];then export RANK=$PET_NODE_RANK if [ $ENV = dsw ]; then #单机多卡 if [ -z ${MP_VP} ]; then if [ -z ${MP_SFT_PACKING} ]; then DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT" BASE CONFIGMODEL_SIZE=$2 BASE CONFIGPARALLEL / BOOL OPTIONTP=${10} PARALLEL / BOOL OPTIONOTHERSAC=${17} the following two values will not be used when SFT is trueTRAIN_TOKENS=${23} OUTPUT_BASEPATH=${25} OTHERSif [ $FL = true ]; then if [ $MODEL_SIZE = 8B ]; then NUM_LAYERS=24 tie_option="" elif [ $MODEL_SIZE = 57B ]; then NUM_LAYERS=64 tie_option=" NUM_LAYERS=84 tie_option=" fi TP_COMM_OVERLAP=$(( ($TP > 1) ? 1 : 0 )) if [ $TP_COMM_OVERLAP -eq 1 ]; then if [ $AC = full ]; then if [ $PR = fp16 ]; then if [ $OPTIMIZER_OFFLOAD != false ] && [ $DO = false ]; then if [ $DO = true ]; then elif [ $DO = false ]; then te_options=" if [ $SP = true ] && [ $TP -gt 1 ]; then elif [ $SP = false ]; then if [ $PRETRAIN_CHECKPOINT_PATH != none ]; then if [ $OPTIMIZER_OFFLOAD = 'static' ]; then if [ $SFT = true ]; then if [ ${MP_DATASET_TYPE} = "raw" ]; then if [ ${MP_SFT_PACKING} = true ]; then Prepare logdirsNAME="${PREFIX}-pr-${PR}-tp-${TP}-pp-${PP}-cp-${CP}-ac-${AC}-do-${DO}-sp-${SP}-ti-${TRAIN_ITERS}-wi-${LR_WARMUP_ITERS}" mkdir -p ${SAVED_PRETRAIN_CHECKPOINT_PATH} megatron_options=" run_cmd="torchrun $DISTRIBUTED_ARGS ../qwen2/pretrain_qwen.py echo ${run_cmd} 还是报错如下:
|
运行命令:#!/bin/bash$( dirname $ {CURRENT_DIR}))
set -e
ENV=$1
CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
MEGATRON_PATH=$( dirname
export PYTHONPATH=${MEGATRON_PATH}:${MEGATRON_PATH}/PAI-Megatron-LM-240718:$PYTHONPATH
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_CONNECT_TIMEOUT=14400
export NCCL_EXEC_TIMEOUT=14400
Here are some configs controled by env
if [ -z ${MP_DATASET_TYPE} ];then
MP_DATASET_TYPE="idxmap"
fi
if [ -z ${MP_AC_LAYERS} ];then
MP_AC_LAYERS=1
fi
export RANK=$PET_NODE_RANK
export KUBERNETES_CONTAINER_RESOURCE_GPU=$PET_NPROC_PER_NODE
if [ $ENV = dsw ]; then #单机多卡
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
MASTER_ADDR=localhost
MASTER_PORT=$(shuf -n 1 -i 10000-65535)
NNODES=1
NODE_RANK=0
GPUS_PER_NODE=8
elif [ $ENV = dlc ]; then #多机多卡
NNODES=${WORLD_SIZE}
NODE_RANK=${RANK}
GPUS_PER_NODE=${KUBERNETES_CONTAINER_RESOURCE_GPU}
fi
if [ -z ${MP_VP} ]; then
vp_options=""
else
vp_options="
--num-layers-per-virtual-pipeline-stage ${MP_VP}"
fi
if [ -z ${MP_SFT_PACKING} ]; then
MP_SFT_PACKING=false
fi
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
BASE CONFIG
MODEL_SIZE=$2
BATCH_SIZE=$3
GLOBAL_BATCH_SIZE=$4
LR=$5
MIN_LR=$6
SEQ_LEN=$7
PAD_LEN=$8
PR=$9
BASE CONFIG
PARALLEL / BOOL OPTION
TP=${10}
PP=${11}
CP=${12}
SP=${13}
DO=${14}
FL=${15}
SFT=${16}
PARALLEL / BOOL OPTION
OTHERS
AC=${17}
OPTIMIZER_OFFLOAD=${18}
SAVE_INTERVAL=${19}
DATASET_PATH=${20}
VALID_DATASET_PATH=${21}
PRETRAIN_CHECKPOINT_PATH=${22}
the following two values will not be used when SFT is true
TRAIN_TOKENS=${23}
WARMUP_TOKENS=${24}
###############################
OUTPUT_BASEPATH=${25}
OTHERS
if [ $FL = true ]; then
export NVTE_FLASH_ATTN=1 NVTE_FUSED_ATTN=0
elif [ $FL = false ]; then
export NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=1
fi
if [ $MODEL_SIZE = 8B ]; then
NUM_LAYERS=24
HIDDEN_SIZE=896
NUM_ATTN_HEADS=14
INTERMEDIATE_SIZE=4864
NUM_KEY_VALUE_HEADS=2
MAX_POSITION_EMBEDDINGS=32768
EXTRA_VOCAB_SIZE=293
RMS_NORM_EPS=1e-6
gqa_options="
--group-query-attention
--num-query-groups ${NUM_KEY_VALUE_HEADS}"
tie_option=""
elif [ $MODEL_SIZE = 57B ]; then
NUM_LAYERS=64
HIDDEN_SIZE=5120
NUM_ATTN_HEADS=40
INTERMEDIATE_SIZE=27648
NUM_KEY_VALUE_HEADS=8
MAX_POSITION_EMBEDDINGS=131072
EXTRA_VOCAB_SIZE=421
RMS_NORM_EPS=1e-5
gqa_options="
--group-query-attention
--num-query-groups ${NUM_KEY_VALUE_HEADS}"
tie_option="
--untie-embeddings-and-output-weights
"
elif [ $MODEL_SIZE = 75B ]; then
NUM_LAYERS=84
HIDDEN_SIZE=8192
NUM_ATTN_HEADS=64
INTERMEDIATE_SIZE=29184
NUM_KEY_VALUE_HEADS=8
MAX_POSITION_EMBEDDINGS=8192
EXTRA_VOCAB_SIZE=488
RMS_NORM_EPS=1e-06
gqa_options="
--group-query-attention
--num-query-groups ${NUM_KEY_VALUE_HEADS}"
tie_option="
--untie-embeddings-and-output-weights
"
fi
TP_COMM_OVERLAP=$(( ($TP > 1) ? 1 : 0 ))
comm_overlap_option="
--overlap-grad-reduce
--overlap-param-gather"
if [ $TP_COMM_OVERLAP -eq 1 ]; then
comm_overlap_option="
--tp-comm-overlap
--overlap-grad-reduce
--overlap-param-gather"
fi
if [ $AC = full ]; then$PP) % $ {MP_AC_LAYERS} ))
_check=$(( ($NUM_LAYERS /
if [ $_check != 0 ]; then
echo "the num layers per pp rank must be a multiple of the recompute layers."
exit -1
fi
activation_checkpoint_options="
--recompute-method uniform
--recompute-num-layers ${MP_AC_LAYERS}
--recompute-granularity full"
elif [ $AC = sel ]; then
activation_checkpoint_options="
--recompute-activations"
elif [ $AC = none ]; then
activation_checkpoint_options="
"
elif [ $AC = offload ]; then
activation_checkpoint_options="
--cpu-offloading
--cpu-offloading-num-layers ${MP_AC_LAYERS}"
if [ $TP_COMM_OVERLAP -eq 1 ]; then
echo "Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..."
comm_overlap_option="
--tp-comm-overlap"
else
echo "Disable --overlap-grad-reduce and --overlap-param-gather when cpu offloading is on..."
comm_overlap_option=""
fi
fi
if [ $PR = fp16 ]; then
pr_options="
--fp16
--apply-query-key-layer-scaling"
export NVTE_APPLY_QK_LAYER_SCALING=1
elif [ $PR = bf16 ]; then
pr_options="
--bf16"
elif [ $PR = fp8 ]; then
pr_options="
--bf16
--fp8-format hybrid
--fp8-amax-compute-algo max
--fp8-amax-history-len 1024"
fi
if [ $OPTIMIZER_OFFLOAD != false ] && [ $DO = false ]; then
echo "Offload optimizer is valid only if $DO=true"
DO=true
fi
if [ $DO = true ]; then
do_options="
--use-distributed-optimizer"
elif [ $DO = false ]; then
do_options="
"
fi
te_options="
--transformer-impl transformer_engine"
if [ $SP = true ] && [ $TP -gt 1 ]; then
sp_options="
--sequence-parallel"
elif [ $SP = false ]; then
sp_options="
"
fi
if [ $PRETRAIN_CHECKPOINT_PATH != none ]; then
load_options="
--load $PRETRAIN_CHECKPOINT_PATH"
fi
if [ $OPTIMIZER_OFFLOAD = 'static' ]; then
offload_option="
--optimizer hybridadam
--optimizer-offload-policy static
--optimizer-offload-fraction 1.0"
elif [ $OPTIMIZER_OFFLOAD = 'auto' ]; then
offload_option="
--optimizer hybridadam
--optimizer-offload-policy auto"
else
offload_option=""
fi
if [ $SFT = true ]; then
TRAIN_ITERS=${23}
LR_WARMUP_ITERS=${24}
LR_DECAY_ITERS=$(( ${TRAIN_ITERS} - ${LR_WARMUP_ITERS}))
PREFIX="finetune-mcore-jiutian75B-${MODEL_SIZE}-lr-${LR}-minlr-${MIN_LR}-bs-${BATCH_SIZE}-gbs-${GLOBAL_BATCH_SIZE}-seqlen-${SEQ_LEN}"
sft_option="
--eod-mask-loss
--train-mode finetune"
else
TRAIN_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
LR_WARMUP_ITERS=$(( ${WARMUP_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
LR_DECAY_ITERS=$(( ${TRAIN_TOKENS} / ${GLOBAL_BATCH_SIZE} / ${SEQ_LEN} ))
PREFIX="pretrain-mcore-qwen2.5-${MODEL_SIZE}-lr-${LR}-minlr-${MIN_LR}-bs-${BATCH_SIZE}-gbs-${GLOBAL_BATCH_SIZE}-seqlen-${SEQ_LEN}"
sft_option="
--train-mode pretrain"
fi
if [ ${MP_DATASET_TYPE} = "raw" ]; then
dataset_option="
--train-data-path ${DATASET_PATH}
--valid-data-path ${VALID_DATASET_PATH}
--dataloader-type cyclic
--dataset LLama-SFT-Raw"
else
dataset_option="
--data-path ${DATASET_PATH}
--split 9999,1,0
--dataset LLama-Pretrain-Idxmap"
fi
if [ ${MP_SFT_PACKING} = true ]; then
packing_options="
--reset-position-ids
--no-create-attention-mask-in-dataloader
"
else
packing_options=""
fi
Prepare logdirs
NAME="${PREFIX}-pr-${PR}-tp-${TP}-pp-${PP}-cp-${CP}-ac-${AC}-do-${DO}-sp-${SP}-ti-${TRAIN_ITERS}-wi-${LR_WARMUP_ITERS}"
mkdir -p "${OUTPUT_BASEPATH}/tensorboard/"
mkdir -p "${OUTPUT_BASEPATH}/checkpoint/"
mkdir -p "${OUTPUT_BASEPATH}/log/"
current_time=$(date "+%Y.%m.%d-%H.%M.%S")
TENSORBOARD_DIR="${OUTPUT_BASEPATH}/tensorboard/${NAME}_${current_time}"
mkdir -p ${TENSORBOARD_DIR}
SAVED_PRETRAIN_CHECKPOINT_PATH="${OUTPUT_BASEPATH}/checkpoint/${NAME}"
mkdir -p ${SAVED_PRETRAIN_CHECKPOINT_PATH}
find -L ${PRETRAIN_CHECKPOINT_PATH} -maxdepth 1 -type f -name "*.json" -print0 | xargs -0 cp -t ${SAVED_PRETRAIN_CHECKPOINT_PATH}
megatron_options="
--save ${SAVED_PRETRAIN_CHECKPOINT_PATH}
--lr ${LR}
--min-lr ${MIN_LR}
--lr-decay-style cosine
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--clip-grad 1.0
--init-method-std 0.008
--attention-dropout 0.0
--hidden-dropout 0.0
--lr-decay-iters ${LR_DECAY_ITERS}
--lr-warmup-iters ${LR_WARMUP_ITERS}
--train-iters ${TRAIN_ITERS}
--micro-batch-size ${BATCH_SIZE}
--global-batch-size ${GLOBAL_BATCH_SIZE}
--num-layers ${NUM_LAYERS}
--hidden-size ${HIDDEN_SIZE}
--num-attention-heads ${NUM_ATTN_HEADS}
--ffn-hidden-size ${INTERMEDIATE_SIZE}
--seq-length ${SEQ_LEN}
--max-position-embeddings ${MAX_POSITION_EMBEDDINGS}
--max-padding-length ${PAD_LEN}
--log-interval 1
--log-throughput
--eval-interval 1000
--eval-iters 10
--save-interval ${SAVE_INTERVAL}
--tensorboard-queue-size 1
--tensorboard-dir ${TENSORBOARD_DIR}
--log-timers-to-tensorboard
--log-batch-size-to-tensorboard
--log-validation-ppl-to-tensorboard
--tensor-model-parallel-size ${TP}
--pipeline-model-parallel-size ${PP}
--context-parallel-size ${CP}
--no-load-optim
--no-load-rng
--num-workers 8
--extra-vocab-size ${EXTRA_VOCAB_SIZE}
--patch-tokenizer-type Qwen2Tokenizer
--swiglu
--normalization RMSNorm
--norm-epsilon ${RMS_NORM_EPS}
--use-rotary-position-embeddings
--position-embedding-type rope
--disable-bias-linear
--add-qkv-bias
--rotary-percent 1.0
--rotary-base 500000
--rotary-seq-len-interpolation-factor 1
--no-save-optim
--calculate-per-token-loss
"
run_cmd="torchrun $DISTRIBUTED_ARGS ../qwen2/pretrain_qwen.py
${megatron_options} ${dataset_option} ${pr_options} ${load_options} ${te_options} ${activation_checkpoint_options}
${do_options} ${sp_options} ${gqa_options} ${offload_option} ${comm_overlap_option} ${sft_option} ${tie_option} ${vp_options} ${packing_options}"
echo ${run_cmd}
eval ${run_cmd}
set +x
报错:
卡0报错:
3
W0219 12:09:28.313000 140145133409408 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
7
sys.exit(load_entry_point('torch==2.4.0a0+07cecf4168.nv24.5', 'console_scripts', 'torchrun')())
8
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 347, in wrapper
9
return f(*args, **kwargs)
10
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py", line 879, in main
11
run(args)
12
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py", line 870, in run
13
elastic_launch(
14
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py", line 132, in call
15
return launch_agent(self._config, self._entrypoint, list(args))
16
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py", line 254, in launch_agent
17
result = agent.run()
18
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper
19
result = f(*args, **kwargs)
20
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run
21
result = self._invoke_run(role)
22
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/agent/server/api.py", line 870, in _invoke_run
23
self._initialize_workers(self._worker_group)
24
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper
25
result = f(*args, **kwargs)
26
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/agent/server/api.py", line 705, in _initialize_workers
27
self._rendezvous(worker_group)
28
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper
29
result = f(*args, **kwargs)
30
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/agent/server/api.py", line 548, in _rendezvous
31
store, group_rank, group_world_size = spec.rdzv_handler.next_rendezvous()
32
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/rendezvous/static_tcp_rendezvous.py", line 55, in next_rendezvous
33
self._store = TCPStore( # type: ignore[call-arg]
34
torch.distributed.DistStoreError: Timed out after 901 seconds waiting for clients. 4/32 clients joined.
卡1,2,3报错:
torchrun --nproc_per_node 8 --nnodes 32 --node_rank 1 --master_addr jiutian75testlcf-master-0 --master_port 23456 ../qwen2/pretrain_qwen.py --save /workspace/mnt/cmss-liangchunfeng/output3_mcore_jiutian75B_test_finetune/checkpoint/finetune-mcore-jiutian75B-75B-lr-1e-5-minlr-1e-6-bs-1-gbs-512-seqlen-8192-pr-bf16-tp-8-pp-4-cp-1-ac-sel-do-true-sp-true-ti-10000-wi-100 --lr 1e-5 --min-lr 1e-6 --lr-decay-style cosine --weight-decay 0.1 --adam-beta1 0.9 --adam-beta2 0.95 --clip-grad 1.0 --init-method-std 0.008 --attention-dropout 0.0 --hidden-dropout 0.0 --lr-decay-iters 9900 --lr-warmup-iters 100 --train-iters 10000 --micro-batch-size 1 --global-batch-size 512 --num-layers 84 --hidden-size 8192 --num-attention-heads 64 --ffn-hidden-size 29184 --seq-length 8192 --max-position-embeddings 8192 --max-padding-length 8192 --log-interval 1 --log-throughput --eval-interval 1000 --eval-iters 10 --save-interval 10000 --tensorboard-queue-size 1 --tensorboard-dir /workspace/mnt/cmss-liangchunfeng/output3_mcore_jiutian75B_test_finetune/tensorboard/finetune-mcore-jiutian75B-75B-lr-1e-5-minlr-1e-6-bs-1-gbs-512-seqlen-8192-pr-bf16-tp-8-pp-4-cp-1-ac-sel-do-true-sp-true-ti-10000-wi-100_2025.02.19-12.09.52 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --tensor-model-parallel-size 8 --pipeline-model-parallel-size 4 --context-parallel-size 1 --no-load-optim --no-load-rng --num-workers 8 --extra-vocab-size 488 --patch-tokenizer-type Qwen2Tokenizer --swiglu --normalization RMSNorm --norm-epsilon 1e-06 --use-rotary-position-embeddings --position-embedding-type rope --disable-bias-linear --add-qkv-bias --rotary-percent 1.0 --rotary-base 500000 --rotary-seq-len-interpolation-factor 1 --no-save-optim --calculate-per-token-loss --data-path /workspace/mnt/cmss-liangchunfeng/qwen-datasets/0218_new_mmap_cmss_sft_dataset_text_document --split 9999,1,0 --dataset LLama-Pretrain-Idxmap --bf16 --load /workspace/mnt/llm_models/jiutian/JIUTIAN-75B-8k-chat/models/jiutian-75B-hf-to-mcore-te-tp8-pp4 --transformer-impl transformer_engine --recompute-activations --use-distributed-optimizer --sequence-parallel --group-query-attention --num-query-groups 8 --tp-comm-overlap --overlap-grad-reduce --overlap-param-gather --eod-mask-loss --train-mode finetune --untie-embeddings-and-output-weights
1
W0219 12:09:57.549000 140165627352192 torch/distributed/run.py:757]
2
W0219 12:09:57.549000 140165627352192 torch/distributed/run.py:757] *****************************************
3
W0219 12:09:57.549000 140165627352192 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
4
W0219 12:09:57.549000 140165627352192 torch/distributed/run.py:757] *****************************************
5
Traceback (most recent call last):
6
File "/usr/local/bin/torchrun", line 33, in
7
sys.exit(load_entry_point('torch==2.4.0a0+07cecf4168.nv24.5', 'console_scripts', 'torchrun')())
8
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 347, in wrapper
9
return f(*args, **kwargs)
10
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py", line 879, in main
11
run(args)
12
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/run.py", line 870, in run
13
elastic_launch(
14
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py", line 132, in call
15
return launch_agent(self._config, self._entrypoint, list(args))
16
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/launcher/api.py", line 254, in launch_agent
17
result = agent.run()
18
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper
19
result = f(*args, **kwargs)
20
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/agent/server/api.py", line 733, in run
21
result = self._invoke_run(role)
22
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/agent/server/api.py", line 870, in _invoke_run
23
self._initialize_workers(self._worker_group)
24
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper
25
result = f(*args, **kwargs)
26
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/agent/server/api.py", line 705, in _initialize_workers
27
self._rendezvous(worker_group)
28
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper
29
result = f(*args, **kwargs)
30
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/agent/server/api.py", line 551, in _rendezvous
31
workers = self._assign_worker_ranks(store, group_rank, group_world_size, spec)
32
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/metrics/api.py", line 123, in wrapper
33
result = f(*args, **kwargs)
34
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/agent/server/api.py", line 638, in _assign_worker_ranks
35
role_infos = self._share_and_gather(store, group_rank, group_world_size, spec)
36
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/agent/server/api.py", line 675, in _share_and_gather
37
role_infos_bytes = store_util.synchronize(
38
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/utils/store.py", line 64, in synchronize
39
agent_data = get_all(store, rank, key_prefix, world_size)
40
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/elastic/utils/store.py", line 34, in get_all
41
data = store.get(f"{prefix}{idx}")
42
torch.distributed.DistStoreError: Socket Timeout
The text was updated successfully, but these errors were encountered: