From 429284dc374bab79d4dfbb25053583901e6e5051 Mon Sep 17 00:00:00 2001 From: Junda Chen <32371474+GindaChen@users.noreply.github.com> Date: Thu, 14 Mar 2024 23:25:05 -0700 Subject: [PATCH] Fix `dist.broadcast` stall without group argument (#3408) --- vllm/model_executor/parallel_utils/communication_op.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/parallel_utils/communication_op.py b/vllm/model_executor/parallel_utils/communication_op.py index 521b6b8a383b0..6f00fd001d956 100644 --- a/vllm/model_executor/parallel_utils/communication_op.py +++ b/vllm/model_executor/parallel_utils/communication_op.py @@ -177,7 +177,7 @@ def broadcast_tensor_dict( for key, value in metadata_list: if isinstance(value, TensorMetadata): tensor = tensor_dict[key] - torch.distributed.broadcast(tensor, src=src) + torch.distributed.broadcast(tensor, src=src, group=group) else: recv_metadata_list = [None] torch.distributed.broadcast_object_list(recv_metadata_list,