diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 1bb0ca79cc1da..10a2ce3e05464 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -222,10 +222,13 @@ class GroupCoordinator: for ranks in group_ranks: device_group = torch.distributed.new_group( - ranks, backend=torch_distributed_backend) + ranks, backend=torch_distributed_backend, + timeout=envs.VLLM_DISTRIBUTED_INIT_TIMEOUT_SECONDS) # a group with `gloo` backend, to allow direct coordination between # processes through the CPU. - cpu_group = torch.distributed.new_group(ranks, backend="gloo") + cpu_group = torch.distributed.new_group(ranks, + backend="gloo", + timeout=envs.VLLM_DISTRIBUTED_INIT_TIMEOUT_SECONDS) if self.rank in ranks: self.ranks = ranks self.world_size = len(ranks) @@ -965,7 +968,8 @@ def init_distributed_environment( backend=backend, init_method=distributed_init_method, world_size=world_size, - rank=rank) + rank=rank, + timeout=envs.VLLM_DISTRIBUTED_INIT_TIMEOUT_SECONDS) # set the local rank # local_rank is not available in torch ProcessGroup, # see https://github.com/pytorch/pytorch/issues/122816 diff --git a/vllm/envs.py b/vllm/envs.py index 261cc7855b705..821a86291f42a 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -140,6 +140,7 @@ if TYPE_CHECKING: VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 120 VLLM_USE_CUDNN_PREFILL: bool = False VLLM_LOOPBACK_IP: str = "" + VLLM_DISTRIBUTED_INIT_TIMEOUT_SECONDS: Optional[int] = None def get_default_cache_root(): @@ -505,6 +506,10 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_IMAGE_FETCH_TIMEOUT": lambda: int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")), + # Timeout for torch distributed calls + "VLLM_DISTRIBUTED_INIT_TIMEOUT_SECONDS": + lambda: maybe_convert_int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", None)), + # Timeout for fetching videos when serving multimodal models # Default is 30 seconds "VLLM_VIDEO_FETCH_TIMEOUT":