Add VLLM_DISTRIBUTED_INIT_TIMEOUT_SECONDS

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2026-06-03 17:27:05 +08:00 · 2025-07-18 14:39:11 -04:00 · 2025-07-18 14:39:11 -04:00 · feeb17303d
commit feeb17303d
parent b2eb2b5ad7
2 changed files with 12 additions and 3 deletions
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@ -222,10 +222,13 @@ class GroupCoordinator:
        for ranks in group_ranks:
            device_group = torch.distributed.new_group(
-                ranks, backend=torch_distributed_backend)
+                ranks, backend=torch_distributed_backend,
                timeout=envs.VLLM_DISTRIBUTED_INIT_TIMEOUT_SECONDS)
            # a group with `gloo` backend, to allow direct coordination between
            # processes through the CPU.
-            cpu_group = torch.distributed.new_group(ranks, backend="gloo")
+            cpu_group = torch.distributed.new_group(ranks, 
                                                    backend="gloo",
                                                    timeout=envs.VLLM_DISTRIBUTED_INIT_TIMEOUT_SECONDS)
            if self.rank in ranks:
                self.ranks = ranks
                self.world_size = len(ranks)
@ -965,7 +968,8 @@ def init_distributed_environment(
            backend=backend,
            init_method=distributed_init_method,
            world_size=world_size,
-            rank=rank)
+            rank=rank,
            timeout=envs.VLLM_DISTRIBUTED_INIT_TIMEOUT_SECONDS)
    # set the local rank
    # local_rank is not available in torch ProcessGroup,
    # see https://github.com/pytorch/pytorch/issues/122816
--- a/vllm/envs.py
+++ b/vllm/envs.py
@ -140,6 +140,7 @@ if TYPE_CHECKING:
    VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 120
    VLLM_USE_CUDNN_PREFILL: bool = False
    VLLM_LOOPBACK_IP: str = ""
    VLLM_DISTRIBUTED_INIT_TIMEOUT_SECONDS: Optional[int] = None
 def get_default_cache_root():
@ -505,6 +506,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_IMAGE_FETCH_TIMEOUT":
    lambda: int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")),
    # Timeout for torch distributed calls
    "VLLM_DISTRIBUTED_INIT_TIMEOUT_SECONDS": 
    lambda: maybe_convert_int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", None)),
    # Timeout for fetching videos when serving multimodal models
    # Default is 30 seconds
    "VLLM_VIDEO_FETCH_TIMEOUT":