mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-03 17:27:05 +08:00
Add VLLM_DISTRIBUTED_INIT_TIMEOUT_SECONDS
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
This commit is contained in:
parent
b2eb2b5ad7
commit
feeb17303d
@ -222,10 +222,13 @@ class GroupCoordinator:
|
|||||||
|
|
||||||
for ranks in group_ranks:
|
for ranks in group_ranks:
|
||||||
device_group = torch.distributed.new_group(
|
device_group = torch.distributed.new_group(
|
||||||
ranks, backend=torch_distributed_backend)
|
ranks, backend=torch_distributed_backend,
|
||||||
|
timeout=envs.VLLM_DISTRIBUTED_INIT_TIMEOUT_SECONDS)
|
||||||
# a group with `gloo` backend, to allow direct coordination between
|
# a group with `gloo` backend, to allow direct coordination between
|
||||||
# processes through the CPU.
|
# processes through the CPU.
|
||||||
cpu_group = torch.distributed.new_group(ranks, backend="gloo")
|
cpu_group = torch.distributed.new_group(ranks,
|
||||||
|
backend="gloo",
|
||||||
|
timeout=envs.VLLM_DISTRIBUTED_INIT_TIMEOUT_SECONDS)
|
||||||
if self.rank in ranks:
|
if self.rank in ranks:
|
||||||
self.ranks = ranks
|
self.ranks = ranks
|
||||||
self.world_size = len(ranks)
|
self.world_size = len(ranks)
|
||||||
@ -965,7 +968,8 @@ def init_distributed_environment(
|
|||||||
backend=backend,
|
backend=backend,
|
||||||
init_method=distributed_init_method,
|
init_method=distributed_init_method,
|
||||||
world_size=world_size,
|
world_size=world_size,
|
||||||
rank=rank)
|
rank=rank,
|
||||||
|
timeout=envs.VLLM_DISTRIBUTED_INIT_TIMEOUT_SECONDS)
|
||||||
# set the local rank
|
# set the local rank
|
||||||
# local_rank is not available in torch ProcessGroup,
|
# local_rank is not available in torch ProcessGroup,
|
||||||
# see https://github.com/pytorch/pytorch/issues/122816
|
# see https://github.com/pytorch/pytorch/issues/122816
|
||||||
|
|||||||
@ -140,6 +140,7 @@ if TYPE_CHECKING:
|
|||||||
VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 120
|
VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 120
|
||||||
VLLM_USE_CUDNN_PREFILL: bool = False
|
VLLM_USE_CUDNN_PREFILL: bool = False
|
||||||
VLLM_LOOPBACK_IP: str = ""
|
VLLM_LOOPBACK_IP: str = ""
|
||||||
|
VLLM_DISTRIBUTED_INIT_TIMEOUT_SECONDS: Optional[int] = None
|
||||||
|
|
||||||
|
|
||||||
def get_default_cache_root():
|
def get_default_cache_root():
|
||||||
@ -505,6 +506,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
"VLLM_IMAGE_FETCH_TIMEOUT":
|
"VLLM_IMAGE_FETCH_TIMEOUT":
|
||||||
lambda: int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")),
|
lambda: int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")),
|
||||||
|
|
||||||
|
# Timeout for torch distributed calls
|
||||||
|
"VLLM_DISTRIBUTED_INIT_TIMEOUT_SECONDS":
|
||||||
|
lambda: maybe_convert_int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", None)),
|
||||||
|
|
||||||
# Timeout for fetching videos when serving multimodal models
|
# Timeout for fetching videos when serving multimodal models
|
||||||
# Default is 30 seconds
|
# Default is 30 seconds
|
||||||
"VLLM_VIDEO_FETCH_TIMEOUT":
|
"VLLM_VIDEO_FETCH_TIMEOUT":
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user