mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-31 00:17:09 +08:00
[Bug Fix] get_distributed_init_method should get the ip from get_ip i… (#20889)
Signed-off-by: Chen Li <lcpingping@gmail.com> Co-authored-by: Russell Bryant <rbryant@redhat.com> Signed-off-by: Russell Bryant <rbryant@redhat.com>
This commit is contained in:
parent
19c863068b
commit
10be209493
@ -139,6 +139,7 @@ if TYPE_CHECKING:
|
|||||||
VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16: bool = True
|
VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16: bool = True
|
||||||
VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: Optional[int] = None
|
VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: Optional[int] = None
|
||||||
VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 120
|
VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 120
|
||||||
|
VLLM_LOOPBACK_IP: str = ""
|
||||||
|
|
||||||
|
|
||||||
def get_default_cache_root():
|
def get_default_cache_root():
|
||||||
@ -964,6 +965,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
# If set to 1, use the TRTLLM Decode Attention backend in flashinfer.
|
# If set to 1, use the TRTLLM Decode Attention backend in flashinfer.
|
||||||
"VLLM_USE_TRTLLM_DECODE_ATTENTION":
|
"VLLM_USE_TRTLLM_DECODE_ATTENTION":
|
||||||
lambda: os.getenv("VLLM_USE_TRTLLM_DECODE_ATTENTION", None),
|
lambda: os.getenv("VLLM_USE_TRTLLM_DECODE_ATTENTION", None),
|
||||||
|
|
||||||
|
# Used to force set up loopback IP
|
||||||
|
"VLLM_LOOPBACK_IP":
|
||||||
|
lambda: os.getenv("VLLM_LOOPBACK_IP", ""),
|
||||||
}
|
}
|
||||||
|
|
||||||
# --8<-- [end:env-vars-definition]
|
# --8<-- [end:env-vars-definition]
|
||||||
|
|||||||
@ -813,6 +813,33 @@ def get_ip() -> str:
|
|||||||
return "0.0.0.0"
|
return "0.0.0.0"
|
||||||
|
|
||||||
|
|
||||||
|
def test_loopback_bind(address, family):
|
||||||
|
try:
|
||||||
|
s = socket.socket(family, socket.SOCK_DGRAM)
|
||||||
|
s.bind((address, 0)) # Port 0 = auto assign
|
||||||
|
s.close()
|
||||||
|
return True
|
||||||
|
except OSError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def get_loopback_ip() -> str:
|
||||||
|
loopback_ip = envs.VLLM_LOOPBACK_IP
|
||||||
|
if loopback_ip:
|
||||||
|
return loopback_ip
|
||||||
|
|
||||||
|
# VLLM_LOOPBACK_IP is not set, try to get it based on network interface
|
||||||
|
|
||||||
|
if test_loopback_bind("127.0.0.1", socket.AF_INET):
|
||||||
|
return "127.0.0.1"
|
||||||
|
elif test_loopback_bind("::1", socket.AF_INET6):
|
||||||
|
return "::1"
|
||||||
|
else:
|
||||||
|
raise RuntimeError(
|
||||||
|
"Neither 127.0.0.1 nor ::1 are bound to a local interface. "
|
||||||
|
"Set the VLLM_LOOPBACK_IP environment variable explicitly.")
|
||||||
|
|
||||||
|
|
||||||
def is_valid_ipv6_address(address: str) -> bool:
|
def is_valid_ipv6_address(address: str) -> bool:
|
||||||
try:
|
try:
|
||||||
ipaddress.IPv6Address(address)
|
ipaddress.IPv6Address(address)
|
||||||
|
|||||||
@ -30,8 +30,8 @@ from vllm.distributed.device_communicators.shm_broadcast import (Handle,
|
|||||||
from vllm.executor.multiproc_worker_utils import (
|
from vllm.executor.multiproc_worker_utils import (
|
||||||
_add_prefix, set_multiprocessing_worker_envs)
|
_add_prefix, set_multiprocessing_worker_envs)
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.utils import (get_distributed_init_method, get_mp_context,
|
from vllm.utils import (get_distributed_init_method, get_loopback_ip,
|
||||||
get_open_port)
|
get_mp_context, get_open_port)
|
||||||
from vllm.v1.executor.abstract import Executor, FailureCallback
|
from vllm.v1.executor.abstract import Executor, FailureCallback
|
||||||
from vllm.v1.outputs import ModelRunnerOutput
|
from vllm.v1.outputs import ModelRunnerOutput
|
||||||
from vllm.worker.worker_base import WorkerWrapperBase
|
from vllm.worker.worker_base import WorkerWrapperBase
|
||||||
@ -63,9 +63,9 @@ class MultiprocExecutor(Executor):
|
|||||||
|
|
||||||
# Multiprocessing-based executor does not support multi-node setting.
|
# Multiprocessing-based executor does not support multi-node setting.
|
||||||
# Since it only works for single node, we can use the loopback address
|
# Since it only works for single node, we can use the loopback address
|
||||||
# 127.0.0.1 for communication.
|
# get_loopback_ip() for communication.
|
||||||
distributed_init_method = get_distributed_init_method(
|
distributed_init_method = get_distributed_init_method(
|
||||||
"127.0.0.1", get_open_port())
|
get_loopback_ip(), get_open_port())
|
||||||
|
|
||||||
# Initialize worker and set up message queues for SchedulerOutputs
|
# Initialize worker and set up message queues for SchedulerOutputs
|
||||||
# and ModelRunnerOutputs
|
# and ModelRunnerOutputs
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user