mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-27 02:21:48 +08:00
[misc][distributed] use 127.0.0.1 for single-node (#5619)
This commit is contained in:
parent
f758aed0e8
commit
3eea74889f
@ -10,7 +10,7 @@ from vllm.executor.multiproc_worker_utils import (ProcessWorkerWrapper,
|
||||
from vllm.logger import init_logger
|
||||
from vllm.sequence import ExecuteModelRequest, SamplerOutput
|
||||
from vllm.utils import (cuda_device_count_stateless,
|
||||
get_distributed_init_method, get_ip, get_open_port,
|
||||
get_distributed_init_method, get_open_port,
|
||||
get_vllm_instance_id, make_async)
|
||||
|
||||
logger = init_logger(__name__)
|
||||
@ -37,8 +37,11 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
|
||||
assert world_size <= cuda_device_count_stateless(), (
|
||||
"please set tensor_parallel_size to less than max local gpu count")
|
||||
|
||||
# Multiprocessing-based executor does not support multi-node setting.
|
||||
# Since it only works for single node, we can use the loopback address
|
||||
# 127.0.0.1 for communication.
|
||||
distributed_init_method = get_distributed_init_method(
|
||||
get_ip(), get_open_port())
|
||||
"127.0.0.1", get_open_port())
|
||||
|
||||
if world_size == 1:
|
||||
self.workers = []
|
||||
|
||||
@ -161,6 +161,16 @@ class RayGPUExecutor(DistributedGPUExecutor):
|
||||
self._run_workers("update_environment_variables",
|
||||
all_args=all_args_to_update_environment_variables)
|
||||
|
||||
if len(node_gpus) == 1:
|
||||
# in single node case, we don't need to get the IP address.
|
||||
# the loopback address is sufficient
|
||||
# NOTE: a node may have several IP addresses, one for each
|
||||
# network interface. `get_ip()` might return any of them,
|
||||
# while they might not work for communication inside the node
|
||||
# if the network setup is complicated. Using the loopback address
|
||||
# solves this issue, as it always works for communication inside
|
||||
# the node.
|
||||
driver_ip = "127.0.0.1"
|
||||
distributed_init_method = get_distributed_init_method(
|
||||
driver_ip, get_open_port())
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user