mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-03 18:44:30 +08:00
infer hybrid lb mode on secondary modes
and update some comments Signed-off-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
parent
36ed9f3467
commit
82f9292b84
@ -1013,6 +1013,7 @@ class EngineArgs:
|
|||||||
def create_engine_config(
|
def create_engine_config(
|
||||||
self,
|
self,
|
||||||
usage_context: Optional[UsageContext] = None,
|
usage_context: Optional[UsageContext] = None,
|
||||||
|
headless: bool = False,
|
||||||
) -> VllmConfig:
|
) -> VllmConfig:
|
||||||
"""
|
"""
|
||||||
Create the VllmConfig.
|
Create the VllmConfig.
|
||||||
@ -1101,6 +1102,10 @@ class EngineArgs:
|
|||||||
# but we should not do this here.
|
# but we should not do this here.
|
||||||
placement_group = ray.util.get_current_placement_group()
|
placement_group = ray.util.get_current_placement_group()
|
||||||
|
|
||||||
|
assert not headless or not self.data_parallel_hybrid_lb, (
|
||||||
|
"data_parallel_hybrid_lb is not applicable in "
|
||||||
|
"headless mode")
|
||||||
|
|
||||||
data_parallel_external_lb = self.data_parallel_rank is not None
|
data_parallel_external_lb = self.data_parallel_rank is not None
|
||||||
# Local DP rank = 1, use pure-external LB.
|
# Local DP rank = 1, use pure-external LB.
|
||||||
if data_parallel_external_lb:
|
if data_parallel_external_lb:
|
||||||
@ -1110,24 +1115,25 @@ class EngineArgs:
|
|||||||
data_parallel_size_local = 1
|
data_parallel_size_local = 1
|
||||||
# Use full external lb if we have local_size of 1.
|
# Use full external lb if we have local_size of 1.
|
||||||
self.data_parallel_hybrid_lb = False
|
self.data_parallel_hybrid_lb = False
|
||||||
# Local DP rank > 1, use hybrid LB.
|
elif self.data_parallel_size_local is not None and (
|
||||||
elif self.data_parallel_hybrid_lb:
|
self.data_parallel_size_local != self.data_parallel_size):
|
||||||
assert self.data_parallel_start_rank is not None, (
|
data_parallel_size_local = self.data_parallel_size_local
|
||||||
"data_parallel_start_rank must be set to use "
|
|
||||||
"data_parallel_hybrid_lb.")
|
if self.data_parallel_start_rank and not headless:
|
||||||
assert self.data_parallel_size_local is not None, (
|
# Infer hybrid LB mode.
|
||||||
"data_parallel_size_local must be set to use "
|
self.data_parallel_hybrid_lb = True
|
||||||
"data_parallel_hybrid_lb.")
|
|
||||||
# Use full external lb if we have local_size of 1.
|
if self.data_parallel_hybrid_lb and data_parallel_size_local == 1:
|
||||||
if self.data_parallel_size_local == 1:
|
# Use full external lb if we have local_size of 1.
|
||||||
data_parallel_external_lb = True
|
data_parallel_external_lb = True
|
||||||
self.data_parallel_hybrid_lb = False
|
self.data_parallel_hybrid_lb = False
|
||||||
data_parallel_size_local = self.data_parallel_size_local
|
|
||||||
self.data_parallel_rank = self.data_parallel_start_rank
|
self.data_parallel_rank = self.data_parallel_start_rank or 0
|
||||||
elif self.data_parallel_size_local is not None:
|
|
||||||
data_parallel_size_local = self.data_parallel_size_local
|
|
||||||
self.data_parallel_rank = self.data_parallel_start_rank
|
|
||||||
else:
|
else:
|
||||||
|
assert self.data_parallel_hybrid_lb is None, (
|
||||||
|
"data_parallel_size_local must be set to use "
|
||||||
|
"data_parallel_hybrid_lb.")
|
||||||
|
|
||||||
# Local DP size defaults to global DP size if not set.
|
# Local DP size defaults to global DP size if not set.
|
||||||
data_parallel_size_local = self.data_parallel_size
|
data_parallel_size_local = self.data_parallel_size
|
||||||
|
|
||||||
|
|||||||
@ -81,7 +81,8 @@ def run_headless(args: argparse.Namespace):
|
|||||||
# Create the EngineConfig.
|
# Create the EngineConfig.
|
||||||
engine_args = vllm.AsyncEngineArgs.from_cli_args(args)
|
engine_args = vllm.AsyncEngineArgs.from_cli_args(args)
|
||||||
usage_context = UsageContext.OPENAI_API_SERVER
|
usage_context = UsageContext.OPENAI_API_SERVER
|
||||||
vllm_config = engine_args.create_engine_config(usage_context=usage_context)
|
vllm_config = engine_args.create_engine_config(usage_context=usage_context,
|
||||||
|
headless=True)
|
||||||
|
|
||||||
if not envs.VLLM_USE_V1:
|
if not envs.VLLM_USE_V1:
|
||||||
raise ValueError("Headless mode is only supported for V1")
|
raise ValueError("Headless mode is only supported for V1")
|
||||||
|
|||||||
@ -467,13 +467,14 @@ class EngineCoreProc(EngineCore):
|
|||||||
For DP>1 with internal loadbalancing this is with the shared front-end
|
For DP>1 with internal loadbalancing this is with the shared front-end
|
||||||
process which may reside on a different node.
|
process which may reside on a different node.
|
||||||
|
|
||||||
For DP>1 with external loadbalancing, two handshakes are performed:
|
For DP>1 with external or hybrid loadbalancing, two handshakes are
|
||||||
|
performed:
|
||||||
- With the rank 0 front-end process which retrieves the
|
- With the rank 0 front-end process which retrieves the
|
||||||
DP Coordinator ZMQ addresses and DP process group address.
|
DP Coordinator ZMQ addresses and DP process group address.
|
||||||
- With the colocated front-end process which retrieves the
|
- With the colocated front-end process which retrieves the
|
||||||
client input/output socket addresses.
|
client input/output socket addresses.
|
||||||
with the exception of the rank 0 engine itself which doesn't require
|
with the exception of the rank 0 and colocated engines themselves which
|
||||||
the second handshake.
|
don't require the second handshake.
|
||||||
|
|
||||||
Here, "front-end" process can mean the process containing the engine
|
Here, "front-end" process can mean the process containing the engine
|
||||||
core client (which is the API server process in the case the API
|
core client (which is the API server process in the case the API
|
||||||
@ -489,8 +490,9 @@ class EngineCoreProc(EngineCore):
|
|||||||
with handshake as addresses:
|
with handshake as addresses:
|
||||||
yield addresses
|
yield addresses
|
||||||
else:
|
else:
|
||||||
|
assert local_client
|
||||||
local_handshake = self._perform_handshake(
|
local_handshake = self._perform_handshake(
|
||||||
input_ctx, client_handshake_address, identity, local_client,
|
input_ctx, client_handshake_address, identity, True,
|
||||||
vllm_config)
|
vllm_config)
|
||||||
with handshake as addresses, local_handshake as client_addresses:
|
with handshake as addresses, local_handshake as client_addresses:
|
||||||
addresses.inputs = client_addresses.inputs
|
addresses.inputs = client_addresses.inputs
|
||||||
|
|||||||
@ -605,6 +605,8 @@ def launch_core_engines(
|
|||||||
elif dp_rank == 0:
|
elif dp_rank == 0:
|
||||||
# Rank 0 holds Coordinator, so it handshakes with all Cores
|
# Rank 0 holds Coordinator, so it handshakes with all Cores
|
||||||
# in both external dplb and internal dplb mode.
|
# in both external dplb and internal dplb mode.
|
||||||
|
# Note this also covers the case where we have zero local engines
|
||||||
|
# and rank 0 is headless.
|
||||||
engines_to_handshake = [
|
engines_to_handshake = [
|
||||||
CoreEngine(index=i, local=(i < local_engine_count))
|
CoreEngine(index=i, local=(i < local_engine_count))
|
||||||
for i in range(dp_size)
|
for i in range(dp_size)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user