[DP][ray] Support different VLLM_RAY_DP_PACK_STRATEGY (#23849)

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
This commit is contained in:
Rui Qiao 2025-10-09 19:53:43 -07:00 committed by GitHub
parent c6187f55f7
commit 757fa4a4da
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 86 additions and 33 deletions

View File

@ -133,6 +133,7 @@ if TYPE_CHECKING:
VLLM_DP_MASTER_PORT: int = 0 VLLM_DP_MASTER_PORT: int = 0
VLLM_MOE_DP_CHUNK_SIZE: int = 256 VLLM_MOE_DP_CHUNK_SIZE: int = 256
VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False VLLM_RANDOMIZE_DP_DUMMY_INPUTS: bool = False
VLLM_RAY_DP_PACK_STRATEGY: str = "strict"
VLLM_MARLIN_USE_ATOMIC_ADD: bool = False VLLM_MARLIN_USE_ATOMIC_ADD: bool = False
VLLM_MXFP4_USE_MARLIN: Optional[bool] = None VLLM_MXFP4_USE_MARLIN: Optional[bool] = None
VLLM_V0_USE_OUTLINES_CACHE: bool = False VLLM_V0_USE_OUTLINES_CACHE: bool = False
@ -1000,6 +1001,17 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_RANDOMIZE_DP_DUMMY_INPUTS", "0" "VLLM_RANDOMIZE_DP_DUMMY_INPUTS", "0"
) )
== "1", == "1",
# Strategy to pack the data parallel ranks for Ray.
# Available options:
# - "fill":
# for DP master node, allocate exactly data-parallel-size-local DP ranks,
# for non-master nodes, allocate as many DP ranks as can fit;
# - "strict":
# allocate exactly data-parallel-size-local DP ranks to each picked node;
# This environment variable is ignored if data-parallel-backend is not Ray.
"VLLM_RAY_DP_PACK_STRATEGY": lambda: os.getenv(
"VLLM_RAY_DP_PACK_STRATEGY", "strict"
),
# Whether to use S3 path for model loading in CI via RunAI Streamer # Whether to use S3 path for model loading in CI via RunAI Streamer
"VLLM_CI_USE_S3": lambda: os.environ.get("VLLM_CI_USE_S3", "0") == "1", "VLLM_CI_USE_S3": lambda: os.environ.get("VLLM_CI_USE_S3", "0") == "1",
# Use model_redirect to redirect the model name to a local folder. # Use model_redirect to redirect the model name to a local folder.

View File

@ -15,6 +15,7 @@ from unittest.mock import patch
import msgspec import msgspec
import zmq import zmq
from vllm import envs
from vllm.config import CacheConfig, ParallelConfig, VllmConfig from vllm.config import CacheConfig, ParallelConfig, VllmConfig
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.platforms import current_platform from vllm.platforms import current_platform
@ -337,8 +338,8 @@ class CoreEngineActorManager:
logger.info("Creating placement groups for data parallel") logger.info("Creating placement groups for data parallel")
dp_master_ip = vllm_config.parallel_config.data_parallel_master_ip dp_master_ip = vllm_config.parallel_config.data_parallel_master_ip
num_pg_to_create = vllm_config.parallel_config.data_parallel_size dp_size = vllm_config.parallel_config.data_parallel_size
local_engine_count = vllm_config.parallel_config.data_parallel_size_local dp_size_local = vllm_config.parallel_config.data_parallel_size_local
available_resources = available_resources_per_node() available_resources = available_resources_per_node()
world_size = vllm_config.parallel_config.world_size world_size = vllm_config.parallel_config.world_size
@ -354,44 +355,84 @@ class CoreEngineActorManager:
dp_master_ip, dp_master_ip,
) )
device_str = current_platform.ray_device_key device_str = current_platform.ray_device_key
if envs.VLLM_RAY_DP_PACK_STRATEGY == "fill" and (
envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput"
or envs.VLLM_ALL2ALL_BACKEND == "deepep_low_latency"
):
raise ValueError(
"DeepEP kernels require EP ranks [0,7] (same for [8,15], ...) "
"to be on the same node, but VLLM_RAY_DP_PACK_STRATEGY=fill "
"does not guarantee that. "
"Please use VLLM_RAY_DP_PACK_STRATEGY=strict instead."
)
logger.info(
"Using '%s' DP packing strategy based on VLLM_RAY_DP_PACK_STRATEGY",
envs.VLLM_RAY_DP_PACK_STRATEGY,
)
strict_local_size = envs.VLLM_RAY_DP_PACK_STRATEGY == "strict"
for node_resources in nodes: for node_resources in nodes:
if device_str not in node_resources: node_ip_keys = [
continue key
for key in node_resources
if key != "node:__internal_head__" and key.startswith("node:")
]
assert len(node_ip_keys) == 1, (
"Zero or multiple node IP keys found in node resources: %s",
node_ip_keys,
)
node_ip_key = node_ip_keys[0]
node_ip = node_ip_key.split(":")[1]
# For now, each DP rank can only be assigned to one node # For now, each DP rank can only be assigned to one node
# TODO(rui): support allocating a single DP rank # TODO(rui): support allocating a single DP rank
# to multiple nodes # to multiple nodes
available_engine_count = int(node_resources[device_str]) // world_size dp_size_available = (
if dp_master_ip_key in node_resources: int(node_resources[device_str]) // world_size
assert available_engine_count >= local_engine_count, ( if device_str in node_resources
"Not enough resources to allocate DP ranks " else 0
f"on DP master node {dp_master_ip}" )
)
for i in range(local_engine_count): if node_ip == dp_master_ip:
bundles = [ if dp_size_available < dp_size_local:
{device_str: 1.0, "node:" + dp_master_ip: 0.001} raise ValueError(
] * world_size + [{"CPU": 1.0}] "Not enough resources to allocate %s DP ranks "
pg = ray.util.placement_group( "on DP master node %s, possible to fit %s DP ranks",
name=f"dp_rank_{len(placement_groups)}", dp_size_local,
strategy="STRICT_PACK", dp_master_ip,
bundles=bundles, dp_size_available,
) )
placement_groups.append(pg) dp_size_to_allocate = dp_size_local
local_dp_ranks.append(i) elif strict_local_size:
if dp_size_available < dp_size_local:
logger.info(
"Skipping node %s as %s DP ranks could not fit, "
"possible to fit %s DP ranks",
node_ip,
dp_size_local,
dp_size_available,
)
continue
dp_size_to_allocate = dp_size_local
else: else:
for i in range(available_engine_count): dp_size_to_allocate = dp_size_available
if len(placement_groups) == num_pg_to_create:
break for i in range(dp_size_to_allocate):
bundles = [{device_str: 1.0}] * world_size + [{"CPU": 1.0}] bundles = [{device_str: 1.0, "node:" + node_ip: 0.001}] * world_size + [
pg = ray.util.placement_group( {"CPU": 1.0}
name=f"dp_rank_{len(placement_groups)}", ]
strategy="STRICT_PACK", pg = ray.util.placement_group(
bundles=bundles, name=f"dp_rank_{len(placement_groups)}",
) strategy="STRICT_PACK",
placement_groups.append(pg) bundles=bundles,
local_dp_ranks.append(i) )
if len(placement_groups) < num_pg_to_create: placement_groups.append(pg)
local_dp_ranks.append(i)
if len(placement_groups) < dp_size:
raise ValueError( raise ValueError(
f"Not enough resources to allocate {num_pg_to_create} " f"Not enough resources to allocate {dp_size} "
"placement groups, only created " "placement groups, only created "
f"{len(placement_groups)} placement groups. " f"{len(placement_groups)} placement groups. "
"Available resources: " "Available resources: "