[DP] Create placement groups by ray_device_key (#25026)

Signed-off-by: Xinyu Chen <xinyu1.chen@intel.com>
Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
This commit is contained in:
Xinyu Chen 2025-09-17 16:57:25 +08:00 committed by GitHub
parent 0fb2551c23
commit bb58dc8c20
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -334,20 +334,22 @@ class CoreEngineActorManager:
"No nodes with resources found in Ray cluster.") "No nodes with resources found in Ray cluster.")
assert dp_master_ip_key in nodes[0], ( assert dp_master_ip_key in nodes[0], (
"The DP master node (ip: %s) is missing or dead", dp_master_ip) "The DP master node (ip: %s) is missing or dead", dp_master_ip)
device_str = current_platform.ray_device_key
for node_resources in nodes: for node_resources in nodes:
if "GPU" not in node_resources: if device_str not in node_resources:
continue continue
# For now, each DP rank can only be assigned to one node # For now, each DP rank can only be assigned to one node
# TODO(rui): support allocating a single DP rank # TODO(rui): support allocating a single DP rank
# to multiple nodes # to multiple nodes
available_engine_count = int(node_resources["GPU"]) // world_size available_engine_count = int(
node_resources[device_str]) // world_size
if dp_master_ip_key in node_resources: if dp_master_ip_key in node_resources:
assert available_engine_count >= local_engine_count, ( assert available_engine_count >= local_engine_count, (
"Not enough resources to allocate DP ranks " "Not enough resources to allocate DP ranks "
f"on DP master node {dp_master_ip}") f"on DP master node {dp_master_ip}")
for i in range(local_engine_count): for i in range(local_engine_count):
bundles = [{ bundles = [{
"GPU": 1.0, device_str: 1.0,
"node:" + dp_master_ip: 0.001 "node:" + dp_master_ip: 0.001
}] * world_size + [{ }] * world_size + [{
"CPU": 1.0 "CPU": 1.0
@ -363,7 +365,7 @@ class CoreEngineActorManager:
for i in range(available_engine_count): for i in range(available_engine_count):
if len(placement_groups) == num_pg_to_create: if len(placement_groups) == num_pg_to_create:
break break
bundles = [{"GPU": 1.0}] * world_size + [{"CPU": 1.0}] bundles = [{device_str: 1.0}] * world_size + [{"CPU": 1.0}]
pg = ray.util.placement_group( pg = ray.util.placement_group(
name=f"dp_rank_{len(placement_groups)}", name=f"dp_rank_{len(placement_groups)}",
strategy="STRICT_PACK", strategy="STRICT_PACK",
@ -415,17 +417,18 @@ class CoreEngineActorManager:
local_dp_ranks = [] local_dp_ranks = []
num_pg_created = 0 num_pg_created = 0
device_str = current_platform.ray_device_key
for node in nodes: for node in nodes:
if num_pg_created >= num_pg_to_create: if num_pg_created >= num_pg_to_create:
break break
node_ip = node.node_ip node_ip = node.node_ip
node_id = node.node_id node_id = node.node_id
available_gpus = int(available_resources[node_id]["GPU"]) available_gpus = int(available_resources[node_id][device_str])
# Get total GPUs on this node from the node's resources # Get total GPUs on this node from the node's resources
# Ray stores node resources with node ID as key # Ray stores node resources with node ID as key
total_gpus = int(total_resources[node_id]["GPU"]) total_gpus = int(total_resources[node_id][device_str])
# Calculate used GPUs and used engines on this node # Calculate used GPUs and used engines on this node
used_gpus = max(0, total_gpus - available_gpus) used_gpus = max(0, total_gpus - available_gpus)
@ -444,13 +447,13 @@ class CoreEngineActorManager:
# Create bundles with node constraint for master node # Create bundles with node constraint for master node
if node_ip == dp_master_ip: if node_ip == dp_master_ip:
bundles = [{ bundles = [{
"GPU": 1.0, device_str: 1.0,
"node:" + dp_master_ip: 0.001 "node:" + dp_master_ip: 0.001
}] * world_size + [{ }] * world_size + [{
"CPU": 1.0 "CPU": 1.0
}] }]
else: else:
bundles = [{"GPU": 1.0}] * world_size + [{"CPU": 1.0}] bundles = [{device_str: 1.0}] * world_size + [{"CPU": 1.0}]
pg = ray.util.placement_group( pg = ray.util.placement_group(
name=f"dp_rank_{rank}", name=f"dp_rank_{rank}",