[EPLB] Optimize EPLB for Async Rearrange Experts (#22179)

Signed-off-by: David Chen <530634352@qq.com>
Co-authored-by: SunChenxiang123 <1291824390@qq.com>
This commit is contained in:
WeiQing Chen 2025-11-24 22:08:29 +08:00 committed by GitHub
parent 4de87866a8
commit 2601f18a82
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 779 additions and 78 deletions

View File

@ -1,13 +1,18 @@
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import random import random
import pytest import pytest
import torch import torch
import torch.distributed import torch.distributed
from vllm.distributed.eplb.rebalance_execute import rearrange_expert_weights_inplace from vllm.distributed.eplb.rebalance_execute import (
move_from_buffer,
rearrange_expert_weights_inplace,
transfer_layer,
)
from vllm.distributed.parallel_state import ( from vllm.distributed.parallel_state import (
ensure_model_parallel_initialized, ensure_model_parallel_initialized,
get_tp_group, get_tp_group,
@ -231,6 +236,100 @@ def verify_redundant_experts_have_same_weights(
) )
def _test_async_transfer_layer_without_mtp_worker(
env,
world_size: int,
num_layers: int,
num_local_experts: int,
num_logical_experts: int,
) -> None:
set_env_vars_and_device(env)
ensure_model_parallel_initialized(
tensor_model_parallel_size=world_size, pipeline_model_parallel_size=1
)
tp_group = get_tp_group()
ep_group = tp_group.device_group
ep_rank = torch.distributed.get_rank()
device = torch.device(f"cuda:{ep_rank}")
total_physical_experts = world_size * num_local_experts
hidden_sizes = [16, 32]
redundancy_config = create_redundancy_config(
num_logical_experts,
total_physical_experts,
)
old_indices = create_expert_indices_with_redundancy(
num_layers,
num_logical_experts,
total_physical_experts,
redundancy_config,
)
new_redundancy_config = create_redundancy_config(
num_logical_experts,
total_physical_experts,
)
new_indices = create_expert_indices_with_redundancy(
num_layers,
num_logical_experts,
total_physical_experts,
new_redundancy_config,
)
expert_weights = create_expert_weights(
num_layers,
num_local_experts,
hidden_sizes,
ep_rank,
device,
old_indices,
)
expert_buffer = [torch.empty_like(w) for w in expert_weights[0]]
cuda_stream = torch.cuda.Stream(device=device)
for layer_idx in range(num_layers):
is_unchanged, is_received_locally, experts_recv_loc = asyncio.run(
transfer_layer(
old_global_expert_indices=old_indices,
new_global_expert_indices=new_indices,
expert_weights=expert_weights,
expert_weights_buffer=expert_buffer,
ep_group=ep_group,
layer=layer_idx,
cuda_stream=cuda_stream,
)
)
cuda_stream.synchronize()
move_from_buffer(
expert_weights=expert_weights[layer_idx],
expert_weights_buffer=expert_buffer,
is_unchanged=is_unchanged,
is_received_locally=is_received_locally,
experts_recv_loc=experts_recv_loc,
new_indices=new_indices[layer_idx].tolist(),
ep_group=ep_group,
)
verify_expert_weights_after_shuffle(
expert_weights,
new_indices,
hidden_sizes,
ep_rank,
num_local_experts,
)
verify_redundant_experts_have_same_weights(
expert_weights,
new_indices,
hidden_sizes,
world_size,
num_local_experts,
)
def _test_rearrange_expert_weights_with_redundancy( def _test_rearrange_expert_weights_with_redundancy(
env, world_size, num_layers, num_local_experts, num_logical_experts env, world_size, num_layers, num_local_experts, num_logical_experts
) -> None: ) -> None:
@ -399,6 +498,32 @@ def _test_rearrange_expert_weights_no_change(env, world_size) -> None:
) )
@pytest.mark.parametrize(
"world_size,num_layers,num_local_experts,num_logical_experts",
[
(2, 2, 2, 3),
],
)
def test_async_transfer_layer_without_mtp(
world_size: int,
num_layers: int,
num_local_experts: int,
num_logical_experts: int,
):
"""Exercise async EPLB transfer path without MTP/spec decode."""
if torch.cuda.device_count() < world_size:
pytest.skip(f"Need at least {world_size} GPUs to run the test")
distributed_run(
_test_async_transfer_layer_without_mtp_worker,
world_size,
num_layers,
num_local_experts,
num_logical_experts,
)
@pytest.mark.parametrize("world_size", [2, 4]) @pytest.mark.parametrize("world_size", [2, 4])
def test_rearrange_expert_weights_no_change(world_size): def test_rearrange_expert_weights_no_change(world_size):
""" """

View File

@ -10,10 +10,11 @@ from tests.utils import large_gpu_mark
def get_model_args( def get_model_args(
model_name: str, model_name: str,
spec_model_name: str, spec_model_name: str | None,
spec_method: str, spec_method: str,
tp_size: int, tp_size: int,
model_max_len: int, model_max_len: int,
use_async: bool = False,
) -> dict: ) -> dict:
speculative_config = { speculative_config = {
"method": spec_method, "method": spec_method,
@ -37,6 +38,8 @@ def get_model_args(
"enable_eplb": True, "enable_eplb": True,
"max_model_len": model_max_len, "max_model_len": model_max_len,
} }
if use_async:
model_args["eplb_config"] = {"use_async": True}
return model_args return model_args
@ -94,3 +97,37 @@ def test_eplb_spec_decode(
measured_value - RTOL < expected_gsm8k_value measured_value - RTOL < expected_gsm8k_value
and measured_value + RTOL > expected_gsm8k_value and measured_value + RTOL > expected_gsm8k_value
), f"Expected: {expected_gsm8k_value} | Measured: {measured_value}" ), f"Expected: {expected_gsm8k_value} | Measured: {measured_value}"
@large_gpu_mark(min_gb=80)
def test_eplb_spec_decode_qwen3_next_mtp_async() -> None:
"""
Ensure async EPLB works with MTP speculative decoding for Qwen3-Next.
"""
TASK = "gsm8k"
FILTER = "exact_match,strict-match"
RTOL = 0.03
expected_gsm8k_value = 0.86
model_args = get_model_args(
model_name="Qwen/Qwen3-Next-80B-A3B-Instruct",
spec_model_name=None,
spec_method="mtp",
tp_size=4,
model_max_len=4096,
use_async=True,
)
results = lm_eval.simple_evaluate(
model="vllm",
model_args=model_args,
tasks=TASK,
batch_size=64,
num_fewshot=8,
)
measured_value = results["results"][TASK][FILTER]
assert (
measured_value - RTOL < expected_gsm8k_value
and measured_value + RTOL > expected_gsm8k_value
), f"Expected: {expected_gsm8k_value} | Measured: {measured_value}"

View File

@ -60,6 +60,10 @@ class EPLBConfig:
Log the balancedness each step of expert parallelism. Log the balancedness each step of expert parallelism.
This is turned off by default since it will cause communication overhead. This is turned off by default since it will cause communication overhead.
""" """
use_async: bool = False
"""
Whether to use non-blocking EPLB.
"""
@config @config

View File

@ -0,0 +1,115 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
The async worker that transfers experts in the background.
"""
import asyncio
import threading
from typing import TYPE_CHECKING
import torch
from torch.distributed import ProcessGroup
from vllm.distributed.parallel_state import get_ep_group
from vllm.logger import init_logger
from .rebalance_execute import transfer_layer
if TYPE_CHECKING:
from .eplb_state import EplbState
logger = init_logger(__name__)
def start_async_worker(
state: "EplbState",
rank_mapping: dict[int, int] | None = None,
is_profile: bool = False,
) -> threading.Thread:
ep_group = get_ep_group().device_group
rank = ep_group.rank()
device_index = state.cuda_device_index
def thread_target() -> None:
assert device_index is not None
torch.cuda.set_device(device_index)
cuda_stream = torch.cuda.Stream(device=device_index)
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
loop.run_until_complete(
transfer_run_periodically(
state=state,
ep_group=ep_group,
is_profile=is_profile,
rank_mapping=rank_mapping,
cuda_stream=cuda_stream,
)
)
except Exception as exc: # pragma: no cover - diagnostic path
logger.exception("async loop error (Rank %d): %s", rank, str(exc))
finally:
loop.close()
thread = threading.Thread(target=thread_target, daemon=True)
thread.start()
return thread
async def transfer_run_periodically(
state: "EplbState",
ep_group: ProcessGroup,
is_profile: bool = False,
rank_mapping: dict[int, int] | None = None,
cuda_stream: torch.cuda.Stream = None,
) -> None:
while True:
await asyncio.to_thread(state.rearrange_event.wait)
logger.info("async worker woke up for EPLB transfer")
for model_state in state.model_states.values():
if not model_state.is_async_enabled:
continue
current_num_layers = model_state.model.num_moe_layers
while (
model_state.rebalanced
and model_state.layer_to_transfer < current_num_layers
):
if (
not model_state.ep_buffer_ready
and model_state.rebalanced
and model_state.new_physical_to_logical_map is not None
):
await asyncio.to_thread(model_state.buffer_lock.acquire)
try:
if model_state.layer_to_transfer >= current_num_layers:
break
(
model_state.is_unchanged,
model_state.is_received_locally,
model_state.experts_recv_loc,
) = await transfer_layer(
old_global_expert_indices=model_state.physical_to_logical_map,
new_global_expert_indices=model_state.new_physical_to_logical_map,
expert_weights=model_state.model.expert_weights,
expert_weights_buffer=model_state.expert_buffer,
ep_group=ep_group,
is_profile=is_profile,
layer=model_state.layer_to_transfer,
cuda_stream=cuda_stream,
rank_mapping=rank_mapping,
)
event = torch.cuda.Event(blocking=False)
cuda_stream.record_event(event)
model_state.buffer_ready_event = event
model_state.ep_buffer_ready = 1
finally:
model_state.buffer_lock.release()
else:
if not model_state.rebalanced:
break
await asyncio.sleep(0.001)
state.rearrange_event.clear()

View File

@ -26,6 +26,7 @@ MoE layer. If we have 32 EP ranks, then each GPU will hold 288 / 32 = 9 local
physical experts. physical experts.
""" """
import threading
import time import time
from collections.abc import Sequence from collections.abc import Sequence
from dataclasses import dataclass from dataclasses import dataclass
@ -43,8 +44,9 @@ from vllm.distributed.utils import StatelessProcessGroup
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.models.interfaces import MixtureOfExperts from vllm.model_executor.models.interfaces import MixtureOfExperts
from .async_worker import start_async_worker
from .rebalance_algo import rebalance_experts from .rebalance_algo import rebalance_experts
from .rebalance_execute import rearrange_expert_weights_inplace from .rebalance_execute import move_from_buffer, rearrange_expert_weights_inplace
logger = init_logger(__name__) logger = init_logger(__name__)
@ -132,6 +134,74 @@ class EplbModelState:
""" """
model_name: str model_name: str
model: MixtureOfExperts model: MixtureOfExperts
expert_buffer: list[torch.Tensor]
"""
The buffer to store the expert weights during transfer.
"""
buffer_lock: threading.Lock
"""
The lock to protect the expert buffer.
"""
buffer_ready_event: torch.cuda.Event | None
"""
CUDA event recorded when the async worker finishes filling the buffer.
The main thread waits on this before consuming the buffer.
"""
ep_buffer_ready: int
"""
The flag indicates whether the expert buffer is ready for transfer.
0 or 1.
"""
layer_to_transfer: int
"""
The layer index to transfer in async mode.
"""
rebalanced: bool
"""
The flag indicates whether the experts rebalance have been computed.
"""
pending_global_ready_check: bool
"""
Whether the async EPLB needs to poll peers for buffer readiness.
"""
is_unchanged: list[bool]
"""
intermediate variable between `move_to_buffer` and `move_to_workspace`.
The size is same as the num of physical experts in the current layer.
"""
is_received_locally: list[bool]
"""
intermediate variable between `move_to_buffer` and `move_to_workspace`.
The size is same as the num of physical experts in the current layer.
"""
experts_recv_loc: dict[int, int]
"""
intermediate variable between `move_to_buffer` and `move_to_workspace`.
The size is same as the num of physical experts in the current layer.
"""
is_async_enabled: bool
"""
The flag indicates whether the EPLB is running in async mode.
"""
cuda_device_index: int | None
"""
CUDA device index for the async EPLB worker thread.
"""
new_physical_to_logical_map: torch.Tensor | None = None
"""
intermediate variable between `move_to_buffer` and `move_to_workspace`.
the size is same as physical_to_logical_map
"""
new_logical_to_physical_map: torch.Tensor | None = None
"""
intermediate variable between `move_to_buffer` and `move_to_workspace`.
the size is same as logical_to_physical_map
"""
new_logical_replica_count: torch.Tensor | None = None
"""
intermediate variable between `move_to_buffer` and `move_to_workspace`.
the size is same as logical_replica_count
"""
class EplbState: class EplbState:
@ -164,12 +234,31 @@ class EplbState:
Otherwise, the rearrangement will hang at collective Otherwise, the rearrangement will hang at collective
communication calls. communication calls.
""" """
self.expert_rearrangement_step: int = 0 self.expert_rearrangement_step_interval: int = 0
""" """
Interval for expert rearrangement steps. Interval for expert rearrangement steps.
This is a constant and is taken from the config. This is a constant and is taken from the config.
""" """
self.expert_rearrangement_step_interval: int = 0 self.is_async: bool = False
"""
The flag indicates whether the EPLB is running in async mode.
"""
self.rearrange_event = threading.Event()
"""
Event to signal when a new rearrangement is needed for the async thread.
"""
self.async_worker: threading.Thread | None = None
"""
Background thread handling async transfers.
"""
self.cuda_device_index: int | None = None
"""
CUDA device index for the async EPLB worker thread.
"""
if self.device.type == "cuda":
self.cuda_device_index = self.device.index
if self.cuda_device_index is None and torch.cuda.is_available():
self.cuda_device_index = torch.cuda.current_device()
@staticmethod @staticmethod
def build_initial_global_physical_to_logical_map( def build_initial_global_physical_to_logical_map(
@ -239,6 +328,8 @@ class EplbState:
Build the initial EPLB state. Build the initial EPLB state.
""" """
self.validate_ep_configuration(model) self.validate_ep_configuration(model)
self.is_async = self.parallel_config.eplb_config.use_async
physical_to_logical_map_list = ( physical_to_logical_map_list = (
EplbState.build_initial_global_physical_to_logical_map( EplbState.build_initial_global_physical_to_logical_map(
model.num_routed_experts, model.num_routed_experts,
@ -368,7 +459,12 @@ class EplbState:
physical_to_logical_map = new_physical_to_logical_map.to(self.device) physical_to_logical_map = new_physical_to_logical_map.to(self.device)
logical_to_physical_map.copy_(new_logical_to_physical_map) logical_to_physical_map.copy_(new_logical_to_physical_map)
logical_replica_count.copy_(new_logical_replica_count) logical_replica_count.copy_(new_logical_replica_count)
else:
new_physical_to_logical_map = None
new_logical_to_physical_map = None
new_logical_replica_count = None
model.set_eplb_state( model.set_eplb_state(
expert_load_pass, expert_load_pass,
logical_to_physical_map, logical_to_physical_map,
@ -385,15 +481,33 @@ class EplbState:
) )
self.expert_rearrangement_step = 0 self.expert_rearrangement_step = 0
self.model_states[model_config.compute_hash()] = EplbModelState( expert_buffer = [torch.empty_like(w) for w in model.expert_weights[0]]
physical_to_logical_map,
logical_to_physical_map, model_state = EplbModelState(
logical_replica_count, physical_to_logical_map=physical_to_logical_map,
expert_load_pass, logical_to_physical_map=logical_to_physical_map,
expert_load_window, logical_replica_count=logical_replica_count,
model_config.model, expert_load_pass=expert_load_pass,
model, expert_load_window=expert_load_window,
model_name=model_config.model,
model=model,
expert_buffer=expert_buffer,
buffer_lock=threading.Lock(),
buffer_ready_event=None,
ep_buffer_ready=0,
layer_to_transfer=0,
rebalanced=False,
pending_global_ready_check=False,
is_unchanged=[],
is_received_locally=[],
experts_recv_loc={},
is_async_enabled=self.is_async,
cuda_device_index=self.cuda_device_index,
new_physical_to_logical_map=new_physical_to_logical_map,
new_logical_to_physical_map=new_logical_to_physical_map,
new_logical_replica_count=new_logical_replica_count,
) )
self.model_states[model_config.compute_hash()] = model_state
def step( def step(
self, self,
@ -420,7 +534,7 @@ class EplbState:
- `max_tokens`: The maximum load across ranks. - `max_tokens`: The maximum load across ranks.
- `balancedness`: The ratio of average load to maximum load. - `balancedness`: The ratio of average load to maximum load.
""" """
ep_group = get_ep_group().device_group
if is_profile: if is_profile:
self.rearrange(is_profile=True) self.rearrange(is_profile=True)
return return
@ -488,7 +602,49 @@ class EplbState:
# rearrangement step and perform rearrangement to ensure all ranks are # rearrangement step and perform rearrangement to ensure all ranks are
# performing collective communication. # performing collective communication.
self.expert_rearrangement_step += 1 self.expert_rearrangement_step += 1
if self.is_async:
for eplb_model_state in self.model_states.values():
if not eplb_model_state.is_async_enabled:
continue
all_ranks_buffer_ready = False
if eplb_model_state.pending_global_ready_check:
all_ranks_buffer_ready = self._all_ranks_buffer_ready(
eplb_model_state
)
if (
eplb_model_state.is_async_enabled
and eplb_model_state.ep_buffer_ready
and all_ranks_buffer_ready
):
self.move_to_workspace(
model_state=eplb_model_state,
ep_group=ep_group,
is_profile=is_profile,
)
if (
eplb_model_state.layer_to_transfer
>= eplb_model_state.model.num_moe_layers
):
self.post_eplb(eplb_model_state, is_profile)
eplb_model_state.rebalanced = False
eplb_model_state.layer_to_transfer = 0
eplb_model_state.pending_global_ready_check = False
logger.info(
"finish async transfer for model %s rank %d layer %d",
eplb_model_state.model_name,
ep_group.rank(),
eplb_model_state.model.num_moe_layers,
)
if self.expert_rearrangement_step >= self.expert_rearrangement_step_interval: if self.expert_rearrangement_step >= self.expert_rearrangement_step_interval:
if any(
eplb_model_state.is_async_enabled and eplb_model_state.rebalanced
for eplb_model_state in self.model_states.values()
):
# Still performing asynchronous rearrangement
return
self.expert_rearrangement_step = 0 self.expert_rearrangement_step = 0
self.rearrange() self.rearrange()
@ -524,7 +680,11 @@ class EplbState:
if is_main_rank: if is_main_rank:
torch.cuda.synchronize() torch.cuda.synchronize()
time_start = time.perf_counter() time_start = time.perf_counter()
logger.info("Rearranging experts %s...", "(profile)" if is_profile else "") logger.info(
"Rearranging experts %s %s...",
"(async mode)" if self.is_async else "sync mode",
"(profile)" if is_profile else "",
)
if global_expert_loads is None: if global_expert_loads is None:
# Map the physical expert load to global logical experts # Map the physical expert load to global logical experts
@ -593,6 +753,7 @@ class EplbState:
model = eplb_model_state.model model = eplb_model_state.model
num_replicas = model.num_physical_experts num_replicas = model.num_physical_experts
num_groups = model.num_expert_groups num_groups = model.num_expert_groups
if rank_mapping is not None and len(rank_mapping) == ep_group.size(): if rank_mapping is not None and len(rank_mapping) == ep_group.size():
# NOTE(yongji): scale down, we need to rebalance the experts on # NOTE(yongji): scale down, we need to rebalance the experts on
# remaining GPUs, transfer the experts while we haven't shutdown # remaining GPUs, transfer the experts while we haven't shutdown
@ -608,7 +769,7 @@ class EplbState:
num_gpus = ep_group.size() num_gpus = ep_group.size()
if num_gpus % num_nodes != 0: if num_gpus % num_nodes != 0:
self.num_nodes = 1 num_nodes = 1
logger.warning_once( logger.warning_once(
f"num_gpus % num_nodes != 0, " f"num_gpus % num_nodes != 0, "
"not using hierarchical rearrangement algorithm.\n" "not using hierarchical rearrangement algorithm.\n"
@ -631,6 +792,7 @@ class EplbState:
num_gpus, num_gpus,
) )
if not eplb_model_state.is_async_enabled or is_profile:
# Update expert weights # Update expert weights
rearrange_expert_weights_inplace( rearrange_expert_weights_inplace(
eplb_model_state.physical_to_logical_map, eplb_model_state.physical_to_logical_map,
@ -672,8 +834,9 @@ class EplbState:
eplb_model_state.logical_to_physical_map.copy_( eplb_model_state.logical_to_physical_map.copy_(
new_logical_to_physical_map new_logical_to_physical_map
) )
eplb_model_state.logical_replica_count.copy_(new_logical_replica_count) eplb_model_state.logical_replica_count.copy_(
new_logical_replica_count
)
if is_main_rank: if is_main_rank:
assert time_start is not None assert time_start is not None
torch.cuda.synchronize() torch.cuda.synchronize()
@ -683,8 +846,162 @@ class EplbState:
" (profile) " if is_profile else " ", " (profile) " if is_profile else " ",
time_end - time_start, time_end - time_start,
) )
else:
device = eplb_model_state.physical_to_logical_map.device
new_physical = new_physical_to_logical_map.to(device)
max_slots = eplb_model_state.logical_to_physical_map.shape[-1]
padded_logical = torch.nn.functional.pad(
new_logical_to_physical_map,
(0, max(0, max_slots - new_logical_to_physical_map.shape[-1])),
value=-1,
).to(eplb_model_state.logical_to_physical_map.device)
new_replica = new_logical_replica_count.to(
eplb_model_state.logical_replica_count.device
)
eplb_model_state.new_physical_to_logical_map = new_physical
eplb_model_state.new_logical_to_physical_map = padded_logical
eplb_model_state.new_logical_replica_count = new_replica
eplb_model_state.rebalanced = True
eplb_model_state.layer_to_transfer = 0
eplb_model_state.pending_global_ready_check = True
# Signal async thread to start transferring layers
if self.is_async and (not is_profile):
self.rearrange_event.set()
return None return None
def start_async_loop(
self,
rank_mapping: dict[int, int] | None = None,
is_profile: bool = False,
):
if not self.is_async:
return
if self.async_worker is None:
self.async_worker = start_async_worker(
self,
rank_mapping=rank_mapping,
is_profile=is_profile,
)
def _update_layer_mapping_from_new(
self, model_state: EplbModelState, layer: int
) -> None:
if (
model_state.new_physical_to_logical_map is None
or model_state.new_logical_to_physical_map is None
or model_state.new_logical_replica_count is None
):
return
target_device = model_state.physical_to_logical_map.device
new_physical = model_state.new_physical_to_logical_map
if model_state.physical_to_logical_map.shape[1] != new_physical.shape[1]:
model_state.physical_to_logical_map = new_physical.to(target_device)
else:
model_state.physical_to_logical_map[layer].copy_(
new_physical[layer].to(target_device)
)
logical_device = model_state.logical_to_physical_map.device
new_logical = model_state.new_logical_to_physical_map[layer].to(logical_device)
max_slots = model_state.logical_to_physical_map.shape[-1]
slot_delta = max_slots - new_logical.shape[-1]
if slot_delta > 0:
new_logical = torch.nn.functional.pad(
new_logical, (0, slot_delta), value=-1
)
model_state.logical_to_physical_map[layer].copy_(new_logical)
replica_device = model_state.logical_replica_count.device
model_state.logical_replica_count[layer].copy_(
model_state.new_logical_replica_count[layer].to(replica_device)
)
def _all_ranks_buffer_ready(self, model_state: EplbModelState) -> bool:
parallel_state = get_ep_group()
cpu_group = getattr(parallel_state, "cpu_group", None)
if cpu_group is not None and cpu_group.size() > 1:
flag = torch.tensor(
(int(model_state.ep_buffer_ready),), dtype=torch.int32, device="cpu"
)
all_reduce(flag, group=cpu_group)
return int(flag.item()) == cpu_group.size()
device_group = parallel_state.device_group
if device_group.size() <= 1:
return bool(model_state.ep_buffer_ready)
device = getattr(
parallel_state, "device", model_state.physical_to_logical_map.device
)
flag = torch.tensor(
(int(model_state.ep_buffer_ready),), dtype=torch.int32, device=device
)
all_reduce(flag, group=device_group)
return int(flag.item()) == device_group.size()
def move_to_workspace(
self,
model_state: EplbModelState,
ep_group: ProcessGroup,
is_profile: bool = False,
):
if not model_state.buffer_lock.acquire(blocking=False):
return
try:
assert model_state.new_physical_to_logical_map is not None
device_index = model_state.cuda_device_index or self.cuda_device_index
if model_state.buffer_ready_event is not None and device_index is not None:
stream = torch.cuda.current_stream(device=device_index)
stream.wait_event(model_state.buffer_ready_event)
model_state.buffer_ready_event = None
move_from_buffer(
expert_weights=model_state.model.expert_weights[
model_state.layer_to_transfer
],
expert_weights_buffer=model_state.expert_buffer,
is_unchanged=model_state.is_unchanged,
is_received_locally=model_state.is_received_locally,
experts_recv_loc=model_state.experts_recv_loc,
new_indices=model_state.new_physical_to_logical_map[
model_state.layer_to_transfer
].tolist(),
ep_group=ep_group,
)
transferred_layer = model_state.layer_to_transfer
self._update_layer_mapping_from_new(model_state, transferred_layer)
# After the main thread consumes, advance layer_to_transfer
model_state.layer_to_transfer += 1
model_state.ep_buffer_ready = 0
logger.info(
"model %s successfully move_to_workspace layer %d",
model_state.model_name,
transferred_layer,
)
finally:
try:
model_state.buffer_lock.release()
except Exception as e:
logger.error(
"Rank %d: buffer_lock release failed in move_to_workspace: %s",
ep_group.rank(),
str(e),
)
def post_eplb(self, model_state: EplbModelState, is_profile: bool = False) -> None:
assert model_state.new_physical_to_logical_map is not None
assert model_state.new_logical_to_physical_map is not None
assert model_state.new_logical_replica_count is not None
if not is_profile:
for layer_idx in range(model_state.physical_to_logical_map.shape[0]):
self._update_layer_mapping_from_new(model_state, layer_idx)
model_state.new_physical_to_logical_map = None
model_state.new_logical_to_physical_map = None
model_state.new_logical_replica_count = None
@staticmethod @staticmethod
def recv_state() -> tuple[list[torch.Tensor], list[torch.Tensor]]: def recv_state() -> tuple[list[torch.Tensor], list[torch.Tensor]]:
""" """

View File

@ -100,18 +100,19 @@ def get_ep_ranks_with_expert(
return ranks_to_send, ranks_to_recv_actual return ranks_to_send, ranks_to_recv_actual
def shuffle_layer( def move_to_buffer(
num_local_experts: int, num_local_experts: int,
ep_rank: int,
old_indices: Sequence[int], old_indices: Sequence[int],
new_indices: Sequence[int], new_indices: Sequence[int],
expert_weights: Iterable[torch.Tensor], expert_weights: Iterable[torch.Tensor],
expert_weights_buffer: Sequence[torch.Tensor], expert_weights_buffer: Sequence[torch.Tensor],
cuda_stream: torch.cuda.Stream | None,
ep_group: ProcessGroup, ep_group: ProcessGroup,
) -> None: ) -> tuple[list[bool], list[bool], dict[int, int]]:
""" """
Perform expert weights rearrangement of one layer. Perform expert weights rearrangement of one layer.
""" """
ep_rank = ep_group.rank()
local2global = partial( local2global = partial(
idx_local_to_global, idx_local_to_global,
local_cnt=num_local_experts, local_cnt=num_local_experts,
@ -137,7 +138,8 @@ def shuffle_layer(
if old_indices[src_global] == new_indices[dst_global]: if old_indices[src_global] == new_indices[dst_global]:
is_received_locally[dst] = True is_received_locally[dst] = True
for weight, buffer in zip(expert_weights, expert_weights_buffer): for weight, buffer in zip(expert_weights, expert_weights_buffer):
buffer[dst].copy_(weight[src]) with torch.cuda.stream(cuda_stream):
buffer[dst].copy_(weight[src], non_blocking=True)
p2p_ops: list[P2POp] = [] p2p_ops: list[P2POp] = []
@ -225,25 +227,115 @@ def shuffle_layer(
] ]
# 4. Execute the P2P operations. The real communication happens here. # 4. Execute the P2P operations. The real communication happens here.
if p2p_ops: if p2p_ops and cuda_stream is not None:
with torch.cuda.stream(cuda_stream):
reqs = batch_isend_irecv(p2p_ops) reqs = batch_isend_irecv(p2p_ops)
for req in reqs: for req in reqs:
req.wait() req.wait()
elif p2p_ops:
reqs = batch_isend_irecv(p2p_ops)
for req in reqs:
req.wait()
# wait for the communication to finish
return is_unchanged, is_received_locally, experts_recv_loc
def move_from_buffer(
expert_weights: Iterable[torch.Tensor],
expert_weights_buffer: list[torch.Tensor],
is_unchanged: list[bool],
is_received_locally: list[bool],
experts_recv_loc: dict[int, int],
new_indices: Sequence[int],
ep_group: ProcessGroup,
) -> None:
ep_rank = ep_group.rank()
num_local_experts = len(is_unchanged)
local2global = partial(
idx_local_to_global, local_cnt=num_local_experts, ep_rank=ep_rank
)
# 5. Copy the weights from the buffer back to the original weights.
for dst in range(num_local_experts): for dst in range(num_local_experts):
if is_unchanged[dst]: if is_unchanged[dst]:
continue continue
if is_received_locally[dst]: if is_received_locally[dst]:
for weight, buffer in zip(expert_weights, expert_weights_buffer): for weight, buffer in zip(expert_weights, expert_weights_buffer):
weight[dst].copy_(buffer[dst]) weight[dst].copy_(buffer[dst], non_blocking=True)
else: else:
expert = new_indices[local2global(dst)] expert = new_indices[local2global(dst)]
if expert == -1: if expert == -1:
continue continue
src = experts_recv_loc[expert] src = experts_recv_loc[expert]
for weight, buffer in zip(expert_weights, expert_weights_buffer): for weight, buffer in zip(expert_weights, expert_weights_buffer):
weight[dst].copy_(buffer[src]) weight[dst].copy_(buffer[src], non_blocking=True)
async def transfer_layer(
old_global_expert_indices: torch.Tensor,
new_global_expert_indices: torch.Tensor,
expert_weights: Sequence[Iterable[torch.Tensor]],
expert_weights_buffer: Sequence[torch.Tensor],
ep_group: ProcessGroup,
is_profile: bool = False,
layer: int = 0,
cuda_stream: torch.cuda.Stream | None = None,
rank_mapping: dict[int, int] | None = None,
) -> tuple[list[bool], list[bool], dict[int, int]]:
"""
Rearranges the expert weights in place according to the new expert indices.
The value of the indices arguments are logical indices of the experts,
while keys are physical.
Args:
old_global_expert_indices: Shape (num_moe_layers, num_physical_experts).
new_global_expert_indices: Shape (num_moe_layers, num_physical_experts).
expert_weights: A sequence of shape (num_moe_layers)(weight_count)
of tensors of shape (num_local_physical_experts, hidden_size_i).
For example, a linear layer may have up and down projection,
so weight_count = 2. Each weight's hidden size can be different.
ep_group: The device process group for expert parallelism.
is_profile (bool): If `True`, do not perform any actual weight copy.
This is used during profile run, where we only perform dummy
communications to reserve enough memory for the buffers.
"""
ep_size = ep_group.size()
if rank_mapping is not None:
if len(rank_mapping) == ep_group.size():
# scale down
new_global_expert_indices = _map_new_expert_indices_with_rank_mapping(
new_global_expert_indices,
rank_mapping,
)
else:
# scale up
old_global_expert_indices = _map_old_expert_indices_with_rank_mapping(
old_global_expert_indices,
rank_mapping,
ep_group.size(),
)
assert old_global_expert_indices.shape[1] == new_global_expert_indices.shape[1]
num_moe_layers, num_physical_experts = old_global_expert_indices.shape
assert len(expert_weights) == num_moe_layers
num_local_physical_experts = next(iter(expert_weights[0])).shape[0]
assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts)
assert num_physical_experts == ep_size * num_local_physical_experts
# A buffer to hold the expert weights in one layer during the exchange.
# NOTE: Currently we assume the same weights across different layers
# have the same shape.
is_unchanged, is_received_locally, experts_recv_loc = move_to_buffer(
num_local_experts=num_local_physical_experts,
old_indices=old_global_expert_indices[layer].tolist(),
new_indices=new_global_expert_indices[layer].tolist(),
expert_weights=expert_weights[layer],
expert_weights_buffer=expert_weights_buffer,
cuda_stream=cuda_stream,
ep_group=ep_group,
)
return is_unchanged, is_received_locally, experts_recv_loc
def rearrange_expert_weights_inplace( def rearrange_expert_weights_inplace(
@ -296,7 +388,6 @@ def rearrange_expert_weights_inplace(
num_local_physical_experts = next(iter(expert_weights[0])).shape[0] num_local_physical_experts = next(iter(expert_weights[0])).shape[0]
assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts) assert new_global_expert_indices.shape == (num_moe_layers, num_physical_experts)
ep_rank = ep_group.rank()
ep_size = ep_group.size() ep_size = ep_group.size()
assert num_physical_experts == ep_size * num_local_physical_experts assert num_physical_experts == ep_size * num_local_physical_experts
@ -329,14 +420,24 @@ def rearrange_expert_weights_inplace(
torch.cuda.synchronize() torch.cuda.synchronize()
for layer in range(num_moe_layers): for layer in range(num_moe_layers):
shuffle_layer( is_unchanged, is_received_locally, experts_recv_loc = move_to_buffer(
num_local_physical_experts, num_local_experts=num_local_physical_experts,
ep_rank, old_indices=old_global_expert_indices_cpu[layer].tolist(),
old_global_expert_indices_cpu[layer].tolist(), new_indices=new_global_expert_indices_cpu[layer].tolist(),
new_global_expert_indices_cpu[layer].tolist(), expert_weights=expert_weights[layer],
expert_weights[layer], expert_weights_buffer=expert_weights_buffer,
expert_weights_buffer, cuda_stream=None,
ep_group, ep_group=ep_group,
)
move_from_buffer(
expert_weights=expert_weights[layer],
expert_weights_buffer=expert_weights_buffer,
is_unchanged=is_unchanged,
is_received_locally=is_received_locally,
experts_recv_loc=experts_recv_loc,
new_indices=new_global_expert_indices[layer].tolist(),
ep_group=ep_group,
) )
@ -428,4 +529,4 @@ def _map_new_expert_indices_with_rank_mapping(
return mapped_expert_indices return mapped_expert_indices
__all__ = ["rearrange_expert_weights_inplace"] __all__ = ["transfer_layer", "move_from_buffer"]

View File

@ -3370,6 +3370,8 @@ class GPUModelRunner(
old_global_expert_indices, old_global_expert_indices,
rank_mapping, rank_mapping,
) )
if self.eplb_state.is_async:
self.eplb_state.start_async_loop(rank_mapping=rank_mapping)
if ( if (
self.vllm_config.compilation_config.mode self.vllm_config.compilation_config.mode