mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 09:45:34 +08:00
[Core] Simplify the Dp padding/should ubatch coordination logic (#25768)
Signed-off-by: Sage Moore <sage@neuralmagic.com> Signed-off-by: mgoin <mgoin64@gmail.com> Co-authored-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
parent
c50901f3b9
commit
2111b4643c
@ -13,7 +13,7 @@ from vllm.v1.attention.backends.utils import (
|
|||||||
split_attn_metadata,
|
split_attn_metadata,
|
||||||
split_decodes_and_prefills,
|
split_decodes_and_prefills,
|
||||||
)
|
)
|
||||||
from vllm.v1.worker.ubatch_splitting import create_ubatch_slices
|
from vllm.v1.worker.ubatch_utils import create_ubatch_slices
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
|||||||
@ -152,6 +152,10 @@ class ParallelConfig:
|
|||||||
threshold, microbatching will be used. Otherwise, the request will be
|
threshold, microbatching will be used. Otherwise, the request will be
|
||||||
processed in a single batch."""
|
processed in a single batch."""
|
||||||
|
|
||||||
|
disable_nccl_for_dp_synchronization: bool = False
|
||||||
|
"""Forces the dp synchronization logic in vllm/v1/worker/dp_utils.py
|
||||||
|
to use Gloo instead of NCCL for its all reduce"""
|
||||||
|
|
||||||
ray_workers_use_nsight: bool = False
|
ray_workers_use_nsight: bool = False
|
||||||
"""Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""
|
"""Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""
|
||||||
|
|
||||||
|
|||||||
@ -365,6 +365,9 @@ class EngineArgs:
|
|||||||
enable_dbo: bool = ParallelConfig.enable_dbo
|
enable_dbo: bool = ParallelConfig.enable_dbo
|
||||||
dbo_decode_token_threshold: int = ParallelConfig.dbo_decode_token_threshold
|
dbo_decode_token_threshold: int = ParallelConfig.dbo_decode_token_threshold
|
||||||
dbo_prefill_token_threshold: int = ParallelConfig.dbo_prefill_token_threshold
|
dbo_prefill_token_threshold: int = ParallelConfig.dbo_prefill_token_threshold
|
||||||
|
disable_nccl_for_dp_synchronization: bool = (
|
||||||
|
ParallelConfig.disable_nccl_for_dp_synchronization
|
||||||
|
)
|
||||||
eplb_config: EPLBConfig = get_field(ParallelConfig, "eplb_config")
|
eplb_config: EPLBConfig = get_field(ParallelConfig, "eplb_config")
|
||||||
enable_eplb: bool = ParallelConfig.enable_eplb
|
enable_eplb: bool = ParallelConfig.enable_eplb
|
||||||
expert_placement_strategy: ExpertPlacementStrategy = (
|
expert_placement_strategy: ExpertPlacementStrategy = (
|
||||||
@ -760,6 +763,10 @@ class EngineArgs:
|
|||||||
"--dbo-prefill-token-threshold",
|
"--dbo-prefill-token-threshold",
|
||||||
**parallel_kwargs["dbo_prefill_token_threshold"],
|
**parallel_kwargs["dbo_prefill_token_threshold"],
|
||||||
)
|
)
|
||||||
|
parallel_group.add_argument(
|
||||||
|
"--disable-nccl-for-dp-synchronization",
|
||||||
|
**parallel_kwargs["disable_nccl_for_dp_synchronization"],
|
||||||
|
)
|
||||||
parallel_group.add_argument("--enable-eplb", **parallel_kwargs["enable_eplb"])
|
parallel_group.add_argument("--enable-eplb", **parallel_kwargs["enable_eplb"])
|
||||||
parallel_group.add_argument("--eplb-config", **parallel_kwargs["eplb_config"])
|
parallel_group.add_argument("--eplb-config", **parallel_kwargs["eplb_config"])
|
||||||
parallel_group.add_argument(
|
parallel_group.add_argument(
|
||||||
@ -1437,6 +1444,7 @@ class EngineArgs:
|
|||||||
enable_dbo=self.enable_dbo,
|
enable_dbo=self.enable_dbo,
|
||||||
dbo_decode_token_threshold=self.dbo_decode_token_threshold,
|
dbo_decode_token_threshold=self.dbo_decode_token_threshold,
|
||||||
dbo_prefill_token_threshold=self.dbo_prefill_token_threshold,
|
dbo_prefill_token_threshold=self.dbo_prefill_token_threshold,
|
||||||
|
disable_nccl_for_dp_synchronization=self.disable_nccl_for_dp_synchronization,
|
||||||
enable_eplb=self.enable_eplb,
|
enable_eplb=self.enable_eplb,
|
||||||
eplb_config=self.eplb_config,
|
eplb_config=self.eplb_config,
|
||||||
expert_placement_strategy=self.expert_placement_strategy,
|
expert_placement_strategy=self.expert_placement_strategy,
|
||||||
|
|||||||
@ -95,7 +95,6 @@ if TYPE_CHECKING:
|
|||||||
VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
|
VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
|
||||||
VLLM_SKIP_P2P_CHECK: bool = False
|
VLLM_SKIP_P2P_CHECK: bool = False
|
||||||
VLLM_DISABLED_KERNELS: list[str] = []
|
VLLM_DISABLED_KERNELS: list[str] = []
|
||||||
VLLM_DISABLE_NCCL_FOR_DP_SYNCHRONIZATION: bool = False
|
|
||||||
VLLM_DISABLE_PYNCCL: bool = False
|
VLLM_DISABLE_PYNCCL: bool = False
|
||||||
VLLM_USE_V1: bool = True
|
VLLM_USE_V1: bool = True
|
||||||
VLLM_ROCM_USE_AITER: bool = False
|
VLLM_ROCM_USE_AITER: bool = False
|
||||||
@ -830,12 +829,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
|||||||
"VLLM_DISABLED_KERNELS": lambda: []
|
"VLLM_DISABLED_KERNELS": lambda: []
|
||||||
if "VLLM_DISABLED_KERNELS" not in os.environ
|
if "VLLM_DISABLED_KERNELS" not in os.environ
|
||||||
else os.environ["VLLM_DISABLED_KERNELS"].split(","),
|
else os.environ["VLLM_DISABLED_KERNELS"].split(","),
|
||||||
# Swaps the all reduce backend that we use to coordinate the DP padding
|
|
||||||
# information from NCCL to gloo.
|
|
||||||
"VLLM_DISABLE_NCCL_FOR_DP_SYNCHRONIZATION": lambda: (
|
|
||||||
os.getenv("VLLM_DISABLE_NCCL_FOR_DP_SYNCHRONIZATION", "False").lower()
|
|
||||||
in ("true", "1")
|
|
||||||
),
|
|
||||||
# Disable pynccl (using torch.distributed instead)
|
# Disable pynccl (using torch.distributed instead)
|
||||||
"VLLM_DISABLE_PYNCCL": lambda: (
|
"VLLM_DISABLE_PYNCCL": lambda: (
|
||||||
os.getenv("VLLM_DISABLE_PYNCCL", "False").lower() in ("true", "1")
|
os.getenv("VLLM_DISABLE_PYNCCL", "False").lower() in ("true", "1")
|
||||||
|
|||||||
@ -8,13 +8,11 @@ from dataclasses import dataclass
|
|||||||
from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union
|
from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed as dist
|
|
||||||
|
|
||||||
import vllm.envs as envs
|
import vllm.envs as envs
|
||||||
from vllm.config import CUDAGraphMode, ParallelConfig, VllmConfig
|
from vllm.config import CUDAGraphMode, ParallelConfig, VllmConfig
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.platforms import current_platform
|
from vllm.v1.worker.ubatch_utils import UBatchSlices
|
||||||
from vllm.v1.worker.ubatch_utils import UBatchSlices, is_second_ubatch_empty
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from vllm.attention.backends.abstract import AttentionMetadata
|
from vllm.attention.backends.abstract import AttentionMetadata
|
||||||
@ -87,129 +85,22 @@ class DPMetadata:
|
|||||||
# NOTE: local_sizes should only be set by the chunked_sizes context manager
|
# NOTE: local_sizes should only be set by the chunked_sizes context manager
|
||||||
local_sizes: Optional[list[int]] = None
|
local_sizes: Optional[list[int]] = None
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def num_tokens_across_dp(
|
|
||||||
num_tokens: int, dp_size: int, dp_rank: int
|
|
||||||
) -> torch.Tensor:
|
|
||||||
"""
|
|
||||||
Gather the num_tokens across all DP ranks and return results in a
|
|
||||||
CPU tensor of size dp_size.
|
|
||||||
"""
|
|
||||||
from vllm.distributed.parallel_state import get_dp_group
|
|
||||||
|
|
||||||
device = current_platform.device_type
|
|
||||||
group = get_dp_group().device_group
|
|
||||||
|
|
||||||
# Transfering this tensor from GPU to CPU will introduce a GPU sync
|
|
||||||
# point that could adversely affect performance of vllm with asynch
|
|
||||||
# scheduling. This environment variable exists to quickly disable
|
|
||||||
# this optimization if we run into this case.
|
|
||||||
if envs.VLLM_DISABLE_NCCL_FOR_DP_SYNCHRONIZATION:
|
|
||||||
logger.info_once(
|
|
||||||
"Using CPU all reduce to syncronize DP padding between ranks."
|
|
||||||
)
|
|
||||||
device = "cpu"
|
|
||||||
group = get_dp_group().cpu_group
|
|
||||||
num_tokens_across_dp = [0] * dp_size
|
|
||||||
num_tokens_across_dp[dp_rank] = num_tokens
|
|
||||||
num_tokens_tensor = torch.tensor(
|
|
||||||
num_tokens_across_dp, device=device, dtype=torch.int32
|
|
||||||
)
|
|
||||||
dist.all_reduce(num_tokens_tensor, group=group)
|
|
||||||
return num_tokens_tensor.cpu()
|
|
||||||
|
|
||||||
# Get the cumulative tokens across sequence parallel ranks.
|
|
||||||
# In this case the input to the MoEs will be distributed w.r.t both
|
|
||||||
# DP and TP rank.
|
|
||||||
# When sp_size==1, this is just the cummulative num tokens across DP.
|
|
||||||
def cu_tokens_across_sp(self, sp_size: int) -> torch.Tensor:
|
|
||||||
num_tokens_across_sp_cpu = (
|
|
||||||
self.num_tokens_across_dp_cpu - 1 + sp_size
|
|
||||||
) // sp_size
|
|
||||||
num_tokens_across_sp_cpu = num_tokens_across_sp_cpu.repeat_interleave(sp_size)
|
|
||||||
return torch.cumsum(num_tokens_across_sp_cpu, dim=0)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def should_ubatch_across_dp(
|
|
||||||
should_ubatch: bool,
|
|
||||||
orig_num_tokens_per_ubatch: int,
|
|
||||||
padded_num_tokens_per_ubatch: int,
|
|
||||||
dp_size: int,
|
|
||||||
dp_rank: int,
|
|
||||||
) -> tuple[bool, Optional[torch.Tensor]]:
|
|
||||||
"""
|
|
||||||
1. Decides if each DP rank is going to microbatch. Either all ranks
|
|
||||||
run with microbatching or none of them do. If this function decides
|
|
||||||
not to run with microbatching. It will "abort" meaning that no padding
|
|
||||||
information will be returned to the caller. It will return (False, None)
|
|
||||||
|
|
||||||
2. Determines the total number of tokens that each rank will run.
|
|
||||||
All ranks will be padded out so that the run with the same number
|
|
||||||
of tokens
|
|
||||||
|
|
||||||
Returns: tuple[
|
|
||||||
should_ubatch: Are all DP ranks going to microbatch
|
|
||||||
num_tokens_after_padding: A tensor containing the total number of
|
|
||||||
tokens per-microbatch for each DP rank including padding. Will be
|
|
||||||
None if should_ubatch if False
|
|
||||||
]
|
|
||||||
"""
|
|
||||||
|
|
||||||
device = current_platform.device_type
|
|
||||||
tensor = torch.zeros(3, dp_size, device=device, dtype=torch.int32)
|
|
||||||
tensor[0][dp_rank] = orig_num_tokens_per_ubatch
|
|
||||||
tensor[1][dp_rank] = padded_num_tokens_per_ubatch
|
|
||||||
tensor[2][dp_rank] = 1 if should_ubatch else 0
|
|
||||||
|
|
||||||
from vllm.distributed.parallel_state import get_dp_group
|
|
||||||
|
|
||||||
dist.all_reduce(tensor, group=get_dp_group().device_group)
|
|
||||||
|
|
||||||
result: bool = bool(torch.all(tensor[2] == 1).item())
|
|
||||||
if not result:
|
|
||||||
return result, None
|
|
||||||
|
|
||||||
orig_num_tokens_tensor = tensor[0, :]
|
|
||||||
padded_num_tokens_tensor = tensor[1, :]
|
|
||||||
|
|
||||||
orig_min_num_tokens = int(orig_num_tokens_tensor.min().item())
|
|
||||||
padded_max_num_tokens = int(padded_num_tokens_tensor.max().item())
|
|
||||||
if is_second_ubatch_empty(orig_min_num_tokens, padded_max_num_tokens):
|
|
||||||
logger.debug(
|
|
||||||
"Aborting ubatching %s %s", orig_min_num_tokens, padded_max_num_tokens
|
|
||||||
)
|
|
||||||
return False, None
|
|
||||||
return result, padded_num_tokens_tensor.cpu()
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def make(
|
def make(
|
||||||
parallel_config: ParallelConfig,
|
parallel_config: ParallelConfig,
|
||||||
attn_metadata: Any,
|
|
||||||
num_tokens: int,
|
num_tokens: int,
|
||||||
num_tokens_across_dp_cpu: Optional[torch.Tensor] = None,
|
num_tokens_across_dp_cpu: torch.Tensor,
|
||||||
) -> "DPMetadata":
|
) -> "DPMetadata":
|
||||||
|
assert num_tokens_across_dp_cpu is not None
|
||||||
assert parallel_config.data_parallel_size > 1
|
assert parallel_config.data_parallel_size > 1
|
||||||
dp_size = parallel_config.data_parallel_size
|
|
||||||
dp_rank = parallel_config.data_parallel_rank
|
dp_rank = parallel_config.data_parallel_rank
|
||||||
if attn_metadata is not None and hasattr(attn_metadata, "num_prefill_tokens"):
|
batchsize = num_tokens
|
||||||
# for v0 attention backends
|
|
||||||
batchsize = (
|
|
||||||
attn_metadata.num_prefill_tokens + attn_metadata.num_decode_tokens
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# for v1 attention backends or no attn_metadata
|
|
||||||
batchsize = num_tokens
|
|
||||||
|
|
||||||
# If num_tokens_across_dp is None, it will be computed by all_reduce
|
# If num_tokens_across_dp is None, it will be computed by all_reduce
|
||||||
# Otherwise, num_tokens_across_dp[dp_rank] should be equal to batchsize
|
# Otherwise, num_tokens_across_dp[dp_rank] should be equal to batchsize
|
||||||
assert (
|
assert num_tokens_across_dp_cpu[dp_rank] == batchsize, (
|
||||||
num_tokens_across_dp_cpu is None
|
f"{num_tokens_across_dp_cpu[dp_rank]} {batchsize}"
|
||||||
or num_tokens_across_dp_cpu[dp_rank] == batchsize
|
)
|
||||||
), f"{num_tokens_across_dp_cpu[dp_rank]} {batchsize}"
|
|
||||||
if num_tokens_across_dp_cpu is None:
|
|
||||||
num_tokens_across_dp_cpu = DPMetadata.num_tokens_across_dp(
|
|
||||||
batchsize, dp_size, dp_rank
|
|
||||||
)
|
|
||||||
max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp_cpu)
|
max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp_cpu)
|
||||||
return DPMetadata(max_tokens_across_dp_cpu, num_tokens_across_dp_cpu)
|
return DPMetadata(max_tokens_across_dp_cpu, num_tokens_across_dp_cpu)
|
||||||
|
|
||||||
@ -376,11 +267,9 @@ def set_forward_context(
|
|||||||
if vllm_config.parallel_config.data_parallel_size > 1 and (
|
if vllm_config.parallel_config.data_parallel_size > 1 and (
|
||||||
attn_metadata is not None or num_tokens is not None
|
attn_metadata is not None or num_tokens is not None
|
||||||
):
|
):
|
||||||
|
assert num_tokens_across_dp is not None
|
||||||
dp_metadata = DPMetadata.make(
|
dp_metadata = DPMetadata.make(
|
||||||
vllm_config.parallel_config,
|
vllm_config.parallel_config, num_tokens or 0, num_tokens_across_dp
|
||||||
attn_metadata,
|
|
||||||
num_tokens or 0,
|
|
||||||
num_tokens_across_dp,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
forward_context = create_forward_context(
|
forward_context = create_forward_context(
|
||||||
|
|||||||
177
vllm/v1/worker/dp_utils.py
Normal file
177
vllm/v1/worker/dp_utils.py
Normal file
@ -0,0 +1,177 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
import torch.distributed as dist
|
||||||
|
|
||||||
|
from vllm.config import ParallelConfig
|
||||||
|
from vllm.distributed.parallel_state import get_dp_group
|
||||||
|
from vllm.logger import init_logger
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
from vllm.v1.worker.ubatch_utils import (
|
||||||
|
UBatchSlices,
|
||||||
|
check_ubatch_thresholds,
|
||||||
|
create_ubatch_slices,
|
||||||
|
is_second_ubatch_empty,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_device_and_group(parallel_config: ParallelConfig):
|
||||||
|
device = current_platform.device_type
|
||||||
|
group = get_dp_group().device_group
|
||||||
|
|
||||||
|
# Transfering this tensor from GPU to CPU will introduce a GPU sync
|
||||||
|
# point that could adversely affect performance of vllm with asynch
|
||||||
|
# scheduling. This environment variable exists to quickly disable
|
||||||
|
# this optimization if we run into this case.
|
||||||
|
if parallel_config.disable_nccl_for_dp_synchronization:
|
||||||
|
logger.info_once("Using CPU all reduce to syncronize DP padding between ranks.")
|
||||||
|
device = "cpu"
|
||||||
|
group = get_dp_group().cpu_group
|
||||||
|
return device, group
|
||||||
|
|
||||||
|
|
||||||
|
def _run_ar(
|
||||||
|
should_ubatch: bool,
|
||||||
|
orig_num_tokens_per_ubatch: int,
|
||||||
|
padded_num_tokens_per_ubatch: int,
|
||||||
|
parallel_config: ParallelConfig,
|
||||||
|
) -> torch.Tensor:
|
||||||
|
dp_size = parallel_config.data_parallel_size
|
||||||
|
dp_rank = parallel_config.data_parallel_rank
|
||||||
|
device, group = _get_device_and_group(parallel_config)
|
||||||
|
tensor = torch.zeros(3, dp_size, device=device, dtype=torch.int32)
|
||||||
|
tensor[0][dp_rank] = orig_num_tokens_per_ubatch
|
||||||
|
tensor[1][dp_rank] = padded_num_tokens_per_ubatch
|
||||||
|
tensor[2][dp_rank] = 1 if should_ubatch else 0
|
||||||
|
dist.all_reduce(tensor, group=group)
|
||||||
|
return tensor
|
||||||
|
|
||||||
|
|
||||||
|
def _post_process_ubatch(tensor: torch.Tensor) -> bool:
|
||||||
|
orig_num_tokens_tensor = tensor[0, :]
|
||||||
|
padded_num_tokens_tensor = tensor[1, :]
|
||||||
|
|
||||||
|
# First determine if we are going to be ubatching.
|
||||||
|
should_ubatch: bool = bool(torch.all(tensor[2] == 1).item())
|
||||||
|
if not should_ubatch:
|
||||||
|
return False
|
||||||
|
# If the DP ranks are planning to ubatch, make sure that
|
||||||
|
# there are no "empty" second ubatches
|
||||||
|
orig_min_num_tokens = int(orig_num_tokens_tensor.min().item())
|
||||||
|
padded_max_num_tokens = int(padded_num_tokens_tensor.max().item())
|
||||||
|
if is_second_ubatch_empty(orig_min_num_tokens, padded_max_num_tokens):
|
||||||
|
logger.debug(
|
||||||
|
"Aborting ubatching %s %s", orig_min_num_tokens, padded_max_num_tokens
|
||||||
|
)
|
||||||
|
should_ubatch = False
|
||||||
|
return should_ubatch
|
||||||
|
|
||||||
|
|
||||||
|
def _synchronize_dp_ranks(
|
||||||
|
num_tokens_unpadded: int,
|
||||||
|
num_tokens_padded: int,
|
||||||
|
should_attempt_ubatching: bool,
|
||||||
|
parallel_config: ParallelConfig,
|
||||||
|
) -> tuple[bool, Optional[torch.Tensor]]:
|
||||||
|
"""
|
||||||
|
1. Decides if each DP rank is going to microbatch. Either all ranks
|
||||||
|
run with microbatching or none of them do.
|
||||||
|
|
||||||
|
2. Determines the total number of tokens that each rank will run.
|
||||||
|
All ranks will be padded out so that the run with the same number
|
||||||
|
of tokens
|
||||||
|
|
||||||
|
Returns: tuple[
|
||||||
|
should_ubatch: Are all DP ranks going to microbatch
|
||||||
|
num_tokens_after_padding: A tensor containing the total number of
|
||||||
|
tokens per-microbatch for each DP rank including padding.
|
||||||
|
]
|
||||||
|
|
||||||
|
"""
|
||||||
|
assert num_tokens_padded >= num_tokens_unpadded
|
||||||
|
|
||||||
|
# First we coordinate between the DP ranks via an All Reduce
|
||||||
|
# to determine the total number of tokens that each rank
|
||||||
|
# will run and if we are using ubatching or not.
|
||||||
|
tensor = _run_ar(
|
||||||
|
should_ubatch=should_attempt_ubatching,
|
||||||
|
orig_num_tokens_per_ubatch=num_tokens_unpadded,
|
||||||
|
padded_num_tokens_per_ubatch=num_tokens_padded,
|
||||||
|
parallel_config=parallel_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Ensure that each rank is processing the same nuber of tokens
|
||||||
|
num_tokens_across_dp = tensor[1, :]
|
||||||
|
max_num_tokens = int(num_tokens_across_dp.max().item())
|
||||||
|
num_tokens_after_padding = torch.tensor(
|
||||||
|
[max_num_tokens] * len(num_tokens_across_dp), device="cpu", dtype=torch.int32
|
||||||
|
)
|
||||||
|
|
||||||
|
should_ubatch = _post_process_ubatch(tensor)
|
||||||
|
|
||||||
|
return should_ubatch, num_tokens_after_padding
|
||||||
|
|
||||||
|
|
||||||
|
def coordinate_batch_across_dp(
|
||||||
|
num_scheduled_tokens_per_request: np.ndarray,
|
||||||
|
num_tokens_unpadded: int,
|
||||||
|
num_tokens_padded: int,
|
||||||
|
parallel_config: ParallelConfig,
|
||||||
|
allow_microbatching: bool,
|
||||||
|
uniform_decode: bool,
|
||||||
|
) -> tuple[Optional[UBatchSlices], Optional[torch.Tensor]]:
|
||||||
|
"""
|
||||||
|
Coordinates amongst all DP ranks to determine if and how the full batch
|
||||||
|
should be split into microbatches.
|
||||||
|
|
||||||
|
Returns: tuple[
|
||||||
|
ubatch_slices: if this is set then all DP ranks have agreed to
|
||||||
|
microbatch
|
||||||
|
num_tokens_after_padding: A tensor containing the total number of
|
||||||
|
tokens per-microbatch for each DP rank including padding.
|
||||||
|
]
|
||||||
|
|
||||||
|
"""
|
||||||
|
if parallel_config.data_parallel_size == 1:
|
||||||
|
# Early exit.
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
# Check preconditions for microbatching
|
||||||
|
should_attempt_ubatching = check_ubatch_thresholds(
|
||||||
|
parallel_config,
|
||||||
|
num_tokens_unpadded,
|
||||||
|
uniform_decode=uniform_decode,
|
||||||
|
)
|
||||||
|
|
||||||
|
# If the caller has explicitly disabled microbatching.
|
||||||
|
if not allow_microbatching:
|
||||||
|
should_attempt_ubatching = False
|
||||||
|
|
||||||
|
(should_ubatch, num_tokens_after_padding) = _synchronize_dp_ranks(
|
||||||
|
num_tokens_unpadded,
|
||||||
|
num_tokens_padded,
|
||||||
|
should_attempt_ubatching,
|
||||||
|
parallel_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Don't microbatch unless every other DP worker is also microbatching
|
||||||
|
if not should_ubatch:
|
||||||
|
return (None, num_tokens_after_padding)
|
||||||
|
|
||||||
|
# This doesn't actually pad the ubatch slices. It just initializes the
|
||||||
|
# split point to the padded value so that padding can be applied
|
||||||
|
# to the second ubatch in pad_out_ubatch_slice after attention
|
||||||
|
# metadata creation
|
||||||
|
assert num_tokens_after_padding is not None
|
||||||
|
token_split_point = int(num_tokens_after_padding[0].item()) // 2
|
||||||
|
|
||||||
|
ubatch_slices = create_ubatch_slices(
|
||||||
|
num_scheduled_tokens_per_request, token_split_point
|
||||||
|
)
|
||||||
|
|
||||||
|
return (ubatch_slices, num_tokens_after_padding)
|
||||||
@ -41,7 +41,7 @@ from vllm.distributed.parallel_state import (
|
|||||||
is_global_first_rank,
|
is_global_first_rank,
|
||||||
prepare_communication_buffer_for_model,
|
prepare_communication_buffer_for_model,
|
||||||
)
|
)
|
||||||
from vllm.forward_context import BatchDescriptor, DPMetadata, set_forward_context
|
from vllm.forward_context import BatchDescriptor, set_forward_context
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
|
from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
|
||||||
from vllm.model_executor.layers.mamba.abstract import MambaBase
|
from vllm.model_executor.layers.mamba.abstract import MambaBase
|
||||||
@ -131,12 +131,16 @@ from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
|
|||||||
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
|
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
|
||||||
from vllm.v1.structured_output.utils import apply_grammar_bitmask
|
from vllm.v1.structured_output.utils import apply_grammar_bitmask
|
||||||
from vllm.v1.utils import CpuGpuBuffer, record_function_or_nullcontext
|
from vllm.v1.utils import CpuGpuBuffer, record_function_or_nullcontext
|
||||||
|
from vllm.v1.worker.dp_utils import coordinate_batch_across_dp
|
||||||
from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
|
from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
|
||||||
from vllm.v1.worker.gpu_ubatch_wrapper import UBatchWrapper
|
from vllm.v1.worker.gpu_ubatch_wrapper import UBatchWrapper
|
||||||
from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin
|
from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin
|
||||||
from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
|
from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
|
||||||
from vllm.v1.worker.ubatch_splitting import check_ubatch_thresholds, ubatch_split
|
from vllm.v1.worker.ubatch_utils import (
|
||||||
from vllm.v1.worker.ubatch_utils import UBatchSlice, UBatchSlices
|
UBatchSlice,
|
||||||
|
UBatchSlices,
|
||||||
|
check_ubatch_thresholds,
|
||||||
|
)
|
||||||
from vllm.v1.worker.utils import is_residual_scattered_for_sp
|
from vllm.v1.worker.utils import is_residual_scattered_for_sp
|
||||||
|
|
||||||
from .utils import (
|
from .utils import (
|
||||||
@ -1161,18 +1165,17 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
query_start_loc = self.query_start_loc.gpu[: num_reqs + 1]
|
query_start_loc = self.query_start_loc.gpu[: num_reqs + 1]
|
||||||
|
|
||||||
num_tokens_unpadded = scheduler_output.total_num_scheduled_tokens
|
num_tokens_unpadded = scheduler_output.total_num_scheduled_tokens
|
||||||
num_tokens_padded = num_tokens_unpadded + self.get_local_padding(
|
num_tokens_padded = self._get_num_input_tokens(num_tokens_unpadded)
|
||||||
num_tokens_unpadded
|
|
||||||
)
|
|
||||||
uniform_decode = (
|
uniform_decode = (
|
||||||
max_num_scheduled_tokens == self.uniform_decode_query_len
|
max_num_scheduled_tokens == self.uniform_decode_query_len
|
||||||
) and (total_num_scheduled_tokens == num_reqs * max_num_scheduled_tokens)
|
) and (total_num_scheduled_tokens == num_reqs * max_num_scheduled_tokens)
|
||||||
ubatch_slices, num_tokens_after_padding = ubatch_split(
|
ubatch_slices, num_tokens_across_dp = coordinate_batch_across_dp(
|
||||||
num_scheduled_tokens,
|
num_scheduled_tokens,
|
||||||
num_tokens_unpadded,
|
num_tokens_unpadded,
|
||||||
num_tokens_padded,
|
num_tokens_padded,
|
||||||
uniform_decode=uniform_decode,
|
self.parallel_config,
|
||||||
vllm_config=self.vllm_config,
|
True,
|
||||||
|
uniform_decode,
|
||||||
)
|
)
|
||||||
|
|
||||||
self.seq_lens.np[:num_reqs] = (
|
self.seq_lens.np[:num_reqs] = (
|
||||||
@ -1405,7 +1408,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
spec_decode_common_attn_metadata,
|
spec_decode_common_attn_metadata,
|
||||||
max_num_scheduled_tokens,
|
max_num_scheduled_tokens,
|
||||||
ubatch_slices,
|
ubatch_slices,
|
||||||
num_tokens_after_padding,
|
num_tokens_across_dp,
|
||||||
use_cascade_attn,
|
use_cascade_attn,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1986,65 +1989,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
log_stats=self.parallel_config.eplb_config.log_balancedness,
|
log_stats=self.parallel_config.eplb_config.log_balancedness,
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_dp_padding(self, num_tokens: int) -> tuple[int, Optional[torch.Tensor]]:
|
|
||||||
"""
|
|
||||||
Determines the total number of tokens that each rank will run.
|
|
||||||
All ranks will be padded out so that they run with the same number
|
|
||||||
of tokens
|
|
||||||
|
|
||||||
Returns: tuple[
|
|
||||||
num_pad_tokens: The number of tokens that will be added to the batch
|
|
||||||
num_tokens_after_padding: A tensor containing the total number of
|
|
||||||
tokens for each DP rank including padding.
|
|
||||||
]
|
|
||||||
"""
|
|
||||||
dp_size = self.vllm_config.parallel_config.data_parallel_size
|
|
||||||
dp_rank = self.vllm_config.parallel_config.data_parallel_rank
|
|
||||||
|
|
||||||
# For DP: Don't pad when setting enforce_eager.
|
|
||||||
# This lets us set enforce_eager on the prefiller in a P/D setup and
|
|
||||||
# still use CUDA graphs (enabled by this padding) on the decoder.
|
|
||||||
#
|
|
||||||
# TODO(tms) : There are many cases where padding is enabled for
|
|
||||||
# prefills, causing unnecessary and excessive padding of activations.
|
|
||||||
|
|
||||||
if dp_size == 1 or self.vllm_config.model_config.enforce_eager:
|
|
||||||
# Early exit.
|
|
||||||
return 0, None
|
|
||||||
|
|
||||||
num_tokens_across_dp = DPMetadata.num_tokens_across_dp(
|
|
||||||
num_tokens, dp_size, dp_rank
|
|
||||||
)
|
|
||||||
max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp).item()
|
|
||||||
num_tokens_after_padding = torch.tensor(
|
|
||||||
[max_tokens_across_dp_cpu] * dp_size, device="cpu", dtype=torch.int32
|
|
||||||
)
|
|
||||||
return max_tokens_across_dp_cpu - num_tokens, num_tokens_after_padding
|
|
||||||
|
|
||||||
def get_local_padding(self, num_tokens_unpadded: int) -> int:
|
|
||||||
num_tokens_padded = num_tokens_unpadded
|
|
||||||
|
|
||||||
if (
|
|
||||||
self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
|
|
||||||
and num_tokens_unpadded <= self.cudagraph_batch_sizes[-1]
|
|
||||||
):
|
|
||||||
# Use piecewise CUDA graphs.
|
|
||||||
# Add padding to the batch size.
|
|
||||||
num_tokens_padded = self.vllm_config.pad_for_cudagraph(num_tokens_unpadded)
|
|
||||||
else:
|
|
||||||
# Eager mode.
|
|
||||||
# Pad tokens to multiple of tensor_parallel_size when
|
|
||||||
# enabled collective fusion for SP
|
|
||||||
tp_size = self.vllm_config.parallel_config.tensor_parallel_size
|
|
||||||
if (
|
|
||||||
self.vllm_config.compilation_config.pass_config.enable_sequence_parallelism
|
|
||||||
and tp_size > 1
|
|
||||||
):
|
|
||||||
num_tokens_padded = round_up(num_tokens_unpadded, tp_size)
|
|
||||||
|
|
||||||
num_pad_tokens = num_tokens_padded - num_tokens_unpadded
|
|
||||||
return num_pad_tokens
|
|
||||||
|
|
||||||
# This is where the second ubatch is adjusted to account for the padding.
|
# This is where the second ubatch is adjusted to account for the padding.
|
||||||
# Should be called after attention metadata creation. This just pads
|
# Should be called after attention metadata creation. This just pads
|
||||||
# the second ubatch slice out to the total number of tokens
|
# the second ubatch slice out to the total number of tokens
|
||||||
@ -2127,13 +2071,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
def _preprocess(
|
def _preprocess(
|
||||||
self,
|
self,
|
||||||
scheduler_output: "SchedulerOutput",
|
scheduler_output: "SchedulerOutput",
|
||||||
|
num_input_tokens: int, # Padded
|
||||||
intermediate_tensors: Optional[IntermediateTensors] = None,
|
intermediate_tensors: Optional[IntermediateTensors] = None,
|
||||||
ubatch_slices: Optional[UBatchSlices] = None,
|
|
||||||
num_tokens_after_padding: Optional[torch.Tensor] = None,
|
|
||||||
) -> tuple[
|
) -> tuple[
|
||||||
int,
|
int,
|
||||||
int,
|
|
||||||
Optional[torch.Tensor],
|
|
||||||
Optional[torch.Tensor],
|
Optional[torch.Tensor],
|
||||||
Optional[torch.Tensor],
|
Optional[torch.Tensor],
|
||||||
torch.Tensor,
|
torch.Tensor,
|
||||||
@ -2141,14 +2082,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
dict[str, Any],
|
dict[str, Any],
|
||||||
]:
|
]:
|
||||||
num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
|
num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
|
||||||
if ubatch_slices:
|
|
||||||
assert num_tokens_after_padding is not None
|
|
||||||
num_input_tokens = int(num_tokens_after_padding[0].item() * 2)
|
|
||||||
self.pad_out_ubatch_slice(ubatch_slices, num_input_tokens)
|
|
||||||
elif ubatch_slices is None:
|
|
||||||
num_input_tokens = self._get_num_input_tokens(num_scheduled_tokens)
|
|
||||||
num_pad, num_tokens_after_padding = self.get_dp_padding(num_input_tokens)
|
|
||||||
num_input_tokens += num_pad
|
|
||||||
|
|
||||||
# _prepare_inputs may reorder the batch, so we must gather multi
|
# _prepare_inputs may reorder the batch, so we must gather multi
|
||||||
# modal outputs after that to ensure the correct order
|
# modal outputs after that to ensure the correct order
|
||||||
@ -2235,8 +2168,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
|
|
||||||
return (
|
return (
|
||||||
num_scheduled_tokens,
|
num_scheduled_tokens,
|
||||||
num_input_tokens,
|
|
||||||
num_tokens_after_padding,
|
|
||||||
input_ids,
|
input_ids,
|
||||||
inputs_embeds,
|
inputs_embeds,
|
||||||
positions,
|
positions,
|
||||||
@ -2506,24 +2437,30 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
spec_decode_common_attn_metadata,
|
spec_decode_common_attn_metadata,
|
||||||
max_query_len,
|
max_query_len,
|
||||||
ubatch_slices,
|
ubatch_slices,
|
||||||
num_tokens_after_padding,
|
num_tokens_across_dp,
|
||||||
use_cascade_attn,
|
use_cascade_attn,
|
||||||
) = self._prepare_inputs(scheduler_output)
|
) = self._prepare_inputs(scheduler_output)
|
||||||
|
|
||||||
|
if ubatch_slices:
|
||||||
|
assert num_tokens_across_dp is not None
|
||||||
|
num_input_tokens = int(num_tokens_across_dp[0].item())
|
||||||
|
self.pad_out_ubatch_slice(ubatch_slices, num_input_tokens)
|
||||||
|
elif num_tokens_across_dp is not None:
|
||||||
|
num_input_tokens = int(num_tokens_across_dp[0].item())
|
||||||
|
else:
|
||||||
|
num_input_tokens = self._get_num_input_tokens(
|
||||||
|
scheduler_output.total_num_scheduled_tokens
|
||||||
|
)
|
||||||
|
|
||||||
(
|
(
|
||||||
num_scheduled_tokens,
|
num_scheduled_tokens,
|
||||||
num_input_tokens,
|
|
||||||
num_tokens_across_dp,
|
|
||||||
input_ids,
|
input_ids,
|
||||||
inputs_embeds,
|
inputs_embeds,
|
||||||
positions,
|
positions,
|
||||||
intermediate_tensors,
|
intermediate_tensors,
|
||||||
model_kwargs,
|
model_kwargs,
|
||||||
) = self._preprocess(
|
) = self._preprocess(
|
||||||
scheduler_output,
|
scheduler_output, num_input_tokens, intermediate_tensors
|
||||||
intermediate_tensors,
|
|
||||||
ubatch_slices,
|
|
||||||
num_tokens_after_padding,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
uniform_decode = (max_query_len == self.uniform_decode_query_len) and (
|
uniform_decode = (max_query_len == self.uniform_decode_query_len) and (
|
||||||
@ -2548,11 +2485,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
):
|
):
|
||||||
cudagraph_runtime_mode = CUDAGraphMode.NONE
|
cudagraph_runtime_mode = CUDAGraphMode.NONE
|
||||||
|
|
||||||
# This is currently to get around the assert in the DPMetadata
|
|
||||||
# where it wants `num_tokens_across_dp` to align with `num_tokens`
|
|
||||||
if ubatch_slices is not None:
|
|
||||||
num_input_tokens = ubatch_slices[0].num_tokens
|
|
||||||
|
|
||||||
# Run the model.
|
# Run the model.
|
||||||
# Use persistent buffers for CUDA graphs.
|
# Use persistent buffers for CUDA graphs.
|
||||||
with (
|
with (
|
||||||
@ -3329,36 +3261,19 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
|
|||||||
num_scheduled_tokens = np.array(num_scheduled_tokens_list, dtype=np.int32)
|
num_scheduled_tokens = np.array(num_scheduled_tokens_list, dtype=np.int32)
|
||||||
total_num_scheduled_tokens = int(num_scheduled_tokens.sum())
|
total_num_scheduled_tokens = int(num_scheduled_tokens.sum())
|
||||||
|
|
||||||
ubatch_slices = None
|
|
||||||
num_tokens_after_padding = None
|
|
||||||
|
|
||||||
# We currently only microbatch if the number of tokens is
|
# We currently only microbatch if the number of tokens is
|
||||||
# over a certain threshold.
|
# over a certain threshold.
|
||||||
if self.parallel_config.enable_dbo and allow_microbatching:
|
ubatch_slices, num_tokens_across_dp = coordinate_batch_across_dp(
|
||||||
ubatch_slices, ubatch_num_tokens_after_padding = ubatch_split(
|
num_scheduled_tokens,
|
||||||
num_scheduled_tokens,
|
total_num_scheduled_tokens,
|
||||||
total_num_scheduled_tokens,
|
total_num_scheduled_tokens,
|
||||||
total_num_scheduled_tokens,
|
self.vllm_config.parallel_config,
|
||||||
uniform_decode=uniform_decode,
|
allow_microbatching,
|
||||||
vllm_config=self.vllm_config,
|
uniform_decode,
|
||||||
)
|
)
|
||||||
# Currently when DBO is enabled `ubatch_split` returns
|
num_tokens_after_padding = num_tokens
|
||||||
# the num_tokens_after_padding for a single ubatch, but we have 2
|
if num_tokens_across_dp is not None:
|
||||||
# TODO(sage,lucas): this is cruft that should be addressed in the
|
num_tokens_after_padding = int(num_tokens_across_dp[0])
|
||||||
# padding refactor.
|
|
||||||
if ubatch_num_tokens_after_padding is not None:
|
|
||||||
num_tokens_after_padding = ubatch_num_tokens_after_padding * 2
|
|
||||||
|
|
||||||
# If we failed to microbatch, currently need to resynchronize
|
|
||||||
# TODO(lucas,sage): we should be able to avoid this second sync by
|
|
||||||
# refactoring `get_dp_padding_ubatch` and `get_dp_padding` into
|
|
||||||
# a single `coordinate_batch_across_dp` function.
|
|
||||||
if num_tokens_after_padding is None:
|
|
||||||
num_pad, num_tokens_across_dp = self.get_dp_padding(num_tokens)
|
|
||||||
num_tokens_after_padding = num_tokens + num_pad
|
|
||||||
else:
|
|
||||||
num_tokens_across_dp = num_tokens_after_padding
|
|
||||||
num_tokens_after_padding = int(num_tokens_after_padding[0].item())
|
|
||||||
|
|
||||||
attn_metadata: Optional[PerLayerAttnMetadata] = None
|
attn_metadata: Optional[PerLayerAttnMetadata] = None
|
||||||
|
|
||||||
|
|||||||
@ -13,6 +13,7 @@ from vllm.config import CUDAGraphMode, VllmConfig
|
|||||||
from vllm.distributed import get_ep_group
|
from vllm.distributed import get_ep_group
|
||||||
from vllm.distributed.device_communicators.pynccl_allocator import set_graph_pool_id
|
from vllm.distributed.device_communicators.pynccl_allocator import set_graph_pool_id
|
||||||
from vllm.forward_context import (
|
from vllm.forward_context import (
|
||||||
|
DPMetadata,
|
||||||
create_forward_context,
|
create_forward_context,
|
||||||
get_forward_context,
|
get_forward_context,
|
||||||
override_forward_context,
|
override_forward_context,
|
||||||
@ -409,6 +410,18 @@ class UBatchWrapper:
|
|||||||
|
|
||||||
# We shouldn't be here unless we are running with multiple DP ranks
|
# We shouldn't be here unless we are running with multiple DP ranks
|
||||||
assert dp_metadata is not None
|
assert dp_metadata is not None
|
||||||
|
num_tokens_per_ubatch = (
|
||||||
|
ubatch_slices[0].token_slice.stop - ubatch_slices[0].token_slice.start
|
||||||
|
)
|
||||||
|
dp_size = self.vllm_config.parallel_config.data_parallel_size
|
||||||
|
ubatch_num_tokens_across_dp = torch.tensor(
|
||||||
|
[num_tokens_per_ubatch] * dp_size, device="cpu", dtype=torch.int32
|
||||||
|
)
|
||||||
|
ubatch_dp_metadata = DPMetadata.make(
|
||||||
|
self.vllm_config.parallel_config,
|
||||||
|
num_tokens_per_ubatch,
|
||||||
|
ubatch_num_tokens_across_dp,
|
||||||
|
)
|
||||||
|
|
||||||
if (
|
if (
|
||||||
num_tokens not in self.cudagraphs
|
num_tokens not in self.cudagraphs
|
||||||
@ -422,7 +435,7 @@ class UBatchWrapper:
|
|||||||
intermediate_tensors=intermediate_tensors,
|
intermediate_tensors=intermediate_tensors,
|
||||||
inputs_embeds=inputs_embeds,
|
inputs_embeds=inputs_embeds,
|
||||||
compute_stream=compute_stream,
|
compute_stream=compute_stream,
|
||||||
dp_metadata=dp_metadata,
|
dp_metadata=ubatch_dp_metadata,
|
||||||
batch_descriptor=batch_descriptor,
|
batch_descriptor=batch_descriptor,
|
||||||
cudagraph_runtime_mode=CUDAGraphMode.NONE,
|
cudagraph_runtime_mode=CUDAGraphMode.NONE,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -1,207 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
|
||||||
|
|
||||||
from vllm.config import ParallelConfig, VllmConfig
|
|
||||||
from vllm.forward_context import DPMetadata
|
|
||||||
from vllm.logger import init_logger
|
|
||||||
from vllm.utils import round_up
|
|
||||||
from vllm.v1.worker.ubatch_utils import (
|
|
||||||
UBatchSlice,
|
|
||||||
UBatchSlices,
|
|
||||||
is_second_ubatch_empty,
|
|
||||||
)
|
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def should_ubatch_with_num_tokens(
|
|
||||||
should_ubatch: bool,
|
|
||||||
orig_num_tokens_per_ubatch: int,
|
|
||||||
padded_num_tokens_per_ubatch: int,
|
|
||||||
vllm_config: VllmConfig,
|
|
||||||
) -> tuple[bool, Optional[torch.Tensor]]:
|
|
||||||
dp_size = vllm_config.parallel_config.data_parallel_size
|
|
||||||
dp_rank = vllm_config.parallel_config.data_parallel_rank
|
|
||||||
return DPMetadata.should_ubatch_across_dp(
|
|
||||||
should_ubatch,
|
|
||||||
orig_num_tokens_per_ubatch,
|
|
||||||
padded_num_tokens_per_ubatch,
|
|
||||||
dp_size,
|
|
||||||
dp_rank,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def check_ubatch_thresholds(
|
|
||||||
config: ParallelConfig, num_tokens: int, uniform_decode: bool
|
|
||||||
) -> bool:
|
|
||||||
if not config.enable_dbo:
|
|
||||||
return False
|
|
||||||
if uniform_decode:
|
|
||||||
return num_tokens >= config.dbo_decode_token_threshold
|
|
||||||
else:
|
|
||||||
return num_tokens >= config.dbo_prefill_token_threshold
|
|
||||||
|
|
||||||
|
|
||||||
def get_dp_padding_ubatch(
|
|
||||||
num_tokens_unpadded: int,
|
|
||||||
num_tokens_padded: int,
|
|
||||||
should_attempt_ubatching: bool,
|
|
||||||
vllm_config: VllmConfig,
|
|
||||||
) -> tuple[bool, Optional[torch.Tensor]]:
|
|
||||||
"""
|
|
||||||
1. Decides if each DP rank is going to microbatch. Either all ranks
|
|
||||||
run with microbatching or none of them do. If this function decides
|
|
||||||
not to run with microbatching. It will "abort" meaning that no padding
|
|
||||||
information will be returned to the caller. It will return (False, None)
|
|
||||||
|
|
||||||
2. Determines the total number of tokens that each rank will run.
|
|
||||||
All ranks will be padded out so that the run with the same number
|
|
||||||
of tokens
|
|
||||||
|
|
||||||
Returns: tuple[
|
|
||||||
should_ubatch: Are all DP ranks going to microbatch
|
|
||||||
num_tokens_after_padding: A tensor containing the total number of
|
|
||||||
tokens per-microbatch for each DP rank including padding. Will be
|
|
||||||
None if should_ubatch if False
|
|
||||||
]
|
|
||||||
|
|
||||||
"""
|
|
||||||
assert num_tokens_padded >= num_tokens_unpadded
|
|
||||||
dp_size = vllm_config.parallel_config.data_parallel_size
|
|
||||||
if dp_size == 1:
|
|
||||||
# Early exit.
|
|
||||||
return False, None
|
|
||||||
|
|
||||||
# If this DP rank doesn't want to attempt microbatching
|
|
||||||
if not should_attempt_ubatching:
|
|
||||||
(should_ubatch, num_tokens_across_dp) = should_ubatch_with_num_tokens(
|
|
||||||
False, 0, 0, vllm_config
|
|
||||||
)
|
|
||||||
assert should_ubatch is False
|
|
||||||
assert num_tokens_across_dp is None
|
|
||||||
return should_ubatch, num_tokens_across_dp
|
|
||||||
|
|
||||||
# Round up to the next multiple of two for even divisibility
|
|
||||||
num_tokens_padded = round_up(num_tokens_padded, 2)
|
|
||||||
num_tokens_per_ubatch = num_tokens_padded // 2
|
|
||||||
should_ubatch = True
|
|
||||||
|
|
||||||
# Sanity Check that the existing padding isn't giving us an empty second
|
|
||||||
# ubatch. Abort if so
|
|
||||||
if is_second_ubatch_empty(num_tokens_unpadded, num_tokens_padded):
|
|
||||||
logger.debug(
|
|
||||||
"Empty second µbatch detected: unpadded tokens: %s, padded tokens: %s",
|
|
||||||
num_tokens_unpadded,
|
|
||||||
num_tokens_padded,
|
|
||||||
)
|
|
||||||
should_ubatch = False
|
|
||||||
|
|
||||||
# Note that we compute the number of padded tokens per ubatch
|
|
||||||
(should_ubatch, num_tokens_across_dp) = should_ubatch_with_num_tokens(
|
|
||||||
should_ubatch, num_tokens_unpadded // 2, num_tokens_per_ubatch, vllm_config
|
|
||||||
)
|
|
||||||
if not should_ubatch:
|
|
||||||
assert num_tokens_across_dp is None
|
|
||||||
return should_ubatch, num_tokens_across_dp
|
|
||||||
|
|
||||||
assert num_tokens_across_dp is not None
|
|
||||||
|
|
||||||
max_tokens_across_dp_cpu = int(torch.max(num_tokens_across_dp).item())
|
|
||||||
num_tokens_after_padding = torch.tensor(
|
|
||||||
[max_tokens_across_dp_cpu] * dp_size, device="cpu", dtype=torch.int32
|
|
||||||
)
|
|
||||||
return should_ubatch, num_tokens_after_padding
|
|
||||||
|
|
||||||
|
|
||||||
def create_ubatch_slices(
|
|
||||||
num_scheduled_tokens: np.ndarray, split_point: int
|
|
||||||
) -> UBatchSlices:
|
|
||||||
# TODO(lucas): Refactor the gpu_model_runner.py so we can pass
|
|
||||||
# in cu_num_tokens directly (i.e. query_start_loc)
|
|
||||||
cu_num_tokens = np.zeros(len(num_scheduled_tokens) + 1, dtype=np.int32)
|
|
||||||
np.cumsum(num_scheduled_tokens, dtype=np.int32, out=cu_num_tokens[1:])
|
|
||||||
|
|
||||||
first_ubatch_token_slice = slice(0, split_point)
|
|
||||||
second_ubatch_token_slice = slice(split_point, cu_num_tokens[-1])
|
|
||||||
|
|
||||||
# Determine request slices using exclusive stop semantics
|
|
||||||
# First ubatch includes requests whose tokens overlap [0, split_point)
|
|
||||||
first_ubatch_req_stop = int(
|
|
||||||
np.searchsorted(cu_num_tokens, split_point, side="left")
|
|
||||||
)
|
|
||||||
first_ubatch_req_slice = slice(0, first_ubatch_req_stop)
|
|
||||||
|
|
||||||
# Second ubatch starts at the request that contains the split_point
|
|
||||||
# or the request starting exactly at split_point (if on boundary)
|
|
||||||
second_ubatch_req_start = int(
|
|
||||||
np.searchsorted(cu_num_tokens, split_point, side="right") - 1
|
|
||||||
)
|
|
||||||
second_ubatch_req_slice = slice(second_ubatch_req_start, len(cu_num_tokens) - 1)
|
|
||||||
|
|
||||||
return [
|
|
||||||
UBatchSlice(first_ubatch_req_slice, first_ubatch_token_slice),
|
|
||||||
UBatchSlice(second_ubatch_req_slice, second_ubatch_token_slice),
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def ubatch_split(
|
|
||||||
num_scheduled_tokens_per_request: np.ndarray,
|
|
||||||
num_tokens_unpadded: int,
|
|
||||||
num_tokens_padded: int,
|
|
||||||
uniform_decode: bool,
|
|
||||||
vllm_config: VllmConfig,
|
|
||||||
) -> tuple[Optional[UBatchSlices], Optional[torch.Tensor]]:
|
|
||||||
"""
|
|
||||||
Coordinates amongst all DP ranks to determine if and how the full batch
|
|
||||||
should be split into microbatches.
|
|
||||||
|
|
||||||
Returns: tuple[
|
|
||||||
ubatch_slices: if this is set then all DP ranks have agreed to
|
|
||||||
microbatch
|
|
||||||
num_tokens_after_padding: A tensor containing the total number of
|
|
||||||
tokens per-microbatch for each DP rank including padding. Will be
|
|
||||||
None if ubatch_slices is None
|
|
||||||
]
|
|
||||||
|
|
||||||
"""
|
|
||||||
parallel_config = vllm_config.parallel_config
|
|
||||||
# Don't bother with the should_ubatch handshaking unless microbatching
|
|
||||||
# is enabled
|
|
||||||
if not parallel_config.enable_dbo:
|
|
||||||
return (None, None)
|
|
||||||
|
|
||||||
# Check preconditions for microbatching
|
|
||||||
should_attempt_ubatching = check_ubatch_thresholds(
|
|
||||||
parallel_config,
|
|
||||||
num_tokens_unpadded,
|
|
||||||
uniform_decode=uniform_decode,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Don't microbatch unless every other DP worker is also microbatching
|
|
||||||
should_ubatch, num_tokens_after_padding = get_dp_padding_ubatch(
|
|
||||||
num_tokens_unpadded,
|
|
||||||
num_tokens_padded,
|
|
||||||
should_attempt_ubatching,
|
|
||||||
vllm_config,
|
|
||||||
)
|
|
||||||
|
|
||||||
if not should_ubatch:
|
|
||||||
return (None, None)
|
|
||||||
|
|
||||||
# This doesn't actually pad the ubatch slices. It just initializes the
|
|
||||||
# split point to the padded value so that padding can be applied
|
|
||||||
# to the second ubatch in pad_out_ubatch_slice after attention
|
|
||||||
# metadata creation
|
|
||||||
assert num_tokens_after_padding is not None
|
|
||||||
token_split_point = int(num_tokens_after_padding[0].item())
|
|
||||||
|
|
||||||
ubatch_slices = create_ubatch_slices(
|
|
||||||
num_scheduled_tokens_per_request, token_split_point
|
|
||||||
)
|
|
||||||
|
|
||||||
return (ubatch_slices, num_tokens_after_padding)
|
|
||||||
@ -2,8 +2,11 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
from typing_extensions import TypeAlias
|
from typing_extensions import TypeAlias
|
||||||
|
|
||||||
|
from vllm.config import ParallelConfig
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class UBatchSlice:
|
class UBatchSlice:
|
||||||
@ -24,7 +27,47 @@ class UBatchSlice:
|
|||||||
UBatchSlices: TypeAlias = list[UBatchSlice]
|
UBatchSlices: TypeAlias = list[UBatchSlice]
|
||||||
|
|
||||||
|
|
||||||
def is_second_ubatch_empty(
|
def is_second_ubatch_empty(orig_num_tokens: int, padded_num_tokens: int) -> bool:
|
||||||
orig_num_tokens_per_ubatch: int, padded_num_tokens_per_ubatch: int
|
return (padded_num_tokens // 2) >= orig_num_tokens
|
||||||
|
|
||||||
|
|
||||||
|
def check_ubatch_thresholds(
|
||||||
|
config: ParallelConfig, num_tokens: int, uniform_decode: bool
|
||||||
) -> bool:
|
) -> bool:
|
||||||
return padded_num_tokens_per_ubatch >= 2 * orig_num_tokens_per_ubatch
|
if not config.enable_dbo:
|
||||||
|
return False
|
||||||
|
if uniform_decode:
|
||||||
|
return num_tokens >= config.dbo_decode_token_threshold
|
||||||
|
else:
|
||||||
|
return num_tokens >= config.dbo_prefill_token_threshold
|
||||||
|
|
||||||
|
|
||||||
|
def create_ubatch_slices(
|
||||||
|
num_scheduled_tokens: np.ndarray, split_point: int
|
||||||
|
) -> UBatchSlices:
|
||||||
|
# TODO(lucas): Refactor the gpu_model_runner.py so we can pass
|
||||||
|
# in cu_num_tokens directly (i.e. query_start_loc)
|
||||||
|
cu_num_tokens = np.zeros(len(num_scheduled_tokens) + 1, dtype=np.int32)
|
||||||
|
np.cumsum(num_scheduled_tokens, dtype=np.int32, out=cu_num_tokens[1:])
|
||||||
|
|
||||||
|
first_ubatch_token_slice = slice(0, split_point)
|
||||||
|
second_ubatch_token_slice = slice(split_point, cu_num_tokens[-1])
|
||||||
|
|
||||||
|
# Determine request slices using exclusive stop semantics
|
||||||
|
# First ubatch includes requests whose tokens overlap [0, split_point)
|
||||||
|
first_ubatch_req_stop = int(
|
||||||
|
np.searchsorted(cu_num_tokens, split_point, side="left")
|
||||||
|
)
|
||||||
|
first_ubatch_req_slice = slice(0, first_ubatch_req_stop)
|
||||||
|
|
||||||
|
# Second ubatch starts at the request that contains the split_point
|
||||||
|
# or the request starting exactly at split_point (if on boundary)
|
||||||
|
second_ubatch_req_start = int(
|
||||||
|
np.searchsorted(cu_num_tokens, split_point, side="right") - 1
|
||||||
|
)
|
||||||
|
second_ubatch_req_slice = slice(second_ubatch_req_start, len(cu_num_tokens) - 1)
|
||||||
|
|
||||||
|
return [
|
||||||
|
UBatchSlice(first_ubatch_req_slice, first_ubatch_token_slice),
|
||||||
|
UBatchSlice(second_ubatch_req_slice, second_ubatch_token_slice),
|
||||||
|
]
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user