[Core] Simplify the Dp padding/should ubatch coordination logic (#25768)

Signed-off-by: Sage Moore <sage@neuralmagic.com> Signed-off-by: mgoin <mgoin64@gmail.com> Co-authored-by: mgoin <mgoin64@gmail.com>
2026-06-11 15:55:43 +08:00 · 2025-10-06 18:57:49 -07:00 · 2025-10-06 18:57:49 -07:00 · 2111b4643c
commit 2111b4643c
parent c50901f3b9
10 changed files with 297 additions and 462 deletions
--- a/tests/v1/attention/test_attention_splitting.py
+++ b/tests/v1/attention/test_attention_splitting.py
@ -13,7 +13,7 @@ from vllm.v1.attention.backends.utils import (
    split_attn_metadata,
    split_decodes_and_prefills,
 )
-from vllm.v1.worker.ubatch_splitting import create_ubatch_slices
+from vllm.v1.worker.ubatch_utils import create_ubatch_slices
@pytest.fixture
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@ -152,6 +152,10 @@ class ParallelConfig:
    threshold, microbatching will be used. Otherwise, the request will be
    processed in a single batch."""
    disable_nccl_for_dp_synchronization: bool = False
    """Forces the dp synchronization logic in vllm/v1/worker/dp_utils.py 
    to use Gloo instead of NCCL for its all reduce"""
    ray_workers_use_nsight: bool = False
    """Whether to profile Ray workers with nsight, see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler."""
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -365,6 +365,9 @@ class EngineArgs:
    enable_dbo: bool = ParallelConfig.enable_dbo
    dbo_decode_token_threshold: int = ParallelConfig.dbo_decode_token_threshold
    dbo_prefill_token_threshold: int = ParallelConfig.dbo_prefill_token_threshold
    disable_nccl_for_dp_synchronization: bool = (
        ParallelConfig.disable_nccl_for_dp_synchronization
    )
    eplb_config: EPLBConfig = get_field(ParallelConfig, "eplb_config")
    enable_eplb: bool = ParallelConfig.enable_eplb
    expert_placement_strategy: ExpertPlacementStrategy = (
@ -760,6 +763,10 @@ class EngineArgs:
            "--dbo-prefill-token-threshold",
            **parallel_kwargs["dbo_prefill_token_threshold"],
        )
        parallel_group.add_argument(
            "--disable-nccl-for-dp-synchronization",
            **parallel_kwargs["disable_nccl_for_dp_synchronization"],
        )
        parallel_group.add_argument("--enable-eplb", **parallel_kwargs["enable_eplb"])
        parallel_group.add_argument("--eplb-config", **parallel_kwargs["eplb_config"])
        parallel_group.add_argument(
@ -1437,6 +1444,7 @@ class EngineArgs:
            enable_dbo=self.enable_dbo,
            dbo_decode_token_threshold=self.dbo_decode_token_threshold,
            dbo_prefill_token_threshold=self.dbo_prefill_token_threshold,
            disable_nccl_for_dp_synchronization=self.disable_nccl_for_dp_synchronization,
            enable_eplb=self.enable_eplb,
            eplb_config=self.eplb_config,
            expert_placement_strategy=self.expert_placement_strategy,
--- a/vllm/envs.py
+++ b/vllm/envs.py
@ -95,7 +95,6 @@ if TYPE_CHECKING:
    VLLM_ALLOW_RUNTIME_LORA_UPDATING: bool = False
    VLLM_SKIP_P2P_CHECK: bool = False
    VLLM_DISABLED_KERNELS: list[str] = []
    VLLM_DISABLE_NCCL_FOR_DP_SYNCHRONIZATION: bool = False
    VLLM_DISABLE_PYNCCL: bool = False
    VLLM_USE_V1: bool = True
    VLLM_ROCM_USE_AITER: bool = False
@ -830,12 +829,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_DISABLED_KERNELS": lambda: []
    if "VLLM_DISABLED_KERNELS" not in os.environ
    else os.environ["VLLM_DISABLED_KERNELS"].split(","),
    # Swaps the all reduce backend that we use to coordinate the DP padding
    # information from NCCL to gloo.
    "VLLM_DISABLE_NCCL_FOR_DP_SYNCHRONIZATION": lambda: (
        os.getenv("VLLM_DISABLE_NCCL_FOR_DP_SYNCHRONIZATION", "False").lower()
        in ("true", "1")
    ),
    # Disable pynccl (using torch.distributed instead)
    "VLLM_DISABLE_PYNCCL": lambda: (
        os.getenv("VLLM_DISABLE_PYNCCL", "False").lower() in ("true", "1")
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@ -8,13 +8,11 @@ from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Union
 import torch
 import torch.distributed as dist
 import vllm.envs as envs
 from vllm.config import CUDAGraphMode, ParallelConfig, VllmConfig
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
+from vllm.v1.worker.ubatch_utils import UBatchSlices
 from vllm.v1.worker.ubatch_utils import UBatchSlices, is_second_ubatch_empty
 if TYPE_CHECKING:
    from vllm.attention.backends.abstract import AttentionMetadata
@ -87,129 +85,22 @@ class DPMetadata:
    # NOTE: local_sizes should only be set by the chunked_sizes context manager
    local_sizes: Optional[list[int]] = None
    @staticmethod
    def num_tokens_across_dp(
        num_tokens: int, dp_size: int, dp_rank: int
    ) -> torch.Tensor:
        """
        Gather the num_tokens across all DP ranks and return results in a
        CPU tensor of size dp_size.
        """
        from vllm.distributed.parallel_state import get_dp_group
        device = current_platform.device_type
        group = get_dp_group().device_group
        # Transfering this tensor from GPU to CPU will introduce a GPU sync
        # point that could adversely affect performance of vllm with asynch
        # scheduling. This environment variable exists to quickly disable
        # this optimization if we run into this case.
        if envs.VLLM_DISABLE_NCCL_FOR_DP_SYNCHRONIZATION:
            logger.info_once(
                "Using CPU all reduce to syncronize DP padding between ranks."
            )
            device = "cpu"
            group = get_dp_group().cpu_group
        num_tokens_across_dp = [0] * dp_size
        num_tokens_across_dp[dp_rank] = num_tokens
        num_tokens_tensor = torch.tensor(
            num_tokens_across_dp, device=device, dtype=torch.int32
        )
        dist.all_reduce(num_tokens_tensor, group=group)
        return num_tokens_tensor.cpu()
    # Get the cumulative tokens across sequence parallel ranks.
    # In this case the input to the MoEs will be distributed w.r.t both
    # DP and TP rank.
    # When sp_size==1, this is just the cummulative num tokens across DP.
    def cu_tokens_across_sp(self, sp_size: int) -> torch.Tensor:
        num_tokens_across_sp_cpu = (
            self.num_tokens_across_dp_cpu - 1 + sp_size
        ) // sp_size
        num_tokens_across_sp_cpu = num_tokens_across_sp_cpu.repeat_interleave(sp_size)
        return torch.cumsum(num_tokens_across_sp_cpu, dim=0)
    @staticmethod
    def should_ubatch_across_dp(
        should_ubatch: bool,
        orig_num_tokens_per_ubatch: int,
        padded_num_tokens_per_ubatch: int,
        dp_size: int,
        dp_rank: int,
    ) -> tuple[bool, Optional[torch.Tensor]]:
        """
        1. Decides if each DP rank is going to microbatch. Either all ranks
        run with microbatching or none of them do. If this function decides
        not to run with microbatching. It will "abort" meaning that no padding
        information will be returned to the caller. It will return (False, None)
        2. Determines the total number of tokens that each rank will run.
        All ranks will be padded out so that the run with the same number
        of tokens
        Returns: tuple[
            should_ubatch: Are all DP ranks going to microbatch
            num_tokens_after_padding: A tensor containing the total number of
            tokens per-microbatch for each DP rank including padding. Will be
            None if should_ubatch if False
        ]
        """
        device = current_platform.device_type
        tensor = torch.zeros(3, dp_size, device=device, dtype=torch.int32)
        tensor[0][dp_rank] = orig_num_tokens_per_ubatch
        tensor[1][dp_rank] = padded_num_tokens_per_ubatch
        tensor[2][dp_rank] = 1 if should_ubatch else 0
        from vllm.distributed.parallel_state import get_dp_group
        dist.all_reduce(tensor, group=get_dp_group().device_group)
        result: bool = bool(torch.all(tensor[2] == 1).item())
        if not result:
            return result, None
        orig_num_tokens_tensor = tensor[0, :]
        padded_num_tokens_tensor = tensor[1, :]
        orig_min_num_tokens = int(orig_num_tokens_tensor.min().item())
        padded_max_num_tokens = int(padded_num_tokens_tensor.max().item())
        if is_second_ubatch_empty(orig_min_num_tokens, padded_max_num_tokens):
            logger.debug(
                "Aborting ubatching %s %s", orig_min_num_tokens, padded_max_num_tokens
            )
            return False, None
        return result, padded_num_tokens_tensor.cpu()
    @staticmethod
    def make(
        parallel_config: ParallelConfig,
        attn_metadata: Any,
        num_tokens: int,
-        num_tokens_across_dp_cpu: Optional[torch.Tensor] = None,
+        num_tokens_across_dp_cpu: torch.Tensor,
    ) -> "DPMetadata":
        assert num_tokens_across_dp_cpu is not None
        assert parallel_config.data_parallel_size > 1
        dp_size = parallel_config.data_parallel_size
        dp_rank = parallel_config.data_parallel_rank
-        if attn_metadata is not None and hasattr(attn_metadata, "num_prefill_tokens"):
+        batchsize = num_tokens
            # for v0 attention backends
            batchsize = (
                attn_metadata.num_prefill_tokens + attn_metadata.num_decode_tokens
            )
        else:
            # for v1 attention backends or no attn_metadata
            batchsize = num_tokens
        # If num_tokens_across_dp is None, it will be computed by all_reduce
        # Otherwise, num_tokens_across_dp[dp_rank] should be equal to batchsize
-        assert (
+        assert num_tokens_across_dp_cpu[dp_rank] == batchsize, (
-            num_tokens_across_dp_cpu is None
+            f"{num_tokens_across_dp_cpu[dp_rank]} {batchsize}"
-            or num_tokens_across_dp_cpu[dp_rank] == batchsize
+        )
        ), f"{num_tokens_across_dp_cpu[dp_rank]} {batchsize}"
        if num_tokens_across_dp_cpu is None:
            num_tokens_across_dp_cpu = DPMetadata.num_tokens_across_dp(
                batchsize, dp_size, dp_rank
            )
        max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp_cpu)
        return DPMetadata(max_tokens_across_dp_cpu, num_tokens_across_dp_cpu)
@ -376,11 +267,9 @@ def set_forward_context(
    if vllm_config.parallel_config.data_parallel_size > 1 and (
        attn_metadata is not None or num_tokens is not None
    ):
        assert num_tokens_across_dp is not None
        dp_metadata = DPMetadata.make(
-            vllm_config.parallel_config,
+            vllm_config.parallel_config, num_tokens or 0, num_tokens_across_dp
            attn_metadata,
            num_tokens or 0,
            num_tokens_across_dp,
        )
    forward_context = create_forward_context(
--- a/vllm/v1/worker/dp_utils.py
+++ b/vllm/v1/worker/dp_utils.py
@ -0,0 +1,177 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 import numpy as np
 import torch
 import torch.distributed as dist
 from vllm.config import ParallelConfig
 from vllm.distributed.parallel_state import get_dp_group
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.v1.worker.ubatch_utils import (
    UBatchSlices,
    check_ubatch_thresholds,
    create_ubatch_slices,
    is_second_ubatch_empty,
 )
 logger = init_logger(__name__)
 def _get_device_and_group(parallel_config: ParallelConfig):
    device = current_platform.device_type
    group = get_dp_group().device_group
    # Transfering this tensor from GPU to CPU will introduce a GPU sync
    # point that could adversely affect performance of vllm with asynch
    # scheduling. This environment variable exists to quickly disable
    # this optimization if we run into this case.
    if parallel_config.disable_nccl_for_dp_synchronization:
        logger.info_once("Using CPU all reduce to syncronize DP padding between ranks.")
        device = "cpu"
        group = get_dp_group().cpu_group
    return device, group
 def _run_ar(
    should_ubatch: bool,
    orig_num_tokens_per_ubatch: int,
    padded_num_tokens_per_ubatch: int,
    parallel_config: ParallelConfig,
 ) -> torch.Tensor:
    dp_size = parallel_config.data_parallel_size
    dp_rank = parallel_config.data_parallel_rank
    device, group = _get_device_and_group(parallel_config)
    tensor = torch.zeros(3, dp_size, device=device, dtype=torch.int32)
    tensor[0][dp_rank] = orig_num_tokens_per_ubatch
    tensor[1][dp_rank] = padded_num_tokens_per_ubatch
    tensor[2][dp_rank] = 1 if should_ubatch else 0
    dist.all_reduce(tensor, group=group)
    return tensor
 def _post_process_ubatch(tensor: torch.Tensor) -> bool:
    orig_num_tokens_tensor = tensor[0, :]
    padded_num_tokens_tensor = tensor[1, :]
    # First determine if we are going to be ubatching.
    should_ubatch: bool = bool(torch.all(tensor[2] == 1).item())
    if not should_ubatch:
        return False
    # If the DP ranks are planning to ubatch, make sure that
    # there are no "empty" second ubatches
    orig_min_num_tokens = int(orig_num_tokens_tensor.min().item())
    padded_max_num_tokens = int(padded_num_tokens_tensor.max().item())
    if is_second_ubatch_empty(orig_min_num_tokens, padded_max_num_tokens):
        logger.debug(
            "Aborting ubatching %s %s", orig_min_num_tokens, padded_max_num_tokens
        )
        should_ubatch = False
    return should_ubatch
 def _synchronize_dp_ranks(
    num_tokens_unpadded: int,
    num_tokens_padded: int,
    should_attempt_ubatching: bool,
    parallel_config: ParallelConfig,
 ) -> tuple[bool, Optional[torch.Tensor]]:
    """
    1. Decides if each DP rank is going to microbatch. Either all ranks
    run with microbatching or none of them do.
    2. Determines the total number of tokens that each rank will run.
    All ranks will be padded out so that the run with the same number
    of tokens
    Returns: tuple[
        should_ubatch: Are all DP ranks going to microbatch
        num_tokens_after_padding: A tensor containing the total number of
        tokens per-microbatch for each DP rank including padding.
    ]
    """
    assert num_tokens_padded >= num_tokens_unpadded
    # First we coordinate between the DP ranks via an All Reduce
    # to determine the total number of tokens that each rank
    # will run and if we are using ubatching or not.
    tensor = _run_ar(
        should_ubatch=should_attempt_ubatching,
        orig_num_tokens_per_ubatch=num_tokens_unpadded,
        padded_num_tokens_per_ubatch=num_tokens_padded,
        parallel_config=parallel_config,
    )
    # Ensure that each rank is processing the same nuber of tokens
    num_tokens_across_dp = tensor[1, :]
    max_num_tokens = int(num_tokens_across_dp.max().item())
    num_tokens_after_padding = torch.tensor(
        [max_num_tokens] * len(num_tokens_across_dp), device="cpu", dtype=torch.int32
    )
    should_ubatch = _post_process_ubatch(tensor)
    return should_ubatch, num_tokens_after_padding
 def coordinate_batch_across_dp(
    num_scheduled_tokens_per_request: np.ndarray,
    num_tokens_unpadded: int,
    num_tokens_padded: int,
    parallel_config: ParallelConfig,
    allow_microbatching: bool,
    uniform_decode: bool,
 ) -> tuple[Optional[UBatchSlices], Optional[torch.Tensor]]:
    """
    Coordinates amongst all DP ranks to determine if and how the full batch
    should be split into microbatches.
    Returns: tuple[
        ubatch_slices: if this is set then all DP ranks have agreed to
        microbatch
        num_tokens_after_padding: A tensor containing the total number of
        tokens per-microbatch for each DP rank including padding.
    ]
    """
    if parallel_config.data_parallel_size == 1:
        # Early exit.
        return None, None
    # Check preconditions for microbatching
    should_attempt_ubatching = check_ubatch_thresholds(
        parallel_config,
        num_tokens_unpadded,
        uniform_decode=uniform_decode,
    )
    # If the caller has explicitly disabled microbatching.
    if not allow_microbatching:
        should_attempt_ubatching = False
    (should_ubatch, num_tokens_after_padding) = _synchronize_dp_ranks(
        num_tokens_unpadded,
        num_tokens_padded,
        should_attempt_ubatching,
        parallel_config,
    )
    # Don't microbatch unless every other DP worker is also microbatching
    if not should_ubatch:
        return (None, num_tokens_after_padding)
    # This doesn't actually pad the ubatch slices. It just initializes the
    # split point to the padded value so that padding can be applied
    # to the second ubatch in pad_out_ubatch_slice after attention
    # metadata creation
    assert num_tokens_after_padding is not None
    token_split_point = int(num_tokens_after_padding[0].item()) // 2
    ubatch_slices = create_ubatch_slices(
        num_scheduled_tokens_per_request, token_split_point
    )
    return (ubatch_slices, num_tokens_after_padding)
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@ -41,7 +41,7 @@ from vllm.distributed.parallel_state import (
    is_global_first_rank,
    prepare_communication_buffer_for_model,
 )
-from vllm.forward_context import BatchDescriptor, DPMetadata, set_forward_context
+from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 from vllm.model_executor.layers.mamba.abstract import MambaBase
@ -131,12 +131,16 @@ from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
 from vllm.v1.spec_decode.ngram_proposer import NgramProposer
 from vllm.v1.structured_output.utils import apply_grammar_bitmask
 from vllm.v1.utils import CpuGpuBuffer, record_function_or_nullcontext
 from vllm.v1.worker.dp_utils import coordinate_batch_across_dp
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
 from vllm.v1.worker.gpu_ubatch_wrapper import UBatchWrapper
 from vllm.v1.worker.kv_connector_model_runner_mixin import KVConnectorModelRunnerMixin
 from vllm.v1.worker.lora_model_runner_mixin import LoRAModelRunnerMixin
-from vllm.v1.worker.ubatch_splitting import check_ubatch_thresholds, ubatch_split
+from vllm.v1.worker.ubatch_utils import (
-from vllm.v1.worker.ubatch_utils import UBatchSlice, UBatchSlices
+    UBatchSlice,
    UBatchSlices,
    check_ubatch_thresholds,
 )
 from vllm.v1.worker.utils import is_residual_scattered_for_sp
 from .utils import (
@ -1161,18 +1165,17 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
        query_start_loc = self.query_start_loc.gpu[: num_reqs + 1]
        num_tokens_unpadded = scheduler_output.total_num_scheduled_tokens
-        num_tokens_padded = num_tokens_unpadded + self.get_local_padding(
+        num_tokens_padded = self._get_num_input_tokens(num_tokens_unpadded)
            num_tokens_unpadded
        )
        uniform_decode = (
            max_num_scheduled_tokens == self.uniform_decode_query_len
        ) and (total_num_scheduled_tokens == num_reqs * max_num_scheduled_tokens)
-        ubatch_slices, num_tokens_after_padding = ubatch_split(
+        ubatch_slices, num_tokens_across_dp = coordinate_batch_across_dp(
            num_scheduled_tokens,
            num_tokens_unpadded,
            num_tokens_padded,
-            uniform_decode=uniform_decode,
+            self.parallel_config,
-            vllm_config=self.vllm_config,
+            True,
            uniform_decode,
        )
        self.seq_lens.np[:num_reqs] = (
@ -1405,7 +1408,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
            spec_decode_common_attn_metadata,
            max_num_scheduled_tokens,
            ubatch_slices,
-            num_tokens_after_padding,
+            num_tokens_across_dp,
            use_cascade_attn,
        )
@ -1986,65 +1989,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
            log_stats=self.parallel_config.eplb_config.log_balancedness,
        )
    def get_dp_padding(self, num_tokens: int) -> tuple[int, Optional[torch.Tensor]]:
        """
        Determines the total number of tokens that each rank will run.
        All ranks will be padded out so that they run with the same number
        of tokens
        Returns: tuple[
            num_pad_tokens: The number of tokens that will be added to the batch
            num_tokens_after_padding: A tensor containing the total number of
            tokens for each DP rank including padding.
        ]
        """
        dp_size = self.vllm_config.parallel_config.data_parallel_size
        dp_rank = self.vllm_config.parallel_config.data_parallel_rank
        # For DP: Don't pad when setting enforce_eager.
        # This lets us set enforce_eager on the prefiller in a P/D setup and
        # still use CUDA graphs (enabled by this padding) on the decoder.
        #
        # TODO(tms) : There are many cases where padding is enabled for
        # prefills, causing unnecessary and excessive padding of activations.
        if dp_size == 1 or self.vllm_config.model_config.enforce_eager:
            # Early exit.
            return 0, None
        num_tokens_across_dp = DPMetadata.num_tokens_across_dp(
            num_tokens, dp_size, dp_rank
        )
        max_tokens_across_dp_cpu = torch.max(num_tokens_across_dp).item()
        num_tokens_after_padding = torch.tensor(
            [max_tokens_across_dp_cpu] * dp_size, device="cpu", dtype=torch.int32
        )
        return max_tokens_across_dp_cpu - num_tokens, num_tokens_after_padding
    def get_local_padding(self, num_tokens_unpadded: int) -> int:
        num_tokens_padded = num_tokens_unpadded
        if (
            self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE
            and num_tokens_unpadded <= self.cudagraph_batch_sizes[-1]
        ):
            # Use piecewise CUDA graphs.
            # Add padding to the batch size.
            num_tokens_padded = self.vllm_config.pad_for_cudagraph(num_tokens_unpadded)
        else:
            # Eager mode.
            # Pad tokens to multiple of tensor_parallel_size when
            # enabled collective fusion for SP
            tp_size = self.vllm_config.parallel_config.tensor_parallel_size
            if (
                self.vllm_config.compilation_config.pass_config.enable_sequence_parallelism
                and tp_size > 1
            ):
                num_tokens_padded = round_up(num_tokens_unpadded, tp_size)
        num_pad_tokens = num_tokens_padded - num_tokens_unpadded
        return num_pad_tokens
    # This is where the second ubatch is adjusted to account for the padding.
    # Should be called after attention metadata creation. This just pads
    # the second ubatch slice out to the total number of tokens
@ -2127,13 +2071,10 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
    def _preprocess(
        self,
        scheduler_output: "SchedulerOutput",
        num_input_tokens: int,  # Padded
        intermediate_tensors: Optional[IntermediateTensors] = None,
        ubatch_slices: Optional[UBatchSlices] = None,
        num_tokens_after_padding: Optional[torch.Tensor] = None,
    ) -> tuple[
        int,
        int,
        Optional[torch.Tensor],
        Optional[torch.Tensor],
        Optional[torch.Tensor],
        torch.Tensor,
@ -2141,14 +2082,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
        dict[str, Any],
    ]:
        num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
        if ubatch_slices:
            assert num_tokens_after_padding is not None
            num_input_tokens = int(num_tokens_after_padding[0].item() * 2)
            self.pad_out_ubatch_slice(ubatch_slices, num_input_tokens)
        elif ubatch_slices is None:
            num_input_tokens = self._get_num_input_tokens(num_scheduled_tokens)
            num_pad, num_tokens_after_padding = self.get_dp_padding(num_input_tokens)
            num_input_tokens += num_pad
        # _prepare_inputs may reorder the batch, so we must gather multi
        # modal outputs after that to ensure the correct order
@ -2235,8 +2168,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
        return (
            num_scheduled_tokens,
            num_input_tokens,
            num_tokens_after_padding,
            input_ids,
            inputs_embeds,
            positions,
@ -2506,24 +2437,30 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                    spec_decode_common_attn_metadata,
                    max_query_len,
                    ubatch_slices,
-                    num_tokens_after_padding,
+                    num_tokens_across_dp,
                    use_cascade_attn,
                ) = self._prepare_inputs(scheduler_output)
            if ubatch_slices:
                assert num_tokens_across_dp is not None
                num_input_tokens = int(num_tokens_across_dp[0].item())
                self.pad_out_ubatch_slice(ubatch_slices, num_input_tokens)
            elif num_tokens_across_dp is not None:
                num_input_tokens = int(num_tokens_across_dp[0].item())
            else:
                num_input_tokens = self._get_num_input_tokens(
                    scheduler_output.total_num_scheduled_tokens
                )
            (
                num_scheduled_tokens,
                num_input_tokens,
                num_tokens_across_dp,
                input_ids,
                inputs_embeds,
                positions,
                intermediate_tensors,
                model_kwargs,
            ) = self._preprocess(
-                scheduler_output,
+                scheduler_output, num_input_tokens, intermediate_tensors
                intermediate_tensors,
                ubatch_slices,
                num_tokens_after_padding,
            )
            uniform_decode = (max_query_len == self.uniform_decode_query_len) and (
@ -2548,11 +2485,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
            ):
                cudagraph_runtime_mode = CUDAGraphMode.NONE
        # This is currently to get around the assert in the DPMetadata
        # where it wants `num_tokens_across_dp` to align with `num_tokens`
        if ubatch_slices is not None:
            num_input_tokens = ubatch_slices[0].num_tokens
        # Run the model.
        # Use persistent buffers for CUDA graphs.
        with (
@ -3329,36 +3261,19 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
        num_scheduled_tokens = np.array(num_scheduled_tokens_list, dtype=np.int32)
        total_num_scheduled_tokens = int(num_scheduled_tokens.sum())
        ubatch_slices = None
        num_tokens_after_padding = None
        # We currently only microbatch if the number of tokens is
        # over a certain threshold.
-        if self.parallel_config.enable_dbo and allow_microbatching:
+        ubatch_slices, num_tokens_across_dp = coordinate_batch_across_dp(
-            ubatch_slices, ubatch_num_tokens_after_padding = ubatch_split(
+            num_scheduled_tokens,
-                num_scheduled_tokens,
+            total_num_scheduled_tokens,
-                total_num_scheduled_tokens,
+            total_num_scheduled_tokens,
-                total_num_scheduled_tokens,
+            self.vllm_config.parallel_config,
-                uniform_decode=uniform_decode,
+            allow_microbatching,
-                vllm_config=self.vllm_config,
+            uniform_decode,
-            )
+        )
-            # Currently when DBO is enabled `ubatch_split` returns
+        num_tokens_after_padding = num_tokens
-            # the num_tokens_after_padding for a single ubatch, but we have 2
+        if num_tokens_across_dp is not None:
-            # TODO(sage,lucas): this is cruft that should be addressed in the
+            num_tokens_after_padding = int(num_tokens_across_dp[0])
            # padding refactor.
            if ubatch_num_tokens_after_padding is not None:
                num_tokens_after_padding = ubatch_num_tokens_after_padding * 2
        # If we failed to microbatch, currently need to resynchronize
        # TODO(lucas,sage): we should be able to avoid this second sync by
        #  refactoring `get_dp_padding_ubatch` and `get_dp_padding` into
        #  a single `coordinate_batch_across_dp` function.
        if num_tokens_after_padding is None:
            num_pad, num_tokens_across_dp = self.get_dp_padding(num_tokens)
            num_tokens_after_padding = num_tokens + num_pad
        else:
            num_tokens_across_dp = num_tokens_after_padding
            num_tokens_after_padding = int(num_tokens_after_padding[0].item())
        attn_metadata: Optional[PerLayerAttnMetadata] = None
--- a/vllm/v1/worker/gpu_ubatch_wrapper.py
+++ b/vllm/v1/worker/gpu_ubatch_wrapper.py
@ -13,6 +13,7 @@ from vllm.config import CUDAGraphMode, VllmConfig
 from vllm.distributed import get_ep_group
 from vllm.distributed.device_communicators.pynccl_allocator import set_graph_pool_id
 from vllm.forward_context import (
    DPMetadata,
    create_forward_context,
    get_forward_context,
    override_forward_context,
@ -409,6 +410,18 @@ class UBatchWrapper:
        # We shouldn't be here unless we are running with multiple DP ranks
        assert dp_metadata is not None
        num_tokens_per_ubatch = (
            ubatch_slices[0].token_slice.stop - ubatch_slices[0].token_slice.start
        )
        dp_size = self.vllm_config.parallel_config.data_parallel_size
        ubatch_num_tokens_across_dp = torch.tensor(
            [num_tokens_per_ubatch] * dp_size, device="cpu", dtype=torch.int32
        )
        ubatch_dp_metadata = DPMetadata.make(
            self.vllm_config.parallel_config,
            num_tokens_per_ubatch,
            ubatch_num_tokens_across_dp,
        )
        if (
            num_tokens not in self.cudagraphs
@ -422,7 +435,7 @@ class UBatchWrapper:
                intermediate_tensors=intermediate_tensors,
                inputs_embeds=inputs_embeds,
                compute_stream=compute_stream,
-                dp_metadata=dp_metadata,
+                dp_metadata=ubatch_dp_metadata,
                batch_descriptor=batch_descriptor,
                cudagraph_runtime_mode=CUDAGraphMode.NONE,
            )
--- a/vllm/v1/worker/ubatch_splitting.py
+++ b/vllm/v1/worker/ubatch_splitting.py
@ -1,207 +0,0 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from typing import Optional
 import numpy as np
 import torch
 from vllm.config import ParallelConfig, VllmConfig
 from vllm.forward_context import DPMetadata
 from vllm.logger import init_logger
 from vllm.utils import round_up
 from vllm.v1.worker.ubatch_utils import (
    UBatchSlice,
    UBatchSlices,
    is_second_ubatch_empty,
 )
 logger = init_logger(__name__)
 def should_ubatch_with_num_tokens(
    should_ubatch: bool,
    orig_num_tokens_per_ubatch: int,
    padded_num_tokens_per_ubatch: int,
    vllm_config: VllmConfig,
 ) -> tuple[bool, Optional[torch.Tensor]]:
    dp_size = vllm_config.parallel_config.data_parallel_size
    dp_rank = vllm_config.parallel_config.data_parallel_rank
    return DPMetadata.should_ubatch_across_dp(
        should_ubatch,
        orig_num_tokens_per_ubatch,
        padded_num_tokens_per_ubatch,
        dp_size,
        dp_rank,
    )
 def check_ubatch_thresholds(
    config: ParallelConfig, num_tokens: int, uniform_decode: bool
 ) -> bool:
    if not config.enable_dbo:
        return False
    if uniform_decode:
        return num_tokens >= config.dbo_decode_token_threshold
    else:
        return num_tokens >= config.dbo_prefill_token_threshold
 def get_dp_padding_ubatch(
    num_tokens_unpadded: int,
    num_tokens_padded: int,
    should_attempt_ubatching: bool,
    vllm_config: VllmConfig,
 ) -> tuple[bool, Optional[torch.Tensor]]:
    """
    1. Decides if each DP rank is going to microbatch. Either all ranks
    run with microbatching or none of them do. If this function decides
    not to run with microbatching. It will "abort" meaning that no padding
    information will be returned to the caller. It will return (False, None)
    2. Determines the total number of tokens that each rank will run.
    All ranks will be padded out so that the run with the same number
    of tokens
    Returns: tuple[
        should_ubatch: Are all DP ranks going to microbatch
        num_tokens_after_padding: A tensor containing the total number of
        tokens per-microbatch for each DP rank including padding. Will be
        None if should_ubatch if False
    ]
    """
    assert num_tokens_padded >= num_tokens_unpadded
    dp_size = vllm_config.parallel_config.data_parallel_size
    if dp_size == 1:
        # Early exit.
        return False, None
    # If this DP rank doesn't want to attempt microbatching
    if not should_attempt_ubatching:
        (should_ubatch, num_tokens_across_dp) = should_ubatch_with_num_tokens(
            False, 0, 0, vllm_config
        )
        assert should_ubatch is False
        assert num_tokens_across_dp is None
        return should_ubatch, num_tokens_across_dp
    # Round up to the next multiple of two for even divisibility
    num_tokens_padded = round_up(num_tokens_padded, 2)
    num_tokens_per_ubatch = num_tokens_padded // 2
    should_ubatch = True
    # Sanity Check that the existing padding isn't giving us an empty second
    # ubatch. Abort if so
    if is_second_ubatch_empty(num_tokens_unpadded, num_tokens_padded):
        logger.debug(
            "Empty second µbatch detected: unpadded tokens: %s, padded tokens: %s",
            num_tokens_unpadded,
            num_tokens_padded,
        )
        should_ubatch = False
    # Note that we compute the number of padded tokens per ubatch
    (should_ubatch, num_tokens_across_dp) = should_ubatch_with_num_tokens(
        should_ubatch, num_tokens_unpadded // 2, num_tokens_per_ubatch, vllm_config
    )
    if not should_ubatch:
        assert num_tokens_across_dp is None
        return should_ubatch, num_tokens_across_dp
    assert num_tokens_across_dp is not None
    max_tokens_across_dp_cpu = int(torch.max(num_tokens_across_dp).item())
    num_tokens_after_padding = torch.tensor(
        [max_tokens_across_dp_cpu] * dp_size, device="cpu", dtype=torch.int32
    )
    return should_ubatch, num_tokens_after_padding
 def create_ubatch_slices(
    num_scheduled_tokens: np.ndarray, split_point: int
 ) -> UBatchSlices:
    # TODO(lucas): Refactor the gpu_model_runner.py so we can pass
    # in cu_num_tokens directly (i.e. query_start_loc)
    cu_num_tokens = np.zeros(len(num_scheduled_tokens) + 1, dtype=np.int32)
    np.cumsum(num_scheduled_tokens, dtype=np.int32, out=cu_num_tokens[1:])
    first_ubatch_token_slice = slice(0, split_point)
    second_ubatch_token_slice = slice(split_point, cu_num_tokens[-1])
    # Determine request slices using exclusive stop semantics
    # First ubatch includes requests whose tokens overlap [0, split_point)
    first_ubatch_req_stop = int(
        np.searchsorted(cu_num_tokens, split_point, side="left")
    )
    first_ubatch_req_slice = slice(0, first_ubatch_req_stop)
    # Second ubatch starts at the request that contains the split_point
    # or the request starting exactly at split_point (if on boundary)
    second_ubatch_req_start = int(
        np.searchsorted(cu_num_tokens, split_point, side="right") - 1
    )
    second_ubatch_req_slice = slice(second_ubatch_req_start, len(cu_num_tokens) - 1)
    return [
        UBatchSlice(first_ubatch_req_slice, first_ubatch_token_slice),
        UBatchSlice(second_ubatch_req_slice, second_ubatch_token_slice),
    ]
 def ubatch_split(
    num_scheduled_tokens_per_request: np.ndarray,
    num_tokens_unpadded: int,
    num_tokens_padded: int,
    uniform_decode: bool,
    vllm_config: VllmConfig,
 ) -> tuple[Optional[UBatchSlices], Optional[torch.Tensor]]:
    """
    Coordinates amongst all DP ranks to determine if and how the full batch
    should be split into microbatches.
    Returns: tuple[
        ubatch_slices: if this is set then all DP ranks have agreed to
        microbatch
        num_tokens_after_padding: A tensor containing the total number of
        tokens per-microbatch for each DP rank including padding. Will be
        None if ubatch_slices is None
    ]
    """
    parallel_config = vllm_config.parallel_config
    # Don't bother with the should_ubatch handshaking unless microbatching
    # is enabled
    if not parallel_config.enable_dbo:
        return (None, None)
    # Check preconditions for microbatching
    should_attempt_ubatching = check_ubatch_thresholds(
        parallel_config,
        num_tokens_unpadded,
        uniform_decode=uniform_decode,
    )
    # Don't microbatch unless every other DP worker is also microbatching
    should_ubatch, num_tokens_after_padding = get_dp_padding_ubatch(
        num_tokens_unpadded,
        num_tokens_padded,
        should_attempt_ubatching,
        vllm_config,
    )
    if not should_ubatch:
        return (None, None)
    # This doesn't actually pad the ubatch slices. It just initializes the
    # split point to the padded value so that padding can be applied
    # to the second ubatch in pad_out_ubatch_slice after attention
    # metadata creation
    assert num_tokens_after_padding is not None
    token_split_point = int(num_tokens_after_padding[0].item())
    ubatch_slices = create_ubatch_slices(
        num_scheduled_tokens_per_request, token_split_point
    )
    return (ubatch_slices, num_tokens_after_padding)
--- a/vllm/v1/worker/ubatch_utils.py
+++ b/vllm/v1/worker/ubatch_utils.py
@ -2,8 +2,11 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from dataclasses import dataclass
 import numpy as np
 from typing_extensions import TypeAlias
 from vllm.config import ParallelConfig
@dataclass
 class UBatchSlice:
@ -24,7 +27,47 @@ class UBatchSlice:
 UBatchSlices: TypeAlias = list[UBatchSlice]
-def is_second_ubatch_empty(
+def is_second_ubatch_empty(orig_num_tokens: int, padded_num_tokens: int) -> bool:
-    orig_num_tokens_per_ubatch: int, padded_num_tokens_per_ubatch: int
+    return (padded_num_tokens // 2) >= orig_num_tokens
 def check_ubatch_thresholds(
    config: ParallelConfig, num_tokens: int, uniform_decode: bool
 ) -> bool:
-    return padded_num_tokens_per_ubatch >= 2 * orig_num_tokens_per_ubatch
+    if not config.enable_dbo:
        return False
    if uniform_decode:
        return num_tokens >= config.dbo_decode_token_threshold
    else:
        return num_tokens >= config.dbo_prefill_token_threshold
 def create_ubatch_slices(
    num_scheduled_tokens: np.ndarray, split_point: int
 ) -> UBatchSlices:
    # TODO(lucas): Refactor the gpu_model_runner.py so we can pass
    # in cu_num_tokens directly (i.e. query_start_loc)
    cu_num_tokens = np.zeros(len(num_scheduled_tokens) + 1, dtype=np.int32)
    np.cumsum(num_scheduled_tokens, dtype=np.int32, out=cu_num_tokens[1:])
    first_ubatch_token_slice = slice(0, split_point)
    second_ubatch_token_slice = slice(split_point, cu_num_tokens[-1])
    # Determine request slices using exclusive stop semantics
    # First ubatch includes requests whose tokens overlap [0, split_point)
    first_ubatch_req_stop = int(
        np.searchsorted(cu_num_tokens, split_point, side="left")
    )
    first_ubatch_req_slice = slice(0, first_ubatch_req_stop)
    # Second ubatch starts at the request that contains the split_point
    # or the request starting exactly at split_point (if on boundary)
    second_ubatch_req_start = int(
        np.searchsorted(cu_num_tokens, split_point, side="right") - 1
    )
    second_ubatch_req_slice = slice(second_ubatch_req_start, len(cu_num_tokens) - 1)
    return [
        UBatchSlice(first_ubatch_req_slice, first_ubatch_token_slice),
        UBatchSlice(second_ubatch_req_slice, second_ubatch_token_slice),
    ]