From 6ca8d9753c70fb02f70d65430d992b14407b5358 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Fri, 26 Sep 2025 12:22:49 -0400 Subject: [PATCH] [BugFix] Fix using `dbo_decode_token_threshold` always (and ignoring `dbo_prefill_token_threshold`) (#25622) Signed-off-by: Lucas Wilkinson Signed-off-by: yewentao256 --- vllm/v1/worker/gpu_model_runner.py | 9 +++++++-- vllm/v1/worker/ubatch_splitting.py | 3 ++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index cbf439aa697b2..f199dbd991f48 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1045,11 +1045,15 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): num_tokens_unpadded = scheduler_output.total_num_scheduled_tokens num_tokens_padded = num_tokens_unpadded + self.get_local_padding( num_tokens_unpadded) + uniform_decode = \ + (max_num_scheduled_tokens == self.uniform_decode_query_len) and \ + (total_num_scheduled_tokens == num_reqs * max_num_scheduled_tokens) ubatch_slices, num_tokens_after_padding = \ ubatch_split(num_scheduled_tokens, num_tokens_unpadded, num_tokens_padded, - self.vllm_config) + uniform_decode=uniform_decode, + vllm_config=self.vllm_config) self.seq_lens.np[:num_reqs] = ( self.input_batch.num_computed_tokens_cpu[:num_reqs] + @@ -2989,7 +2993,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): num_scheduled_tokens, total_num_scheduled_tokens, total_num_scheduled_tokens, - self.vllm_config, + uniform_decode=uniform_decode, + vllm_config=self.vllm_config, ) # If we failed to microbatch, currently need to resynchronize diff --git a/vllm/v1/worker/ubatch_splitting.py b/vllm/v1/worker/ubatch_splitting.py index 30acb14ff58a7..7767750aa6048 100644 --- a/vllm/v1/worker/ubatch_splitting.py +++ b/vllm/v1/worker/ubatch_splitting.py @@ -139,6 +139,7 @@ def ubatch_split( num_scheduled_tokens_per_request: np.ndarray, num_tokens_unpadded: int, num_tokens_padded: int, + uniform_decode: bool, vllm_config: VllmConfig, ) -> tuple[Optional[UBatchSlices], Optional[torch.Tensor]]: """ @@ -164,7 +165,7 @@ def ubatch_split( should_attempt_ubatching = check_ubatch_thresholds( parallel_config, num_tokens_unpadded, - vllm_config, + uniform_decode=uniform_decode, ) # Don't microbatch unless every other DP worker is also microbatching