From 23194d83e8f2a6783b0d8c275f5f8a22faab9aec Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Tue, 30 Sep 2025 00:18:59 -0400 Subject: [PATCH] [BugFix] Fix DP/EP hang (#25906) Signed-off-by: Lucas Wilkinson --- vllm/v1/worker/gpu_model_runner.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 9e7d6eb0387bc..98e00f6d98a97 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -3075,13 +3075,19 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # We currently only microbatch if the number of tokens is # over a certain threshold. if self.parallel_config.enable_dbo and allow_microbatching: - ubatch_slices, num_tokens_after_padding = ubatch_split( + ubatch_slices, ubatch_num_tokens_after_padding = ubatch_split( num_scheduled_tokens, total_num_scheduled_tokens, total_num_scheduled_tokens, uniform_decode=uniform_decode, vllm_config=self.vllm_config, ) + # Currently when DBO is enabled `ubatch_split` returns + # the num_tokens_after_padding for a single ubatch, but we have 2 + # TODO(sage,lucas): this is cruft that should be addressed in the + # padding refactor. + if ubatch_num_tokens_after_padding is not None: + num_tokens_after_padding = ubatch_num_tokens_after_padding * 2 # If we failed to microbatch, currently need to resynchronize # TODO(lucas,sage): we should be able to avoid this second sync by @@ -3198,7 +3204,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): # filter out the valid batch descriptor _cg_mode, batch_descriptor = self.cudagraph_dispatcher.dispatch( - BatchDescriptor(num_tokens=num_tokens, + BatchDescriptor(num_tokens=num_tokens_after_padding, uniform_decode=uniform_decode)) \ if not is_profile else (CUDAGraphMode.NONE, None) if cudagraph_runtime_mode is not None: @@ -3212,7 +3218,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): cudagraph_runtime_mode = _cg_mode if ubatch_slices is not None: - num_tokens = num_tokens // 2 + # Adjust values to reflect a single ubatch. + # TODO(sage,lucas): this is cruft that should be addressed in + # the padding refactor. + num_tokens_after_padding = ubatch_slices[0].num_tokens + if num_tokens_across_dp is not None: + num_tokens_across_dp[:] = num_tokens_after_padding + with self.maybe_randomize_inputs(input_ids), set_forward_context( attn_metadata, self.vllm_config,