From e6e3407b8d0a86f8b12eaeedf07d452a95d4e241 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Sun, 8 Jun 2025 16:56:24 +0000
Subject: [PATCH] fix ubatch padding to account for the case where the padding
 would result in an empty second ubatch

Signed-off-by: Sage Moore <sage@neuralmagic.com>
---
 vllm/v1/worker/gpu_model_runner.py | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index fd34451d815d5..676a46dda85b6 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -670,7 +670,23 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             assert should_ubatch
             num_pad_tokens, num_tokens_after_padding = self.get_dp_padding_ubatch(ubatch_slices)
             if num_pad_tokens > 0:
-                self.pad_out_ubatch_first_stage(ubatch_slices, num_pad_tokens)
+                if num_pad_tokens < scheduler_output.total_num_scheduled_tokens:
+                    self.pad_out_ubatch_first_stage(ubatch_slices, num_pad_tokens)
+                else:
+                    # We bail out of ubatching here. This accounts for the case where 
+                    # the padding would result in an "empty" second ubatch.
+                    # TODO: just make the second ubatch a dummy ubatch
+                    ubatch_slices = None
+        
+        # This AR is only necessary in the case described above where 
+        # the second ubatch ends up being empty. NOte if you delete this go delete 
+        # the second should_ubatch call in  _dummy_run
+        should_ubatch = self.should_ubatch(True if ubatch_slices else False)
+        if not should_ubatch:
+            num_pad_tokens = 0
+            num_tokens_after_padding = None
+            ubatch_slices = None
+
 
         
 
@@ -1273,8 +1289,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                                                 device="cpu",
                                                 dtype=torch.int32)
 
-        assert max_tokens_across_dp <= 2 * max_tokens_per_ubatch_local, \
-            f"max_tokens_across_dp: {max_tokens_across_dp} max_tokens_per_ubatch{max_tokens_per_ubatch_local}"
+        # assert max_tokens_across_dp <= 2 * max_tokens_per_ubatch_local, \
+        #     f"max_tokens_across_dp: {max_tokens_across_dp} max_tokens_per_ubatch{max_tokens_per_ubatch_local}"
         num_pad_tokens = (max_tokens_across_dp * 2) - \
             (first_ubatch_num_tokens + second_ubatch_num_tokens)
         return num_pad_tokens, num_tokens_after_padding
@@ -2182,6 +2198,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         # _dummy_run doesn't go through _prepare_inputs so 
         # we synchronize with other DP ranks here
         self.should_ubatch(should_microbatch)
+        self.should_ubatch(should_microbatch)
 
         with self.maybe_dummy_run_with_lora(self.lora_config,
                                             num_scheduled_tokens):