From b74c731342b6e2509e6fbbdb9305253354996e3e Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Thu, 12 Jun 2025 20:36:13 +0000
Subject: [PATCH] more hacking

Signed-off-by: Sage Moore <sage@neuralmagic.com>
---
 vllm/forward_context.py                       | 11 +++-
 .../layers/fused_moe/pplx_prepare_finalize.py |  8 +--
 vllm/v1/worker/gpu_model_runner.py            | 52 ++++++++++++++-----
 3 files changed, 51 insertions(+), 20 deletions(-)

diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index ead1d86d8a42a..e964a3badb4eb 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -44,9 +44,9 @@ class DPMetadata:
                                          device="cpu",
                                          dtype=torch.int32)
         from vllm.distributed.parallel_state import get_dp_group
-        # print("STARTING AR num_tokens_across_dp")
+        # logger.info("STARTING AR num_tokens_across_dp")
         dist.all_reduce(num_tokens_tensor, group=get_dp_group().cpu_group)
-        # print("finishing num_tokens_across_dp")
+        # logger.info("finishing num_tokens_across_dp")
         return num_tokens_tensor
 
     @staticmethod
@@ -57,7 +57,14 @@ class DPMetadata:
                                          device="cpu",
                                          dtype=torch.int32)
         from vllm.distributed.parallel_state import get_dp_group
+        # logger.info(f"should_ubatch_tensor before ar {should_ubatch_tensor}")
         dist.all_reduce(should_ubatch_tensor, group=get_dp_group().cpu_group)
+        # logger.info(f"should_ubatch_tensor after ar {should_ubatch_tensor}")
+
+        # If there's an incorrect ordering of ARs across DP ranks, this tensor 
+        # can end up containing the number of padded tokens for a DP rank
+        assert torch.all(should_ubatch_tensor <= 1)
+
         result: bool = bool(torch.all(should_ubatch_tensor == 1).item())
         # print(f"FINISHING AR should_ubatch_across_dp {result} {should_ubatch_tensor}")
         return result
diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
index cd5ec2f4cbf2d..d823706b63e03 100644
--- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@@ -134,14 +134,14 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                 do_recv=not send,
             )
 
-        yield_and_switch_from_compute_to_comm_impl(schedule="default")
+        # yield_and_switch_from_compute_to_comm_impl(schedule="default")
         dispatch(True)  # Send
         # torch.cuda.synchronize()
         # print(f"{ubatch_id} AFTER SEND SYNC", flush=True)
         dispatch(False)  # Recv
         # torch.cuda.synchronize()
         # print(f"{ubatch_id} AFTER RECV SYNC", flush=True)
-        yield_and_switch_from_comm_to_compute_impl(schedule="default")
+        # yield_and_switch_from_comm_to_compute_impl(schedule="default")
         # torch.cuda.synchronize()
         if expert_x_scale is not None:
             expert_x_scale = expert_x_scale[:, :, 0:1]
@@ -185,11 +185,11 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                 do_recv=not send,
             )
 
-        yield_and_switch_from_compute_to_comm_impl(schedule="default")
+        # yield_and_switch_from_compute_to_comm_impl(schedule="default")
         combine(True)
         # torch.cuda.synchronize()
         # print(f"{ubatch_id} AFTER COMBINE SEND SYNC", flush=True)
         combine(False)
         # print(f"{ubatch_id} AFTER COMBINE RECV SYNC", flush=True)
-        yield_and_switch_from_comm_to_compute_impl(schedule="default")
+        # yield_and_switch_from_comm_to_compute_impl(schedule="default")
         # torch.cuda.synchronize()
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 311656f364674..c3a9f96f57f61 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -686,34 +686,48 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             self.query_start_loc_np, max_num_scheduled_tokens,
             scheduler_output)
         should_ubatch = self.should_ubatch(True if ubatch_slices else False)
-        if should_ubatch:
-            assert ubatch_slices
         # Don't attempt to microbatch unless every other DP worker is also microbatching
-        if not should_ubatch and ubatch_slices:
+        if not should_ubatch:
             ubatch_slices = None
 
         num_pad_tokens = 0
         num_tokens_after_padding = None
+        ubatch_bailout = False
         if ubatch_slices:
+            # logger.info(f"ATTEMPTING TO PAD UBATCH {should_ubatch}")
             assert should_ubatch
             num_pad_tokens, num_tokens_after_padding = self.get_dp_padding_ubatch(ubatch_slices)
+            # logger.info("UBATCH PADDING DONE")
             if num_pad_tokens > 0:
                 if num_pad_tokens < scheduler_output.total_num_scheduled_tokens:
                     self.pad_out_ubatch_first_stage(ubatch_slices, num_pad_tokens)
                 else:
+                    assert False
                     # We bail out of ubatching here. This accounts for the case where 
                     # the padding would result in an "empty" second ubatch.
                     # TODO: just make the second ubatch a dummy ubatch
-                    ubatch_slices = None
+                    logger.info("FALLING BACK AND DISABLING UBATCHING")
+                    ubatch_bailout = True
+        
+        # Note that if we are attempting to ubatch by this point then we know that no 
+        # DP ranks are doing dummy runs
+        # if ubatch_slices:
+        #     should_ubatch = self.should_ubatch(False if ubatch_bailout else True)
+        #     if not should_ubatch:
+        #         logger.info("SUCCESSFULLY BAILED OUT")
+        #         num_pad_tokens = 0
+        #         num_tokens_after_padding = None
+        #         ubatch_slices = None
+
         
         # This AR is only necessary in the case described above where 
         # the second ubatch ends up being empty. NOte if you delete this go delete 
         # the second should_ubatch call in  _dummy_run
-        should_ubatch = self.should_ubatch(True if ubatch_slices else False)
-        if not should_ubatch:
-            num_pad_tokens = 0
-            num_tokens_after_padding = None
-            ubatch_slices = None
+        # should_ubatch = self.should_ubatch(True if ubatch_slices else False)
+        # if not should_ubatch:
+        #     num_pad_tokens = 0
+        #     num_tokens_after_padding = None
+        #     ubatch_slices = None
 
 
         
@@ -1643,9 +1657,12 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             num_input_tokens += num_pad_tokens
             self.pad_out_ubatch_second_stage(ubatch_slices, num_input_tokens)
         elif ubatch_slices is None:
+            # logger.info("ATTEMPTING TO PAD NORMAL BATCH")
             num_pad, num_tokens_after_padding = self.get_padding(num_input_tokens)
+            # logger.info("NORMAL BATCH DONE")
             num_input_tokens += num_pad
 
+        # logger.info("RUNNING MODEL")
         # Run the decoder.
         # Use persistent buffers for CUDA graphs.
         self.maybe_setup_kv_connector(scheduler_output)
@@ -2135,9 +2152,16 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         allow_microbatching: bool = False,
     ) -> torch.Tensor:
 
+        should_microbatch = False
+        # _dummy_run doesn't go through _prepare_inputs so 
+        # we synchronize with other DP ranks here
+        self.should_ubatch(should_microbatch)
         # Padding for DP
+        # logger.info("PADDING DUMMY")
         num_pad, num_tokens_across_dp = self.get_dp_padding(num_tokens)
+        # logger.info("PADDING DUMMY DONE")
         num_tokens += num_pad
+        # num_tokens_across_dp = None
 
         # Set num_scheduled_tokens based on num_tokens and max_num_seqs
         # for dummy run with LoRA so that the num_reqs collectively
@@ -2187,11 +2211,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
         #     and self.vllm_config.parallel_config.always_microbatch_if_enabled)
         # dummy_microbatches = [(slice(0, 0), slice(0, 0)),
         #                       (slice(0, 0), slice(0, 0))]
-        should_microbatch = False
-        # _dummy_run doesn't go through _prepare_inputs so 
-        # we synchronize with other DP ranks here
-        self.should_ubatch(should_microbatch)
-        self.should_ubatch(should_microbatch)
 
         with self.maybe_dummy_run_with_lora(self.lora_config,
                                             num_scheduled_tokens):
@@ -2212,6 +2231,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 self.drafter.dummy_run(num_tokens)
 
         logit_indices = np.cumsum(num_scheduled_tokens) - 1
+        # logger.info("DUMMY RUN RETURNING HIDDEN STATES")
         return hidden_states[logit_indices]
 
     @torch.inference_mode()
@@ -2360,7 +2380,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
             # Cache the dummy encoder outputs.
             self.encoder_cache["tmp"] = dict(enumerate(dummy_encoder_outputs))
 
+        # logger.info("STARTING HIDDEN STATES")
         hidden_states = self._dummy_run(self.max_num_tokens)
+        # logger.info("HIDDEN STATES")
         if get_pp_group().is_last_rank:
             sampler_output = self._dummy_sampler_run(hidden_states)
         else:
@@ -2389,7 +2411,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                 for _ in range(self.vllm_config.compilation_config.
                                cudagraph_num_of_warmups):
                     self._dummy_run(num_tokens, skip_attn=skip_attn)
+                # print("CUDAGRAPH CAPTURE START")
                 self._dummy_run(num_tokens, skip_attn=skip_attn)
+                # print("CUDAGRAPH CAPTURE END")
 
         end_time = time.perf_counter()
         end_free_gpu_memory = torch.cuda.mem_get_info()[0]