From 5b0249b86ef44e99e458537c40b0d4de410821c2 Mon Sep 17 00:00:00 2001
From: Sage Moore <sage@neuralmagic.com>
Date: Fri, 30 May 2025 14:19:12 +0000
Subject: [PATCH] various fixes

---
 .../layers/fused_moe/fused_batched_moe.py     |  2 +-
 .../layers/fused_moe/pplx_prepare_finalize.py |  4 ++--
 vllm/v1/worker/ubatching.py                   | 20 +++++++++----------
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
index b28c441769204..006c2b504541d 100644
--- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -661,7 +661,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
                 f"Hidden size mismatch {hidden_states.size(-1)} "
                 f"!= {w1.size(2)}")
 
-        print("in batched triton experts", hidden_states.shape, expert_num_tokens)
+        # print("in batched triton experts", hidden_states.shape, expert_num_tokens)
 
         assert hidden_states.is_contiguous(
         ), "Hidden_states must be contiguous"
diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
index 87d2745c20eb4..6811786147d1a 100644
--- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@@ -129,7 +129,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
         ubatch_id = ubatch_ctx.id if ubatch_ctx is not None else -1
         yield_and_switch_from_compute_to_comm_impl(schedule="default")
         dispatch(True) # Send
-        torch.cuda.synchronize()
+        # torch.cuda.synchronize()
         # print(f"{ubatch_id} AFTER SEND SYNC", flush=True)
         dispatch(False) # Recv
         # torch.cuda.synchronize()
@@ -176,7 +176,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
             )
         yield_and_switch_from_compute_to_comm_impl(schedule="default")
         combine(True)
-        torch.cuda.synchronize()
+        # torch.cuda.synchronize()
         # print(f"{ubatch_id} AFTER COMBINE SEND SYNC", flush=True)
         combine(False)
         # torch.cuda.synchronize()
diff --git a/vllm/v1/worker/ubatching.py b/vllm/v1/worker/ubatching.py
index e17d57ee6aeeb..07ba978032314 100644
--- a/vllm/v1/worker/ubatching.py
+++ b/vllm/v1/worker/ubatching.py
@@ -113,29 +113,29 @@ class UBatchContext:
 
     def yield_and_switch_from_compute_to_comm(self):
         assert current_stream() == self.compute_stream
-        dp_rank = get_dp_group().rank_in_group 
-        print(f"DP: {dp_rank} UB: {self.id} Yield and switch from {self.stream_string()}", flush=True)
+        # dp_rank = get_dp_group().rank_in_group 
+        # print(f"DP: {dp_rank} UB: {self.id} Yield and switch from {self.stream_string()}", flush=True)
         self.ctx_valid_state()
-        # self._signal_compute_done()
+        self._signal_compute_done()
         self._cpu_yield()
         self.ctx_valid_state()
         assert self.current_stream == self.compute_stream
         self.update_stream(self.comm_stream)
-        print(f"DP: {dp_rank} UB: {self.id} Resuming on stream {self.stream_string()}", flush=True)
-        # self._wait_compute_done()
+        # print(f"DP: {dp_rank} UB: {self.id} Resuming on stream {self.stream_string()}", flush=True)
+        self._wait_compute_done()
 
     def yield_and_switch_from_comm_to_compute(self):
         assert current_stream() == self.comm_stream
-        dp_rank = get_dp_group().rank_in_group 
-        print(f"DP: {dp_rank} UB: {self.id} Yield and switch from {self.stream_string()}", flush=True)
+        # dp_rank = get_dp_group().rank_in_group 
+        # print(f"DP: {dp_rank} UB: {self.id} Yield and switch from {self.stream_string()}", flush=True)
         self.ctx_valid_state()
-        # self._signal_comm_done()
+        self._signal_comm_done()
         self._cpu_yield()
         self.ctx_valid_state()
         assert self.current_stream == self.comm_stream
         self.update_stream(self.compute_stream)
-        print(f"DP: {dp_rank} UB: {self.id} Resuming on stream {self.stream_string()}", flush=True)
-        # self._wait_comm_done()
+        # print(f"DP: {dp_rank} UB: {self.id} Resuming on stream {self.stream_string()}", flush=True)
+        self._wait_comm_done()
         
 
 _CURRENT_CONTEXT: dict = {}