From 952f3c5c1e2b968e475430ec38582fcbe7e3aa84 Mon Sep 17 00:00:00 2001
From: Lucas Wilkinson <lwilkins@redhat.com>
Date: Fri, 23 May 2025 18:18:05 +0000
Subject: [PATCH] tone down prints

Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
---
 vllm/model_executor/layers/fused_moe/layer.py        |  8 ++++----
 .../layers/fused_moe/modular_kernel.py               | 12 ++++++------
 .../layers/fused_moe/pplx_prepare_finalize.py        | 12 ++++++------
 vllm/model_executor/models/deepseek_v2.py            |  8 ++++----
 vllm/v1/worker/ubatching.py                          | 12 ++++++------
 5 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
index 4eea5714e1be3..cd7b1a2c1cfa6 100644
--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -1300,8 +1300,8 @@ class FusedMoE(torch.nn.Module):
         max_tokens_across_dp = ctx.dp_metadata.max_tokens_across_dp_cpu
         moe_dp_chunk_size_per_rank = MOE_DP_CHUNK_SIZE
 
-        if (ubatch_ctdx := get_current_ubatch_context()) is not None:
-            print("in fused moe, ubatch:", ubatch_ctdx.id, "chunk size:", max_tokens_across_dp, "moe_dp_chunk_size_per_rank", moe_dp_chunk_size_per_rank)
+        # if (ubatch_ctdx := get_current_ubatch_context()) is not None:
+        #     print("in fused moe, ubatch:", ubatch_ctdx.id, "chunk size:", max_tokens_across_dp, "moe_dp_chunk_size_per_rank", moe_dp_chunk_size_per_rank)
 
         num_tokens = full_hidden_states.size(0)
         for chunk_start_ in range(0, max_tokens_across_dp,
@@ -1401,8 +1401,8 @@ def moe_forward(hidden_states: torch.Tensor, router_logits: torch.Tensor,
     forward_context: ForwardContext = get_forward_context()
     self = forward_context.no_compile_layers[layer_name]
     assert self.quant_method is not None
-    if (ubatch_ctx := get_current_ubatch_context()) is not None:
-        print("in fused moe, ubatch:", ubatch_ctx.id, self)
+    # if (ubatch_ctx := get_current_ubatch_context()) is not None:
+    #     print("in fused moe, ubatch:", ubatch_ctx.id, self)
 
     return self.forward_impl(hidden_states, router_logits)
 
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
index 56317f6ee6adc..47d0880ee8071 100644
--- a/vllm/model_executor/layers/fused_moe/modular_kernel.py
+++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -336,15 +336,15 @@ class FusedMoEModularKernel(torch.nn.Module):
                                  device=a1.device,
                                  dtype=workspace_dtype)
 
-        if (ubatch_ctx := get_current_ubatch_context()) is not None:
-            print("in modular moe, ubatch:", ubatch_ctx.id)
+        # if (ubatch_ctx := get_current_ubatch_context()) is not None:
+        #     print("in modular moe, ubatch:", ubatch_ctx.id)
 
         a1q, a1q_scale, expert_num_tokens = self.prepare_finalize.prepare(
             a1, a1_scale, a2_scale, topk_weights, topk_ids, global_num_experts,
             expert_map, apply_router_weight_on_input)
 
-        if (ubatch_ctx := get_current_ubatch_context()) is not None:
-            print("in modular moe2, ubatch:", ubatch_ctx.id, self.fused_experts)
+        # if (ubatch_ctx := get_current_ubatch_context()) is not None:
+        #     print("in modular moe2, ubatch:", ubatch_ctx.id, self.fused_experts)
 
         print("pre synchronize")
         torch.cuda.synchronize(a1.device)
@@ -369,8 +369,8 @@ class FusedMoEModularKernel(torch.nn.Module):
             expert_num_tokens=expert_num_tokens,
         )
 
-        if (ubatch_ctx := get_current_ubatch_context()) is not None:
-            print("in modular moe3, ubatch:", ubatch_ctx.id, self.fused_experts)
+        # if (ubatch_ctx := get_current_ubatch_context()) is not None:
+        #     print("in modular moe3, ubatch:", ubatch_ctx.id, self.fused_experts)
 
         self.prepare_finalize.finalize(output, fused_out, topk_weights,
                                        topk_ids, apply_router_weight_on_input)
diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
index a179a4d6d6f2b..f5276637326b0 100644
--- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
+++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@@ -119,14 +119,14 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                 do_recv=not send,
             )
         
-        print("Dispatch pre-wait")
+        #print("Dispatch pre-wait")
         if (ubatch_ctx := get_current_ubatch_context()) is not None:
             ubatch_ctx.gpu_stream_wait()
-        print("Dispatch launched")
+        #print("Dispatch launched")
         dispatch(True) # Send
         yield_impl(gpu_wait=False)
         dispatch(False) # Recv
-        print("Finished dispatch")
+        #print("Finished dispatch")
 
         return expert_x, expert_x_scale, expert_num_tokens
 
@@ -164,11 +164,11 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
                 do_recv=not send,
             )
             
-        print("Combine pre-wait")
+        #print("Combine pre-wait")
         if (ubatch_ctx := get_current_ubatch_context()) is not None:
             ubatch_ctx.gpu_stream_wait()
         combine(True)
-        print("Combine launched")
+        #print("Combine launched")
         yield_impl(gpu_wait=False)
         combine(False)
-        print("Finished combine")
+        #print("Finished combine")
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
index eb1eec53fcbd0..6b153a6161eff 100644
--- a/vllm/model_executor/models/deepseek_v2.py
+++ b/vllm/model_executor/models/deepseek_v2.py
@@ -564,8 +564,8 @@ class DeepseekV2DecoderLayer(nn.Module):
         hidden_states: torch.Tensor,
         residual: Optional[torch.Tensor],
     ) -> torch.Tensor:
-        if (ubatch_ctx := get_current_ubatch_context()) is not None:
-            print("in decoder, ubatch:", ubatch_ctx.id)
+        # if (ubatch_ctx := get_current_ubatch_context()) is not None:
+        #     print("in decoder, ubatch:", ubatch_ctx.id)
         # Self Attention
         if residual is None:
             residual = hidden_states
@@ -659,8 +659,8 @@ class DeepseekV2Model(nn.Module):
         intermediate_tensors: Optional[IntermediateTensors],
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
-        if (ubatch_ctx := get_current_ubatch_context()) is not None:
-            print("in forward, ubatch:", ubatch_ctx.id)
+        # if (ubatch_ctx := get_current_ubatch_context()) is not None:
+        #     print("in forward, ubatch:", ubatch_ctx.id)
         
         if get_pp_group().is_first_rank:
             if inputs_embeds is not None:
diff --git a/vllm/v1/worker/ubatching.py b/vllm/v1/worker/ubatching.py
index d24435f227cc1..c4026f7eae014 100644
--- a/vllm/v1/worker/ubatching.py
+++ b/vllm/v1/worker/ubatching.py
@@ -44,8 +44,8 @@ class UBatchContext:
         self.original_stream.record_event(start_event)
         self.stream.wait_event(start_event)
         print("Starting ubatch %d" % self.id)
-        if self.gpu_wait_on_launch:
-            self.gpu_stream_wait()
+        # if self.gpu_wait_on_launch:
+        #     self.gpu_stream_wait()
         return self
 
     def __exit__(self, exc_type, exc_val, exc_tb):
@@ -84,10 +84,10 @@ class UBatchContext:
         self.stream.wait_event(self.gpu_wait_event)
 
     def _yield(self, gpu_wait: bool = True):
-        print("Yielding ubatch %d" % self.id)
+        #print("Yielding ubatch %d" % self.id)
         self._signal()
         self._cpu_wait()
-        print("Resuming ubatch %d" % self.id)
+        #print("Resuming ubatch %d" % self.id)
         if gpu_wait:
             self.gpu_stream_wait()
 
@@ -115,7 +115,7 @@ def get_current_ubatch_context() -> Optional[UBatchContext]:
 def yield_impl(schedule="default", gpu_wait: bool = True):
     # Perform the barrier if a context exists for this thread
     ctx = get_current_ubatch_context() 
-    print("you are in yield_impl", ctx)
+    #print("you are in yield_impl", ctx)
     if ctx is not None:
         ctx._yield(gpu_wait=gpu_wait)
 
@@ -146,7 +146,7 @@ def make_ubatch_context_chain(
     Create a context manager for micro-batching synchronization.
     """
     cpu_events = [threading.Event() for _ in range(num_micro_batches)]
-    gpu_events = [torch.cuda.Event() for _ in range(num_micro_batches)]
+    gpu_events = [torch.cuda.Event(blocking=True) for _ in range(num_micro_batches)]
     device = device or torch.cuda.current_device()
     
     ctxs = []