From 952f3c5c1e2b968e475430ec38582fcbe7e3aa84 Mon Sep 17 00:00:00 2001 From: Lucas Wilkinson Date: Fri, 23 May 2025 18:18:05 +0000 Subject: [PATCH] tone down prints Signed-off-by: Lucas Wilkinson --- vllm/model_executor/layers/fused_moe/layer.py | 8 ++++---- .../layers/fused_moe/modular_kernel.py | 12 ++++++------ .../layers/fused_moe/pplx_prepare_finalize.py | 12 ++++++------ vllm/model_executor/models/deepseek_v2.py | 8 ++++---- vllm/v1/worker/ubatching.py | 12 ++++++------ 5 files changed, 26 insertions(+), 26 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index 4eea5714e1be3..cd7b1a2c1cfa6 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -1300,8 +1300,8 @@ class FusedMoE(torch.nn.Module): max_tokens_across_dp = ctx.dp_metadata.max_tokens_across_dp_cpu moe_dp_chunk_size_per_rank = MOE_DP_CHUNK_SIZE - if (ubatch_ctdx := get_current_ubatch_context()) is not None: - print("in fused moe, ubatch:", ubatch_ctdx.id, "chunk size:", max_tokens_across_dp, "moe_dp_chunk_size_per_rank", moe_dp_chunk_size_per_rank) + # if (ubatch_ctdx := get_current_ubatch_context()) is not None: + # print("in fused moe, ubatch:", ubatch_ctdx.id, "chunk size:", max_tokens_across_dp, "moe_dp_chunk_size_per_rank", moe_dp_chunk_size_per_rank) num_tokens = full_hidden_states.size(0) for chunk_start_ in range(0, max_tokens_across_dp, @@ -1401,8 +1401,8 @@ def moe_forward(hidden_states: torch.Tensor, router_logits: torch.Tensor, forward_context: ForwardContext = get_forward_context() self = forward_context.no_compile_layers[layer_name] assert self.quant_method is not None - if (ubatch_ctx := get_current_ubatch_context()) is not None: - print("in fused moe, ubatch:", ubatch_ctx.id, self) + # if (ubatch_ctx := get_current_ubatch_context()) is not None: + # print("in fused moe, ubatch:", ubatch_ctx.id, self) return self.forward_impl(hidden_states, router_logits) diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 56317f6ee6adc..47d0880ee8071 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -336,15 +336,15 @@ class FusedMoEModularKernel(torch.nn.Module): device=a1.device, dtype=workspace_dtype) - if (ubatch_ctx := get_current_ubatch_context()) is not None: - print("in modular moe, ubatch:", ubatch_ctx.id) + # if (ubatch_ctx := get_current_ubatch_context()) is not None: + # print("in modular moe, ubatch:", ubatch_ctx.id) a1q, a1q_scale, expert_num_tokens = self.prepare_finalize.prepare( a1, a1_scale, a2_scale, topk_weights, topk_ids, global_num_experts, expert_map, apply_router_weight_on_input) - if (ubatch_ctx := get_current_ubatch_context()) is not None: - print("in modular moe2, ubatch:", ubatch_ctx.id, self.fused_experts) + # if (ubatch_ctx := get_current_ubatch_context()) is not None: + # print("in modular moe2, ubatch:", ubatch_ctx.id, self.fused_experts) print("pre synchronize") torch.cuda.synchronize(a1.device) @@ -369,8 +369,8 @@ class FusedMoEModularKernel(torch.nn.Module): expert_num_tokens=expert_num_tokens, ) - if (ubatch_ctx := get_current_ubatch_context()) is not None: - print("in modular moe3, ubatch:", ubatch_ctx.id, self.fused_experts) + # if (ubatch_ctx := get_current_ubatch_context()) is not None: + # print("in modular moe3, ubatch:", ubatch_ctx.id, self.fused_experts) self.prepare_finalize.finalize(output, fused_out, topk_weights, topk_ids, apply_router_weight_on_input) diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py index a179a4d6d6f2b..f5276637326b0 100644 --- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py @@ -119,14 +119,14 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): do_recv=not send, ) - print("Dispatch pre-wait") + #print("Dispatch pre-wait") if (ubatch_ctx := get_current_ubatch_context()) is not None: ubatch_ctx.gpu_stream_wait() - print("Dispatch launched") + #print("Dispatch launched") dispatch(True) # Send yield_impl(gpu_wait=False) dispatch(False) # Recv - print("Finished dispatch") + #print("Finished dispatch") return expert_x, expert_x_scale, expert_num_tokens @@ -164,11 +164,11 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): do_recv=not send, ) - print("Combine pre-wait") + #print("Combine pre-wait") if (ubatch_ctx := get_current_ubatch_context()) is not None: ubatch_ctx.gpu_stream_wait() combine(True) - print("Combine launched") + #print("Combine launched") yield_impl(gpu_wait=False) combine(False) - print("Finished combine") + #print("Finished combine") diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py index eb1eec53fcbd0..6b153a6161eff 100644 --- a/vllm/model_executor/models/deepseek_v2.py +++ b/vllm/model_executor/models/deepseek_v2.py @@ -564,8 +564,8 @@ class DeepseekV2DecoderLayer(nn.Module): hidden_states: torch.Tensor, residual: Optional[torch.Tensor], ) -> torch.Tensor: - if (ubatch_ctx := get_current_ubatch_context()) is not None: - print("in decoder, ubatch:", ubatch_ctx.id) + # if (ubatch_ctx := get_current_ubatch_context()) is not None: + # print("in decoder, ubatch:", ubatch_ctx.id) # Self Attention if residual is None: residual = hidden_states @@ -659,8 +659,8 @@ class DeepseekV2Model(nn.Module): intermediate_tensors: Optional[IntermediateTensors], inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: - if (ubatch_ctx := get_current_ubatch_context()) is not None: - print("in forward, ubatch:", ubatch_ctx.id) + # if (ubatch_ctx := get_current_ubatch_context()) is not None: + # print("in forward, ubatch:", ubatch_ctx.id) if get_pp_group().is_first_rank: if inputs_embeds is not None: diff --git a/vllm/v1/worker/ubatching.py b/vllm/v1/worker/ubatching.py index d24435f227cc1..c4026f7eae014 100644 --- a/vllm/v1/worker/ubatching.py +++ b/vllm/v1/worker/ubatching.py @@ -44,8 +44,8 @@ class UBatchContext: self.original_stream.record_event(start_event) self.stream.wait_event(start_event) print("Starting ubatch %d" % self.id) - if self.gpu_wait_on_launch: - self.gpu_stream_wait() + # if self.gpu_wait_on_launch: + # self.gpu_stream_wait() return self def __exit__(self, exc_type, exc_val, exc_tb): @@ -84,10 +84,10 @@ class UBatchContext: self.stream.wait_event(self.gpu_wait_event) def _yield(self, gpu_wait: bool = True): - print("Yielding ubatch %d" % self.id) + #print("Yielding ubatch %d" % self.id) self._signal() self._cpu_wait() - print("Resuming ubatch %d" % self.id) + #print("Resuming ubatch %d" % self.id) if gpu_wait: self.gpu_stream_wait() @@ -115,7 +115,7 @@ def get_current_ubatch_context() -> Optional[UBatchContext]: def yield_impl(schedule="default", gpu_wait: bool = True): # Perform the barrier if a context exists for this thread ctx = get_current_ubatch_context() - print("you are in yield_impl", ctx) + #print("you are in yield_impl", ctx) if ctx is not None: ctx._yield(gpu_wait=gpu_wait) @@ -146,7 +146,7 @@ def make_ubatch_context_chain( Create a context manager for micro-batching synchronization. """ cpu_events = [threading.Event() for _ in range(num_micro_batches)] - gpu_events = [torch.cuda.Event() for _ in range(num_micro_batches)] + gpu_events = [torch.cuda.Event(blocking=True) for _ in range(num_micro_batches)] device = device or torch.cuda.current_device() ctxs = []