From 5b0249b86ef44e99e458537c40b0d4de410821c2 Mon Sep 17 00:00:00 2001 From: Sage Moore Date: Fri, 30 May 2025 14:19:12 +0000 Subject: [PATCH] various fixes --- .../layers/fused_moe/fused_batched_moe.py | 2 +- .../layers/fused_moe/pplx_prepare_finalize.py | 4 ++-- vllm/v1/worker/ubatching.py | 20 +++++++++---------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py index b28c441769204..006c2b504541d 100644 --- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py @@ -661,7 +661,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute): f"Hidden size mismatch {hidden_states.size(-1)} " f"!= {w1.size(2)}") - print("in batched triton experts", hidden_states.shape, expert_num_tokens) + # print("in batched triton experts", hidden_states.shape, expert_num_tokens) assert hidden_states.is_contiguous( ), "Hidden_states must be contiguous" diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py index 87d2745c20eb4..6811786147d1a 100644 --- a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py @@ -129,7 +129,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): ubatch_id = ubatch_ctx.id if ubatch_ctx is not None else -1 yield_and_switch_from_compute_to_comm_impl(schedule="default") dispatch(True) # Send - torch.cuda.synchronize() + # torch.cuda.synchronize() # print(f"{ubatch_id} AFTER SEND SYNC", flush=True) dispatch(False) # Recv # torch.cuda.synchronize() @@ -176,7 +176,7 @@ class PplxPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize): ) yield_and_switch_from_compute_to_comm_impl(schedule="default") combine(True) - torch.cuda.synchronize() + # torch.cuda.synchronize() # print(f"{ubatch_id} AFTER COMBINE SEND SYNC", flush=True) combine(False) # torch.cuda.synchronize() diff --git a/vllm/v1/worker/ubatching.py b/vllm/v1/worker/ubatching.py index e17d57ee6aeeb..07ba978032314 100644 --- a/vllm/v1/worker/ubatching.py +++ b/vllm/v1/worker/ubatching.py @@ -113,29 +113,29 @@ class UBatchContext: def yield_and_switch_from_compute_to_comm(self): assert current_stream() == self.compute_stream - dp_rank = get_dp_group().rank_in_group - print(f"DP: {dp_rank} UB: {self.id} Yield and switch from {self.stream_string()}", flush=True) + # dp_rank = get_dp_group().rank_in_group + # print(f"DP: {dp_rank} UB: {self.id} Yield and switch from {self.stream_string()}", flush=True) self.ctx_valid_state() - # self._signal_compute_done() + self._signal_compute_done() self._cpu_yield() self.ctx_valid_state() assert self.current_stream == self.compute_stream self.update_stream(self.comm_stream) - print(f"DP: {dp_rank} UB: {self.id} Resuming on stream {self.stream_string()}", flush=True) - # self._wait_compute_done() + # print(f"DP: {dp_rank} UB: {self.id} Resuming on stream {self.stream_string()}", flush=True) + self._wait_compute_done() def yield_and_switch_from_comm_to_compute(self): assert current_stream() == self.comm_stream - dp_rank = get_dp_group().rank_in_group - print(f"DP: {dp_rank} UB: {self.id} Yield and switch from {self.stream_string()}", flush=True) + # dp_rank = get_dp_group().rank_in_group + # print(f"DP: {dp_rank} UB: {self.id} Yield and switch from {self.stream_string()}", flush=True) self.ctx_valid_state() - # self._signal_comm_done() + self._signal_comm_done() self._cpu_yield() self.ctx_valid_state() assert self.current_stream == self.comm_stream self.update_stream(self.compute_stream) - print(f"DP: {dp_rank} UB: {self.id} Resuming on stream {self.stream_string()}", flush=True) - # self._wait_comm_done() + # print(f"DP: {dp_rank} UB: {self.id} Resuming on stream {self.stream_string()}", flush=True) + self._wait_comm_done() _CURRENT_CONTEXT: dict = {}