From 95c40f9b09d71ab02f74c731e36b771a9f2c5f52 Mon Sep 17 00:00:00 2001 From: Bill Nell Date: Fri, 30 May 2025 02:33:58 +0000 Subject: [PATCH] hacks Signed-off-by: Bill Nell --- vllm/model_executor/layers/fused_moe/fused_batched_moe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py index a5a12732c5a83..313e7ec38814d 100644 --- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py @@ -635,7 +635,8 @@ def batched_moe_kernel_quantize_input( per_channel_quant: bool, block_shape: Optional[list[int]] = None, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: - if (torch.compiler.is_compiling() + if (True or + torch.compiler.is_compiling() or torch.cuda.is_current_stream_capturing()): # Note: this does a bunch of extra work because expert_num_tokens is ignored # but it does support torch.compile + cudagraphs.