diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py index a5a12732c5a83..313e7ec38814d 100644 --- a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py @@ -635,7 +635,8 @@ def batched_moe_kernel_quantize_input( per_channel_quant: bool, block_shape: Optional[list[int]] = None, ) -> tuple[torch.Tensor, Optional[torch.Tensor]]: - if (torch.compiler.is_compiling() + if (True or + torch.compiler.is_compiling() or torch.cuda.is_current_stream_capturing()): # Note: this does a bunch of extra work because expert_num_tokens is ignored # but it does support torch.compile + cudagraphs.