From d1007767c5f5dd58752f87f39ca4e7847ab69432 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 2 Oct 2025 19:30:37 -0400 Subject: [PATCH] [Bugfix] Disable cascade attention with FlashInfer (#26130) Signed-off-by: mgoin Signed-off-by: Michael Goin Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Signed-off-by: simon-mo --- vllm/v1/attention/backends/flashinfer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 688e681f0591..15a252734d4d 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -29,7 +29,6 @@ from vllm.utils.flashinfer import (can_use_trtllm_attention, flashinfer_disable_q_quantization, supports_trtllm_attention, use_trtllm_attention) -from vllm.v1.attention.backends.flash_attn import use_cascade_attention # yapf conflicts with isort for this block # yapf: disable from vllm.v1.attention.backends.utils import (AttentionCGSupport, @@ -677,7 +676,9 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): # TODO: The cascade wrapper currently does not support setting # kv cache dtype to something different from query dtype. return False - return use_cascade_attention(*args, **kwargs) + # TODO: Cascade attention doesn't work, disable it for now + # return use_cascade_attention(*args, **kwargs) + return False class FlashInferImpl(AttentionImpl):