From 2ad6985c49cb2cfffe22af944c4fe06c461852fc Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Fri, 15 Aug 2025 14:24:50 -0700
Subject: [PATCH] opt

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
---
 vllm/v1/attention/backends/flashinfer.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py
index a221ed2530445..909abe6d4324d 100755
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@@ -493,7 +493,8 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
         arange = self.block_table_arange[:max_num_blocks].unsqueeze(0)
         mask = arange < block_table_bounds_cpu.unsqueeze(1)
         if (self.sliding_window is not None and not use_cascade
-                and num_decodes > 0):
+                and num_decodes > 0 and
+                max_num_blocks > self.sliding_window // page_size):
             # NOTE(woosuk): Since FlashInfer's decode kernel doesn't skip the kv
             # outside the sliding window and only do masking, we manually
             # manipulate the seq_lens and block table for skipping.