opt

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2026-07-06 10:27:12 +08:00 · 2025-08-15 14:24:50 -07:00 · 2025-08-15 14:24:50 -07:00 · 2ad6985c49
commit 2ad6985c49
parent da03cb8f0b
1 changed files with 2 additions and 1 deletions
--- a/vllm/v1/attention/backends/flashinfer.py
+++ b/vllm/v1/attention/backends/flashinfer.py
@ -493,7 +493,8 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
        arange = self.block_table_arange[:max_num_blocks].unsqueeze(0)
        mask = arange < block_table_bounds_cpu.unsqueeze(1)
        if (self.sliding_window is not None and not use_cascade
-                and num_decodes > 0):
+                and num_decodes > 0 and
+                max_num_blocks > self.sliding_window // page_size):
            # NOTE(woosuk): Since FlashInfer's decode kernel doesn't skip the kv
            # outside the sliding window and only do masking, we manually
            # manipulate the seq_lens and block table for skipping.