[Bugfix] lookahead block table with cuda graph max capture (#8340)

[Bugfix] Ensure multistep lookahead allocation is compatible with cuda graph max capture (#8340)
2025-12-15 13:45:28 +08:00 · 2024-09-10 19:00:35 -04:00 · 2024-09-10 19:00:35 -04:00 · 22f3a4bc6c
commit 22f3a4bc6c
parent b1f3e18958
1 changed files with 11 additions and 1 deletions
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@ -471,9 +471,19 @@ class FlashAttentionMetadataBuilder(
            # The shape of graph_block_tables is
            # [max batch size, max context len // block size].
            input_block_tables = self.runner.graph_block_tables[:batch_size]
            max_blocks = input_block_tables.shape[1]
            for i, block_table in enumerate(self.block_tables):
                if block_table:
-                    input_block_tables[i, :len(block_table)] = block_table
+                    num_blocks = len(block_table)
                    if num_blocks <= max_blocks:
                        input_block_tables[i, :num_blocks] = block_table
                    else:
                        # It may be possible to have more blocks allocated due
                        # to lookahead slots of multi-step, however, they are
                        # not used anyway, so can be safely ignored.
                        input_block_tables[
                            i, :max_blocks] = block_table[:max_blocks]
            block_tables = torch.from_numpy(input_block_tables).to(
                device=device, non_blocking=True)
        else: