mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 13:45:28 +08:00
[Bugfix] lookahead block table with cuda graph max capture (#8340)
[Bugfix] Ensure multistep lookahead allocation is compatible with cuda graph max capture (#8340)
This commit is contained in:
parent
b1f3e18958
commit
22f3a4bc6c
@ -471,9 +471,19 @@ class FlashAttentionMetadataBuilder(
|
|||||||
# The shape of graph_block_tables is
|
# The shape of graph_block_tables is
|
||||||
# [max batch size, max context len // block size].
|
# [max batch size, max context len // block size].
|
||||||
input_block_tables = self.runner.graph_block_tables[:batch_size]
|
input_block_tables = self.runner.graph_block_tables[:batch_size]
|
||||||
|
max_blocks = input_block_tables.shape[1]
|
||||||
for i, block_table in enumerate(self.block_tables):
|
for i, block_table in enumerate(self.block_tables):
|
||||||
if block_table:
|
if block_table:
|
||||||
input_block_tables[i, :len(block_table)] = block_table
|
num_blocks = len(block_table)
|
||||||
|
if num_blocks <= max_blocks:
|
||||||
|
input_block_tables[i, :num_blocks] = block_table
|
||||||
|
else:
|
||||||
|
# It may be possible to have more blocks allocated due
|
||||||
|
# to lookahead slots of multi-step, however, they are
|
||||||
|
# not used anyway, so can be safely ignored.
|
||||||
|
input_block_tables[
|
||||||
|
i, :max_blocks] = block_table[:max_blocks]
|
||||||
|
|
||||||
block_tables = torch.from_numpy(input_block_tables).to(
|
block_tables = torch.from_numpy(input_block_tables).to(
|
||||||
device=device, non_blocking=True)
|
device=device, non_blocking=True)
|
||||||
else:
|
else:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user