From c88481913599daff6ca293bf778a620213e467c9 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 8 Jan 2024 10:11:06 -0800 Subject: [PATCH] Fix eager mode performance (#2377) --- vllm/worker/model_runner.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index be2803089f51b..bbadc0e0186a5 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -235,9 +235,11 @@ class ModelRunner: input_block_tables[i, :len(block_table)] = block_table block_tables = torch.tensor(input_block_tables, device="cuda") else: + max_block_table_len = (max_context_len + self.block_size - + 1) // self.block_size block_tables = _make_tensor_with_pad( block_tables, - max_len=max_context_len, + max_len=max_block_table_len, pad=0, dtype=torch.int, device="cuda",