[Bugfix][V1] Re-compute an entire block when fully cache hit (#11186)

Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
2026-07-05 06:17:10 +08:00 · 2024-12-13 17:08:23 -08:00 · 2024-12-13 17:08:23 -08:00 · 9855aea21b
commit 9855aea21b
parent 4b5b8a6a3b
1 changed files with 7 additions and 3 deletions
--- a/vllm/v1/core/scheduler.py
+++ b/vllm/v1/core/scheduler.py
@ -199,9 +199,13 @@ class Scheduler:
                if num_new_tokens == 0:
                    # The happens when prompt length is divisible by the block
                    # size and all blocks are cached. Now we force to recompute
-                    # the last token.
-                    num_computed_tokens -= 1
-                    num_new_tokens = 1
+                    # the last block. Note that we have to re-compute an entire
+                    # block because allocate_slots() assumes num_computed_tokens
+                    # is always a multiple of the block size. This limitation
+                    # can potentially be removed in the future to slightly
+                    # improve the performance.
+                    num_computed_tokens -= self.block_size
+                    num_new_tokens = self.block_size
                    computed_blocks.pop()
                num_new_tokens = min(num_new_tokens, token_budget)
                assert num_new_tokens > 0