From 3d5f1c864051f3d8462537a1b108e51946e7c2a1 Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Thu, 2 Oct 2025 11:48:31 -0700 Subject: [PATCH] [Mamba][KVCacheManager] Simplify kv cache manage logic for mamba + MTP (#25119) Signed-off-by: Chen Zhang --- vllm/v1/core/single_type_kv_cache_manager.py | 29 +++----------------- 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py index 4ecd9c8157e2..27ea1c4db2a5 100644 --- a/vllm/v1/core/single_type_kv_cache_manager.py +++ b/vllm/v1/core/single_type_kv_cache_manager.py @@ -565,35 +565,14 @@ class MambaManager(SingleTypeKVCacheManager): def get_num_blocks_to_allocate( self, request_id: str, num_tokens: int, new_computed_blocks: list[KVCacheBlock]) -> int: - """ - Get the number of blocks needed to be allocated for the request. - - Args: - request_id: The request ID. - num_tokens: The total number of tokens that need a slot (including - tokens that are already allocated). - new_computed_blocks: The new computed blocks just hitting the - prefix caching. - - Returns: - The number of blocks - """ - + # Allocate extra `num_speculative_blocks` blocks for + # speculative decoding (MTP/EAGLE) with linear attention. assert isinstance(self.kv_cache_spec, MambaSpec) if self.kv_cache_spec.num_speculative_blocks > 0: num_tokens += (self.kv_cache_spec.block_size * self.kv_cache_spec.num_speculative_blocks) - num_required_blocks = cdiv(num_tokens, self.block_size) - num_new_blocks = (num_required_blocks - len(new_computed_blocks) - - len(self.req_to_blocks[request_id])) - # If a computed block of a request is an eviction candidate (in the - # free queue and ref_cnt == 0), it will be changed from a free block - # to a computed block when the request is allocated, so we also count - # it as needed to be allocated. - num_evictable_computed_blocks = sum( - blk.ref_cnt == 0 and not blk.is_null - for blk in new_computed_blocks) - return num_new_blocks + num_evictable_computed_blocks + return super().get_num_blocks_to_allocate(request_id, num_tokens, + new_computed_blocks) def allocate_new_blocks(self, request_id: str, num_tokens: int) -> list[KVCacheBlock]: