From 5d91d2b292be9b1d6b121d36d242d5077a031e4b Mon Sep 17 00:00:00 2001 From: maang-h <55082429+maang-h@users.noreply.github.com> Date: Wed, 3 Dec 2025 07:23:09 +0800 Subject: [PATCH] [Doc] Add allocate_slots parameter docs (#29777) Signed-off-by: maang Signed-off-by: maang-h <55082429+maang-h@users.noreply.github.com> Co-authored-by: Chen Zhang --- vllm/v1/core/kv_cache_manager.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vllm/v1/core/kv_cache_manager.py b/vllm/v1/core/kv_cache_manager.py index 3823384881cd3..33e8c81514c5f 100644 --- a/vllm/v1/core/kv_cache_manager.py +++ b/vllm/v1/core/kv_cache_manager.py @@ -230,6 +230,9 @@ class KVCacheManager: delay_cache_blocks: Whether to skip caching the blocks. This is used by P/D when allocating blocks used in a KV transfer which will complete in a future step. + num_encoder_tokens: The number of encoder tokens to allocate for + cross-attention in encoder-decoder models(e.g., Whisper). + For decoder-only models, this should be 0. Blocks layout: ```