From f3a21e9c6835f04ecb5da1b953cb650986bf2209 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Wed, 23 Apr 2025 16:50:05 +0100 Subject: [PATCH] `CacheConfig.block_size` should always be `int` when used (#17052) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- vllm/config.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/vllm/config.py b/vllm/config.py index 641b221f5d347..f403654be1652 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1261,11 +1261,14 @@ PrefixCachingHashAlgo = Literal["builtin", "sha256"] class CacheConfig: """Configuration for the KV cache.""" - block_size: Optional[BlockSize] = None + block_size: BlockSize = None # type: ignore """Size of a contiguous cache block in number of tokens. This is ignored on neuron devices and set to `--max-model-len`. On CUDA devices, only block sizes up to 32 are supported. On HPU devices, block size defaults to 128. - """ + + This config has no static default. If left unspecified by the user, it will + be set in `Platform.check_and_update_configs()` based on the current + platform.""" gpu_memory_utilization: float = 0.9 """The fraction of GPU memory to be used for the model executor, which can range from 0 to 1. For example, a value of 0.5 would imply 50% GPU memory