From 7adeb4bfa8630773c84f9ad9e97830becd540896 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Wed, 24 Dec 2025 19:15:27 +0800 Subject: [PATCH] [Bugfix] Fix `max_model_len="auto"` handling (#31260) Signed-off-by: DarkLight1337 --- vllm/config/model.py | 2 +- vllm/engine/arg_utils.py | 41 +++++++----- vllm/v1/core/kv_cache_utils.py | 118 +++++++++++++++------------------ 3 files changed, 79 insertions(+), 82 deletions(-) diff --git a/vllm/config/model.py b/vllm/config/model.py index ce554b136cef3..a730aa8ad1b9c 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -164,7 +164,7 @@ class ModelConfig: """The specific revision to use for the tokenizer on the Hugging Face Hub. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.""" - max_model_len: int = Field(default=None, gt=0) + max_model_len: int = Field(default=None, ge=-1) """Model context length (prompt and output). If unspecified, will be automatically derived from the model config. diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index a524a1611f008..1442c83a1504a 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -297,16 +297,14 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, dict[str, Any]]: elif contains_type(type_hints, set): kwargs[name].update(collection_to_kwargs(type_hints, set)) elif contains_type(type_hints, int): - kwargs[name]["type"] = int - # Special case for large integers - human_readable_ints = { - "max_model_len", - "max_num_batched_tokens", - "kv_cache_memory_bytes", - } - if name in human_readable_ints: + if name == "max_model_len": + kwargs[name]["type"] = human_readable_int_or_auto + kwargs[name]["help"] += f"\n\n{human_readable_int_or_auto.__doc__}" + elif name in ("max_num_batched_tokens", "kv_cache_memory_bytes"): kwargs[name]["type"] = human_readable_int kwargs[name]["help"] += f"\n\n{human_readable_int.__doc__}" + else: + kwargs[name]["type"] = int elif contains_type(type_hints, float): kwargs[name]["type"] = float elif contains_type(type_hints, dict) and ( @@ -2042,23 +2040,17 @@ def _raise_unsupported_error(feature_name: str): raise NotImplementedError(msg) -def human_readable_int(value): +def human_readable_int(value: str) -> int: """Parse human-readable integers like '1k', '2M', etc. Including decimal values with decimal multipliers. - Also accepts -1 or 'auto' as a special value for auto-detection. Examples: - '1k' -> 1,000 - '1K' -> 1,024 - '25.6k' -> 25,600 - - '-1' or 'auto' -> -1 (special value for auto-detection) """ value = value.strip() - # Handle -1 or 'auto' as a special value for auto-detection - if value == "-1" or value.lower() == "auto": - return -1 - match = re.fullmatch(r"(\d+(?:\.\d+)?)([kKmMgGtT])", value) if match: decimal_multiplier = { @@ -2092,3 +2084,22 @@ def human_readable_int(value): # Regular plain number. return int(value) + + +def human_readable_int_or_auto(value: str) -> int: + """Parse human-readable integers like '1k', '2M', etc. + Including decimal values with decimal multipliers. + Also accepts -1 or 'auto' as a special value for auto-detection. + + Examples: + - '1k' -> 1,000 + - '1K' -> 1,024 + - '25.6k' -> 25,600 + - '-1' or 'auto' -> -1 (special value for auto-detection) + """ + value = value.strip() + + if value == "-1" or value.lower() == "auto": + return -1 + + return human_readable_int(value) diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 85afff38e486a..1480a1f798ea0 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -606,6 +606,43 @@ def get_request_block_hasher( return request_block_hasher +def _check_enough_kv_cache_memory( + available_memory: int, + get_needed_memory: Callable[[], int], + max_model_len: int, + estimate_max_model_len: Callable[[int], int], +): + if available_memory <= 0: + raise ValueError( + "No available memory for the cache blocks. " + "Try increasing `gpu_memory_utilization` when initializing the engine. " + "See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ " + "for more details." + ) + + needed_memory = get_needed_memory() + + if needed_memory > available_memory: + estimated_max_len = estimate_max_model_len(available_memory) + estimated_msg = "" + if estimated_max_len > 0: + estimated_msg = ( + "Based on the available memory, " + f"the estimated maximum model length is {estimated_max_len}. " + ) + + raise ValueError( + f"To serve at least one request with the models's max seq len " + f"({max_model_len}), ({needed_memory / GiB_bytes:.2f} GiB KV " + f"cache is needed, which is larger than the available KV cache " + f"memory ({available_memory / GiB_bytes:.2f} GiB). {estimated_msg}" + f"Try increasing `gpu_memory_utilization` or decreasing `max_model_len` " + f"when initializing the engine. " + f"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ " + f"for more details." + ) + + def max_memory_usage_bytes( vllm_config: VllmConfig, kv_cache_specs: Iterable[KVCacheSpec] ) -> int: @@ -688,43 +725,12 @@ def check_enough_kv_cache_memory( """ # No need to check for available memory if the kv_cache_spec is empty - if not kv_cache_spec: - return - - if available_memory <= 0: - raise ValueError( - "No available memory for the cache blocks. " - "Try increasing `gpu_memory_utilization` when " - "initializing the engine. " - "See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ " - "for more details." - ) - - max_model_len = vllm_config.model_config.max_model_len - needed_memory = max_memory_usage_bytes(vllm_config, kv_cache_spec.values()) - - if needed_memory > available_memory: - # Estimate the maximum model length that can fit in the available memory - estimated_max_len = estimate_max_model_len( - vllm_config, kv_cache_spec, available_memory - ) - estimated_msg = "" - if estimated_max_len > 0: - estimated_msg = ( - "Based on the available memory, " - f"the estimated maximum model length is {estimated_max_len}." - ) - - raise ValueError( - f"To serve at least one request with the models's max seq len " - f"({max_model_len}), ({needed_memory / GiB_bytes:.2f} GiB KV " - f"cache is needed, which is larger than the available KV cache " - f"memory ({available_memory / GiB_bytes:.2f} GiB). " - f"{estimated_msg} " - f"Try increasing `gpu_memory_utilization` or decreasing `max_model_len` " - f"when initializing the engine. " - f"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ " - f"for more details." + if kv_cache_spec: + _check_enough_kv_cache_memory( + available_memory, + lambda: max_memory_usage_bytes(vllm_config, kv_cache_spec.values()), + vllm_config.model_config.max_model_len, + lambda am: estimate_max_model_len(vllm_config, kv_cache_spec, am), ) @@ -1505,36 +1511,16 @@ def get_kv_cache_configs( # Check if the available memory is enough (using min across all workers). # We use the global groups to correctly account for padding. if global_kv_cache_groups: - min_available_memory = min(available_memory) - if min_available_memory <= 0: - raise ValueError( - "No available memory for the cache blocks. " - "Try increasing `gpu_memory_utilization` when " - "initializing the engine." - ) - max_model_len = vllm_config.model_config.max_model_len - needed_memory = _max_memory_usage_bytes_from_groups( - vllm_config, global_kv_cache_groups + _check_enough_kv_cache_memory( + min(available_memory), + lambda: _max_memory_usage_bytes_from_groups( + vllm_config, global_kv_cache_groups + ), + vllm_config.model_config.max_model_len, + lambda am: _estimate_max_model_len_from_groups( + vllm_config, global_kv_cache_groups, am + ), ) - if needed_memory > min_available_memory: - estimated_max_len = _estimate_max_model_len_from_groups( - vllm_config, global_kv_cache_groups, min_available_memory - ) - estimated_msg = "" - if estimated_max_len > 0: - estimated_msg = ( - f"Based on the available memory, the estimated maximum " - f"model length is {estimated_max_len}. " - ) - raise ValueError( - f"To serve at least one request with the models's max seq len " - f"({max_model_len}), ({needed_memory / GiB_bytes:.2f} GiB KV " - f"cache is needed, which is larger than the available KV cache " - f"memory ({min_available_memory / GiB_bytes:.2f} GiB). " - f"{estimated_msg}" - f"Try increasing `gpu_memory_utilization` or decreasing " - f"`max_model_len` when initializing the engine." - ) kv_cache_configs: list[KVCacheConfig] = [] for kv_cache_spec_one_worker, available_memory_one_worker in zip(