From 7adeb4bfa8630773c84f9ad9e97830becd540896 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 24 Dec 2025 19:15:27 +0800
Subject: [PATCH] [Bugfix] Fix `max_model_len="auto"` handling (#31260)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 vllm/config/model.py           |   2 +-
 vllm/engine/arg_utils.py       |  41 +++++++-----
 vllm/v1/core/kv_cache_utils.py | 118 +++++++++++++++------------------
 3 files changed, 79 insertions(+), 82 deletions(-)

diff --git a/vllm/config/model.py b/vllm/config/model.py
index ce554b136cef3..a730aa8ad1b9c 100644
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@@ -164,7 +164,7 @@ class ModelConfig:
     """The specific revision to use for the tokenizer on the Hugging Face Hub.
     It can be a branch name, a tag name, or a commit id. If unspecified, will
     use the default version."""
-    max_model_len: int = Field(default=None, gt=0)
+    max_model_len: int = Field(default=None, ge=-1)
     """Model context length (prompt and output). If unspecified, will be
     automatically derived from the model config.
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index a524a1611f008..1442c83a1504a 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -297,16 +297,14 @@ def _compute_kwargs(cls: ConfigType) -> dict[str, dict[str, Any]]:
         elif contains_type(type_hints, set):
             kwargs[name].update(collection_to_kwargs(type_hints, set))
         elif contains_type(type_hints, int):
-            kwargs[name]["type"] = int
-            # Special case for large integers
-            human_readable_ints = {
-                "max_model_len",
-                "max_num_batched_tokens",
-                "kv_cache_memory_bytes",
-            }
-            if name in human_readable_ints:
+            if name == "max_model_len":
+                kwargs[name]["type"] = human_readable_int_or_auto
+                kwargs[name]["help"] += f"\n\n{human_readable_int_or_auto.__doc__}"
+            elif name in ("max_num_batched_tokens", "kv_cache_memory_bytes"):
                 kwargs[name]["type"] = human_readable_int
                 kwargs[name]["help"] += f"\n\n{human_readable_int.__doc__}"
+            else:
+                kwargs[name]["type"] = int
         elif contains_type(type_hints, float):
             kwargs[name]["type"] = float
         elif contains_type(type_hints, dict) and (
@@ -2042,23 +2040,17 @@ def _raise_unsupported_error(feature_name: str):
     raise NotImplementedError(msg)
 
 
-def human_readable_int(value):
+def human_readable_int(value: str) -> int:
     """Parse human-readable integers like '1k', '2M', etc.
     Including decimal values with decimal multipliers.
-    Also accepts -1 or 'auto' as a special value for auto-detection.
 
     Examples:
     - '1k' -> 1,000
     - '1K' -> 1,024
     - '25.6k' -> 25,600
-    - '-1' or 'auto' -> -1 (special value for auto-detection)
     """
     value = value.strip()
 
-    # Handle -1 or 'auto' as a special value for auto-detection
-    if value == "-1" or value.lower() == "auto":
-        return -1
-
     match = re.fullmatch(r"(\d+(?:\.\d+)?)([kKmMgGtT])", value)
     if match:
         decimal_multiplier = {
@@ -2092,3 +2084,22 @@ def human_readable_int(value):
 
     # Regular plain number.
     return int(value)
+
+
+def human_readable_int_or_auto(value: str) -> int:
+    """Parse human-readable integers like '1k', '2M', etc.
+    Including decimal values with decimal multipliers.
+    Also accepts -1 or 'auto' as a special value for auto-detection.
+
+    Examples:
+    - '1k' -> 1,000
+    - '1K' -> 1,024
+    - '25.6k' -> 25,600
+    - '-1' or 'auto' -> -1 (special value for auto-detection)
+    """
+    value = value.strip()
+
+    if value == "-1" or value.lower() == "auto":
+        return -1
+
+    return human_readable_int(value)
diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py
index 85afff38e486a..1480a1f798ea0 100644
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@@ -606,6 +606,43 @@ def get_request_block_hasher(
     return request_block_hasher
 
 
+def _check_enough_kv_cache_memory(
+    available_memory: int,
+    get_needed_memory: Callable[[], int],
+    max_model_len: int,
+    estimate_max_model_len: Callable[[int], int],
+):
+    if available_memory <= 0:
+        raise ValueError(
+            "No available memory for the cache blocks. "
+            "Try increasing `gpu_memory_utilization` when initializing the engine. "
+            "See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
+            "for more details."
+        )
+
+    needed_memory = get_needed_memory()
+
+    if needed_memory > available_memory:
+        estimated_max_len = estimate_max_model_len(available_memory)
+        estimated_msg = ""
+        if estimated_max_len > 0:
+            estimated_msg = (
+                "Based on the available memory, "
+                f"the estimated maximum model length is {estimated_max_len}. "
+            )
+
+        raise ValueError(
+            f"To serve at least one request with the models's max seq len "
+            f"({max_model_len}), ({needed_memory / GiB_bytes:.2f} GiB KV "
+            f"cache is needed, which is larger than the available KV cache "
+            f"memory ({available_memory / GiB_bytes:.2f} GiB). {estimated_msg}"
+            f"Try increasing `gpu_memory_utilization` or decreasing `max_model_len` "
+            f"when initializing the engine. "
+            f"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
+            f"for more details."
+        )
+
+
 def max_memory_usage_bytes(
     vllm_config: VllmConfig, kv_cache_specs: Iterable[KVCacheSpec]
 ) -> int:
@@ -688,43 +725,12 @@ def check_enough_kv_cache_memory(
     """
 
     # No need to check for available memory if the kv_cache_spec is empty
-    if not kv_cache_spec:
-        return
-
-    if available_memory <= 0:
-        raise ValueError(
-            "No available memory for the cache blocks. "
-            "Try increasing `gpu_memory_utilization` when "
-            "initializing the engine. "
-            "See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
-            "for more details."
-        )
-
-    max_model_len = vllm_config.model_config.max_model_len
-    needed_memory = max_memory_usage_bytes(vllm_config, kv_cache_spec.values())
-
-    if needed_memory > available_memory:
-        # Estimate the maximum model length that can fit in the available memory
-        estimated_max_len = estimate_max_model_len(
-            vllm_config, kv_cache_spec, available_memory
-        )
-        estimated_msg = ""
-        if estimated_max_len > 0:
-            estimated_msg = (
-                "Based on the available memory, "
-                f"the estimated maximum model length is {estimated_max_len}."
-            )
-
-        raise ValueError(
-            f"To serve at least one request with the models's max seq len "
-            f"({max_model_len}), ({needed_memory / GiB_bytes:.2f} GiB KV "
-            f"cache is needed, which is larger than the available KV cache "
-            f"memory ({available_memory / GiB_bytes:.2f} GiB). "
-            f"{estimated_msg} "
-            f"Try increasing `gpu_memory_utilization` or decreasing `max_model_len` "
-            f"when initializing the engine. "
-            f"See https://docs.vllm.ai/en/latest/configuration/conserving_memory/ "
-            f"for more details."
+    if kv_cache_spec:
+        _check_enough_kv_cache_memory(
+            available_memory,
+            lambda: max_memory_usage_bytes(vllm_config, kv_cache_spec.values()),
+            vllm_config.model_config.max_model_len,
+            lambda am: estimate_max_model_len(vllm_config, kv_cache_spec, am),
         )
 
 
@@ -1505,36 +1511,16 @@ def get_kv_cache_configs(
     # Check if the available memory is enough (using min across all workers).
     # We use the global groups to correctly account for padding.
     if global_kv_cache_groups:
-        min_available_memory = min(available_memory)
-        if min_available_memory <= 0:
-            raise ValueError(
-                "No available memory for the cache blocks. "
-                "Try increasing `gpu_memory_utilization` when "
-                "initializing the engine."
-            )
-        max_model_len = vllm_config.model_config.max_model_len
-        needed_memory = _max_memory_usage_bytes_from_groups(
-            vllm_config, global_kv_cache_groups
+        _check_enough_kv_cache_memory(
+            min(available_memory),
+            lambda: _max_memory_usage_bytes_from_groups(
+                vllm_config, global_kv_cache_groups
+            ),
+            vllm_config.model_config.max_model_len,
+            lambda am: _estimate_max_model_len_from_groups(
+                vllm_config, global_kv_cache_groups, am
+            ),
         )
-        if needed_memory > min_available_memory:
-            estimated_max_len = _estimate_max_model_len_from_groups(
-                vllm_config, global_kv_cache_groups, min_available_memory
-            )
-            estimated_msg = ""
-            if estimated_max_len > 0:
-                estimated_msg = (
-                    f"Based on the available memory, the estimated maximum "
-                    f"model length is {estimated_max_len}. "
-                )
-            raise ValueError(
-                f"To serve at least one request with the models's max seq len "
-                f"({max_model_len}), ({needed_memory / GiB_bytes:.2f} GiB KV "
-                f"cache is needed, which is larger than the available KV cache "
-                f"memory ({min_available_memory / GiB_bytes:.2f} GiB). "
-                f"{estimated_msg}"
-                f"Try increasing `gpu_memory_utilization` or decreasing "
-                f"`max_model_len` when initializing the engine."
-            )
 
     kv_cache_configs: list[KVCacheConfig] = []
     for kv_cache_spec_one_worker, available_memory_one_worker in zip(