Add --max-model-len auto to auto-fit context to available memory (#29431)

Signed-off-by: mgoin <mgoin64@gmail.com>
2026-03-16 08:27:07 +08:00 · 2025-12-24 00:37:14 -05:00 · 2025-12-24 00:37:14 -05:00 · 8ee90c83f8
commit 8ee90c83f8
parent d7e05ac743
7 changed files with 313 additions and 31 deletions
--- a/tests/engine/test_arg_utils.py
+++ b/tests/engine/test_arg_utils.py
@ -511,6 +511,16 @@ def test_human_readable_model_len():
    args = parser.parse_args(["--max-model-len", "10.2123451234567t"])
    assert args.max_model_len == 10212345123456

+    # Special value -1 for auto-fit to GPU memory
+    args = parser.parse_args(["--max-model-len", "-1"])
+    assert args.max_model_len == -1
+
+    # 'auto' is an alias for -1
+    args = parser.parse_args(["--max-model-len", "auto"])
+    assert args.max_model_len == -1
+    args = parser.parse_args(["--max-model-len", "AUTO"])
+    assert args.max_model_len == -1
+
    # Invalid (do not allow decimals with binary multipliers)
    for invalid in ["1a", "pwd", "10.24", "1.23M", "1.22T"]:
        with pytest.raises(ArgumentError):
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@ -1798,3 +1798,60 @@ def test_request_with_prompt_embeds_and_mm_inputs(hash_fn: Callable[[Any], bytes
        )
    )
    assert block_hashes[1] == expected_hash2
+
+
+def test_auto_fit_max_model_len():
+    """Test that max_model_len=-1 auto-fits to available GPU memory."""
+    # Create config with original_max_model_len=-1 to trigger auto-fit
+    model_config = ModelConfig(max_model_len=1024)
+    # Simulate the user passing -1 by setting original_max_model_len
+    model_config.original_max_model_len = -1
+    vllm_config = VllmConfig(model_config=model_config)
+
+    mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2  # 16KB per block per layer
+    kv_cache_specs = {
+        "layer_1": new_kv_cache_spec(),
+        "layer_2": new_kv_cache_spec(),
+    }
+
+    # With enough memory, max_model_len stays at the derived max
+    large_available_memory = mem_per_block_per_layer * 2 * 1024  # plenty of memory
+    _kv_cache_configs = get_kv_cache_configs(
+        vllm_config, [kv_cache_specs], [large_available_memory]
+    )
+    assert vllm_config.model_config.max_model_len == 1024
+
+    # Reset for next test
+    model_config = ModelConfig(max_model_len=1024)
+    model_config.original_max_model_len = -1
+    vllm_config = VllmConfig(model_config=model_config)
+
+    # With limited memory, max_model_len should be reduced
+    # Need memory for at least max_model_len tokens
+    # 32 blocks worth of memory for 2 layers = can fit 32*16=512 tokens
+    limited_memory = mem_per_block_per_layer * 2 * 32
+    _kv_cache_configs = get_kv_cache_configs(
+        vllm_config, [kv_cache_specs], [limited_memory]
+    )
+    # Should be reduced to fit in memory
+    assert vllm_config.model_config.max_model_len < 1024
+    assert vllm_config.model_config.max_model_len > 0
+
+
+def test_auto_fit_max_model_len_not_triggered():
+    """Test that auto-fit is not triggered when original_max_model_len is not -1."""
+    model_config = ModelConfig(max_model_len=16)
+    # original_max_model_len should be None by default, not -1
+    vllm_config = VllmConfig(model_config=model_config)
+
+    mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2
+    kv_cache_specs = {
+        "layer_1": new_kv_cache_spec(),
+        "layer_2": new_kv_cache_spec(),
+    }
+
+    # This should work normally without auto-fit
+    _kv_cache_configs = get_kv_cache_configs(
+        vllm_config, [kv_cache_specs], [mem_per_block_per_layer * 2 * 32]
+    )
+    assert vllm_config.model_config.max_model_len == 16
--- a/vllm/config/model.py
+++ b/vllm/config/model.py
@ -172,7 +172,10 @@ class ModelConfig:
    format. Examples:\n
    - 1k -> 1000\n
    - 1K -> 1024\n
-    - 25.6k -> 25,600"""
+    - 25.6k -> 25,600\n
+    - -1 or 'auto' -> Automatically choose the maximum model length that fits in
+    GPU memory. This will use the model's maximum context length if it fits,
+    otherwise it will find the largest length that can be accommodated."""
    spec_target_max_model_len: int | None = None
    """Specify the maximum length for spec decoding draft models."""
    quantization: QuantizationMethods | str | None = None
@ -2151,9 +2154,10 @@ def _get_and_verify_max_len(
    if encoder_config and "max_seq_length" in encoder_config:
        derived_max_model_len = encoder_config["max_seq_length"]

-    # If the user didn't specify `max_model_len`, then use that derived from
-    # the model config as a default value.
-    if max_model_len is None:
+    # If the user didn't specify `max_model_len` or specified -1 (auto-fit),
+    # then use that derived from the model config as a default value.
+    # When -1 is specified, the engine will later auto-fit to available memory.
+    if max_model_len is None or max_model_len == -1:
        # For LongRoPE, default to original_max_position_embeddings to avoid
        # performance degradation for shorter sequences
        if rope_parameters is not None and any(
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -2045,13 +2045,20 @@ def _raise_unsupported_error(feature_name: str):
 def human_readable_int(value):
    """Parse human-readable integers like '1k', '2M', etc.
    Including decimal values with decimal multipliers.
+    Also accepts -1 or 'auto' as a special value for auto-detection.

    Examples:
    - '1k' -> 1,000
    - '1K' -> 1,024
    - '25.6k' -> 25,600
+    - '-1' or 'auto' -> -1 (special value for auto-detection)
    """
    value = value.strip()
+
+    # Handle -1 or 'auto' as a special value for auto-detection
+    if value == "-1" or value.lower() == "auto":
+        return -1
+
    match = re.fullmatch(r"(\d+(?:\.\d+)?)([kKmMgGtT])", value)
    if match:
        decimal_multiplier = {
--- a/vllm/v1/core/kv_cache_utils.py
+++ b/vllm/v1/core/kv_cache_utils.py
@ -624,6 +624,9 @@ def estimate_max_model_len(
    Estimates the maximum model length that can fit in the available memory
    using binary search.

+    This function temporarily modifies max_model_len during estimation but
+    restores the original value before returning, ensuring no side effects.
+
    Args:
        vllm_config: The global VllmConfig
        kv_cache_spec: The kv cache spec of each attention layer in the model
@ -632,33 +635,38 @@ def estimate_max_model_len(
    Returns:
        The estimated maximum model length that can fit in the available memory.
    """
+    # Save the original max_model_len to restore after estimation
+    original_max_model_len = vllm_config.model_config.max_model_len

    # Define a function to check if a given model length fits in memory
    def fits_in_memory(model_len: int) -> bool:
-        # Modify the max_model_len for this calculation
+        # Temporarily modify the max_model_len for this calculation
        vllm_config.model_config.max_model_len = model_len
        # Calculate memory needed for the given model length
        memory_needed = max_memory_usage_bytes(vllm_config, kv_cache_spec.values())
        return memory_needed <= available_memory

-    # Binary search for the maximum model length
-    current_max = vllm_config.model_config.max_model_len
-    left, right = 1, current_max
+    try:
+        # Binary search for the maximum model length
+        left, right = 1, original_max_model_len

-    # If even the smallest model length doesn't fit, return 0
-    if not fits_in_memory(left):
-        return 0
+        # If even the smallest model length doesn't fit, return 0
+        if not fits_in_memory(left):
+            return 0

-    # Binary search for the maximum model length that fits
-    result = 1
-    while left <= right:
-        mid = (left + right) // 2
-        if fits_in_memory(mid):
-            result = mid
-            left = mid + 1
-        else:
-            right = mid - 1
-    return result
+        # Binary search for the maximum model length that fits
+        result = 1
+        while left <= right:
+            mid = (left + right) // 2
+            if fits_in_memory(mid):
+                result = mid
+                left = mid + 1
+            else:
+                right = mid - 1
+        return result
+    finally:
+        # Always restore the original max_model_len to avoid side effects
+        vllm_config.model_config.max_model_len = original_max_model_len


 def check_enough_kv_cache_memory(
@ -1301,6 +1309,140 @@ def _report_kv_cache_config(
    )


+def _max_memory_usage_bytes_from_groups(
+    vllm_config: VllmConfig,
+    kv_cache_groups: list[KVCacheGroupSpec],
+) -> int:
+    """
+    Calculate maximum memory usage in bytes from KV cache groups.
+
+    This correctly accounts for padding in hybrid models. For example, if a
+    model has 8 full attention layers and 9 sliding window layers, they will
+    be padded to 9 full + 9 sliding window for uniform group sizes.
+    """
+    if not kv_cache_groups:
+        return 0
+
+    # UniformTypeKVCacheSpecs special case (single group, per-layer specs)
+    if len(kv_cache_groups) == 1 and isinstance(
+        kv_cache_groups[0].kv_cache_spec, UniformTypeKVCacheSpecs
+    ):
+        per_layer_specs = kv_cache_groups[0].kv_cache_spec.kv_cache_specs
+        return sum(
+            spec.max_memory_usage_bytes(vllm_config)
+            for spec in per_layer_specs.values()
+        )
+
+    # General case: group_size pools, each shared by one layer per group
+    # Memory = group_size * page_size * blocks_for_max_len
+    group_size = max(len(group.layer_names) for group in kv_cache_groups)
+    page_size = get_uniform_page_size(
+        [group.kv_cache_spec for group in kv_cache_groups]
+    )
+    any_spec = kv_cache_groups[0].kv_cache_spec
+    blocks_needed = cdiv(any_spec.max_memory_usage_bytes(vllm_config), page_size)
+
+    return group_size * page_size * blocks_needed
+
+
+def _estimate_max_model_len_from_groups(
+    vllm_config: VllmConfig,
+    kv_cache_groups: list[KVCacheGroupSpec],
+    available_memory: int,
+) -> int:
+    """
+    Binary search for the maximum model length that fits in available memory.
+    Returns 0 if even 1 token doesn't fit.
+    """
+    original_max = vllm_config.model_config.max_model_len
+
+    def fits(model_len: int) -> bool:
+        vllm_config.model_config.max_model_len = model_len
+        return (
+            _max_memory_usage_bytes_from_groups(vllm_config, kv_cache_groups)
+            <= available_memory
+        )
+
+    try:
+        left, right = 1, original_max
+        if not fits(left):
+            return 0
+        result = 1
+        while left <= right:
+            mid = (left + right) // 2
+            if fits(mid):
+                result = mid
+                left = mid + 1
+            else:
+                right = mid - 1
+        return result
+    finally:
+        vllm_config.model_config.max_model_len = original_max
+
+
+def _auto_fit_max_model_len(
+    vllm_config: VllmConfig,
+    kv_cache_groups: list[KVCacheGroupSpec],
+    available_memory: list[int],
+) -> None:
+    """
+    When max_model_len is set to -1, this function estimates the largest
+    context length that can be supported with the available GPU memory.
+    It uses binary search to find the maximum length that fits across all
+    workers.
+
+    Args:
+        vllm_config: The global VllmConfig (will be modified in-place)
+        kv_cache_groups: The global KV cache groups (from get_kv_cache_groups).
+            This correctly accounts for padding in hybrid models.
+        available_memory: Memory available for KV cache in bytes for each
+            worker.
+    """
+    original_max = vllm_config.model_config.max_model_len
+
+    if not kv_cache_groups:
+        # All workers have empty specs (attention-free model)
+        logger.info_once(
+            "Auto-fit max_model_len: attention-free model, "
+            "using derived max_model_len=%d",
+            original_max,
+            scope="local",
+        )
+        return
+
+    # Use minimum available memory across all workers
+    min_available_memory = min(available_memory)
+    auto_fit_max = _estimate_max_model_len_from_groups(
+        vllm_config, kv_cache_groups, min_available_memory
+    )
+
+    if auto_fit_max <= 0:
+        raise ValueError(
+            "Cannot auto-fit max_model_len: not enough GPU memory available "
+            "to serve even a single token. Try increasing `gpu_memory_utilization`."
+        )
+
+    if auto_fit_max >= original_max:
+        # The model's full context length fits in memory
+        logger.info_once(
+            "Auto-fit max_model_len: full model context length %d fits in "
+            "available GPU memory",
+            original_max,
+            scope="local",
+        )
+    else:
+        # Need to reduce max_model_len to fit in memory
+        vllm_config.model_config.max_model_len = auto_fit_max
+        logger.info_once(
+            "Auto-fit max_model_len: reduced from %d to %d to fit in "
+            "available GPU memory (%.2f GiB available for KV cache)",
+            original_max,
+            auto_fit_max,
+            min_available_memory / GiB_bytes,
+            scope="local",
+        )
+
+
 def get_kv_cache_configs(
    vllm_config: VllmConfig,
    kv_cache_specs: list[dict[str, KVCacheSpec]],
@ -1317,10 +1459,12 @@ def get_kv_cache_configs(
    1. Merge the KV cache specs of all workers to get the KVCacheSpecs for
       the whole model.
    2. Generate the KV cache groups based on the layer ratio of the whole model.
-    3. Generate the KV cache configs for each worker based on the KV cache
+       This also handles spec unification for hybrid models.
+    3. Handle auto-fit max_model_len and memory checks using the unified specs.
+    4. Generate the KV cache configs for each worker based on the KV cache
       grouping strategy. (This is reasonable because the layer ratio of
       different PP stages are similar.)
-    4. Change the num_blocks of each worker to the smallest among all workers
+    5. Change the num_blocks of each worker to the smallest among all workers
       and shrink tensor sizes proportionally to avoid allocating unused memory.

    Args:
@ -1333,14 +1477,6 @@ def get_kv_cache_configs(
        The generated KVCacheConfigs for each worker.
    """

-    # Check if the available memory is enough for each worker.
-    for kv_cache_spec_one_worker, available_memory_one_worker in zip(
-        kv_cache_specs, available_memory
-    ):
-        check_enough_kv_cache_memory(
-            vllm_config, kv_cache_spec_one_worker, available_memory_one_worker
-        )
-
    # Merge the KV cache specs of all workers. Different PP stages may have
    # different layer names, and different TP ranks of the same PP stage should
    # have the same KV cache spec.
@ -1354,8 +1490,52 @@ def get_kv_cache_configs(
                    "The KV cache specs for the same layer are different "
                    "across workers. This is not supported yet."
                )
+
+    # Get global KV cache groups. This also handles spec unification for
+    # hybrid models when disable_hybrid_kv_cache_manager is enabled.
+    # After this call, merged_kv_cache_specs may be modified in-place.
    global_kv_cache_groups = get_kv_cache_groups(vllm_config, merged_kv_cache_specs)

+    # If original_max_model_len was -1, automatically
+    # determine the maximum model length that fits in available GPU memory.
+    # We use the global groups here to correctly account for padding.
+    if vllm_config.model_config.original_max_model_len == -1:
+        _auto_fit_max_model_len(vllm_config, global_kv_cache_groups, available_memory)
+
+    # Check if the available memory is enough (using min across all workers).
+    # We use the global groups to correctly account for padding.
+    if global_kv_cache_groups:
+        min_available_memory = min(available_memory)
+        if min_available_memory <= 0:
+            raise ValueError(
+                "No available memory for the cache blocks. "
+                "Try increasing `gpu_memory_utilization` when "
+                "initializing the engine."
+            )
+        max_model_len = vllm_config.model_config.max_model_len
+        needed_memory = _max_memory_usage_bytes_from_groups(
+            vllm_config, global_kv_cache_groups
+        )
+        if needed_memory > min_available_memory:
+            estimated_max_len = _estimate_max_model_len_from_groups(
+                vllm_config, global_kv_cache_groups, min_available_memory
+            )
+            estimated_msg = ""
+            if estimated_max_len > 0:
+                estimated_msg = (
+                    f"Based on the available memory, the estimated maximum "
+                    f"model length is {estimated_max_len}. "
+                )
+            raise ValueError(
+                f"To serve at least one request with the models's max seq len "
+                f"({max_model_len}), ({needed_memory / GiB_bytes:.2f} GiB KV "
+                f"cache is needed, which is larger than the available KV cache "
+                f"memory ({min_available_memory / GiB_bytes:.2f} GiB). "
+                f"{estimated_msg}"
+                f"Try increasing `gpu_memory_utilization` or decreasing "
+                f"`max_model_len` when initializing the engine."
+            )
+
    kv_cache_configs: list[KVCacheConfig] = []
    for kv_cache_spec_one_worker, available_memory_one_worker in zip(
        kv_cache_specs, available_memory
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@ -247,9 +247,20 @@ class EngineCore:

        assert len(kv_cache_specs) == len(available_gpu_memory)

+        # Track max_model_len before KV cache config to detect auto-fit changes
+        max_model_len_before = vllm_config.model_config.max_model_len
+
        kv_cache_configs = get_kv_cache_configs(
            vllm_config, kv_cache_specs, available_gpu_memory
        )
+
+        # If auto-fit reduced max_model_len, sync the new value to workers.
+        # This is needed because workers were spawned before memory profiling
+        # and have the original (larger) max_model_len cached.
+        max_model_len_after = vllm_config.model_config.max_model_len
+        if max_model_len_after != max_model_len_before:
+            self.collective_rpc("update_max_model_len", args=(max_model_len_after,))
+
        scheduler_kv_cache_config = generate_scheduler_kv_cache_config(kv_cache_configs)
        num_gpu_blocks = scheduler_kv_cache_config.num_blocks
        num_cpu_blocks = 0
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@ -387,6 +387,19 @@ class Worker(WorkerBase):
    def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
        return self.model_runner.get_kv_cache_spec()

+    def update_max_model_len(self, max_model_len: int) -> None:
+        """Update max_model_len after auto-fit to GPU memory.
+
+        This is called when max_model_len=-1 is used and the engine
+        automatically determines the maximum context length that fits
+        in GPU memory. Workers need to update their cached max_model_len
+        to match the engine's decision.
+        """
+        self.model_config.max_model_len = max_model_len
+        if self.model_runner is not None:
+            self.model_runner.max_model_len = max_model_len
+        logger.debug("Updated max_model_len to %d", max_model_len)
+
    def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None:
        """Allocate GPU KV cache with the specified kv_cache_config."""