Add --max-model-len auto to auto-fit context to available memory (#29431)

Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
Michael Goin 2025-12-24 00:37:14 -05:00 committed by GitHub
parent d7e05ac743
commit 8ee90c83f8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 313 additions and 31 deletions

View File

@ -511,6 +511,16 @@ def test_human_readable_model_len():
args = parser.parse_args(["--max-model-len", "10.2123451234567t"])
assert args.max_model_len == 10212345123456
# Special value -1 for auto-fit to GPU memory
args = parser.parse_args(["--max-model-len", "-1"])
assert args.max_model_len == -1
# 'auto' is an alias for -1
args = parser.parse_args(["--max-model-len", "auto"])
assert args.max_model_len == -1
args = parser.parse_args(["--max-model-len", "AUTO"])
assert args.max_model_len == -1
# Invalid (do not allow decimals with binary multipliers)
for invalid in ["1a", "pwd", "10.24", "1.23M", "1.22T"]:
with pytest.raises(ArgumentError):

View File

@ -1798,3 +1798,60 @@ def test_request_with_prompt_embeds_and_mm_inputs(hash_fn: Callable[[Any], bytes
)
)
assert block_hashes[1] == expected_hash2
def test_auto_fit_max_model_len():
"""Test that max_model_len=-1 auto-fits to available GPU memory."""
# Create config with original_max_model_len=-1 to trigger auto-fit
model_config = ModelConfig(max_model_len=1024)
# Simulate the user passing -1 by setting original_max_model_len
model_config.original_max_model_len = -1
vllm_config = VllmConfig(model_config=model_config)
mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2 # 16KB per block per layer
kv_cache_specs = {
"layer_1": new_kv_cache_spec(),
"layer_2": new_kv_cache_spec(),
}
# With enough memory, max_model_len stays at the derived max
large_available_memory = mem_per_block_per_layer * 2 * 1024 # plenty of memory
_kv_cache_configs = get_kv_cache_configs(
vllm_config, [kv_cache_specs], [large_available_memory]
)
assert vllm_config.model_config.max_model_len == 1024
# Reset for next test
model_config = ModelConfig(max_model_len=1024)
model_config.original_max_model_len = -1
vllm_config = VllmConfig(model_config=model_config)
# With limited memory, max_model_len should be reduced
# Need memory for at least max_model_len tokens
# 32 blocks worth of memory for 2 layers = can fit 32*16=512 tokens
limited_memory = mem_per_block_per_layer * 2 * 32
_kv_cache_configs = get_kv_cache_configs(
vllm_config, [kv_cache_specs], [limited_memory]
)
# Should be reduced to fit in memory
assert vllm_config.model_config.max_model_len < 1024
assert vllm_config.model_config.max_model_len > 0
def test_auto_fit_max_model_len_not_triggered():
"""Test that auto-fit is not triggered when original_max_model_len is not -1."""
model_config = ModelConfig(max_model_len=16)
# original_max_model_len should be None by default, not -1
vllm_config = VllmConfig(model_config=model_config)
mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2
kv_cache_specs = {
"layer_1": new_kv_cache_spec(),
"layer_2": new_kv_cache_spec(),
}
# This should work normally without auto-fit
_kv_cache_configs = get_kv_cache_configs(
vllm_config, [kv_cache_specs], [mem_per_block_per_layer * 2 * 32]
)
assert vllm_config.model_config.max_model_len == 16

View File

@ -172,7 +172,10 @@ class ModelConfig:
format. Examples:\n
- 1k -> 1000\n
- 1K -> 1024\n
- 25.6k -> 25,600"""
- 25.6k -> 25,600\n
- -1 or 'auto' -> Automatically choose the maximum model length that fits in
GPU memory. This will use the model's maximum context length if it fits,
otherwise it will find the largest length that can be accommodated."""
spec_target_max_model_len: int | None = None
"""Specify the maximum length for spec decoding draft models."""
quantization: QuantizationMethods | str | None = None
@ -2151,9 +2154,10 @@ def _get_and_verify_max_len(
if encoder_config and "max_seq_length" in encoder_config:
derived_max_model_len = encoder_config["max_seq_length"]
# If the user didn't specify `max_model_len`, then use that derived from
# the model config as a default value.
if max_model_len is None:
# If the user didn't specify `max_model_len` or specified -1 (auto-fit),
# then use that derived from the model config as a default value.
# When -1 is specified, the engine will later auto-fit to available memory.
if max_model_len is None or max_model_len == -1:
# For LongRoPE, default to original_max_position_embeddings to avoid
# performance degradation for shorter sequences
if rope_parameters is not None and any(

View File

@ -2045,13 +2045,20 @@ def _raise_unsupported_error(feature_name: str):
def human_readable_int(value):
"""Parse human-readable integers like '1k', '2M', etc.
Including decimal values with decimal multipliers.
Also accepts -1 or 'auto' as a special value for auto-detection.
Examples:
- '1k' -> 1,000
- '1K' -> 1,024
- '25.6k' -> 25,600
- '-1' or 'auto' -> -1 (special value for auto-detection)
"""
value = value.strip()
# Handle -1 or 'auto' as a special value for auto-detection
if value == "-1" or value.lower() == "auto":
return -1
match = re.fullmatch(r"(\d+(?:\.\d+)?)([kKmMgGtT])", value)
if match:
decimal_multiplier = {

View File

@ -624,6 +624,9 @@ def estimate_max_model_len(
Estimates the maximum model length that can fit in the available memory
using binary search.
This function temporarily modifies max_model_len during estimation but
restores the original value before returning, ensuring no side effects.
Args:
vllm_config: The global VllmConfig
kv_cache_spec: The kv cache spec of each attention layer in the model
@ -632,33 +635,38 @@ def estimate_max_model_len(
Returns:
The estimated maximum model length that can fit in the available memory.
"""
# Save the original max_model_len to restore after estimation
original_max_model_len = vllm_config.model_config.max_model_len
# Define a function to check if a given model length fits in memory
def fits_in_memory(model_len: int) -> bool:
# Modify the max_model_len for this calculation
# Temporarily modify the max_model_len for this calculation
vllm_config.model_config.max_model_len = model_len
# Calculate memory needed for the given model length
memory_needed = max_memory_usage_bytes(vllm_config, kv_cache_spec.values())
return memory_needed <= available_memory
# Binary search for the maximum model length
current_max = vllm_config.model_config.max_model_len
left, right = 1, current_max
try:
# Binary search for the maximum model length
left, right = 1, original_max_model_len
# If even the smallest model length doesn't fit, return 0
if not fits_in_memory(left):
return 0
# If even the smallest model length doesn't fit, return 0
if not fits_in_memory(left):
return 0
# Binary search for the maximum model length that fits
result = 1
while left <= right:
mid = (left + right) // 2
if fits_in_memory(mid):
result = mid
left = mid + 1
else:
right = mid - 1
return result
# Binary search for the maximum model length that fits
result = 1
while left <= right:
mid = (left + right) // 2
if fits_in_memory(mid):
result = mid
left = mid + 1
else:
right = mid - 1
return result
finally:
# Always restore the original max_model_len to avoid side effects
vllm_config.model_config.max_model_len = original_max_model_len
def check_enough_kv_cache_memory(
@ -1301,6 +1309,140 @@ def _report_kv_cache_config(
)
def _max_memory_usage_bytes_from_groups(
vllm_config: VllmConfig,
kv_cache_groups: list[KVCacheGroupSpec],
) -> int:
"""
Calculate maximum memory usage in bytes from KV cache groups.
This correctly accounts for padding in hybrid models. For example, if a
model has 8 full attention layers and 9 sliding window layers, they will
be padded to 9 full + 9 sliding window for uniform group sizes.
"""
if not kv_cache_groups:
return 0
# UniformTypeKVCacheSpecs special case (single group, per-layer specs)
if len(kv_cache_groups) == 1 and isinstance(
kv_cache_groups[0].kv_cache_spec, UniformTypeKVCacheSpecs
):
per_layer_specs = kv_cache_groups[0].kv_cache_spec.kv_cache_specs
return sum(
spec.max_memory_usage_bytes(vllm_config)
for spec in per_layer_specs.values()
)
# General case: group_size pools, each shared by one layer per group
# Memory = group_size * page_size * blocks_for_max_len
group_size = max(len(group.layer_names) for group in kv_cache_groups)
page_size = get_uniform_page_size(
[group.kv_cache_spec for group in kv_cache_groups]
)
any_spec = kv_cache_groups[0].kv_cache_spec
blocks_needed = cdiv(any_spec.max_memory_usage_bytes(vllm_config), page_size)
return group_size * page_size * blocks_needed
def _estimate_max_model_len_from_groups(
vllm_config: VllmConfig,
kv_cache_groups: list[KVCacheGroupSpec],
available_memory: int,
) -> int:
"""
Binary search for the maximum model length that fits in available memory.
Returns 0 if even 1 token doesn't fit.
"""
original_max = vllm_config.model_config.max_model_len
def fits(model_len: int) -> bool:
vllm_config.model_config.max_model_len = model_len
return (
_max_memory_usage_bytes_from_groups(vllm_config, kv_cache_groups)
<= available_memory
)
try:
left, right = 1, original_max
if not fits(left):
return 0
result = 1
while left <= right:
mid = (left + right) // 2
if fits(mid):
result = mid
left = mid + 1
else:
right = mid - 1
return result
finally:
vllm_config.model_config.max_model_len = original_max
def _auto_fit_max_model_len(
vllm_config: VllmConfig,
kv_cache_groups: list[KVCacheGroupSpec],
available_memory: list[int],
) -> None:
"""
When max_model_len is set to -1, this function estimates the largest
context length that can be supported with the available GPU memory.
It uses binary search to find the maximum length that fits across all
workers.
Args:
vllm_config: The global VllmConfig (will be modified in-place)
kv_cache_groups: The global KV cache groups (from get_kv_cache_groups).
This correctly accounts for padding in hybrid models.
available_memory: Memory available for KV cache in bytes for each
worker.
"""
original_max = vllm_config.model_config.max_model_len
if not kv_cache_groups:
# All workers have empty specs (attention-free model)
logger.info_once(
"Auto-fit max_model_len: attention-free model, "
"using derived max_model_len=%d",
original_max,
scope="local",
)
return
# Use minimum available memory across all workers
min_available_memory = min(available_memory)
auto_fit_max = _estimate_max_model_len_from_groups(
vllm_config, kv_cache_groups, min_available_memory
)
if auto_fit_max <= 0:
raise ValueError(
"Cannot auto-fit max_model_len: not enough GPU memory available "
"to serve even a single token. Try increasing `gpu_memory_utilization`."
)
if auto_fit_max >= original_max:
# The model's full context length fits in memory
logger.info_once(
"Auto-fit max_model_len: full model context length %d fits in "
"available GPU memory",
original_max,
scope="local",
)
else:
# Need to reduce max_model_len to fit in memory
vllm_config.model_config.max_model_len = auto_fit_max
logger.info_once(
"Auto-fit max_model_len: reduced from %d to %d to fit in "
"available GPU memory (%.2f GiB available for KV cache)",
original_max,
auto_fit_max,
min_available_memory / GiB_bytes,
scope="local",
)
def get_kv_cache_configs(
vllm_config: VllmConfig,
kv_cache_specs: list[dict[str, KVCacheSpec]],
@ -1317,10 +1459,12 @@ def get_kv_cache_configs(
1. Merge the KV cache specs of all workers to get the KVCacheSpecs for
the whole model.
2. Generate the KV cache groups based on the layer ratio of the whole model.
3. Generate the KV cache configs for each worker based on the KV cache
This also handles spec unification for hybrid models.
3. Handle auto-fit max_model_len and memory checks using the unified specs.
4. Generate the KV cache configs for each worker based on the KV cache
grouping strategy. (This is reasonable because the layer ratio of
different PP stages are similar.)
4. Change the num_blocks of each worker to the smallest among all workers
5. Change the num_blocks of each worker to the smallest among all workers
and shrink tensor sizes proportionally to avoid allocating unused memory.
Args:
@ -1333,14 +1477,6 @@ def get_kv_cache_configs(
The generated KVCacheConfigs for each worker.
"""
# Check if the available memory is enough for each worker.
for kv_cache_spec_one_worker, available_memory_one_worker in zip(
kv_cache_specs, available_memory
):
check_enough_kv_cache_memory(
vllm_config, kv_cache_spec_one_worker, available_memory_one_worker
)
# Merge the KV cache specs of all workers. Different PP stages may have
# different layer names, and different TP ranks of the same PP stage should
# have the same KV cache spec.
@ -1354,8 +1490,52 @@ def get_kv_cache_configs(
"The KV cache specs for the same layer are different "
"across workers. This is not supported yet."
)
# Get global KV cache groups. This also handles spec unification for
# hybrid models when disable_hybrid_kv_cache_manager is enabled.
# After this call, merged_kv_cache_specs may be modified in-place.
global_kv_cache_groups = get_kv_cache_groups(vllm_config, merged_kv_cache_specs)
# If original_max_model_len was -1, automatically
# determine the maximum model length that fits in available GPU memory.
# We use the global groups here to correctly account for padding.
if vllm_config.model_config.original_max_model_len == -1:
_auto_fit_max_model_len(vllm_config, global_kv_cache_groups, available_memory)
# Check if the available memory is enough (using min across all workers).
# We use the global groups to correctly account for padding.
if global_kv_cache_groups:
min_available_memory = min(available_memory)
if min_available_memory <= 0:
raise ValueError(
"No available memory for the cache blocks. "
"Try increasing `gpu_memory_utilization` when "
"initializing the engine."
)
max_model_len = vllm_config.model_config.max_model_len
needed_memory = _max_memory_usage_bytes_from_groups(
vllm_config, global_kv_cache_groups
)
if needed_memory > min_available_memory:
estimated_max_len = _estimate_max_model_len_from_groups(
vllm_config, global_kv_cache_groups, min_available_memory
)
estimated_msg = ""
if estimated_max_len > 0:
estimated_msg = (
f"Based on the available memory, the estimated maximum "
f"model length is {estimated_max_len}. "
)
raise ValueError(
f"To serve at least one request with the models's max seq len "
f"({max_model_len}), ({needed_memory / GiB_bytes:.2f} GiB KV "
f"cache is needed, which is larger than the available KV cache "
f"memory ({min_available_memory / GiB_bytes:.2f} GiB). "
f"{estimated_msg}"
f"Try increasing `gpu_memory_utilization` or decreasing "
f"`max_model_len` when initializing the engine."
)
kv_cache_configs: list[KVCacheConfig] = []
for kv_cache_spec_one_worker, available_memory_one_worker in zip(
kv_cache_specs, available_memory

View File

@ -247,9 +247,20 @@ class EngineCore:
assert len(kv_cache_specs) == len(available_gpu_memory)
# Track max_model_len before KV cache config to detect auto-fit changes
max_model_len_before = vllm_config.model_config.max_model_len
kv_cache_configs = get_kv_cache_configs(
vllm_config, kv_cache_specs, available_gpu_memory
)
# If auto-fit reduced max_model_len, sync the new value to workers.
# This is needed because workers were spawned before memory profiling
# and have the original (larger) max_model_len cached.
max_model_len_after = vllm_config.model_config.max_model_len
if max_model_len_after != max_model_len_before:
self.collective_rpc("update_max_model_len", args=(max_model_len_after,))
scheduler_kv_cache_config = generate_scheduler_kv_cache_config(kv_cache_configs)
num_gpu_blocks = scheduler_kv_cache_config.num_blocks
num_cpu_blocks = 0

View File

@ -387,6 +387,19 @@ class Worker(WorkerBase):
def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
return self.model_runner.get_kv_cache_spec()
def update_max_model_len(self, max_model_len: int) -> None:
"""Update max_model_len after auto-fit to GPU memory.
This is called when max_model_len=-1 is used and the engine
automatically determines the maximum context length that fits
in GPU memory. Workers need to update their cached max_model_len
to match the engine's decision.
"""
self.model_config.max_model_len = max_model_len
if self.model_runner is not None:
self.model_runner.max_model_len = max_model_len
logger.debug("Updated max_model_len to %d", max_model_len)
def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None:
"""Allocate GPU KV cache with the specified kv_cache_config."""