mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-25 17:07:32 +08:00
Add --max-model-len auto to auto-fit context to available memory (#29431)
Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
parent
d7e05ac743
commit
8ee90c83f8
@ -511,6 +511,16 @@ def test_human_readable_model_len():
|
||||
args = parser.parse_args(["--max-model-len", "10.2123451234567t"])
|
||||
assert args.max_model_len == 10212345123456
|
||||
|
||||
# Special value -1 for auto-fit to GPU memory
|
||||
args = parser.parse_args(["--max-model-len", "-1"])
|
||||
assert args.max_model_len == -1
|
||||
|
||||
# 'auto' is an alias for -1
|
||||
args = parser.parse_args(["--max-model-len", "auto"])
|
||||
assert args.max_model_len == -1
|
||||
args = parser.parse_args(["--max-model-len", "AUTO"])
|
||||
assert args.max_model_len == -1
|
||||
|
||||
# Invalid (do not allow decimals with binary multipliers)
|
||||
for invalid in ["1a", "pwd", "10.24", "1.23M", "1.22T"]:
|
||||
with pytest.raises(ArgumentError):
|
||||
|
||||
@ -1798,3 +1798,60 @@ def test_request_with_prompt_embeds_and_mm_inputs(hash_fn: Callable[[Any], bytes
|
||||
)
|
||||
)
|
||||
assert block_hashes[1] == expected_hash2
|
||||
|
||||
|
||||
def test_auto_fit_max_model_len():
|
||||
"""Test that max_model_len=-1 auto-fits to available GPU memory."""
|
||||
# Create config with original_max_model_len=-1 to trigger auto-fit
|
||||
model_config = ModelConfig(max_model_len=1024)
|
||||
# Simulate the user passing -1 by setting original_max_model_len
|
||||
model_config.original_max_model_len = -1
|
||||
vllm_config = VllmConfig(model_config=model_config)
|
||||
|
||||
mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2 # 16KB per block per layer
|
||||
kv_cache_specs = {
|
||||
"layer_1": new_kv_cache_spec(),
|
||||
"layer_2": new_kv_cache_spec(),
|
||||
}
|
||||
|
||||
# With enough memory, max_model_len stays at the derived max
|
||||
large_available_memory = mem_per_block_per_layer * 2 * 1024 # plenty of memory
|
||||
_kv_cache_configs = get_kv_cache_configs(
|
||||
vllm_config, [kv_cache_specs], [large_available_memory]
|
||||
)
|
||||
assert vllm_config.model_config.max_model_len == 1024
|
||||
|
||||
# Reset for next test
|
||||
model_config = ModelConfig(max_model_len=1024)
|
||||
model_config.original_max_model_len = -1
|
||||
vllm_config = VllmConfig(model_config=model_config)
|
||||
|
||||
# With limited memory, max_model_len should be reduced
|
||||
# Need memory for at least max_model_len tokens
|
||||
# 32 blocks worth of memory for 2 layers = can fit 32*16=512 tokens
|
||||
limited_memory = mem_per_block_per_layer * 2 * 32
|
||||
_kv_cache_configs = get_kv_cache_configs(
|
||||
vllm_config, [kv_cache_specs], [limited_memory]
|
||||
)
|
||||
# Should be reduced to fit in memory
|
||||
assert vllm_config.model_config.max_model_len < 1024
|
||||
assert vllm_config.model_config.max_model_len > 0
|
||||
|
||||
|
||||
def test_auto_fit_max_model_len_not_triggered():
|
||||
"""Test that auto-fit is not triggered when original_max_model_len is not -1."""
|
||||
model_config = ModelConfig(max_model_len=16)
|
||||
# original_max_model_len should be None by default, not -1
|
||||
vllm_config = VllmConfig(model_config=model_config)
|
||||
|
||||
mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2
|
||||
kv_cache_specs = {
|
||||
"layer_1": new_kv_cache_spec(),
|
||||
"layer_2": new_kv_cache_spec(),
|
||||
}
|
||||
|
||||
# This should work normally without auto-fit
|
||||
_kv_cache_configs = get_kv_cache_configs(
|
||||
vllm_config, [kv_cache_specs], [mem_per_block_per_layer * 2 * 32]
|
||||
)
|
||||
assert vllm_config.model_config.max_model_len == 16
|
||||
|
||||
@ -172,7 +172,10 @@ class ModelConfig:
|
||||
format. Examples:\n
|
||||
- 1k -> 1000\n
|
||||
- 1K -> 1024\n
|
||||
- 25.6k -> 25,600"""
|
||||
- 25.6k -> 25,600\n
|
||||
- -1 or 'auto' -> Automatically choose the maximum model length that fits in
|
||||
GPU memory. This will use the model's maximum context length if it fits,
|
||||
otherwise it will find the largest length that can be accommodated."""
|
||||
spec_target_max_model_len: int | None = None
|
||||
"""Specify the maximum length for spec decoding draft models."""
|
||||
quantization: QuantizationMethods | str | None = None
|
||||
@ -2151,9 +2154,10 @@ def _get_and_verify_max_len(
|
||||
if encoder_config and "max_seq_length" in encoder_config:
|
||||
derived_max_model_len = encoder_config["max_seq_length"]
|
||||
|
||||
# If the user didn't specify `max_model_len`, then use that derived from
|
||||
# the model config as a default value.
|
||||
if max_model_len is None:
|
||||
# If the user didn't specify `max_model_len` or specified -1 (auto-fit),
|
||||
# then use that derived from the model config as a default value.
|
||||
# When -1 is specified, the engine will later auto-fit to available memory.
|
||||
if max_model_len is None or max_model_len == -1:
|
||||
# For LongRoPE, default to original_max_position_embeddings to avoid
|
||||
# performance degradation for shorter sequences
|
||||
if rope_parameters is not None and any(
|
||||
|
||||
@ -2045,13 +2045,20 @@ def _raise_unsupported_error(feature_name: str):
|
||||
def human_readable_int(value):
|
||||
"""Parse human-readable integers like '1k', '2M', etc.
|
||||
Including decimal values with decimal multipliers.
|
||||
Also accepts -1 or 'auto' as a special value for auto-detection.
|
||||
|
||||
Examples:
|
||||
- '1k' -> 1,000
|
||||
- '1K' -> 1,024
|
||||
- '25.6k' -> 25,600
|
||||
- '-1' or 'auto' -> -1 (special value for auto-detection)
|
||||
"""
|
||||
value = value.strip()
|
||||
|
||||
# Handle -1 or 'auto' as a special value for auto-detection
|
||||
if value == "-1" or value.lower() == "auto":
|
||||
return -1
|
||||
|
||||
match = re.fullmatch(r"(\d+(?:\.\d+)?)([kKmMgGtT])", value)
|
||||
if match:
|
||||
decimal_multiplier = {
|
||||
|
||||
@ -624,6 +624,9 @@ def estimate_max_model_len(
|
||||
Estimates the maximum model length that can fit in the available memory
|
||||
using binary search.
|
||||
|
||||
This function temporarily modifies max_model_len during estimation but
|
||||
restores the original value before returning, ensuring no side effects.
|
||||
|
||||
Args:
|
||||
vllm_config: The global VllmConfig
|
||||
kv_cache_spec: The kv cache spec of each attention layer in the model
|
||||
@ -632,33 +635,38 @@ def estimate_max_model_len(
|
||||
Returns:
|
||||
The estimated maximum model length that can fit in the available memory.
|
||||
"""
|
||||
# Save the original max_model_len to restore after estimation
|
||||
original_max_model_len = vllm_config.model_config.max_model_len
|
||||
|
||||
# Define a function to check if a given model length fits in memory
|
||||
def fits_in_memory(model_len: int) -> bool:
|
||||
# Modify the max_model_len for this calculation
|
||||
# Temporarily modify the max_model_len for this calculation
|
||||
vllm_config.model_config.max_model_len = model_len
|
||||
# Calculate memory needed for the given model length
|
||||
memory_needed = max_memory_usage_bytes(vllm_config, kv_cache_spec.values())
|
||||
return memory_needed <= available_memory
|
||||
|
||||
# Binary search for the maximum model length
|
||||
current_max = vllm_config.model_config.max_model_len
|
||||
left, right = 1, current_max
|
||||
try:
|
||||
# Binary search for the maximum model length
|
||||
left, right = 1, original_max_model_len
|
||||
|
||||
# If even the smallest model length doesn't fit, return 0
|
||||
if not fits_in_memory(left):
|
||||
return 0
|
||||
# If even the smallest model length doesn't fit, return 0
|
||||
if not fits_in_memory(left):
|
||||
return 0
|
||||
|
||||
# Binary search for the maximum model length that fits
|
||||
result = 1
|
||||
while left <= right:
|
||||
mid = (left + right) // 2
|
||||
if fits_in_memory(mid):
|
||||
result = mid
|
||||
left = mid + 1
|
||||
else:
|
||||
right = mid - 1
|
||||
return result
|
||||
# Binary search for the maximum model length that fits
|
||||
result = 1
|
||||
while left <= right:
|
||||
mid = (left + right) // 2
|
||||
if fits_in_memory(mid):
|
||||
result = mid
|
||||
left = mid + 1
|
||||
else:
|
||||
right = mid - 1
|
||||
return result
|
||||
finally:
|
||||
# Always restore the original max_model_len to avoid side effects
|
||||
vllm_config.model_config.max_model_len = original_max_model_len
|
||||
|
||||
|
||||
def check_enough_kv_cache_memory(
|
||||
@ -1301,6 +1309,140 @@ def _report_kv_cache_config(
|
||||
)
|
||||
|
||||
|
||||
def _max_memory_usage_bytes_from_groups(
|
||||
vllm_config: VllmConfig,
|
||||
kv_cache_groups: list[KVCacheGroupSpec],
|
||||
) -> int:
|
||||
"""
|
||||
Calculate maximum memory usage in bytes from KV cache groups.
|
||||
|
||||
This correctly accounts for padding in hybrid models. For example, if a
|
||||
model has 8 full attention layers and 9 sliding window layers, they will
|
||||
be padded to 9 full + 9 sliding window for uniform group sizes.
|
||||
"""
|
||||
if not kv_cache_groups:
|
||||
return 0
|
||||
|
||||
# UniformTypeKVCacheSpecs special case (single group, per-layer specs)
|
||||
if len(kv_cache_groups) == 1 and isinstance(
|
||||
kv_cache_groups[0].kv_cache_spec, UniformTypeKVCacheSpecs
|
||||
):
|
||||
per_layer_specs = kv_cache_groups[0].kv_cache_spec.kv_cache_specs
|
||||
return sum(
|
||||
spec.max_memory_usage_bytes(vllm_config)
|
||||
for spec in per_layer_specs.values()
|
||||
)
|
||||
|
||||
# General case: group_size pools, each shared by one layer per group
|
||||
# Memory = group_size * page_size * blocks_for_max_len
|
||||
group_size = max(len(group.layer_names) for group in kv_cache_groups)
|
||||
page_size = get_uniform_page_size(
|
||||
[group.kv_cache_spec for group in kv_cache_groups]
|
||||
)
|
||||
any_spec = kv_cache_groups[0].kv_cache_spec
|
||||
blocks_needed = cdiv(any_spec.max_memory_usage_bytes(vllm_config), page_size)
|
||||
|
||||
return group_size * page_size * blocks_needed
|
||||
|
||||
|
||||
def _estimate_max_model_len_from_groups(
|
||||
vllm_config: VllmConfig,
|
||||
kv_cache_groups: list[KVCacheGroupSpec],
|
||||
available_memory: int,
|
||||
) -> int:
|
||||
"""
|
||||
Binary search for the maximum model length that fits in available memory.
|
||||
Returns 0 if even 1 token doesn't fit.
|
||||
"""
|
||||
original_max = vllm_config.model_config.max_model_len
|
||||
|
||||
def fits(model_len: int) -> bool:
|
||||
vllm_config.model_config.max_model_len = model_len
|
||||
return (
|
||||
_max_memory_usage_bytes_from_groups(vllm_config, kv_cache_groups)
|
||||
<= available_memory
|
||||
)
|
||||
|
||||
try:
|
||||
left, right = 1, original_max
|
||||
if not fits(left):
|
||||
return 0
|
||||
result = 1
|
||||
while left <= right:
|
||||
mid = (left + right) // 2
|
||||
if fits(mid):
|
||||
result = mid
|
||||
left = mid + 1
|
||||
else:
|
||||
right = mid - 1
|
||||
return result
|
||||
finally:
|
||||
vllm_config.model_config.max_model_len = original_max
|
||||
|
||||
|
||||
def _auto_fit_max_model_len(
|
||||
vllm_config: VllmConfig,
|
||||
kv_cache_groups: list[KVCacheGroupSpec],
|
||||
available_memory: list[int],
|
||||
) -> None:
|
||||
"""
|
||||
When max_model_len is set to -1, this function estimates the largest
|
||||
context length that can be supported with the available GPU memory.
|
||||
It uses binary search to find the maximum length that fits across all
|
||||
workers.
|
||||
|
||||
Args:
|
||||
vllm_config: The global VllmConfig (will be modified in-place)
|
||||
kv_cache_groups: The global KV cache groups (from get_kv_cache_groups).
|
||||
This correctly accounts for padding in hybrid models.
|
||||
available_memory: Memory available for KV cache in bytes for each
|
||||
worker.
|
||||
"""
|
||||
original_max = vllm_config.model_config.max_model_len
|
||||
|
||||
if not kv_cache_groups:
|
||||
# All workers have empty specs (attention-free model)
|
||||
logger.info_once(
|
||||
"Auto-fit max_model_len: attention-free model, "
|
||||
"using derived max_model_len=%d",
|
||||
original_max,
|
||||
scope="local",
|
||||
)
|
||||
return
|
||||
|
||||
# Use minimum available memory across all workers
|
||||
min_available_memory = min(available_memory)
|
||||
auto_fit_max = _estimate_max_model_len_from_groups(
|
||||
vllm_config, kv_cache_groups, min_available_memory
|
||||
)
|
||||
|
||||
if auto_fit_max <= 0:
|
||||
raise ValueError(
|
||||
"Cannot auto-fit max_model_len: not enough GPU memory available "
|
||||
"to serve even a single token. Try increasing `gpu_memory_utilization`."
|
||||
)
|
||||
|
||||
if auto_fit_max >= original_max:
|
||||
# The model's full context length fits in memory
|
||||
logger.info_once(
|
||||
"Auto-fit max_model_len: full model context length %d fits in "
|
||||
"available GPU memory",
|
||||
original_max,
|
||||
scope="local",
|
||||
)
|
||||
else:
|
||||
# Need to reduce max_model_len to fit in memory
|
||||
vllm_config.model_config.max_model_len = auto_fit_max
|
||||
logger.info_once(
|
||||
"Auto-fit max_model_len: reduced from %d to %d to fit in "
|
||||
"available GPU memory (%.2f GiB available for KV cache)",
|
||||
original_max,
|
||||
auto_fit_max,
|
||||
min_available_memory / GiB_bytes,
|
||||
scope="local",
|
||||
)
|
||||
|
||||
|
||||
def get_kv_cache_configs(
|
||||
vllm_config: VllmConfig,
|
||||
kv_cache_specs: list[dict[str, KVCacheSpec]],
|
||||
@ -1317,10 +1459,12 @@ def get_kv_cache_configs(
|
||||
1. Merge the KV cache specs of all workers to get the KVCacheSpecs for
|
||||
the whole model.
|
||||
2. Generate the KV cache groups based on the layer ratio of the whole model.
|
||||
3. Generate the KV cache configs for each worker based on the KV cache
|
||||
This also handles spec unification for hybrid models.
|
||||
3. Handle auto-fit max_model_len and memory checks using the unified specs.
|
||||
4. Generate the KV cache configs for each worker based on the KV cache
|
||||
grouping strategy. (This is reasonable because the layer ratio of
|
||||
different PP stages are similar.)
|
||||
4. Change the num_blocks of each worker to the smallest among all workers
|
||||
5. Change the num_blocks of each worker to the smallest among all workers
|
||||
and shrink tensor sizes proportionally to avoid allocating unused memory.
|
||||
|
||||
Args:
|
||||
@ -1333,14 +1477,6 @@ def get_kv_cache_configs(
|
||||
The generated KVCacheConfigs for each worker.
|
||||
"""
|
||||
|
||||
# Check if the available memory is enough for each worker.
|
||||
for kv_cache_spec_one_worker, available_memory_one_worker in zip(
|
||||
kv_cache_specs, available_memory
|
||||
):
|
||||
check_enough_kv_cache_memory(
|
||||
vllm_config, kv_cache_spec_one_worker, available_memory_one_worker
|
||||
)
|
||||
|
||||
# Merge the KV cache specs of all workers. Different PP stages may have
|
||||
# different layer names, and different TP ranks of the same PP stage should
|
||||
# have the same KV cache spec.
|
||||
@ -1354,8 +1490,52 @@ def get_kv_cache_configs(
|
||||
"The KV cache specs for the same layer are different "
|
||||
"across workers. This is not supported yet."
|
||||
)
|
||||
|
||||
# Get global KV cache groups. This also handles spec unification for
|
||||
# hybrid models when disable_hybrid_kv_cache_manager is enabled.
|
||||
# After this call, merged_kv_cache_specs may be modified in-place.
|
||||
global_kv_cache_groups = get_kv_cache_groups(vllm_config, merged_kv_cache_specs)
|
||||
|
||||
# If original_max_model_len was -1, automatically
|
||||
# determine the maximum model length that fits in available GPU memory.
|
||||
# We use the global groups here to correctly account for padding.
|
||||
if vllm_config.model_config.original_max_model_len == -1:
|
||||
_auto_fit_max_model_len(vllm_config, global_kv_cache_groups, available_memory)
|
||||
|
||||
# Check if the available memory is enough (using min across all workers).
|
||||
# We use the global groups to correctly account for padding.
|
||||
if global_kv_cache_groups:
|
||||
min_available_memory = min(available_memory)
|
||||
if min_available_memory <= 0:
|
||||
raise ValueError(
|
||||
"No available memory for the cache blocks. "
|
||||
"Try increasing `gpu_memory_utilization` when "
|
||||
"initializing the engine."
|
||||
)
|
||||
max_model_len = vllm_config.model_config.max_model_len
|
||||
needed_memory = _max_memory_usage_bytes_from_groups(
|
||||
vllm_config, global_kv_cache_groups
|
||||
)
|
||||
if needed_memory > min_available_memory:
|
||||
estimated_max_len = _estimate_max_model_len_from_groups(
|
||||
vllm_config, global_kv_cache_groups, min_available_memory
|
||||
)
|
||||
estimated_msg = ""
|
||||
if estimated_max_len > 0:
|
||||
estimated_msg = (
|
||||
f"Based on the available memory, the estimated maximum "
|
||||
f"model length is {estimated_max_len}. "
|
||||
)
|
||||
raise ValueError(
|
||||
f"To serve at least one request with the models's max seq len "
|
||||
f"({max_model_len}), ({needed_memory / GiB_bytes:.2f} GiB KV "
|
||||
f"cache is needed, which is larger than the available KV cache "
|
||||
f"memory ({min_available_memory / GiB_bytes:.2f} GiB). "
|
||||
f"{estimated_msg}"
|
||||
f"Try increasing `gpu_memory_utilization` or decreasing "
|
||||
f"`max_model_len` when initializing the engine."
|
||||
)
|
||||
|
||||
kv_cache_configs: list[KVCacheConfig] = []
|
||||
for kv_cache_spec_one_worker, available_memory_one_worker in zip(
|
||||
kv_cache_specs, available_memory
|
||||
|
||||
@ -247,9 +247,20 @@ class EngineCore:
|
||||
|
||||
assert len(kv_cache_specs) == len(available_gpu_memory)
|
||||
|
||||
# Track max_model_len before KV cache config to detect auto-fit changes
|
||||
max_model_len_before = vllm_config.model_config.max_model_len
|
||||
|
||||
kv_cache_configs = get_kv_cache_configs(
|
||||
vllm_config, kv_cache_specs, available_gpu_memory
|
||||
)
|
||||
|
||||
# If auto-fit reduced max_model_len, sync the new value to workers.
|
||||
# This is needed because workers were spawned before memory profiling
|
||||
# and have the original (larger) max_model_len cached.
|
||||
max_model_len_after = vllm_config.model_config.max_model_len
|
||||
if max_model_len_after != max_model_len_before:
|
||||
self.collective_rpc("update_max_model_len", args=(max_model_len_after,))
|
||||
|
||||
scheduler_kv_cache_config = generate_scheduler_kv_cache_config(kv_cache_configs)
|
||||
num_gpu_blocks = scheduler_kv_cache_config.num_blocks
|
||||
num_cpu_blocks = 0
|
||||
|
||||
@ -387,6 +387,19 @@ class Worker(WorkerBase):
|
||||
def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
|
||||
return self.model_runner.get_kv_cache_spec()
|
||||
|
||||
def update_max_model_len(self, max_model_len: int) -> None:
|
||||
"""Update max_model_len after auto-fit to GPU memory.
|
||||
|
||||
This is called when max_model_len=-1 is used and the engine
|
||||
automatically determines the maximum context length that fits
|
||||
in GPU memory. Workers need to update their cached max_model_len
|
||||
to match the engine's decision.
|
||||
"""
|
||||
self.model_config.max_model_len = max_model_len
|
||||
if self.model_runner is not None:
|
||||
self.model_runner.max_model_len = max_model_len
|
||||
logger.debug("Updated max_model_len to %d", max_model_len)
|
||||
|
||||
def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None:
|
||||
"""Allocate GPU KV cache with the specified kv_cache_config."""
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user