mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 08:25:01 +08:00
[v1][core] Support for attention free models (#20811)
Signed-off-by: Christian Pinto <christian.pinto@ibm.com>
This commit is contained in:
parent
56fe4bedd6
commit
4ffd963fa0
@ -78,7 +78,12 @@ class KVCacheManager:
|
||||
) -> None:
|
||||
self.max_model_len = max_model_len
|
||||
|
||||
if len(kv_cache_config.kv_cache_groups) == 0:
|
||||
# Attention free models don't have kv cache,
|
||||
# thus don't need prefix caching.
|
||||
enable_caching = False
|
||||
self.enable_caching = enable_caching
|
||||
|
||||
self.caching_hash_fn = (
|
||||
sha256_cbor_64bit if caching_hash_algo == "sha256_cbor_64bit" else
|
||||
sha256 if caching_hash_algo == "sha256" else hash)
|
||||
@ -101,7 +106,7 @@ class KVCacheManager:
|
||||
kv_cache_config=kv_cache_config,
|
||||
max_model_len=self.max_model_len,
|
||||
use_eagle=self.use_eagle,
|
||||
enable_caching=enable_caching,
|
||||
enable_caching=self.enable_caching,
|
||||
caching_hash_fn=self.caching_hash_fn,
|
||||
enable_kv_cache_events=enable_kv_cache_events,
|
||||
)
|
||||
|
||||
@ -563,6 +563,10 @@ def check_enough_kv_cache_memory(vllm_config: VllmConfig,
|
||||
ValueError: If there is not enough memory available for the KV cache.
|
||||
"""
|
||||
|
||||
# No need to check for available memory if the kv_cache_spec is empty
|
||||
if not kv_cache_spec:
|
||||
return
|
||||
|
||||
if available_memory <= 0:
|
||||
raise ValueError("No available memory for the cache blocks. "
|
||||
"Try increasing `gpu_memory_utilization` when "
|
||||
@ -749,6 +753,13 @@ def is_kv_cache_page_size_uniform(
|
||||
return len(page_sizes) == 1
|
||||
|
||||
|
||||
def is_kv_cache_type_attention_free(
|
||||
kv_cache_spec: dict[str, KVCacheSpec]) -> bool:
|
||||
|
||||
# kv_cache_spec is an empty dict for attention free models
|
||||
return not kv_cache_spec
|
||||
|
||||
|
||||
def _get_kv_cache_config_uniform_page_size(
|
||||
vllm_config: VllmConfig, kv_cache_spec: dict[str, KVCacheSpec],
|
||||
available_memory: int) -> KVCacheConfig:
|
||||
@ -891,6 +902,10 @@ def _get_kv_cache_config_uniform_page_size(
|
||||
return kv_cache_config
|
||||
|
||||
|
||||
def _get_kv_cache_config_attention_free() -> KVCacheConfig:
|
||||
return KVCacheConfig(num_blocks=1, kv_cache_tensors=[], kv_cache_groups=[])
|
||||
|
||||
|
||||
def unify_hybrid_kv_cache_specs(kv_cache_spec: dict[str, KVCacheSpec]):
|
||||
"""
|
||||
This function tries to convert the KV cache specs to one type if the model
|
||||
@ -957,7 +972,11 @@ def get_kv_cache_config(
|
||||
if vllm_config.scheduler_config.disable_hybrid_kv_cache_manager:
|
||||
unify_hybrid_kv_cache_specs(kv_cache_spec)
|
||||
|
||||
if is_kv_cache_type_uniform(kv_cache_spec):
|
||||
if is_kv_cache_type_attention_free(kv_cache_spec):
|
||||
# This returns a kv_cache config with 0 kv_cache groups and 1 block
|
||||
# to allow for the KVCache manager to handle attention free models.
|
||||
return _get_kv_cache_config_attention_free()
|
||||
elif is_kv_cache_type_uniform(kv_cache_spec):
|
||||
# KV cache of all layers are the same, which is true for
|
||||
# most models. Allocate the same amount of memory for
|
||||
# each layer.
|
||||
|
||||
@ -139,7 +139,13 @@ class EngineCore:
|
||||
|
||||
# Profiles the peak memory usage of the model to determine how much
|
||||
# memory can be allocated for kv cache.
|
||||
available_gpu_memory = self.model_executor.determine_available_memory()
|
||||
has_kv_cache = any(kv_cache_spec for kv_cache_spec in kv_cache_specs)
|
||||
if has_kv_cache:
|
||||
available_gpu_memory = \
|
||||
self.model_executor.determine_available_memory()
|
||||
else:
|
||||
# Attention free models don't need memory for kv cache
|
||||
available_gpu_memory = [0] * len(kv_cache_specs)
|
||||
|
||||
assert len(kv_cache_specs) == len(available_gpu_memory)
|
||||
# Get the kv cache tensor size
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user