mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-22 21:25:03 +08:00
Signed-off-by: Shivam <shivamprasad91@gmail.com> Signed-off-by: shivampr <shivampr.dev@gmail.com> Co-authored-by: Chen Zhang <zhangch99@outlook.com>
55 lines
1.4 KiB
Python
55 lines
1.4 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
|
|
import pytest
|
|
|
|
from vllm.v1.core.kv_cache_utils import check_enough_kv_cache_memory
|
|
from vllm.v1.kv_cache_interface import FullAttentionSpec
|
|
|
|
|
|
def test_kv_cache_oom_no_memory():
|
|
from unittest.mock import MagicMock
|
|
|
|
config = MagicMock()
|
|
config.model_config.max_model_len = 2048
|
|
|
|
spec = {
|
|
"layer_0": FullAttentionSpec(
|
|
block_size=16,
|
|
num_kv_heads=8,
|
|
head_size=128,
|
|
dtype="float16",
|
|
)
|
|
}
|
|
|
|
with pytest.raises(ValueError):
|
|
check_enough_kv_cache_memory(config, spec, 0)
|
|
|
|
|
|
def test_kv_cache_oom_insufficient_memory(monkeypatch):
|
|
from unittest.mock import MagicMock
|
|
|
|
config = MagicMock()
|
|
config.model_config.max_model_len = 2048
|
|
config.cache_config.block_size = 16
|
|
config.parallel_config.tensor_parallel_size = 1
|
|
config.parallel_config.pipeline_parallel_size = 1
|
|
config.parallel_config.decode_context_parallel_size = 1
|
|
|
|
monkeypatch.setattr(
|
|
"vllm.v1.core.kv_cache_utils.max_memory_usage_bytes",
|
|
lambda c, s: 100 * 1024**3, # 100 GiB
|
|
)
|
|
|
|
spec = {
|
|
"layer_0": FullAttentionSpec(
|
|
block_size=16,
|
|
num_kv_heads=8,
|
|
head_size=128,
|
|
dtype="float16",
|
|
)
|
|
}
|
|
|
|
with pytest.raises(ValueError):
|
|
check_enough_kv_cache_memory(config, spec, 1024**3) # 1 GiB
|