vllm/tests/v1/engine/test_init_error_messaging.py
shivampr 8580919ac3
[Bugfix] fix confusing OOM errors during v1 init (#28051)
Signed-off-by: Shivam <shivamprasad91@gmail.com>
Signed-off-by: shivampr <shivampr.dev@gmail.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
2025-12-10 23:17:41 +00:00

55 lines
1.4 KiB
Python

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm.v1.core.kv_cache_utils import check_enough_kv_cache_memory
from vllm.v1.kv_cache_interface import FullAttentionSpec
def test_kv_cache_oom_no_memory():
from unittest.mock import MagicMock
config = MagicMock()
config.model_config.max_model_len = 2048
spec = {
"layer_0": FullAttentionSpec(
block_size=16,
num_kv_heads=8,
head_size=128,
dtype="float16",
)
}
with pytest.raises(ValueError):
check_enough_kv_cache_memory(config, spec, 0)
def test_kv_cache_oom_insufficient_memory(monkeypatch):
from unittest.mock import MagicMock
config = MagicMock()
config.model_config.max_model_len = 2048
config.cache_config.block_size = 16
config.parallel_config.tensor_parallel_size = 1
config.parallel_config.pipeline_parallel_size = 1
config.parallel_config.decode_context_parallel_size = 1
monkeypatch.setattr(
"vllm.v1.core.kv_cache_utils.max_memory_usage_bytes",
lambda c, s: 100 * 1024**3, # 100 GiB
)
spec = {
"layer_0": FullAttentionSpec(
block_size=16,
num_kv_heads=8,
head_size=128,
dtype="float16",
)
}
with pytest.raises(ValueError):
check_enough_kv_cache_memory(config, spec, 1024**3) # 1 GiB