vllm/tests/v1/engine/test_init_error_messaging.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import pytest

from vllm.v1.core.kv_cache_utils import check_enough_kv_cache_memory
from vllm.v1.kv_cache_interface import FullAttentionSpec


def test_kv_cache_oom_no_memory():
    from unittest.mock import MagicMock

    config = MagicMock()
    config.model_config.max_model_len = 2048

    spec = {
        "layer_0": FullAttentionSpec(
            block_size=16,
            num_kv_heads=8,
            head_size=128,
            dtype="float16",
        )
    }

    with pytest.raises(ValueError):
        check_enough_kv_cache_memory(config, spec, 0)


def test_kv_cache_oom_insufficient_memory(monkeypatch):
    from unittest.mock import MagicMock

    config = MagicMock()
    config.model_config.max_model_len = 2048
    config.cache_config.block_size = 16
    config.parallel_config.tensor_parallel_size = 1
    config.parallel_config.pipeline_parallel_size = 1
    config.parallel_config.decode_context_parallel_size = 1

    monkeypatch.setattr(
        "vllm.v1.core.kv_cache_utils.max_memory_usage_bytes",
        lambda c, s: 100 * 1024**3,  # 100 GiB
    )

    spec = {
        "layer_0": FullAttentionSpec(
            block_size=16,
            num_kv_heads=8,
            head_size=128,
            dtype="float16",
        )
    }

    with pytest.raises(ValueError):
        check_enough_kv_cache_memory(config, spec, 1024**3)  # 1 GiB