diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 2fc65c7fb6584..0914c899aa5b8 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -300,10 +300,12 @@ steps: - pytest -v -s v1/spec_decode - pytest -v -s v1/kv_connector/unit - pytest -v -s v1/metrics + - pytest -v -s v1/test_kv_sharing.py + - pytest -v -s v1/test_metrics_reader.py + - pytest -v -s v1/test_oracle.py + - pytest -v -s v1/test_request.py - pytest -v -s v1/test_serial_utils.py - pytest -v -s v1/test_utils.py - - pytest -v -s v1/test_oracle.py - - pytest -v -s v1/test_metrics_reader.py # Integration test for streaming correctness (requires special branch). - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine diff --git a/tests/v1/test_kv_sharing.py b/tests/v1/test_kv_sharing.py index 96848047145b6..31a74101faf92 100644 --- a/tests/v1/test_kv_sharing.py +++ b/tests/v1/test_kv_sharing.py @@ -1,17 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from unittest.mock import Mock - import torch -from vllm.v1.attention.backends.flash_attn import ( - FlashAttentionBackend, FlashAttentionMetadataBuilder) -from vllm.v1.attention.backends.flex_attention import ( - FlexAttentionBackend, FlexAttentionMetadataBuilder) from vllm.v1.kv_cache_interface import FullAttentionSpec, KVCacheGroupSpec -from vllm.v1.worker.utils import (AttentionGroup, - initialize_kv_cache_for_kv_sharing) +from vllm.v1.worker.utils import add_kv_sharing_layers_to_kv_cache_groups def new_kv_cache_spec(): @@ -37,56 +30,17 @@ def test_initialize_kv_cache_for_kv_sharing_different_attn_groups(): new_kv_cache_spec()), ] - attn_groups = [ - # KV cache group 0 has two attention groups - [ - AttentionGroup( - backend=FlashAttentionBackend, - metadata_builder=Mock(spec=FlashAttentionMetadataBuilder), - layer_names=["model.layers.0"], - ), - AttentionGroup( - backend=FlexAttentionBackend, - metadata_builder=Mock(spec=FlexAttentionMetadataBuilder), - layer_names=["model.layers.1"], - ), - ], - ] - - # Only layers 0 and 1 will have KV caches allocated - kv_caches = { - "model.layers.0": torch.zeros(1, 2, 3), - "model.layers.1": torch.ones(1, 2, 3), - } - - initialize_kv_cache_for_kv_sharing( + add_kv_sharing_layers_to_kv_cache_groups( shared_kv_cache_layers=shared_kv_cache_layers, kv_cache_groups=kv_cache_groups, - kv_caches=kv_caches, - attn_groups=attn_groups, ) - # Check that the KV caches were shared correctly - assert kv_caches["model.layers.2"].data_ptr( - ) == kv_caches["model.layers.0"].data_ptr() - assert kv_caches["model.layers.3"].data_ptr( - ) == kv_caches["model.layers.1"].data_ptr() - # Check that the layers were added to the correct KV cache group assert len(kv_cache_groups) == 1 assert kv_cache_groups[0].layer_names == [ "model.layers.0", "model.layers.1", "model.layers.2", "model.layers.3" ] - # Check that the layers were added to the attention groups - assert len(attn_groups) == 1 and len(attn_groups[0]) == 2 - assert attn_groups[0][0].layer_names == [ - "model.layers.0", "model.layers.2" - ] - assert attn_groups[0][1].layer_names == [ - "model.layers.1", "model.layers.3" - ] - def test_initialize_kv_cache_for_kv_sharing_same_attn_groups(): """ @@ -103,48 +57,17 @@ def test_initialize_kv_cache_for_kv_sharing_same_attn_groups(): new_kv_cache_spec()), ] - attn_groups = [ - # KV cache group 0 has a single attention group - # as all layers have the same flash attention backend - [ - AttentionGroup( - backend=FlashAttentionBackend, - metadata_builder=Mock(spec=FlashAttentionMetadataBuilder), - layer_names=["model.layers.0", "model.layers.1"], - ), - ], - ] - - kv_caches = { - "model.layers.0": torch.zeros(1, 2, 3), - "model.layers.1": torch.ones(1, 2, 3), - } - - initialize_kv_cache_for_kv_sharing( + add_kv_sharing_layers_to_kv_cache_groups( shared_kv_cache_layers=shared_kv_cache_layers, kv_cache_groups=kv_cache_groups, - kv_caches=kv_caches, - attn_groups=attn_groups, ) - # Check that the KV caches were shared correctly - assert kv_caches["model.layers.2"].data_ptr( - ) == kv_caches["model.layers.0"].data_ptr() - assert kv_caches["model.layers.3"].data_ptr( - ) == kv_caches["model.layers.1"].data_ptr() - # Check that the layers were added to the correct KV cache group assert len(kv_cache_groups) == 1 assert kv_cache_groups[0].layer_names == [ "model.layers.0", "model.layers.1", "model.layers.2", "model.layers.3" ] - # Check that the layers were added to the attention groups - assert len(attn_groups) == 1 and len(attn_groups[0]) == 1 - assert attn_groups[0][0].layer_names == [ - "model.layers.0", "model.layers.1", "model.layers.2", "model.layers.3" - ] - def test_initialize_kv_cache_for_kv_sharing_no_attn_groups(): """ @@ -162,23 +85,11 @@ def test_initialize_kv_cache_for_kv_sharing_no_attn_groups(): KVCacheGroupSpec(["model.layers.1"], new_kv_cache_spec()), ] - kv_caches = { - "model.layers.0": torch.zeros(1, 2, 3), - "model.layers.1": torch.ones(1, 2, 3), - } - - initialize_kv_cache_for_kv_sharing( + add_kv_sharing_layers_to_kv_cache_groups( shared_kv_cache_layers=shared_kv_cache_layers, kv_cache_groups=kv_cache_groups, - kv_caches=kv_caches, ) - # Check that the KV caches were shared correctly - assert kv_caches["model.layers.2"].data_ptr( - ) == kv_caches["model.layers.0"].data_ptr() - assert kv_caches["model.layers.3"].data_ptr( - ) == kv_caches["model.layers.1"].data_ptr() - # Check that the layers were added to the correct KV cache group assert len(kv_cache_groups) == 2 assert kv_cache_groups[0].layer_names == [