diff --git a/.buildkite/scripts/hardware_ci/run-xpu-test.sh b/.buildkite/scripts/hardware_ci/run-xpu-test.sh index 1fc3dbd8c21f4..6b9c0121c4aa8 100644 --- a/.buildkite/scripts/hardware_ci/run-xpu-test.sh +++ b/.buildkite/scripts/hardware_ci/run-xpu-test.sh @@ -44,7 +44,6 @@ docker run \ pytest -v -s v1/structured_output pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py + pytest -v -s v1/test_metrics pytest -v -s v1/test_serial_utils.py - pytest -v -s v1/test_utils.py - pytest -v -s v1/test_metrics_reader.py ' diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index c6c4e2a2309fc..e603c1582e1fb 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -159,10 +159,7 @@ steps: - examples/offline_inference/rlhf.py - examples/offline_inference/rlhf_colocate.py - tests/examples/offline_inference/data_parallel.py - - tests/v1/test_async_llm_dp.py - - tests/v1/test_external_lb_dp.py - - tests/v1/test_internal_lb_dp.py - - tests/v1/test_hybrid_lb_dp.py + - tests/v1/distributed - tests/v1/engine/test_engine_core_client.py - tests/distributed/test_symm_mem_allreduce.py commands: @@ -180,10 +177,10 @@ steps: - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py # test with internal dp - python3 ../examples/offline_inference/data_parallel.py --enforce-eager - - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py - - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py - - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_internal_lb_dp.py - - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_hybrid_lb_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py + - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp - pytest -v -s distributed/test_utils.py - pytest -v -s compile/test_basic_correctness.py @@ -300,12 +297,9 @@ steps: - pytest -v -s v1/spec_decode - pytest -v -s v1/kv_connector/unit - pytest -v -s v1/metrics - - pytest -v -s v1/test_kv_sharing.py - - pytest -v -s v1/test_metrics_reader.py - pytest -v -s v1/test_oracle.py - pytest -v -s v1/test_request.py - pytest -v -s v1/test_serial_utils.py - - pytest -v -s v1/test_utils.py # Integration test for streaming correctness (requires special branch). - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine @@ -895,14 +889,13 @@ steps: - tests/compile/test_wrapper.py - tests/distributed/ - tests/entrypoints/llm/test_collective_rpc.py - - tests/v1/test_async_llm_dp.py - - tests/v1/test_external_lb_dp.py + - tests/v1/distributed - tests/v1/entrypoints/openai/test_multi_api_servers.py - tests/v1/shutdown - tests/v1/worker/test_worker_memory_snapshot.py commands: - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py - - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py + - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - pytest -v -s entrypoints/llm/test_collective_rpc.py - pytest -v -s ./compile/test_basic_correctness.py diff --git a/tests/v1/test_kv_sharing.py b/tests/v1/core/test_kv_sharing.py similarity index 100% rename from tests/v1/test_kv_sharing.py rename to tests/v1/core/test_kv_sharing.py diff --git a/tests/v1/distributed/__init__.py b/tests/v1/distributed/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/v1/test_async_llm_dp.py b/tests/v1/distributed/test_async_llm_dp.py similarity index 100% rename from tests/v1/test_async_llm_dp.py rename to tests/v1/distributed/test_async_llm_dp.py diff --git a/tests/v1/test_external_lb_dp.py b/tests/v1/distributed/test_external_lb_dp.py similarity index 100% rename from tests/v1/test_external_lb_dp.py rename to tests/v1/distributed/test_external_lb_dp.py diff --git a/tests/v1/test_hybrid_lb_dp.py b/tests/v1/distributed/test_hybrid_lb_dp.py similarity index 99% rename from tests/v1/test_hybrid_lb_dp.py rename to tests/v1/distributed/test_hybrid_lb_dp.py index 552436f818d77..21d8009a6dbb7 100644 --- a/tests/v1/test_hybrid_lb_dp.py +++ b/tests/v1/distributed/test_hybrid_lb_dp.py @@ -12,7 +12,7 @@ import pytest_asyncio import requests from tests.utils import RemoteOpenAIServer -from tests.v1.test_utils import check_request_balancing +from tests.v1.utils import check_request_balancing from vllm.platforms import current_platform MODEL_NAME = "ibm-research/PowerMoE-3b" diff --git a/tests/v1/test_internal_lb_dp.py b/tests/v1/distributed/test_internal_lb_dp.py similarity index 99% rename from tests/v1/test_internal_lb_dp.py rename to tests/v1/distributed/test_internal_lb_dp.py index e965645711ee6..3f9defd13dead 100644 --- a/tests/v1/test_internal_lb_dp.py +++ b/tests/v1/distributed/test_internal_lb_dp.py @@ -13,7 +13,7 @@ import pytest_asyncio import requests from tests.utils import RemoteOpenAIServer -from tests.v1.test_utils import check_request_balancing +from tests.v1.utils import check_request_balancing from vllm.platforms import current_platform MODEL_NAME = "ibm-research/PowerMoE-3b" diff --git a/tests/v1/entrypoints/openai/test_multi_api_servers.py b/tests/v1/entrypoints/openai/test_multi_api_servers.py index f7c31b0c43778..35f75191d9c8d 100644 --- a/tests/v1/entrypoints/openai/test_multi_api_servers.py +++ b/tests/v1/entrypoints/openai/test_multi_api_servers.py @@ -8,7 +8,7 @@ import pytest import pytest_asyncio from tests.utils import RemoteOpenAIServer -from tests.v1.test_utils import check_request_balancing +from tests.v1.utils import check_request_balancing MODEL_NAME = "ibm-research/PowerMoE-3b" diff --git a/tests/v1/test_metrics_reader.py b/tests/v1/metrics/test_metrics_reader.py similarity index 100% rename from tests/v1/test_metrics_reader.py rename to tests/v1/metrics/test_metrics_reader.py diff --git a/tests/v1/test_utils.py b/tests/v1/utils.py similarity index 67% rename from tests/v1/test_utils.py rename to tests/v1/utils.py index 00d98a873a310..b3f560c11e8f5 100644 --- a/tests/v1/test_utils.py +++ b/tests/v1/utils.py @@ -1,71 +1,10 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project - import pytest import regex as re import requests -import torch from tests.utils import RemoteOpenAIServer -from vllm.v1.worker.utils import bind_kv_cache - - -def test_bind_kv_cache(): - from vllm.attention import Attention - - ctx = { - 'layers.0.self_attn': Attention(32, 128, 0.1), - 'layers.1.self_attn': Attention(32, 128, 0.1), - 'layers.2.self_attn': Attention(32, 128, 0.1), - 'layers.3.self_attn': Attention(32, 128, 0.1), - } - kv_cache = { - 'layers.0.self_attn': torch.zeros((1, )), - 'layers.1.self_attn': torch.zeros((1, )), - 'layers.2.self_attn': torch.zeros((1, )), - 'layers.3.self_attn': torch.zeros((1, )), - } - runner_kv_caches: list[torch.Tensor] = [] - bind_kv_cache(kv_cache, ctx, runner_kv_caches) - assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[ - 'layers.0.self_attn'] - assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[ - 'layers.1.self_attn'] - assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[ - 'layers.2.self_attn'] - assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[ - 'layers.3.self_attn'] - - assert runner_kv_caches[0] is kv_cache['layers.0.self_attn'] - assert runner_kv_caches[1] is kv_cache['layers.1.self_attn'] - assert runner_kv_caches[2] is kv_cache['layers.2.self_attn'] - assert runner_kv_caches[3] is kv_cache['layers.3.self_attn'] - - -def test_bind_kv_cache_non_attention(): - from vllm.attention import Attention - - # example from Jamba PP=2 - ctx = { - 'model.layers.20.attn': Attention(32, 128, 0.1), - 'model.layers.28.attn': Attention(32, 128, 0.1), - } - kv_cache = { - 'model.layers.20.attn': torch.zeros((1, )), - 'model.layers.28.attn': torch.zeros((1, )), - } - - runner_kv_caches: list[torch.Tensor] = [] - bind_kv_cache(kv_cache, ctx, runner_kv_caches) - - assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[ - 'model.layers.20.attn'] - assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[ - 'model.layers.28.attn'] - - assert runner_kv_caches[0] is kv_cache['model.layers.20.attn'] - assert runner_kv_caches[1] is kv_cache['model.layers.28.attn'] - # Prometheus metrics utilities for testing diff --git a/tests/v1/worker/test_utils.py b/tests/v1/worker/test_utils.py new file mode 100644 index 0000000000000..fd0e630ce178a --- /dev/null +++ b/tests/v1/worker/test_utils.py @@ -0,0 +1,63 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import torch + +from vllm.v1.worker.utils import bind_kv_cache + + +def test_bind_kv_cache(): + from vllm.attention import Attention + + ctx = { + 'layers.0.self_attn': Attention(32, 128, 0.1), + 'layers.1.self_attn': Attention(32, 128, 0.1), + 'layers.2.self_attn': Attention(32, 128, 0.1), + 'layers.3.self_attn': Attention(32, 128, 0.1), + } + kv_cache = { + 'layers.0.self_attn': torch.zeros((1, )), + 'layers.1.self_attn': torch.zeros((1, )), + 'layers.2.self_attn': torch.zeros((1, )), + 'layers.3.self_attn': torch.zeros((1, )), + } + runner_kv_caches: list[torch.Tensor] = [] + bind_kv_cache(kv_cache, ctx, runner_kv_caches) + assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[ + 'layers.0.self_attn'] + assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[ + 'layers.1.self_attn'] + assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[ + 'layers.2.self_attn'] + assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[ + 'layers.3.self_attn'] + + assert runner_kv_caches[0] is kv_cache['layers.0.self_attn'] + assert runner_kv_caches[1] is kv_cache['layers.1.self_attn'] + assert runner_kv_caches[2] is kv_cache['layers.2.self_attn'] + assert runner_kv_caches[3] is kv_cache['layers.3.self_attn'] + + +def test_bind_kv_cache_non_attention(): + from vllm.attention import Attention + + # example from Jamba PP=2 + ctx = { + 'model.layers.20.attn': Attention(32, 128, 0.1), + 'model.layers.28.attn': Attention(32, 128, 0.1), + } + kv_cache = { + 'model.layers.20.attn': torch.zeros((1, )), + 'model.layers.28.attn': torch.zeros((1, )), + } + + runner_kv_caches: list[torch.Tensor] = [] + bind_kv_cache(kv_cache, ctx, runner_kv_caches) + + assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[ + 'model.layers.20.attn'] + assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[ + 'model.layers.28.attn'] + + assert runner_kv_caches[0] is kv_cache['model.layers.20.attn'] + assert runner_kv_caches[1] is kv_cache['model.layers.28.attn']