mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-24 02:35:01 +08:00
[CI/Build] Reorganize root-level V1 tests (#25767)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: yewentao256 <zhyanwentao@126.com>
This commit is contained in:
parent
a8913725a1
commit
0f97a2e1db
@ -44,7 +44,6 @@ docker run \
|
|||||||
pytest -v -s v1/structured_output
|
pytest -v -s v1/structured_output
|
||||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py
|
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py
|
||||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
|
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
|
||||||
|
pytest -v -s v1/test_metrics
|
||||||
pytest -v -s v1/test_serial_utils.py
|
pytest -v -s v1/test_serial_utils.py
|
||||||
pytest -v -s v1/test_utils.py
|
|
||||||
pytest -v -s v1/test_metrics_reader.py
|
|
||||||
'
|
'
|
||||||
|
|||||||
@ -159,10 +159,7 @@ steps:
|
|||||||
- examples/offline_inference/rlhf.py
|
- examples/offline_inference/rlhf.py
|
||||||
- examples/offline_inference/rlhf_colocate.py
|
- examples/offline_inference/rlhf_colocate.py
|
||||||
- tests/examples/offline_inference/data_parallel.py
|
- tests/examples/offline_inference/data_parallel.py
|
||||||
- tests/v1/test_async_llm_dp.py
|
- tests/v1/distributed
|
||||||
- tests/v1/test_external_lb_dp.py
|
|
||||||
- tests/v1/test_internal_lb_dp.py
|
|
||||||
- tests/v1/test_hybrid_lb_dp.py
|
|
||||||
- tests/v1/engine/test_engine_core_client.py
|
- tests/v1/engine/test_engine_core_client.py
|
||||||
- tests/distributed/test_symm_mem_allreduce.py
|
- tests/distributed/test_symm_mem_allreduce.py
|
||||||
commands:
|
commands:
|
||||||
@ -180,10 +177,10 @@ steps:
|
|||||||
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
||||||
# test with internal dp
|
# test with internal dp
|
||||||
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_internal_lb_dp.py
|
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
|
||||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_hybrid_lb_dp.py
|
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
||||||
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
||||||
- pytest -v -s distributed/test_utils.py
|
- pytest -v -s distributed/test_utils.py
|
||||||
- pytest -v -s compile/test_basic_correctness.py
|
- pytest -v -s compile/test_basic_correctness.py
|
||||||
@ -300,12 +297,9 @@ steps:
|
|||||||
- pytest -v -s v1/spec_decode
|
- pytest -v -s v1/spec_decode
|
||||||
- pytest -v -s v1/kv_connector/unit
|
- pytest -v -s v1/kv_connector/unit
|
||||||
- pytest -v -s v1/metrics
|
- pytest -v -s v1/metrics
|
||||||
- pytest -v -s v1/test_kv_sharing.py
|
|
||||||
- pytest -v -s v1/test_metrics_reader.py
|
|
||||||
- pytest -v -s v1/test_oracle.py
|
- pytest -v -s v1/test_oracle.py
|
||||||
- pytest -v -s v1/test_request.py
|
- pytest -v -s v1/test_request.py
|
||||||
- pytest -v -s v1/test_serial_utils.py
|
- pytest -v -s v1/test_serial_utils.py
|
||||||
- pytest -v -s v1/test_utils.py
|
|
||||||
# Integration test for streaming correctness (requires special branch).
|
# Integration test for streaming correctness (requires special branch).
|
||||||
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
||||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||||
@ -895,14 +889,13 @@ steps:
|
|||||||
- tests/compile/test_wrapper.py
|
- tests/compile/test_wrapper.py
|
||||||
- tests/distributed/
|
- tests/distributed/
|
||||||
- tests/entrypoints/llm/test_collective_rpc.py
|
- tests/entrypoints/llm/test_collective_rpc.py
|
||||||
- tests/v1/test_async_llm_dp.py
|
- tests/v1/distributed
|
||||||
- tests/v1/test_external_lb_dp.py
|
|
||||||
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
||||||
- tests/v1/shutdown
|
- tests/v1/shutdown
|
||||||
- tests/v1/worker/test_worker_memory_snapshot.py
|
- tests/v1/worker/test_worker_memory_snapshot.py
|
||||||
commands:
|
commands:
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||||
- pytest -v -s ./compile/test_basic_correctness.py
|
- pytest -v -s ./compile/test_basic_correctness.py
|
||||||
|
|||||||
0
tests/v1/distributed/__init__.py
Normal file
0
tests/v1/distributed/__init__.py
Normal file
@ -12,7 +12,7 @@ import pytest_asyncio
|
|||||||
import requests
|
import requests
|
||||||
|
|
||||||
from tests.utils import RemoteOpenAIServer
|
from tests.utils import RemoteOpenAIServer
|
||||||
from tests.v1.test_utils import check_request_balancing
|
from tests.v1.utils import check_request_balancing
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
MODEL_NAME = "ibm-research/PowerMoE-3b"
|
MODEL_NAME = "ibm-research/PowerMoE-3b"
|
||||||
@ -13,7 +13,7 @@ import pytest_asyncio
|
|||||||
import requests
|
import requests
|
||||||
|
|
||||||
from tests.utils import RemoteOpenAIServer
|
from tests.utils import RemoteOpenAIServer
|
||||||
from tests.v1.test_utils import check_request_balancing
|
from tests.v1.utils import check_request_balancing
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
MODEL_NAME = "ibm-research/PowerMoE-3b"
|
MODEL_NAME = "ibm-research/PowerMoE-3b"
|
||||||
@ -8,7 +8,7 @@ import pytest
|
|||||||
import pytest_asyncio
|
import pytest_asyncio
|
||||||
|
|
||||||
from tests.utils import RemoteOpenAIServer
|
from tests.utils import RemoteOpenAIServer
|
||||||
from tests.v1.test_utils import check_request_balancing
|
from tests.v1.utils import check_request_balancing
|
||||||
|
|
||||||
MODEL_NAME = "ibm-research/PowerMoE-3b"
|
MODEL_NAME = "ibm-research/PowerMoE-3b"
|
||||||
|
|
||||||
|
|||||||
@ -1,71 +1,10 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import regex as re
|
import regex as re
|
||||||
import requests
|
import requests
|
||||||
import torch
|
|
||||||
|
|
||||||
from tests.utils import RemoteOpenAIServer
|
from tests.utils import RemoteOpenAIServer
|
||||||
from vllm.v1.worker.utils import bind_kv_cache
|
|
||||||
|
|
||||||
|
|
||||||
def test_bind_kv_cache():
|
|
||||||
from vllm.attention import Attention
|
|
||||||
|
|
||||||
ctx = {
|
|
||||||
'layers.0.self_attn': Attention(32, 128, 0.1),
|
|
||||||
'layers.1.self_attn': Attention(32, 128, 0.1),
|
|
||||||
'layers.2.self_attn': Attention(32, 128, 0.1),
|
|
||||||
'layers.3.self_attn': Attention(32, 128, 0.1),
|
|
||||||
}
|
|
||||||
kv_cache = {
|
|
||||||
'layers.0.self_attn': torch.zeros((1, )),
|
|
||||||
'layers.1.self_attn': torch.zeros((1, )),
|
|
||||||
'layers.2.self_attn': torch.zeros((1, )),
|
|
||||||
'layers.3.self_attn': torch.zeros((1, )),
|
|
||||||
}
|
|
||||||
runner_kv_caches: list[torch.Tensor] = []
|
|
||||||
bind_kv_cache(kv_cache, ctx, runner_kv_caches)
|
|
||||||
assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[
|
|
||||||
'layers.0.self_attn']
|
|
||||||
assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[
|
|
||||||
'layers.1.self_attn']
|
|
||||||
assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[
|
|
||||||
'layers.2.self_attn']
|
|
||||||
assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[
|
|
||||||
'layers.3.self_attn']
|
|
||||||
|
|
||||||
assert runner_kv_caches[0] is kv_cache['layers.0.self_attn']
|
|
||||||
assert runner_kv_caches[1] is kv_cache['layers.1.self_attn']
|
|
||||||
assert runner_kv_caches[2] is kv_cache['layers.2.self_attn']
|
|
||||||
assert runner_kv_caches[3] is kv_cache['layers.3.self_attn']
|
|
||||||
|
|
||||||
|
|
||||||
def test_bind_kv_cache_non_attention():
|
|
||||||
from vllm.attention import Attention
|
|
||||||
|
|
||||||
# example from Jamba PP=2
|
|
||||||
ctx = {
|
|
||||||
'model.layers.20.attn': Attention(32, 128, 0.1),
|
|
||||||
'model.layers.28.attn': Attention(32, 128, 0.1),
|
|
||||||
}
|
|
||||||
kv_cache = {
|
|
||||||
'model.layers.20.attn': torch.zeros((1, )),
|
|
||||||
'model.layers.28.attn': torch.zeros((1, )),
|
|
||||||
}
|
|
||||||
|
|
||||||
runner_kv_caches: list[torch.Tensor] = []
|
|
||||||
bind_kv_cache(kv_cache, ctx, runner_kv_caches)
|
|
||||||
|
|
||||||
assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[
|
|
||||||
'model.layers.20.attn']
|
|
||||||
assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[
|
|
||||||
'model.layers.28.attn']
|
|
||||||
|
|
||||||
assert runner_kv_caches[0] is kv_cache['model.layers.20.attn']
|
|
||||||
assert runner_kv_caches[1] is kv_cache['model.layers.28.attn']
|
|
||||||
|
|
||||||
|
|
||||||
# Prometheus metrics utilities for testing
|
# Prometheus metrics utilities for testing
|
||||||
|
|
||||||
63
tests/v1/worker/test_utils.py
Normal file
63
tests/v1/worker/test_utils.py
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from vllm.v1.worker.utils import bind_kv_cache
|
||||||
|
|
||||||
|
|
||||||
|
def test_bind_kv_cache():
|
||||||
|
from vllm.attention import Attention
|
||||||
|
|
||||||
|
ctx = {
|
||||||
|
'layers.0.self_attn': Attention(32, 128, 0.1),
|
||||||
|
'layers.1.self_attn': Attention(32, 128, 0.1),
|
||||||
|
'layers.2.self_attn': Attention(32, 128, 0.1),
|
||||||
|
'layers.3.self_attn': Attention(32, 128, 0.1),
|
||||||
|
}
|
||||||
|
kv_cache = {
|
||||||
|
'layers.0.self_attn': torch.zeros((1, )),
|
||||||
|
'layers.1.self_attn': torch.zeros((1, )),
|
||||||
|
'layers.2.self_attn': torch.zeros((1, )),
|
||||||
|
'layers.3.self_attn': torch.zeros((1, )),
|
||||||
|
}
|
||||||
|
runner_kv_caches: list[torch.Tensor] = []
|
||||||
|
bind_kv_cache(kv_cache, ctx, runner_kv_caches)
|
||||||
|
assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[
|
||||||
|
'layers.0.self_attn']
|
||||||
|
assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[
|
||||||
|
'layers.1.self_attn']
|
||||||
|
assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[
|
||||||
|
'layers.2.self_attn']
|
||||||
|
assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[
|
||||||
|
'layers.3.self_attn']
|
||||||
|
|
||||||
|
assert runner_kv_caches[0] is kv_cache['layers.0.self_attn']
|
||||||
|
assert runner_kv_caches[1] is kv_cache['layers.1.self_attn']
|
||||||
|
assert runner_kv_caches[2] is kv_cache['layers.2.self_attn']
|
||||||
|
assert runner_kv_caches[3] is kv_cache['layers.3.self_attn']
|
||||||
|
|
||||||
|
|
||||||
|
def test_bind_kv_cache_non_attention():
|
||||||
|
from vllm.attention import Attention
|
||||||
|
|
||||||
|
# example from Jamba PP=2
|
||||||
|
ctx = {
|
||||||
|
'model.layers.20.attn': Attention(32, 128, 0.1),
|
||||||
|
'model.layers.28.attn': Attention(32, 128, 0.1),
|
||||||
|
}
|
||||||
|
kv_cache = {
|
||||||
|
'model.layers.20.attn': torch.zeros((1, )),
|
||||||
|
'model.layers.28.attn': torch.zeros((1, )),
|
||||||
|
}
|
||||||
|
|
||||||
|
runner_kv_caches: list[torch.Tensor] = []
|
||||||
|
bind_kv_cache(kv_cache, ctx, runner_kv_caches)
|
||||||
|
|
||||||
|
assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[
|
||||||
|
'model.layers.20.attn']
|
||||||
|
assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[
|
||||||
|
'model.layers.28.attn']
|
||||||
|
|
||||||
|
assert runner_kv_caches[0] is kv_cache['model.layers.20.attn']
|
||||||
|
assert runner_kv_caches[1] is kv_cache['model.layers.28.attn']
|
||||||
Loading…
x
Reference in New Issue
Block a user