mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 09:06:03 +08:00
[ci][distributed] merge distributed test commands (#7097)
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
This commit is contained in:
parent
8c025fa703
commit
04e5583425
@ -82,20 +82,9 @@ steps:
|
||||
num_gpus: 2
|
||||
commands:
|
||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
|
||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
|
||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
|
||||
- TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
|
||||
- TEST_DIST_MODEL=llava-hf/llava-v1.6-mistral-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
|
||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
|
||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
|
||||
- TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
|
||||
- TEST_DIST_MODEL=llava-hf/llava-v1.6-mistral-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
|
||||
- TARGET_TEST_SUITE=L4 pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- pytest -v -s distributed/test_chunked_prefill_distributed.py
|
||||
- pytest -v -s distributed/test_multimodal_broadcast.py
|
||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
|
||||
@ -107,11 +96,6 @@ steps:
|
||||
fast_check: true
|
||||
commands:
|
||||
- pytest -v -s distributed/test_pynccl.py
|
||||
# We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
|
||||
# See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
|
||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
||||
|
||||
- label: Pipeline Parallelism Test
|
||||
@ -279,9 +263,6 @@ steps:
|
||||
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
||||
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
||||
- pytest -v -s distributed/test_custom_all_reduce.py
|
||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
|
||||
- VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||
- pytest -v -s -x lora/test_mixtral.py
|
||||
|
||||
@ -1,15 +1,10 @@
|
||||
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
|
||||
vLLM will allocate all the available memory, so we need to run the tests one
|
||||
by one. The solution is to pass arguments (model name) by environment
|
||||
variables.
|
||||
|
||||
Run:
|
||||
```sh
|
||||
cd $VLLM_PATH/tests
|
||||
|
||||
TEST_DIST_MODEL=facebook/opt-125m pytest \
|
||||
distributed/test_basic_distributed_correctness.py
|
||||
TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \
|
||||
distributed/test_basic_distributed_correctness.py
|
||||
pytest distributed/test_basic_distributed_correctness.py
|
||||
```
|
||||
"""
|
||||
import os
|
||||
@ -19,27 +14,48 @@ import pytest
|
||||
from vllm.utils import cuda_device_count_stateless
|
||||
|
||||
from ..models.utils import check_outputs_equal
|
||||
from ..utils import fork_new_process_for_each_test
|
||||
|
||||
MODELS = [
|
||||
os.environ["TEST_DIST_MODEL"],
|
||||
]
|
||||
DISTRIBUTED_EXECUTOR_BACKEND = "DISTRIBUTED_EXECUTOR_BACKEND"
|
||||
TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
|
||||
|
||||
|
||||
@pytest.mark.skipif(cuda_device_count_stateless() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [5])
|
||||
@pytest.mark.parametrize(
|
||||
"model, distributed_executor_backend, attention_backend, test_suite", [
|
||||
("facebook/opt-125m", "ray", "", "L4"),
|
||||
("facebook/opt-125m", "mp", "", "L4"),
|
||||
("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
|
||||
("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
|
||||
("facebook/opt-125m", "ray", "", "A100"),
|
||||
("facebook/opt-125m", "mp", "", "A100"),
|
||||
("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
|
||||
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
|
||||
])
|
||||
@fork_new_process_for_each_test
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
distributed_executor_backend: str,
|
||||
attention_backend: str,
|
||||
test_suite: str,
|
||||
) -> None:
|
||||
distributed_executor_backend = os.getenv(DISTRIBUTED_EXECUTOR_BACKEND)
|
||||
|
||||
if test_suite != TARGET_TEST_SUITE:
|
||||
pytest.skip(f"Skip test for {test_suite}")
|
||||
|
||||
if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
|
||||
# test ray adag
|
||||
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
|
||||
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
|
||||
|
||||
if attention_backend:
|
||||
os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
|
||||
|
||||
dtype = "half"
|
||||
max_tokens = 5
|
||||
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
|
||||
@ -1,46 +1,39 @@
|
||||
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
|
||||
vLLM will allocate all the available memory, so we need to run the tests one
|
||||
by one. The solution is to pass arguments (model name) by environment
|
||||
variables.
|
||||
|
||||
Run:
|
||||
```sh
|
||||
TEST_DIST_MODEL=facebook/opt-125m pytest \
|
||||
test_chunked_prefill_distributed.py
|
||||
TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \
|
||||
test_chunked_prefill_distributed.py
|
||||
pytest test_chunked_prefill_distributed.py
|
||||
```
|
||||
"""
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.utils import cuda_device_count_stateless
|
||||
|
||||
from ..models.utils import check_outputs_equal
|
||||
|
||||
MODELS = [
|
||||
os.environ["TEST_DIST_MODEL"],
|
||||
]
|
||||
DISTRIBUTED_EXECUTOR_BACKEND = "DISTRIBUTED_EXECUTOR_BACKEND"
|
||||
from ..utils import fork_new_process_for_each_test
|
||||
|
||||
|
||||
@pytest.mark.skipif(cuda_device_count_stateless() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [5])
|
||||
@pytest.mark.parametrize("chunked_prefill_token_size", [16])
|
||||
@pytest.mark.parametrize("model, distributed_executor_backend", [
|
||||
("facebook/opt-125m", "ray"),
|
||||
("meta-llama/Llama-2-7b-hf", "ray"),
|
||||
("facebook/opt-125m", "mp"),
|
||||
("meta-llama/Llama-2-7b-hf", "mp"),
|
||||
])
|
||||
@fork_new_process_for_each_test
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
example_prompts,
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
chunked_prefill_token_size: int,
|
||||
distributed_executor_backend: str,
|
||||
) -> None:
|
||||
distributed_executor_backend = os.getenv(DISTRIBUTED_EXECUTOR_BACKEND)
|
||||
|
||||
dtype = "half"
|
||||
max_tokens = 5
|
||||
chunked_prefill_token_size = 16
|
||||
|
||||
# Add a chunked prefill config.
|
||||
max_num_seqs = min(chunked_prefill_token_size, 256)
|
||||
|
||||
@ -1,44 +1,41 @@
|
||||
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
|
||||
The second test will hang if more than one test is run per command, so we need
|
||||
to run the tests one by one. The solution is to pass arguments (model name) by
|
||||
environment variables.
|
||||
|
||||
Run:
|
||||
```sh
|
||||
TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf \
|
||||
test_multimodal_broadcast.py
|
||||
TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct \
|
||||
test_multimodal_broadcast.py
|
||||
pytest -s -v test_multimodal_broadcast.py
|
||||
```
|
||||
"""
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.utils import cuda_device_count_stateless
|
||||
|
||||
model = os.environ["TEST_DIST_MODEL"]
|
||||
|
||||
if model.startswith("llava-hf/llava-1.5"):
|
||||
from ..models.test_llava import models, run_test
|
||||
elif model.startswith("llava-hf/llava-v1.6"):
|
||||
from ..models.test_llava_next import models, run_test
|
||||
else:
|
||||
raise NotImplementedError(f"Unsupported model: {model}")
|
||||
from ..utils import fork_new_process_for_each_test
|
||||
|
||||
|
||||
@pytest.mark.parametrize("tensor_parallel_size", [2])
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(hf_runner, vllm_runner, image_assets,
|
||||
tensor_parallel_size: int, dtype: str, max_tokens: int,
|
||||
num_logprobs: int) -> None:
|
||||
if cuda_device_count_stateless() < tensor_parallel_size:
|
||||
pytest.skip(
|
||||
f"Need at least {tensor_parallel_size} GPUs to run the test.")
|
||||
@pytest.mark.skipif(cuda_device_count_stateless() < 2,
|
||||
reason="Need at least 2 GPUs to run the test.")
|
||||
@pytest.mark.parametrize("model, distributed_executor_backend", [
|
||||
("llava-hf/llava-1.5-7b-hf", "ray"),
|
||||
("llava-hf/llava-v1.6-mistral-7b-hf", "ray"),
|
||||
("llava-hf/llava-1.5-7b-hf", "mp"),
|
||||
("llava-hf/llava-v1.6-mistral-7b-hf", "mp"),
|
||||
])
|
||||
@fork_new_process_for_each_test
|
||||
def test_models(hf_runner, vllm_runner, image_assets, model: str,
|
||||
distributed_executor_backend: str) -> None:
|
||||
|
||||
distributed_executor_backend = os.getenv("DISTRIBUTED_EXECUTOR_BACKEND")
|
||||
dtype = "half"
|
||||
max_tokens = 5
|
||||
num_logprobs = 5
|
||||
tensor_parallel_size = 2
|
||||
|
||||
if model.startswith("llava-hf/llava-1.5"):
|
||||
from ..models.test_llava import models, run_test
|
||||
elif model.startswith("llava-hf/llava-v1.6"):
|
||||
from ..models.test_llava_next import models, run_test
|
||||
else:
|
||||
raise NotImplementedError(f"Unsupported model: {model}")
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user