mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-14 06:25:02 +08:00
[ci][distributed] merge distributed test commands (#7097)
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
This commit is contained in:
parent
8c025fa703
commit
04e5583425
@ -82,20 +82,9 @@ steps:
|
|||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
commands:
|
commands:
|
||||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
|
||||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
- TARGET_TEST_SUITE=L4 pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
- pytest -v -s distributed/test_chunked_prefill_distributed.py
|
||||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
|
- pytest -v -s distributed/test_multimodal_broadcast.py
|
||||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
|
|
||||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
|
|
||||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
|
|
||||||
- TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
|
|
||||||
- TEST_DIST_MODEL=llava-hf/llava-v1.6-mistral-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
|
|
||||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
|
||||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
|
||||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
|
|
||||||
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
|
|
||||||
- TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
|
|
||||||
- TEST_DIST_MODEL=llava-hf/llava-v1.6-mistral-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
|
|
||||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
|
||||||
@ -107,11 +96,6 @@ steps:
|
|||||||
fast_check: true
|
fast_check: true
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s distributed/test_pynccl.py
|
- pytest -v -s distributed/test_pynccl.py
|
||||||
# We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
|
|
||||||
# See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
|
|
||||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
|
||||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
|
|
||||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
|
||||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
||||||
|
|
||||||
- label: Pipeline Parallelism Test
|
- label: Pipeline Parallelism Test
|
||||||
@ -279,9 +263,6 @@ steps:
|
|||||||
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
||||||
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
||||||
- pytest -v -s distributed/test_custom_all_reduce.py
|
- pytest -v -s distributed/test_custom_all_reduce.py
|
||||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
|
||||||
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
|
||||||
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
|
- pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
|
||||||
- VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
- TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
- VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
|
||||||
- pytest -v -s -x lora/test_mixtral.py
|
- pytest -v -s -x lora/test_mixtral.py
|
||||||
|
|||||||
@ -1,15 +1,10 @@
|
|||||||
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
|
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
|
||||||
vLLM will allocate all the available memory, so we need to run the tests one
|
|
||||||
by one. The solution is to pass arguments (model name) by environment
|
|
||||||
variables.
|
|
||||||
Run:
|
Run:
|
||||||
```sh
|
```sh
|
||||||
cd $VLLM_PATH/tests
|
cd $VLLM_PATH/tests
|
||||||
|
|
||||||
TEST_DIST_MODEL=facebook/opt-125m pytest \
|
pytest distributed/test_basic_distributed_correctness.py
|
||||||
distributed/test_basic_distributed_correctness.py
|
|
||||||
TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \
|
|
||||||
distributed/test_basic_distributed_correctness.py
|
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
import os
|
import os
|
||||||
@ -19,27 +14,48 @@ import pytest
|
|||||||
from vllm.utils import cuda_device_count_stateless
|
from vllm.utils import cuda_device_count_stateless
|
||||||
|
|
||||||
from ..models.utils import check_outputs_equal
|
from ..models.utils import check_outputs_equal
|
||||||
|
from ..utils import fork_new_process_for_each_test
|
||||||
|
|
||||||
MODELS = [
|
TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
|
||||||
os.environ["TEST_DIST_MODEL"],
|
|
||||||
]
|
|
||||||
DISTRIBUTED_EXECUTOR_BACKEND = "DISTRIBUTED_EXECUTOR_BACKEND"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(cuda_device_count_stateless() < 2,
|
@pytest.mark.skipif(cuda_device_count_stateless() < 2,
|
||||||
reason="Need at least 2 GPUs to run the test.")
|
reason="Need at least 2 GPUs to run the test.")
|
||||||
@pytest.mark.parametrize("model", MODELS)
|
@pytest.mark.parametrize(
|
||||||
@pytest.mark.parametrize("dtype", ["half"])
|
"model, distributed_executor_backend, attention_backend, test_suite", [
|
||||||
@pytest.mark.parametrize("max_tokens", [5])
|
("facebook/opt-125m", "ray", "", "L4"),
|
||||||
|
("facebook/opt-125m", "mp", "", "L4"),
|
||||||
|
("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
|
||||||
|
("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
|
||||||
|
("facebook/opt-125m", "ray", "", "A100"),
|
||||||
|
("facebook/opt-125m", "mp", "", "A100"),
|
||||||
|
("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
|
||||||
|
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
|
||||||
|
])
|
||||||
|
@fork_new_process_for_each_test
|
||||||
def test_models(
|
def test_models(
|
||||||
hf_runner,
|
hf_runner,
|
||||||
vllm_runner,
|
vllm_runner,
|
||||||
example_prompts,
|
example_prompts,
|
||||||
model: str,
|
model: str,
|
||||||
dtype: str,
|
distributed_executor_backend: str,
|
||||||
max_tokens: int,
|
attention_backend: str,
|
||||||
|
test_suite: str,
|
||||||
) -> None:
|
) -> None:
|
||||||
distributed_executor_backend = os.getenv(DISTRIBUTED_EXECUTOR_BACKEND)
|
|
||||||
|
if test_suite != TARGET_TEST_SUITE:
|
||||||
|
pytest.skip(f"Skip test for {test_suite}")
|
||||||
|
|
||||||
|
if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
|
||||||
|
# test ray adag
|
||||||
|
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
|
||||||
|
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
|
||||||
|
|
||||||
|
if attention_backend:
|
||||||
|
os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
|
||||||
|
|
||||||
|
dtype = "half"
|
||||||
|
max_tokens = 5
|
||||||
|
|
||||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||||
# vLLM needs a fresh new process without cuda initialization.
|
# vLLM needs a fresh new process without cuda initialization.
|
||||||
|
|||||||
@ -1,46 +1,39 @@
|
|||||||
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
|
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
|
||||||
vLLM will allocate all the available memory, so we need to run the tests one
|
|
||||||
by one. The solution is to pass arguments (model name) by environment
|
|
||||||
variables.
|
|
||||||
|
|
||||||
Run:
|
Run:
|
||||||
```sh
|
```sh
|
||||||
TEST_DIST_MODEL=facebook/opt-125m pytest \
|
pytest test_chunked_prefill_distributed.py
|
||||||
test_chunked_prefill_distributed.py
|
|
||||||
TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \
|
|
||||||
test_chunked_prefill_distributed.py
|
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
import os
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm.utils import cuda_device_count_stateless
|
from vllm.utils import cuda_device_count_stateless
|
||||||
|
|
||||||
from ..models.utils import check_outputs_equal
|
from ..models.utils import check_outputs_equal
|
||||||
|
from ..utils import fork_new_process_for_each_test
|
||||||
MODELS = [
|
|
||||||
os.environ["TEST_DIST_MODEL"],
|
|
||||||
]
|
|
||||||
DISTRIBUTED_EXECUTOR_BACKEND = "DISTRIBUTED_EXECUTOR_BACKEND"
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(cuda_device_count_stateless() < 2,
|
@pytest.mark.skipif(cuda_device_count_stateless() < 2,
|
||||||
reason="Need at least 2 GPUs to run the test.")
|
reason="Need at least 2 GPUs to run the test.")
|
||||||
@pytest.mark.parametrize("model", MODELS)
|
@pytest.mark.parametrize("model, distributed_executor_backend", [
|
||||||
@pytest.mark.parametrize("dtype", ["half"])
|
("facebook/opt-125m", "ray"),
|
||||||
@pytest.mark.parametrize("max_tokens", [5])
|
("meta-llama/Llama-2-7b-hf", "ray"),
|
||||||
@pytest.mark.parametrize("chunked_prefill_token_size", [16])
|
("facebook/opt-125m", "mp"),
|
||||||
|
("meta-llama/Llama-2-7b-hf", "mp"),
|
||||||
|
])
|
||||||
|
@fork_new_process_for_each_test
|
||||||
def test_models(
|
def test_models(
|
||||||
hf_runner,
|
hf_runner,
|
||||||
vllm_runner,
|
vllm_runner,
|
||||||
example_prompts,
|
example_prompts,
|
||||||
model: str,
|
model: str,
|
||||||
dtype: str,
|
distributed_executor_backend: str,
|
||||||
max_tokens: int,
|
|
||||||
chunked_prefill_token_size: int,
|
|
||||||
) -> None:
|
) -> None:
|
||||||
distributed_executor_backend = os.getenv(DISTRIBUTED_EXECUTOR_BACKEND)
|
|
||||||
|
dtype = "half"
|
||||||
|
max_tokens = 5
|
||||||
|
chunked_prefill_token_size = 16
|
||||||
|
|
||||||
# Add a chunked prefill config.
|
# Add a chunked prefill config.
|
||||||
max_num_seqs = min(chunked_prefill_token_size, 256)
|
max_num_seqs = min(chunked_prefill_token_size, 256)
|
||||||
|
|||||||
@ -1,44 +1,41 @@
|
|||||||
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
|
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
|
||||||
The second test will hang if more than one test is run per command, so we need
|
|
||||||
to run the tests one by one. The solution is to pass arguments (model name) by
|
|
||||||
environment variables.
|
|
||||||
|
|
||||||
Run:
|
Run:
|
||||||
```sh
|
```sh
|
||||||
TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf \
|
pytest -s -v test_multimodal_broadcast.py
|
||||||
test_multimodal_broadcast.py
|
|
||||||
TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct \
|
|
||||||
test_multimodal_broadcast.py
|
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
import os
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm.utils import cuda_device_count_stateless
|
from vllm.utils import cuda_device_count_stateless
|
||||||
|
|
||||||
model = os.environ["TEST_DIST_MODEL"]
|
from ..utils import fork_new_process_for_each_test
|
||||||
|
|
||||||
if model.startswith("llava-hf/llava-1.5"):
|
|
||||||
from ..models.test_llava import models, run_test
|
|
||||||
elif model.startswith("llava-hf/llava-v1.6"):
|
|
||||||
from ..models.test_llava_next import models, run_test
|
|
||||||
else:
|
|
||||||
raise NotImplementedError(f"Unsupported model: {model}")
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("tensor_parallel_size", [2])
|
@pytest.mark.skipif(cuda_device_count_stateless() < 2,
|
||||||
@pytest.mark.parametrize("dtype", ["half"])
|
reason="Need at least 2 GPUs to run the test.")
|
||||||
@pytest.mark.parametrize("max_tokens", [128])
|
@pytest.mark.parametrize("model, distributed_executor_backend", [
|
||||||
@pytest.mark.parametrize("num_logprobs", [5])
|
("llava-hf/llava-1.5-7b-hf", "ray"),
|
||||||
def test_models(hf_runner, vllm_runner, image_assets,
|
("llava-hf/llava-v1.6-mistral-7b-hf", "ray"),
|
||||||
tensor_parallel_size: int, dtype: str, max_tokens: int,
|
("llava-hf/llava-1.5-7b-hf", "mp"),
|
||||||
num_logprobs: int) -> None:
|
("llava-hf/llava-v1.6-mistral-7b-hf", "mp"),
|
||||||
if cuda_device_count_stateless() < tensor_parallel_size:
|
])
|
||||||
pytest.skip(
|
@fork_new_process_for_each_test
|
||||||
f"Need at least {tensor_parallel_size} GPUs to run the test.")
|
def test_models(hf_runner, vllm_runner, image_assets, model: str,
|
||||||
|
distributed_executor_backend: str) -> None:
|
||||||
|
|
||||||
distributed_executor_backend = os.getenv("DISTRIBUTED_EXECUTOR_BACKEND")
|
dtype = "half"
|
||||||
|
max_tokens = 5
|
||||||
|
num_logprobs = 5
|
||||||
|
tensor_parallel_size = 2
|
||||||
|
|
||||||
|
if model.startswith("llava-hf/llava-1.5"):
|
||||||
|
from ..models.test_llava import models, run_test
|
||||||
|
elif model.startswith("llava-hf/llava-v1.6"):
|
||||||
|
from ..models.test_llava_next import models, run_test
|
||||||
|
else:
|
||||||
|
raise NotImplementedError(f"Unsupported model: {model}")
|
||||||
|
|
||||||
run_test(
|
run_test(
|
||||||
hf_runner,
|
hf_runner,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user