[ci][distributed] merge distributed test commands (#7097)

Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-12-14 06:25:02 +08:00 · 2024-08-02 21:33:53 -07:00 · 2024-08-02 21:33:53 -07:00 · 04e5583425
commit 04e5583425
parent 8c025fa703
4 changed files with 75 additions and 88 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -82,20 +82,9 @@ steps:
  num_gpus: 2
  commands:
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TARGET_TEST_SUITE=L4 pytest -v -s distributed/test_basic_distributed_correctness.py
-  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
+  - pytest -v -s distributed/test_multimodal_broadcast.py
  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
  - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
  - TEST_DIST_MODEL=llava-hf/llava-v1.6-mistral-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
  - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
  - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
  - TEST_DIST_MODEL=llava-hf/llava-v1.6-mistral-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
  - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
@ -107,11 +96,6 @@ steps:
  fast_check: true
  commands:
  - pytest -v -s distributed/test_pynccl.py
  # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
  # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
 - label: Pipeline Parallelism Test
@ -279,9 +263,6 @@ steps:
  # NOTE: don't test llama model here, it seems hf implementation is buggy
  # see https://github.com/vllm-project/vllm/pull/5689 for details
  - pytest -v -s distributed/test_custom_all_reduce.py
  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
  - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
-  - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+  - TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py
  - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
  - pytest -v -s -x lora/test_mixtral.py
--- a/tests/distributed/test_basic_distributed_correctness.py
+++ b/tests/distributed/test_basic_distributed_correctness.py
@ -1,15 +1,10 @@
 """Compare the outputs of HF and distributed vLLM when using greedy sampling.
-vLLM will allocate all the available memory, so we need to run the tests one
+
 by one. The solution is to pass arguments (model name) by environment
 variables.
 Run:
 ```sh
 cd $VLLM_PATH/tests
-TEST_DIST_MODEL=facebook/opt-125m pytest \
+pytest distributed/test_basic_distributed_correctness.py
    distributed/test_basic_distributed_correctness.py
 TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \
    distributed/test_basic_distributed_correctness.py
 ```
 """
 import os
@ -19,27 +14,48 @@ import pytest
 from vllm.utils import cuda_device_count_stateless
 from ..models.utils import check_outputs_equal
 from ..utils import fork_new_process_for_each_test
-MODELS = [
+TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
    os.environ["TEST_DIST_MODEL"],
 ]
 DISTRIBUTED_EXECUTOR_BACKEND = "DISTRIBUTED_EXECUTOR_BACKEND"
@pytest.mark.skipif(cuda_device_count_stateless() < 2,
                    reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize(
-@pytest.mark.parametrize("dtype", ["half"])
+    "model, distributed_executor_backend, attention_backend, test_suite", [
-@pytest.mark.parametrize("max_tokens", [5])
+        ("facebook/opt-125m", "ray", "", "L4"),
        ("facebook/opt-125m", "mp", "", "L4"),
        ("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
        ("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
        ("facebook/opt-125m", "ray", "", "A100"),
        ("facebook/opt-125m", "mp", "", "A100"),
        ("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
        ("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
    ])
@fork_new_process_for_each_test
 def test_models(
    hf_runner,
    vllm_runner,
    example_prompts,
    model: str,
-    dtype: str,
+    distributed_executor_backend: str,
-    max_tokens: int,
+    attention_backend: str,
    test_suite: str,
 ) -> None:
-    distributed_executor_backend = os.getenv(DISTRIBUTED_EXECUTOR_BACKEND)
+
    if test_suite != TARGET_TEST_SUITE:
        pytest.skip(f"Skip test for {test_suite}")
    if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
        # test ray adag
        os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
        os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
    if attention_backend:
        os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
    dtype = "half"
    max_tokens = 5
    # NOTE: take care of the order. run vLLM first, and then run HF.
    # vLLM needs a fresh new process without cuda initialization.
--- a/tests/distributed/test_chunked_prefill_distributed.py
+++ b/tests/distributed/test_chunked_prefill_distributed.py
@ -1,46 +1,39 @@
 """Compare the outputs of HF and distributed vLLM when using greedy sampling.
 vLLM will allocate all the available memory, so we need to run the tests one
 by one. The solution is to pass arguments (model name) by environment
 variables.
 Run:
 ```sh
-TEST_DIST_MODEL=facebook/opt-125m pytest \
+pytest test_chunked_prefill_distributed.py
    test_chunked_prefill_distributed.py
 TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf \
    test_chunked_prefill_distributed.py
 ```
 """
 import os
 import pytest
 from vllm.utils import cuda_device_count_stateless
 from ..models.utils import check_outputs_equal
-
+from ..utils import fork_new_process_for_each_test
 MODELS = [
    os.environ["TEST_DIST_MODEL"],
 ]
 DISTRIBUTED_EXECUTOR_BACKEND = "DISTRIBUTED_EXECUTOR_BACKEND"
@pytest.mark.skipif(cuda_device_count_stateless() < 2,
                    reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("model, distributed_executor_backend", [
-@pytest.mark.parametrize("dtype", ["half"])
+    ("facebook/opt-125m", "ray"),
-@pytest.mark.parametrize("max_tokens", [5])
+    ("meta-llama/Llama-2-7b-hf", "ray"),
-@pytest.mark.parametrize("chunked_prefill_token_size", [16])
+    ("facebook/opt-125m", "mp"),
    ("meta-llama/Llama-2-7b-hf", "mp"),
 ])
@fork_new_process_for_each_test
 def test_models(
    hf_runner,
    vllm_runner,
    example_prompts,
    model: str,
-    dtype: str,
+    distributed_executor_backend: str,
    max_tokens: int,
    chunked_prefill_token_size: int,
 ) -> None:
-    distributed_executor_backend = os.getenv(DISTRIBUTED_EXECUTOR_BACKEND)
+
    dtype = "half"
    max_tokens = 5
    chunked_prefill_token_size = 16
    # Add a chunked prefill config.
    max_num_seqs = min(chunked_prefill_token_size, 256)
--- a/tests/distributed/test_multimodal_broadcast.py
+++ b/tests/distributed/test_multimodal_broadcast.py
@ -1,44 +1,41 @@
 """Compare the outputs of HF and distributed vLLM when using greedy sampling.
 The second test will hang if more than one test is run per command, so we need
 to run the tests one by one. The solution is to pass arguments (model name) by
 environment variables.
 Run:
 ```sh
-TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf \
+pytest -s -v test_multimodal_broadcast.py
    test_multimodal_broadcast.py
 TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct \
    test_multimodal_broadcast.py
 ```
 """
 import os
 import pytest
 from vllm.utils import cuda_device_count_stateless
-model = os.environ["TEST_DIST_MODEL"]
+from ..utils import fork_new_process_for_each_test
 if model.startswith("llava-hf/llava-1.5"):
    from ..models.test_llava import models, run_test
 elif model.startswith("llava-hf/llava-v1.6"):
    from ..models.test_llava_next import models, run_test
 else:
    raise NotImplementedError(f"Unsupported model: {model}")
-@pytest.mark.parametrize("tensor_parallel_size", [2])
+@pytest.mark.skipif(cuda_device_count_stateless() < 2,
-@pytest.mark.parametrize("dtype", ["half"])
+                    reason="Need at least 2 GPUs to run the test.")
-@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("model, distributed_executor_backend", [
-@pytest.mark.parametrize("num_logprobs", [5])
+    ("llava-hf/llava-1.5-7b-hf", "ray"),
-def test_models(hf_runner, vllm_runner, image_assets,
+    ("llava-hf/llava-v1.6-mistral-7b-hf", "ray"),
-                tensor_parallel_size: int, dtype: str, max_tokens: int,
+    ("llava-hf/llava-1.5-7b-hf", "mp"),
-                num_logprobs: int) -> None:
+    ("llava-hf/llava-v1.6-mistral-7b-hf", "mp"),
-    if cuda_device_count_stateless() < tensor_parallel_size:
+])
-        pytest.skip(
+@fork_new_process_for_each_test
-            f"Need at least {tensor_parallel_size} GPUs to run the test.")
+def test_models(hf_runner, vllm_runner, image_assets, model: str,
                distributed_executor_backend: str) -> None:
-    distributed_executor_backend = os.getenv("DISTRIBUTED_EXECUTOR_BACKEND")
+    dtype = "half"
    max_tokens = 5
    num_logprobs = 5
    tensor_parallel_size = 2
    if model.startswith("llava-hf/llava-1.5"):
        from ..models.test_llava import models, run_test
    elif model.startswith("llava-hf/llava-v1.6"):
        from ..models.test_llava_next import models, run_test
    else:
        raise NotImplementedError(f"Unsupported model: {model}")
    run_test(
        hf_runner,