mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-14 05:47:03 +08:00
199 lines
9.3 KiB
YAML
199 lines
9.3 KiB
YAML
group: Distributed
|
|
depends_on:
|
|
- image-build
|
|
steps:
|
|
- label: Distributed Comm Ops
|
|
timeout_in_minutes: 20
|
|
working_dir: "/vllm-workspace/tests"
|
|
num_gpus: 2
|
|
source_file_dependencies:
|
|
- vllm/distributed
|
|
- tests/distributed
|
|
commands:
|
|
- pytest -v -s distributed/test_comm_ops.py
|
|
- pytest -v -s distributed/test_shm_broadcast.py
|
|
- pytest -v -s distributed/test_shm_buffer.py
|
|
- pytest -v -s distributed/test_shm_storage.py
|
|
|
|
- label: Distributed (2 GPUs)
|
|
timeout_in_minutes: 90
|
|
working_dir: "/vllm-workspace/tests"
|
|
num_gpus: 2
|
|
source_file_dependencies:
|
|
- vllm/compilation/
|
|
- vllm/distributed/
|
|
- vllm/engine/
|
|
- vllm/executor/
|
|
- vllm/worker/worker_base.py
|
|
- vllm/v1/engine/
|
|
- vllm/v1/worker/
|
|
- tests/compile/fullgraph/test_basic_correctness.py
|
|
- tests/compile/test_wrapper.py
|
|
- tests/distributed/
|
|
- tests/entrypoints/llm/test_collective_rpc.py
|
|
- tests/v1/distributed
|
|
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
|
- tests/v1/shutdown
|
|
- tests/v1/worker/test_worker_memory_snapshot.py
|
|
commands:
|
|
# https://github.com/NVIDIA/nccl/issues/1838
|
|
- export NCCL_CUMEM_HOST_ENABLE=0
|
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
|
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
|
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
|
- pytest -v -s ./compile/fullgraph/test_basic_correctness.py
|
|
- pytest -v -s ./compile/test_wrapper.py
|
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
|
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
|
- pytest -v -s distributed/test_sequence_parallel.py
|
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
|
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
|
|
|
- label: Distributed Tests (4 GPUs)
|
|
timeout_in_minutes: 50
|
|
working_dir: "/vllm-workspace/tests"
|
|
num_gpus: 4
|
|
source_file_dependencies:
|
|
- vllm/distributed/
|
|
- tests/distributed/test_utils
|
|
- tests/distributed/test_pynccl
|
|
- tests/distributed/test_events
|
|
- tests/compile/fullgraph/test_basic_correctness.py
|
|
- examples/offline_inference/rlhf.py
|
|
- examples/offline_inference/rlhf_colocate.py
|
|
- tests/examples/offline_inference/data_parallel.py
|
|
- tests/v1/distributed
|
|
- tests/v1/engine/test_engine_core_client.py
|
|
- tests/distributed/test_symm_mem_allreduce.py
|
|
commands:
|
|
# https://github.com/NVIDIA/nccl/issues/1838
|
|
- export NCCL_CUMEM_HOST_ENABLE=0
|
|
# test with torchrun tp=2 and external_dp=2
|
|
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
|
# test with torchrun tp=2 and pp=2
|
|
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
|
# test with torchrun tp=4 and dp=1
|
|
- TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
|
# test with torchrun tp=2, pp=2 and dp=1
|
|
- PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
|
# test with torchrun tp=1 and dp=4 with ep
|
|
- DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
|
# test with torchrun tp=2 and dp=2 with ep
|
|
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
|
# test with internal dp
|
|
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_eagle_dp.py
|
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
|
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
|
|
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
|
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
|
- pytest -v -s distributed/test_utils.py
|
|
- pytest -v -s compile/fullgraph/test_basic_correctness.py
|
|
- pytest -v -s distributed/test_pynccl.py
|
|
- pytest -v -s distributed/test_events.py
|
|
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
|
# TODO: create a dedicated test section for multi-GPU example tests
|
|
# when we have multiple distributed example tests
|
|
- cd ../examples/offline_inference
|
|
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
|
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
|
|
|
- label: Distributed Tests (8 GPUs)(H100)
|
|
timeout_in_minutes: 10
|
|
gpu: h100
|
|
num_gpus: 8
|
|
working_dir: "/vllm-workspace/tests"
|
|
source_file_dependencies:
|
|
- examples/offline_inference/torchrun_dp_example.py
|
|
- vllm/config/parallel.py
|
|
- vllm/distributed/
|
|
- vllm/v1/engine/llm_engine.py
|
|
- vllm/v1/executor/uniproc_executor.py
|
|
- vllm/v1/worker/gpu_worker.py
|
|
commands:
|
|
# https://github.com/NVIDIA/nccl/issues/1838
|
|
- export NCCL_CUMEM_HOST_ENABLE=0
|
|
# test with torchrun tp=2 and dp=4 with ep
|
|
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
|
|
|
|
- label: Distributed Tests (4 GPUs)(A100)
|
|
gpu: a100
|
|
optional: true
|
|
num_gpus: 4
|
|
source_file_dependencies:
|
|
- vllm/
|
|
commands:
|
|
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
|
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
|
- pytest -v -s distributed/test_custom_all_reduce.py
|
|
- torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
|
|
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
|
- pytest -v -s -x lora/test_mixtral.py
|
|
|
|
- label: Distributed Tests (2 GPUs)(H200)
|
|
gpu: h200
|
|
optional: true
|
|
working_dir: "/vllm-workspace/"
|
|
num_gpus: 2
|
|
commands:
|
|
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_async_tp.py
|
|
- pytest -v -s tests/compile/distributed/test_sequence_parallelism.py
|
|
- pytest -v -s tests/compile/distributed/test_fusion_all_reduce.py
|
|
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/compile/distributed/test_fusions_e2e.py -k 'not Llama-4'
|
|
- VLLM_TEST_CLEAN_GPU_MEMORY=1 pytest -v -s tests/distributed/test_sequence_parallel.py
|
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
|
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
|
- pytest -v -s tests/v1/distributed/test_dbo.py
|
|
|
|
- label: Distributed Tests (2 GPUs)(B200)
|
|
gpu: b200
|
|
optional: true
|
|
working_dir: "/vllm-workspace/"
|
|
num_gpus: 2
|
|
commands:
|
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
|
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
|
- pytest -v -s tests/v1/distributed/test_dbo.py
|
|
|
|
- label: 2 Node Test (4 GPUs)
|
|
timeout_in_minutes: 30
|
|
working_dir: "/vllm-workspace/tests"
|
|
num_gpus: 2
|
|
num_nodes: 2
|
|
source_file_dependencies:
|
|
- vllm/distributed/
|
|
- vllm/engine/
|
|
- vllm/executor/
|
|
- vllm/model_executor/models/
|
|
- tests/distributed/
|
|
- tests/examples/offline_inference/data_parallel.py
|
|
commands:
|
|
- ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code"
|
|
|
|
- label: Distributed NixlConnector PD accuracy (4 GPUs)
|
|
timeout_in_minutes: 30
|
|
working_dir: "/vllm-workspace/tests"
|
|
num_gpus: 4
|
|
source_file_dependencies:
|
|
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
|
- tests/v1/kv_connector/nixl_integration/
|
|
commands:
|
|
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
|
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
|
|
|
|
- label: Pipeline + Context Parallelism (4 GPUs))
|
|
timeout_in_minutes: 60
|
|
working_dir: "/vllm-workspace/tests"
|
|
num_gpus: 4
|
|
source_file_dependencies:
|
|
- vllm/distributed/
|
|
- vllm/engine/
|
|
- vllm/executor/
|
|
- vllm/model_executor/models/
|
|
- tests/distributed/
|
|
commands:
|
|
- pytest -v -s distributed/test_pp_cudagraph.py
|
|
- pytest -v -s distributed/test_pipeline_parallel.py |