# In this file, you can add more tests to run either by adding a new step or # adding a new command to an existing step. See different options here for examples. # This script will be feed into Jinja template in `test-template-aws.j2` at # https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2 # to generate the final pipeline yaml file. # Documentation # label(str): the name of the test. emojis allowed. # fast_check(bool): whether to run this on each commit on the fastcheck pipeline. # torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline. # fast_check_only(bool): run this test on the fastcheck pipeline only # optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run. # soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests). # command(str): the single command to run for tests. incompatible with commands. # commands(list): the list of commands to run for the test. incompatible with command. # mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental] # gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200 # num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4. # num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host, # in this case, commands must be specified. the first command runs on the first host, the second # command runs on the second host. # timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout. # parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB # and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables. # working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests # source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run. # When adding a test # - If the test belongs to an existing group, add it there # - If the test is short, add to any existing step # - If the test takes more than 10min, then it is okay to create a new step. # Note that all steps execute in parallel. steps: ##### fast check tests ##### - label: Pytorch Nightly Dependency Override Check # 2min # if this test fails, it means the nightly torch version is not compatible with some # of the dependencies. Please check the error message and add the package to whitelist # in /vllm/tools/generate_nightly_torch_test.py soft_fail: true source_file_dependencies: - requirements/nightly_torch_test.txt commands: - bash standalone_tests/pytorch_nightly_dependency.sh - label: Async Engine, Inputs, Utils, Worker Test # 36min timeout_in_minutes: 50 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ - tests/multimodal - tests/utils_ commands: - pytest -v -s -m 'not cpu_test' multimodal - pytest -v -s utils_ - label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins timeout_in_minutes: 10 source_file_dependencies: - vllm/ - tests/test_inputs.py - tests/test_outputs.py - tests/multimodal - tests/standalone_tests/lazy_imports.py - tests/transformers_utils no_gpu: true commands: - python3 standalone_tests/lazy_imports.py - pytest -v -s test_inputs.py - pytest -v -s test_outputs.py - pytest -v -s -m 'cpu_test' multimodal - pytest -v -s transformers_utils - label: Python-only Installation Test # 10min timeout_in_minutes: 20 mirror_hardwares: [amdexperimental] source_file_dependencies: - tests/standalone_tests/python_only_compile.sh - setup.py commands: - bash standalone_tests/python_only_compile.sh - label: Basic Correctness Test # 20min timeout_in_minutes: 30 mirror_hardwares: [amdexperimental] fast_check: true torch_nightly: true source_file_dependencies: - vllm/ - tests/basic_correctness/test_basic_correctness - tests/basic_correctness/test_cpu_offload - tests/basic_correctness/test_cumem.py commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s basic_correctness/test_cumem.py - pytest -v -s basic_correctness/test_basic_correctness.py - pytest -v -s basic_correctness/test_cpu_offload.py - label: Entrypoints Unit Tests # 5min timeout_in_minutes: 10 working_dir: "/vllm-workspace/tests" fast_check: true source_file_dependencies: - vllm/entrypoints - tests/entrypoints/ commands: - pytest -v -s entrypoints/openai/tool_parsers - pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling - label: Entrypoints Integration Test (LLM) # 30min timeout_in_minutes: 40 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" fast_check: true torch_nightly: true source_file_dependencies: - vllm/ - tests/entrypoints/llm - tests/entrypoints/offline_mode commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - label: Entrypoints Integration Test (API Server) # 100min timeout_in_minutes: 130 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" fast_check: true torch_nightly: true source_file_dependencies: - vllm/ - tests/entrypoints/openai - tests/entrypoints/test_chat_utils commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/ - pytest -v -s entrypoints/test_chat_utils.py - label: Entrypoints Integration Test (Pooling) timeout_in_minutes: 50 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" fast_check: true torch_nightly: true source_file_dependencies: - vllm/ - tests/entrypoints/pooling commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s entrypoints/pooling - label: Distributed Tests (4 GPUs) # 35min timeout_in_minutes: 50 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 4 source_file_dependencies: - vllm/distributed/ - tests/distributed/test_utils - tests/distributed/test_pynccl - tests/distributed/test_events - tests/compile/test_basic_correctness - examples/offline_inference/rlhf.py - examples/offline_inference/rlhf_colocate.py - tests/examples/offline_inference/data_parallel.py - tests/v1/distributed - tests/v1/engine/test_engine_core_client.py - tests/distributed/test_symm_mem_allreduce.py commands: # test with torchrun tp=2 and external_dp=2 - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py # test with torchrun tp=2 and pp=2 - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py # test with torchrun tp=4 and dp=1 - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py # test with torchrun tp=2, pp=2 and dp=1 - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py # test with torchrun tp=1 and dp=4 with ep - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py # test with torchrun tp=2 and dp=2 with ep - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py # test with internal dp - python3 ../examples/offline_inference/data_parallel.py --enforce-eager - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp - pytest -v -s distributed/test_utils.py - pytest -v -s compile/test_basic_correctness.py - pytest -v -s distributed/test_pynccl.py - pytest -v -s distributed/test_events.py - pytest -v -s distributed/test_symm_mem_allreduce.py # TODO: create a dedicated test section for multi-GPU example tests # when we have multiple distributed example tests - pushd ../examples/offline_inference - VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py - VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py - popd - label: EPLB Algorithm Test # 5min timeout_in_minutes: 15 working_dir: "/vllm-workspace/tests" source_file_dependencies: - vllm/distributed/eplb - tests/distributed/test_eplb_algo.py commands: - pytest -v -s distributed/test_eplb_algo.py - label: EPLB Execution Test # 5min timeout_in_minutes: 15 working_dir: "/vllm-workspace/tests" num_gpus: 4 source_file_dependencies: - vllm/distributed/eplb - tests/distributed/test_eplb_execute.py commands: - pytest -v -s distributed/test_eplb_execute.py - label: Metrics, Tracing Test # 12min timeout_in_minutes: 20 mirror_hardwares: [amdexperimental] num_gpus: 2 source_file_dependencies: - vllm/ - tests/v1/tracing commands: - "pip install \ 'opentelemetry-sdk>=1.26.0' \ 'opentelemetry-api>=1.26.0' \ 'opentelemetry-exporter-otlp>=1.26.0' \ 'opentelemetry-semantic-conventions-ai>=0.4.1'" - pytest -v -s v1/tracing ##### fast check tests ##### ##### 1 GPU test ##### - label: Regression Test # 7min timeout_in_minutes: 20 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ - tests/test_regression commands: - pip install modelscope - pytest -v -s test_regression.py working_dir: "/vllm-workspace/tests" # optional - label: Engine Test # 25min timeout_in_minutes: 40 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ - tests/engine - tests/tokenization - tests/test_sequence - tests/test_config - tests/test_logger - tests/test_vllm_port commands: - pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py # OOM in the CI unless we run this separately - pytest -v -s tokenization - label: V1 Test e2e + engine # 30min timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ - tests/v1 commands: # TODO: accuracy does not match, whether setting # VLLM_USE_FLASHINFER_SAMPLER or not on H100. - pytest -v -s v1/e2e - pytest -v -s v1/engine - label: V1 Test entrypoints # 35min timeout_in_minutes: 50 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ - tests/v1 commands: - pytest -v -s v1/entrypoints - label: V1 Test others # 42min timeout_in_minutes: 60 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ - tests/v1 commands: # split the test to avoid interference - pytest -v -s v1/executor - pytest -v -s v1/kv_offload - pytest -v -s v1/sample - pytest -v -s v1/logits_processors - pytest -v -s v1/worker - pytest -v -s v1/spec_decode - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit - pytest -v -s -m 'not cpu_test' v1/metrics - pytest -v -s v1/test_oracle.py - pytest -v -s v1/test_request.py # Integration test for streaming correctness (requires special branch). - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine - label: V1 Test others (CPU) # 5 mins source_file_dependencies: - vllm/ - tests/v1 no_gpu: true commands: # split the test to avoid interference - pytest -v -s v1/core - pytest -v -s v1/structured_output - pytest -v -s v1/test_serial_utils.py - pytest -v -s -m 'cpu_test' v1/kv_connector/unit - pytest -v -s -m 'cpu_test' v1/metrics - label: Examples Test # 30min timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/examples" source_file_dependencies: - vllm/entrypoints - examples/ commands: - pip install tensorizer # for tensorizer test - python3 offline_inference/basic/generate.py --model facebook/opt-125m - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - python3 offline_inference/basic/chat.py - python3 offline_inference/prefix_caching.py - python3 offline_inference/llm_engine_example.py - python3 offline_inference/audio_language.py --seed 0 - python3 offline_inference/vision_language.py --seed 0 - python3 offline_inference/vision_language_pooling.py --seed 0 - python3 offline_inference/vision_language_multi_image.py --seed 0 - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 - python3 offline_inference/basic/classify.py - python3 offline_inference/basic/embed.py - python3 offline_inference/basic/score.py - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048 - label: Platform Tests (CUDA) # 4min timeout_in_minutes: 15 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ - tests/cuda commands: - pytest -v -s cuda/test_cuda_context.py - label: Samplers Test # 56min timeout_in_minutes: 75 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/model_executor/layers - vllm/sampling_metadata.py - tests/samplers - tests/conftest.py commands: - pytest -v -s samplers - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers - label: LoRA Test %N # 20min each timeout_in_minutes: 30 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/lora - tests/lora commands: - pytest -v -s lora \ --shard-id=$$BUILDKITE_PARALLEL_JOB \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ --ignore=lora/test_chatglm3_tp.py \ --ignore=lora/test_llama_tp.py \ --ignore=lora/test_llm_with_multi_loras.py parallelism: 4 - label: PyTorch Compilation Unit Tests # 15min timeout_in_minutes: 30 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: - vllm/ - tests/compile commands: - pytest -v -s compile/test_pass_manager.py - pytest -v -s compile/test_fusion.py - pytest -v -s compile/test_fusion_attn.py - pytest -v -s compile/test_functionalization.py - pytest -v -s compile/test_silu_mul_quant_fusion.py - pytest -v -s compile/test_sequence_parallelism.py - pytest -v -s compile/test_async_tp.py - pytest -v -s compile/test_fusion_all_reduce.py - pytest -v -s compile/test_decorator.py - pytest -v -s compile/test_noop_elimination.py - label: PyTorch Fullgraph Smoke Test # 15min timeout_in_minutes: 30 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: - vllm/ - tests/compile commands: - pytest -v -s compile/test_basic_correctness.py - pytest -v -s compile/piecewise/ - label: PyTorch Fullgraph Test # 20min timeout_in_minutes: 30 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: - vllm/ - tests/compile commands: - pytest -v -s compile/test_full_graph.py - label: Kernels Core Operation Test # 48min timeout_in_minutes: 75 mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/ - tests/kernels/core commands: - pytest -v -s kernels/core - label: Kernels Attention Test %N # 23min timeout_in_minutes: 35 mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/attention/ - vllm/attention - vllm/v1/attention - tests/kernels/attention commands: - pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT parallelism: 2 - label: Kernels Quantization Test %N # 64min timeout_in_minutes: 90 mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/quantization/ - vllm/model_executor/layers/quantization - tests/kernels/quantization commands: - pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT parallelism: 2 - label: Kernels MoE Test %N # 40min timeout_in_minutes: 60 mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/quantization/cutlass_w8a8/moe/ - csrc/moe/ - tests/kernels/moe - vllm/model_executor/layers/fused_moe/ - vllm/distributed/device_communicators/ commands: - pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT parallelism: 2 - label: Kernels Mamba Test # 31min timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/mamba/ - tests/kernels/mamba - vllm/model_executor/layers/mamba/ops commands: - pytest -v -s kernels/mamba - label: Model Executor Test # 23min timeout_in_minutes: 35 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/model_executor - tests/model_executor - tests/entrypoints/openai/test_tensorizer_entrypoint.py commands: - apt-get update && apt-get install -y curl libsodium23 - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s model_executor - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py - label: Benchmarks # 11min timeout_in_minutes: 20 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/.buildkite" source_file_dependencies: - benchmarks/ commands: - bash scripts/run-benchmarks.sh - label: Benchmarks CLI Test # 7min timeout_in_minutes: 20 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/ - tests/benchmarks/ commands: - pytest -v -s benchmarks/ - label: Quantization Test # 70min timeout_in_minutes: 90 mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/ - vllm/model_executor/layers/quantization - tests/quantization commands: # temporary install here since we need nightly, will move to requirements/test.in # after torchao 0.12 release, and pin a working version of torchao nightly here # since torchao nightly is only compatible with torch nightly currently # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now # we can only upgrade after this is resolved - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128 - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ - label: LM Eval Small Models # 53min timeout_in_minutes: 75 mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/ - vllm/model_executor/layers/quantization commands: - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1 - label: OpenAI API correctness # 22min timeout_in_minutes: 30 mirror_hardwares: [amdexperimental] source_file_dependencies: - csrc/ - vllm/entrypoints/openai/ - vllm/model_executor/models/whisper.py commands: # LMEval+Transcription WER check - pytest -s entrypoints/openai/correctness/ - label: OpenAI-Compatible Tool Use # 23 min timeout_in_minutes: 35 mirror_hardwares: [amdexperimental] fast_check: false source_file_dependencies: - vllm/ - tests/tool_use commands: - pytest -v -s -m 'not cpu_test' tool_use - label: OpenAI-Compatible Tool Use (CPU) # 5 mins timeout_in_minutes: 10 source_file_dependencies: - vllm/ - tests/tool_use no_gpu: true commands: - pytest -v -s -m 'cpu_test' tool_use ##### models test ##### - label: Basic Models Tests (Initialization) timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: - vllm/ - tests/models/test_initialization.py commands: # Run a subset of model initialization tests - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset - label: Basic Models Tests (Extra Initialization) %N timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: - vllm/model_executor/models/ - tests/models/test_initialization.py commands: # Only when vLLM model source is modified - test initialization of a large # subset of supported models (the complement of the small subset in the above # test.) Also run if model initialization test file is modified - pytest -v -s models/test_initialization.py \ -k 'not test_can_initialize_small_subset' \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ --shard-id=$$BUILDKITE_PARALLEL_JOB parallelism: 2 - label: Basic Models Tests (Other) timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: - vllm/ - tests/models/test_transformers.py - tests/models/test_registry.py commands: - pytest -v -s models/test_transformers.py models/test_registry.py - label: Basic Models Test (Other CPU) # 5min timeout_in_minutes: 10 torch_nightly: true source_file_dependencies: - vllm/ - tests/models/test_utils.py - tests/models/test_vision.py no_gpu: true commands: - pytest -v -s models/test_utils.py models/test_vision.py - label: Language Models Tests (Standard) timeout_in_minutes: 25 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: - vllm/ - tests/models/language commands: # Test standard language models, excluding a subset of slow tests - pip freeze | grep -E 'torch' - pytest -v -s models/language -m 'core_model and (not slow_test)' - label: Language Models Tests (Extra Standard) %N timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: - vllm/model_executor/models/ - tests/models/language/pooling/test_embedding.py - tests/models/language/generation/test_common.py - tests/models/language/pooling/test_classification.py commands: # Shard slow subset of standard language models tests. Only run when model # source is modified, or when specified test files are modified - pip freeze | grep -E 'torch' - pytest -v -s models/language -m 'core_model and slow_test' \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ --shard-id=$$BUILDKITE_PARALLEL_JOB parallelism: 2 - label: Language Models Tests (Hybrid) %N timeout_in_minutes: 75 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: - vllm/ - tests/models/language/generation commands: # Install fast path packages for testing against transformers # Note: also needed to run plamo2 model in vLLM - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' # Shard hybrid language model tests - pytest -v -s models/language/generation \ -m hybrid_model \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ --shard-id=$$BUILDKITE_PARALLEL_JOB parallelism: 2 - label: Language Models Test (Extended Generation) # 80min timeout_in_minutes: 110 mirror_hardwares: [amdexperimental] optional: true source_file_dependencies: - vllm/ - tests/models/language/generation commands: # Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile. - pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8' - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' - label: Language Models Test (PPL) timeout_in_minutes: 110 mirror_hardwares: [amdexperimental] optional: true source_file_dependencies: - vllm/ - tests/models/language/generation_ppl_test commands: - pytest -v -s models/language/generation_ppl_test - label: Language Models Test (Extended Pooling) # 36min timeout_in_minutes: 50 mirror_hardwares: [amdexperimental] optional: true source_file_dependencies: - vllm/ - tests/models/language/pooling commands: - pytest -v -s models/language/pooling -m 'not core_model' - label: Language Models Test (MTEB) timeout_in_minutes: 110 mirror_hardwares: [amdexperimental] optional: true source_file_dependencies: - vllm/ - tests/models/language/pooling_mteb_test commands: - pytest -v -s models/language/pooling_mteb_test - label: Multi-Modal Processor Test # 44min timeout_in_minutes: 60 source_file_dependencies: - vllm/ - tests/models/multimodal commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pytest -v -s models/multimodal/processing - label: Multi-Modal Models Test (Standard) # 60min timeout_in_minutes: 80 mirror_hardwares: [amdexperimental] torch_nightly: true source_file_dependencies: - vllm/ - tests/models/multimodal commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pip freeze | grep -E 'torch' - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work - label: Multi-Modal Models Test (Extended) 1 mirror_hardwares: [amdexperimental] optional: true source_file_dependencies: - vllm/ - tests/models/multimodal commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing - label: Multi-Modal Models Test (Extended) 2 mirror_hardwares: [amdexperimental] optional: true source_file_dependencies: - vllm/ - tests/models/multimodal commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model' - label: Multi-Modal Models Test (Extended) 3 mirror_hardwares: [amdexperimental] optional: true source_file_dependencies: - vllm/ - tests/models/multimodal commands: - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model' - label: Quantized Models Test # 45 min timeout_in_minutes: 60 mirror_hardwares: [amdexperimental] source_file_dependencies: - vllm/model_executor/layers/quantization - tests/models/quantization commands: - pytest -v -s models/quantization # This test is used only in PR development phase to test individual models and should never run on main - label: Custom Models Test mirror_hardwares: [amdexperimental] optional: true commands: - echo 'Testing custom models...' # PR authors can temporarily add commands below to test individual models # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR* - label: Transformers Nightly Models Test working_dir: "/vllm-workspace/" optional: true commands: - pip install --upgrade git+https://github.com/huggingface/transformers - pytest -v -s tests/models/test_initialization.py - pytest -v -s tests/models/test_transformers.py - pytest -v -s tests/models/multimodal/processing/ - pytest -v -s tests/models/multimodal/test_mapping.py - python3 examples/offline_inference/basic/chat.py - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl # Whisper needs spawn method to avoid deadlock - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper - label: Blackwell Test # 38 min timeout_in_minutes: 60 working_dir: "/vllm-workspace/" gpu: b200 # optional: true source_file_dependencies: - csrc/quantization/fp4/ - csrc/attention/mla/ - csrc/quantization/cutlass_w8a8/moe/ - vllm/model_executor/layers/fused_moe/cutlass_moe.py - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - vllm/v1/attention/backends/flashinfer.py - vllm/compilation/fusion.py - vllm/compilation/fusion_attn.py commands: - nvidia-smi - python3 examples/offline_inference/basic/chat.py # Attention # num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353 - pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2' - pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py - pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py - pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py # Quantization - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8' - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py - pytest -v -s tests/kernels/moe/test_mxfp4_moe.py # Fusion - pytest -v -s tests/compile/test_fusion_all_reduce.py - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern - pytest -v -s tests/kernels/moe/test_flashinfer.py - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py - label: Blackwell GPT-OSS Eval timeout_in_minutes: 60 working_dir: "/vllm-workspace/" gpu: b200 optional: true # run on nightlies source_file_dependencies: - tests/evals/gpt_oss - vllm/model_executor/models/gpt_oss.py - vllm/model_executor/layers/quantization/mxfp4.py - vllm/v1/attention/backends/flashinfer.py commands: - uv pip install --system 'gpt-oss[eval]==0.0.5' - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 - label: Blackwell Quantized MoE Test timeout_in_minutes: 60 working_dir: "/vllm-workspace/" gpu: b200 source_file_dependencies: - tests/quantization/test_blackwell_moe.py - vllm/model_executor/models/deepseek_v2.py - vllm/model_executor/models/gpt_oss.py - vllm/model_executor/models/llama4.py - vllm/model_executor/layers/fused_moe - vllm/model_executor/layers/quantization/compressed_tensors - vllm/model_executor/layers/quantization/modelopt.py - vllm/model_executor/layers/quantization/mxfp4.py - vllm/v1/attention/backends/flashinfer.py commands: - pytest -s -v tests/quantization/test_blackwell_moe.py - label: Blackwell LM Eval Small Models timeout_in_minutes: 75 gpu: b200 optional: true # run on nightlies source_file_dependencies: - csrc/ - vllm/model_executor/layers/quantization commands: - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1 ##### 1 GPU test ##### ##### multi gpus test ##### - label: Distributed Comm Ops Test # 7min timeout_in_minutes: 20 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 2 source_file_dependencies: - vllm/distributed - tests/distributed commands: - pytest -v -s distributed/test_comm_ops.py - pytest -v -s distributed/test_shm_broadcast.py - pytest -v -s distributed/test_shm_buffer.py - pytest -v -s distributed/test_shm_storage.py - label: 2 Node Tests (4 GPUs in total) # 16min timeout_in_minutes: 30 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 2 num_nodes: 2 source_file_dependencies: - vllm/distributed/ - vllm/engine/ - vllm/executor/ - vllm/model_executor/models/ - tests/distributed/ - tests/examples/offline_inference/data_parallel.py commands: - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code - label: Distributed Tests (2 GPUs) # 68min timeout_in_minutes: 90 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 2 source_file_dependencies: - vllm/compilation/ - vllm/distributed/ - vllm/engine/ - vllm/executor/ - vllm/worker/worker_base.py - vllm/v1/engine/ - vllm/v1/worker/ - tests/compile/test_basic_correctness.py - tests/compile/test_wrapper.py - tests/distributed/ - tests/entrypoints/llm/test_collective_rpc.py - tests/v1/distributed - tests/v1/entrypoints/openai/test_multi_api_servers.py - tests/v1/shutdown - tests/v1/worker/test_worker_memory_snapshot.py commands: - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py - pytest -v -s entrypoints/llm/test_collective_rpc.py - pytest -v -s ./compile/test_basic_correctness.py - pytest -v -s ./compile/test_wrapper.py - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - pytest -v -s distributed/test_sequence_parallel.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown - pytest -v -s v1/worker/test_worker_memory_snapshot.py - label: Distributed Model Tests (2 GPUs) # 37min timeout_in_minutes: 50 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 2 source_file_dependencies: - vllm/model_executor/model_loader/sharded_state_loader.py - vllm/model_executor/models/ - tests/basic_correctness/ - tests/model_executor/model_loader/test_sharded_state_loader.py - tests/models/ commands: - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py # Avoid importing model tests that cause CUDA reinitialization error - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' - pytest models/language -v -s -m 'distributed(num_gpus=2)' - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)' - label: Plugin Tests (2 GPUs) # 40min timeout_in_minutes: 60 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 2 source_file_dependencies: - vllm/plugins/ - tests/plugins/ commands: # begin platform plugin and general plugin tests, all the code in-between runs on dummy platform - pip install -e ./plugins/vllm_add_dummy_platform - pytest -v -s plugins_tests/test_platform_plugins.py - pip uninstall vllm_add_dummy_platform -y # end platform plugin tests # begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin - pip install -e ./plugins/prithvi_io_processor_plugin - pytest -v -s plugins_tests/test_io_processor_plugins.py - pip uninstall prithvi_io_processor_plugin -y # end io_processor plugins test # other tests continue here: - pytest -v -s plugins_tests/test_scheduler_plugins.py - pip install -e ./plugins/vllm_add_dummy_model - pytest -v -s distributed/test_distributed_oot.py - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process - pytest -v -s models/test_oot_registration.py # it needs a clean process - pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins - label: Pipeline + Context Parallelism Test # 45min timeout_in_minutes: 60 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 4 source_file_dependencies: - vllm/distributed/ - vllm/engine/ - vllm/executor/ - vllm/model_executor/models/ - tests/distributed/ commands: - pytest -v -s distributed/test_pp_cudagraph.py - pytest -v -s distributed/test_pipeline_parallel.py - label: LoRA TP Test (Distributed) # 17 min timeout_in_minutes: 30 mirror_hardwares: [amdexperimental] num_gpus: 4 source_file_dependencies: - vllm/lora - tests/lora commands: # FIXIT: find out which code initialize cuda before running the test # before the fix, we need to use spawn to test it - export VLLM_WORKER_MULTIPROC_METHOD=spawn # There is some Tensor Parallelism related processing logic in LoRA that # requires multi-GPU testing for validation. - pytest -v -s -x lora/test_chatglm3_tp.py - pytest -v -s -x lora/test_llama_tp.py - pytest -v -s -x lora/test_llm_with_multi_loras.py - label: Weight Loading Multiple GPU Test # 33min timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 2 optional: true source_file_dependencies: - vllm/ - tests/weight_loading commands: - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt - label: Weight Loading Multiple GPU Test - Large Models # optional mirror_hardwares: [amdexperimental] working_dir: "/vllm-workspace/tests" num_gpus: 2 gpu: a100 optional: true source_file_dependencies: - vllm/ - tests/weight_loading commands: - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt ##### multi gpus test ##### ##### A100 test ##### - label: Distributed Tests (A100) # optional gpu: a100 optional: true num_gpus: 4 source_file_dependencies: - vllm/ commands: # NOTE: don't test llama model here, it seems hf implementation is buggy # see https://github.com/vllm-project/vllm/pull/5689 for details - pytest -v -s distributed/test_custom_all_reduce.py - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - pytest -v -s -x lora/test_mixtral.py - label: LM Eval Large Models # optional gpu: a100 optional: true num_gpus: 4 working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" source_file_dependencies: - csrc/ - vllm/model_executor/layers/quantization commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 ##### H200 test ##### - label: Distrubted Tests (H200) # optional gpu: h200 optional: true working_dir: "/vllm-workspace/" num_gpus: 2 commands: - pytest -v -s tests/distributed/test_context_parallel.py - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048 ##### B200 test ##### - label: Distributed Tests (B200) # optional gpu: b200 optional: true working_dir: "/vllm-workspace/" num_gpus: 2 commands: - pytest -v -s tests/distributed/test_context_parallel.py - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py ##### RL Integration Tests ##### - label: Prime-RL Integration Test # 15min timeout_in_minutes: 30 optional: true num_gpus: 2 working_dir: "/vllm-workspace" source_file_dependencies: - vllm/ - .buildkite/scripts/run-prime-rl-test.sh commands: - bash .buildkite/scripts/run-prime-rl-test.sh