From a5d29e9ee1155c6686ff1def4657c5a3e2d179eb Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Sat, 15 Mar 2025 17:31:21 +0000
Subject: [PATCH] undo massive formatting change

Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 .buildkite/test-pipeline.yaml | 1144 +++++++++++++++++----------------
 1 file changed, 573 insertions(+), 571 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index ac7b0feab7606..6753800f19902 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -28,625 +28,627 @@
 #   Note that all steps execute in parallel.
 
 steps:
-    ##### fast check tests  #####
+##### fast check tests  #####
 
-    - label: Documentation Build # 2min
-      working_dir: "/vllm-workspace/test_docs/docs"
-      fast_check: true
-      no_gpu: True
-      commands:
-          - pip install -r ../../requirements/docs.txt
-          - SPHINXOPTS=\"-W\" make html
-          # Check API reference (if it fails, you may have missing mock imports)
-          - grep \"sig sig-object py\" build/html/api/inference_params.html
+- label: Documentation Build # 2min
+  working_dir: "/vllm-workspace/test_docs/docs"
+  fast_check: true
+  no_gpu: True
+  commands:
+  - pip install -r ../../requirements/docs.txt
+  - SPHINXOPTS=\"-W\" make html
+  # Check API reference (if it fails, you may have missing mock imports)
+  - grep \"sig sig-object py\" build/html/api/inference_params.html
 
-    - label: Async Engine, Inputs, Utils, Worker Test # 24min
-      source_file_dependencies:
-          - vllm/
-          - tests/mq_llm_engine
-          - tests/async_engine
-          - tests/test_inputs
-          - tests/multimodal
-          - tests/test_utils
-          - tests/worker
-          - tests/standalone_tests/lazy_imports.py
-      commands:
-          - python3 standalone_tests/lazy_imports.py
-          - pytest -v -s mq_llm_engine # MQLLMEngine
-          - pytest -v -s async_engine # AsyncLLMEngine
-          - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
-          - pytest -v -s test_inputs.py
-          - pytest -v -s multimodal
-          - pytest -v -s test_utils.py # Utils
-          - pytest -v -s worker # Worker
+- label: Async Engine, Inputs, Utils, Worker Test # 24min
+  source_file_dependencies:
+  - vllm/
+  - tests/mq_llm_engine
+  - tests/async_engine
+  - tests/test_inputs
+  - tests/multimodal
+  - tests/test_utils
+  - tests/worker
+  - tests/standalone_tests/lazy_imports.py
+  commands:
+  - python3 standalone_tests/lazy_imports.py
+  - pytest -v -s mq_llm_engine # MQLLMEngine
+  - pytest -v -s async_engine # AsyncLLMEngine
+  - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
+  - pytest -v -s test_inputs.py
+  - pytest -v -s multimodal
+  - pytest -v -s test_utils.py # Utils
+  - pytest -v -s worker # Worker
 
-    - label: Python-only Installation Test
-      source_file_dependencies:
-          - tests/standalone_tests/python_only_compile.sh
-          - setup.py
-      commands:
-          - bash standalone_tests/python_only_compile.sh
+- label: Python-only Installation Test
+  source_file_dependencies:
+  - tests/standalone_tests/python_only_compile.sh
+  - setup.py
+  commands:
+  - bash standalone_tests/python_only_compile.sh
 
-    - label: Basic Correctness Test # 30min
-      #mirror_hardwares: [amd]
-      fast_check: true
-      source_file_dependencies:
-          - vllm/
-          - tests/basic_correctness/test_basic_correctness
-          - tests/basic_correctness/test_cpu_offload
-          - tests/basic_correctness/test_preemption
-          - tests/basic_correctness/test_cumem.py
-      commands:
-          - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-          - pytest -v -s basic_correctness/test_cumem.py
-          - pytest -v -s basic_correctness/test_basic_correctness.py
-          - pytest -v -s basic_correctness/test_cpu_offload.py
-          - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
+- label: Basic Correctness Test # 30min
+  #mirror_hardwares: [amd]
+  fast_check: true
+  source_file_dependencies:
+  - vllm/
+  - tests/basic_correctness/test_basic_correctness
+  - tests/basic_correctness/test_cpu_offload
+  - tests/basic_correctness/test_preemption
+  - tests/basic_correctness/test_cumem.py
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s basic_correctness/test_cumem.py
+  - pytest -v -s basic_correctness/test_basic_correctness.py
+  - pytest -v -s basic_correctness/test_cpu_offload.py
+  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
 
-    - label: Chunked Prefill Test
-      source_file_dependencies:
-          - vllm/
-          - tests/basic_correctness/test_chunked_prefill
-      commands:
-          - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
-          - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
+- label: Chunked Prefill Test
+  source_file_dependencies:
+  - vllm/
+  - tests/basic_correctness/test_chunked_prefill
+  commands:
+  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
+  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
 
-    - label: Core Test # 10min
-      mirror_hardwares: [amd]
-      fast_check: true
-      source_file_dependencies:
-          - vllm/core
-          - vllm/distributed
-          - tests/core
-      commands:
-          - pytest -v -s core
+- label: Core Test # 10min
+  mirror_hardwares: [amd]
+  fast_check: true
+  source_file_dependencies:
+  - vllm/core
+  - vllm/distributed
+  - tests/core
+  commands:
+  - pytest -v -s core
 
-    - label: Entrypoints Test # 40min
-      working_dir: "/vllm-workspace/tests"
-      fast_check: true
-      mirror_hardwares: [amd]
-      source_file_dependencies:
-          - vllm/
-          - tests/entrypoints/llm
-          - tests/entrypoints/openai
-          - tests/entrypoints/test_chat_utils
-          - tests/entrypoints/offline_mode
-      commands:
-          - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-          - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
-          - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
-          - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
-          - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
-          - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
-          - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/
-          - pytest -v -s entrypoints/test_chat_utils.py
-          - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
+- label: Entrypoints Test # 40min
+  working_dir: "/vllm-workspace/tests"
+  fast_check: true
+  mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/
+  - tests/entrypoints/llm
+  - tests/entrypoints/openai
+  - tests/entrypoints/test_chat_utils
+  - tests/entrypoints/offline_mode
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
+  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
+  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
+  - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
+  - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
+  - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/
+  - pytest -v -s entrypoints/test_chat_utils.py
+  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 
-    - label: Distributed Tests (4 GPUs) # 10min
-      working_dir: "/vllm-workspace/tests"
-      num_gpus: 4
-      source_file_dependencies:
-          - vllm/distributed/
-          - vllm/core/
-          - tests/distributed/test_utils
-          - tests/distributed/test_pynccl
-          - tests/spec_decode/e2e/test_integration_dist_tp4
-          - tests/compile/test_basic_correctness
-          - examples/offline_inference/rlhf.py
-          - examples/offline_inference/rlhf_colocate.py
-          - tests/examples/offline_inference/data_parallel.py
-      commands:
-          - python3 ../examples/offline_inference/data_parallel.py
-          - pytest -v -s distributed/test_utils.py
-          - pytest -v -s compile/test_basic_correctness.py
-          - pytest -v -s distributed/test_pynccl.py
-          - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
-          # TODO: create a dedicated test section for multi-GPU example tests
-          # when we have multiple distributed example tests
-          - pushd ../examples/offline_inference
-          - python3 rlhf.py
-          - RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
-          - popd
+- label: Distributed Tests (4 GPUs) # 10min
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/distributed/
+  - vllm/core/
+  - tests/distributed/test_utils
+  - tests/distributed/test_pynccl
+  - tests/spec_decode/e2e/test_integration_dist_tp4
+  - tests/compile/test_basic_correctness
+  - examples/offline_inference/rlhf.py
+  - examples/offline_inference/rlhf_colocate.py
+  - tests/examples/offline_inference/data_parallel.py
+  commands:
+  - python3 ../examples/offline_inference/data_parallel.py
+  - pytest -v -s distributed/test_utils.py
+  - pytest -v -s compile/test_basic_correctness.py
+  - pytest -v -s distributed/test_pynccl.py
+  - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
+  # TODO: create a dedicated test section for multi-GPU example tests
+  # when we have multiple distributed example tests
+  - pushd ../examples/offline_inference
+  - python3 rlhf.py
+  - RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
+  - popd
 
-    - label: Metrics, Tracing Test # 10min
-      num_gpus: 2
-      source_file_dependencies:
-          - vllm/
-          - tests/metrics
-          - tests/tracing
-      commands:
-          - pytest -v -s metrics
-          - "pip install \
-            'opentelemetry-sdk>=1.26.0,<1.27.0' \
-            'opentelemetry-api>=1.26.0,<1.27.0' \
-            'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
-            'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
-          - pytest -v -s tracing
+- label: Metrics, Tracing Test # 10min
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/
+  - tests/metrics
+  - tests/tracing
+  commands:
+  - pytest -v -s metrics
+  - "pip install \
+      'opentelemetry-sdk>=1.26.0,<1.27.0' \
+      'opentelemetry-api>=1.26.0,<1.27.0' \
+      'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
+      'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'"
+  - pytest -v -s tracing
 
-    ##### fast check tests  #####
-    #####  1 GPU test  #####
+##### fast check tests  #####
+#####  1 GPU test  #####
 
-    - label: Regression Test # 5min
-      mirror_hardwares: [amd]
-      source_file_dependencies:
-          - vllm/
-          - tests/test_regression
-      commands:
-          - pip install modelscope
-          - pytest -v -s test_regression.py
-      working_dir: "/vllm-workspace/tests" # optional
+- label: Regression Test # 5min
+  mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/
+  - tests/test_regression
+  commands:
+  - pip install modelscope
+  - pytest -v -s test_regression.py
+  working_dir: "/vllm-workspace/tests" # optional
 
-    - label: Engine Test # 10min
-      mirror_hardwares: [amd]
-      source_file_dependencies:
-          - vllm/
-          - tests/engine
-          - tests/tokenization
-          - tests/test_sequence
-          - tests/test_config
-          - tests/test_logger
-      commands:
-          - pytest -v -s engine test_sequence.py test_config.py test_logger.py
-          # OOM in the CI unless we run this separately
-          - pytest -v -s tokenization
+- label: Engine Test # 10min
+  mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/
+  - tests/engine
+  - tests/tokenization
+  - tests/test_sequence
+  - tests/test_config
+  - tests/test_logger
+  commands:
+  - pytest -v -s engine test_sequence.py test_config.py test_logger.py
+  # OOM in the CI unless we run this separately
+  - pytest -v -s tokenization
 
-    - label: V1 Test
-      #mirror_hardwares: [amd]
-      source_file_dependencies:
-          - vllm/
-          - tests/v1
-      commands:
-          # split the test to avoid interference
-          - pytest -v -s v1/core
-          - pytest -v -s v1/engine
-          - pytest -v -s v1/sample
-          - pytest -v -s v1/worker
-          - pytest -v -s v1/structured_output
-          - pytest -v -s v1/test_stats.py
-          - pytest -v -s v1/test_utils.py
-          - pytest -v -s v1/test_oracle.py
-          # TODO: accuracy does not match, whether setting
-          # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
-          - pytest -v -s v1/e2e
-          # Integration test for streaming correctness (requires special branch).
-          - pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
-          - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
+- label: V1 Test
+  #mirror_hardwares: [amd]
+  source_file_dependencies:
+    - vllm/
+    - tests/v1
+  commands:
+    # split the test to avoid interference
+    - pytest -v -s v1/core
+    - pytest -v -s v1/engine
+    - pytest -v -s v1/sample
+    - pytest -v -s v1/worker
+    - pytest -v -s v1/structured_output
+    - pytest -v -s v1/test_stats.py
+    - pytest -v -s v1/test_utils.py
+    - pytest -v -s v1/test_oracle.py
+    # TODO: accuracy does not match, whether setting
+    # VLLM_USE_FLASHINFER_SAMPLER or not on H100.
+    - pytest -v -s v1/e2e
+    # Integration test for streaming correctness (requires special branch).
+    - pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
+    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
 
-    - label: Examples Test # 25min
-      working_dir: "/vllm-workspace/examples"
-      #mirror_hardwares: [amd]
-      source_file_dependencies:
-          - vllm/entrypoints
-          - examples/
-      commands:
-          - pip install tensorizer # for tensorizer test
-          - python3 offline_inference/basic/generate.py --model facebook/opt-125m
-          - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
-          - python3 offline_inference/basic/chat.py
-          - python3 offline_inference/prefix_caching.py
-          - python3 offline_inference/llm_engine_example.py
-          - python3 offline_inference/vision_language.py
-          - python3 offline_inference/vision_language_multi_image.py
-          - VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
-          - python3 offline_inference/encoder_decoder.py
-          - python3 offline_inference/basic/classify.py
-          - python3 offline_inference/basic/embed.py
-          - python3 offline_inference/basic/score.py
-          - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
+- label: Examples Test # 25min
+  working_dir: "/vllm-workspace/examples"
+  #mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/entrypoints
+  - examples/
+  commands:
+    - pip install tensorizer # for tensorizer test
+    - python3 offline_inference/basic/generate.py --model facebook/opt-125m
+    - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
+    - python3 offline_inference/basic/chat.py
+    - python3 offline_inference/prefix_caching.py
+    - python3 offline_inference/llm_engine_example.py
+    - python3 offline_inference/vision_language.py
+    - python3 offline_inference/vision_language_multi_image.py
+    - VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+    - python3 offline_inference/encoder_decoder.py
+    - python3 offline_inference/basic/classify.py
+    - python3 offline_inference/basic/embed.py
+    - python3 offline_inference/basic/score.py
+    - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
 
-    - label: Prefix Caching Test # 9min
-      mirror_hardwares: [amd]
-      source_file_dependencies:
-          - vllm/
-          - tests/prefix_caching
-      commands:
-          - pytest -v -s prefix_caching
+- label: Prefix Caching Test # 9min
+  mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/
+  - tests/prefix_caching
+  commands:
+    - pytest -v -s prefix_caching
 
-    - label: Samplers Test # 36min
-      source_file_dependencies:
-          - vllm/model_executor/layers
-          - vllm/sampling_metadata.py
-          - tests/samplers
-          - tests/conftest.py
-      commands:
-          - pytest -v -s samplers
-          - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
+- label: Samplers Test # 36min
+  source_file_dependencies:
+  - vllm/model_executor/layers
+  - vllm/sampling_metadata.py
+  - tests/samplers
+  - tests/conftest.py
+  commands:
+    - pytest -v -s samplers
+    - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
 
-    - label: LogitsProcessor Test # 5min
-      mirror_hardwares: [amd]
-      source_file_dependencies:
-          - vllm/model_executor/layers
-          - vllm/model_executor/guided_decoding
-          - tests/test_logits_processor
-          - tests/model_executor/test_guided_processors
-      commands:
-          - pytest -v -s test_logits_processor.py
-          - pytest -v -s model_executor/test_guided_processors.py
+- label: LogitsProcessor Test # 5min
+  mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/model_executor/layers
+  - vllm/model_executor/guided_decoding
+  - tests/test_logits_processor
+  - tests/model_executor/test_guided_processors
+  commands:
+    - pytest -v -s test_logits_processor.py
+    - pytest -v -s model_executor/test_guided_processors.py
 
-    - label: Speculative decoding tests # 40min
-      source_file_dependencies:
-          - vllm/spec_decode
-          - tests/spec_decode
-          - vllm/model_executor/models/eagle.py
-      commands:
-          - pytest -v -s spec_decode/e2e/test_multistep_correctness.py
-          - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_mtp_correctness.py
-          - pytest -v -s spec_decode/e2e/test_eagle_correctness.py
+- label: Speculative decoding tests # 40min
+  source_file_dependencies:
+  - vllm/spec_decode
+  - tests/spec_decode
+  - vllm/model_executor/models/eagle.py
+  commands:
+    - pytest -v -s spec_decode/e2e/test_multistep_correctness.py
+    - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_mtp_correctness.py
+    - pytest -v -s spec_decode/e2e/test_eagle_correctness.py
 
-    - label: LoRA Test %N # 15min each
-      mirror_hardwares: [amd]
-      source_file_dependencies:
-          - vllm/lora
-          - tests/lora
-      command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py  --ignore=lora/test_transfomers_model.py
-      parallelism: 4
+- label: LoRA Test %N # 15min each
+  mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/lora
+  - tests/lora
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py  --ignore=lora/test_transfomers_model.py
+  parallelism: 4
 
-    - label: PyTorch Fullgraph Smoke Test # 9min
-      source_file_dependencies:
-          - vllm/
-          - tests/compile
-      commands:
-          - pytest -v -s compile/test_basic_correctness.py
-          # these tests need to be separated, cannot combine
-          - pytest -v -s compile/piecewise/test_simple.py
-          - pytest -v -s compile/piecewise/test_toy_llama.py
+- label: PyTorch Fullgraph Smoke Test # 9min
+  source_file_dependencies:
+  - vllm/
+  - tests/compile
+  commands:
+  - pytest -v -s compile/test_basic_correctness.py
+  # these tests need to be separated, cannot combine
+  - pytest -v -s compile/piecewise/test_simple.py
+  - pytest -v -s compile/piecewise/test_toy_llama.py
 
-    - label: PyTorch Fullgraph Test # 18min
-      source_file_dependencies:
-          - vllm/
-          - tests/compile
-      commands:
-          - pytest -v -s compile/test_full_graph.py
+- label: PyTorch Fullgraph Test # 18min
+  source_file_dependencies:
+  - vllm/
+  - tests/compile
+  commands:
+  - pytest -v -s compile/test_full_graph.py
 
-    - label: Kernels Test %N # 1h each
-      mirror_hardwares: [amd]
-      source_file_dependencies:
-          - csrc/
-          - vllm/attention
-          - tests/kernels
-      commands:
-          - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
-      parallelism: 4
+- label: Kernels Test %N # 1h each
+  mirror_hardwares: [amd]
+  source_file_dependencies:
+  - csrc/
+  - vllm/attention
+  - tests/kernels
+  commands:
+    - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  parallelism: 4
 
-    - label: Tensorizer Test # 11min
-      mirror_hardwares: [amd]
-      soft_fail: true
-      source_file_dependencies:
-          - vllm/model_executor/model_loader
-          - tests/tensorizer_loader
-      commands:
-          - apt-get update && apt-get install -y curl libsodium23
-          - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-          - pytest -v -s tensorizer_loader
+- label: Tensorizer Test # 11min
+  mirror_hardwares: [amd]
+  soft_fail: true
+  source_file_dependencies:
+  - vllm/model_executor/model_loader
+  - tests/tensorizer_loader
+  commands:
+    - apt-get update && apt-get install -y curl libsodium23
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    - pytest -v -s tensorizer_loader
 
-    - label: Benchmarks # 9min
-      working_dir: "/vllm-workspace/.buildkite"
-      mirror_hardwares: [amd]
-      source_file_dependencies:
-          - benchmarks/
-      commands:
-          - bash run-benchmarks.sh
+- label: Benchmarks # 9min
+  working_dir: "/vllm-workspace/.buildkite"
+  mirror_hardwares: [amd]
+  source_file_dependencies:
+  - benchmarks/
+  commands:
+  - bash run-benchmarks.sh
 
-    - label: Quantization Test # 33min
-      source_file_dependencies:
-          - csrc/
-          - vllm/model_executor/layers/quantization
-          - tests/quantization
-      command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
+- label: Quantization Test # 33min
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  - tests/quantization
+  command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
 
-    - label: LM Eval Small Models # 53min
-      working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-      source_file_dependencies:
-          - csrc/
-          - vllm/model_executor/layers/quantization
-      commands:
-          - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-          - bash ./run-tests.sh -c configs/models-small.txt -t 1
+- label: LM Eval Small Models # 53min
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - bash ./run-tests.sh -c configs/models-small.txt -t 1
 
-    - label: OpenAI API correctness
-      source_file_dependencies:
-          - csrc/
-          - vllm/entrypoints/openai/
-          - vllm/model_executor/models/whisper.py
-      commands: # LMEval+Transcription WER check
-          - pytest -s entrypoints/openai/correctness/
+- label: OpenAI API correctness
+  source_file_dependencies:
+  - csrc/
+  - vllm/entrypoints/openai/
+  - vllm/model_executor/models/whisper.py
+  commands: # LMEval+Transcription WER check
+  - pytest -s entrypoints/openai/correctness/
 
-    - label: Encoder Decoder tests # 5min
-      source_file_dependencies:
-          - vllm/
-          - tests/encoder_decoder
-      commands:
-          - pytest -v -s encoder_decoder
+- label: Encoder Decoder tests # 5min
+  source_file_dependencies:
+  - vllm/
+  - tests/encoder_decoder
+  commands:
+    - pytest -v -s encoder_decoder
 
-    - label: OpenAI-Compatible Tool Use # 20 min
-      fast_check: false
-      mirror_hardwares: [amd]
-      source_file_dependencies:
-          - vllm/
-          - tests/tool_use
-      commands:
-          - pytest -v -s tool_use
+- label: OpenAI-Compatible Tool Use # 20 min
+  fast_check: false
+  mirror_hardwares: [ amd ]
+  source_file_dependencies:
+    - vllm/
+    - tests/tool_use
+  commands:
+    - pytest -v -s tool_use
 
-    #####  models test  #####
+#####  models test  #####
 
-    - label: Basic Models Test # 24min
-      source_file_dependencies:
-          - vllm/
-          - tests/models
-      commands:
-          - pytest -v -s models/test_transformers.py
-          - pytest -v -s models/test_registry.py
-          # V1 Test: https://github.com/vllm-project/vllm/issues/14531
-          - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py
+- label: Basic Models Test # 24min
+  source_file_dependencies:
+  - vllm/
+  - tests/models
+  commands:
+    - pytest -v -s models/test_transformers.py
+    - pytest -v -s models/test_registry.py
+    # V1 Test: https://github.com/vllm-project/vllm/issues/14531
+    - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py
 
-    - label: Language Models Test (Standard) # 32min
-      #mirror_hardwares: [amd]
-      source_file_dependencies:
-          - vllm/
-          - tests/models/decoder_only/language
-          - tests/models/embedding/language
-          - tests/models/encoder_decoder/language
-      commands:
-          - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
-          - pytest -v -s models/embedding/language -m core_model
+- label: Language Models Test (Standard) # 32min
+  #mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/
+  - tests/models/decoder_only/language
+  - tests/models/embedding/language
+  - tests/models/encoder_decoder/language
+  commands:
+    - pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
+    - pytest -v -s models/embedding/language -m core_model
 
-    - label: SSM and Hybrid Models Test # X min
-      source_file_dependencies:
-          - vllm/
-          - tests/models/decoder_only/language/test_hybrid.py
-          - tests/models/decoder_only/language/test_mamba.py
-      commands:
-          - pytest -v -s models/decoder_only/language/test_hybrid.py
-          - pytest -v -s models/decoder_only/language/test_mamba.py
+- label: Language Models Test (Extended) # 1h10min
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/decoder_only/language
+  - tests/models/embedding/language
+  - tests/models/encoder_decoder/language
+  commands:
+    - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
+    - pytest -v -s models/embedding/language -m 'not core_model'
 
-    - label: Language Models Test (Extended) # 1h10min
-      optional: true
-      source_file_dependencies:
-          - vllm/
-          - tests/models/decoder_only/language
-          - tests/models/embedding/language
-          - tests/models/encoder_decoder/language
-      commands:
-          - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
-          - pytest -v -s models/embedding/language -m 'not core_model'
+- label: Multi-Modal Models Test (Standard) # 40min
+  #mirror_hardwares: [amd]
+  source_file_dependencies:
+  - vllm/
+  - tests/models/decoder_only/audio_language
+  - tests/models/decoder_only/vision_language
+  - tests/models/embedding/vision_language
+  - tests/models/encoder_decoder/audio_language
+  - tests/models/encoder_decoder/vision_language
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/multimodal
+    - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
+    - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
+    - pytest -v -s models/embedding/vision_language -m core_model
+    - pytest -v -s models/encoder_decoder/audio_language -m core_model
+    - pytest -v -s models/encoder_decoder/language -m core_model
+    - pytest -v -s models/encoder_decoder/vision_language -m core_model
 
-    - label: Multi-Modal Models Test (Standard) # 40min
-      #mirror_hardwares: [amd]
-      source_file_dependencies:
-          - vllm/
-          - tests/models/decoder_only/audio_language
-          - tests/models/decoder_only/vision_language
-          - tests/models/embedding/vision_language
-          - tests/models/encoder_decoder/audio_language
-          - tests/models/encoder_decoder/vision_language
-      commands:
-          - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-          - pytest -v -s models/multimodal
-          - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
-          - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
-          - pytest -v -s models/embedding/vision_language -m core_model
-          - pytest -v -s models/encoder_decoder/audio_language -m core_model
-          - pytest -v -s models/encoder_decoder/language -m core_model
-          - pytest -v -s models/encoder_decoder/vision_language -m core_model
+- label: Multi-Modal Models Test (Extended) 1 # 48m
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/decoder_only/audio_language
+  - tests/models/decoder_only/vision_language
+  - tests/models/embedding/vision_language
+  - tests/models/encoder_decoder/vision_language
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
+    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
+    # HACK - run phi3v tests separately to sidestep this transformers bug
+    # https://github.com/huggingface/transformers/issues/34307
+    - pytest -v -s models/decoder_only/vision_language/test_phi3v.py
+    - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
+    - pytest -v -s models/embedding/vision_language -m 'not core_model'
+    - pytest -v -s models/encoder_decoder/language -m 'not core_model'
+    - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
 
-    - label: Multi-Modal Models Test (Extended) 1 # 48m
-      optional: true
-      source_file_dependencies:
-          - vllm/
-          - tests/models/decoder_only/audio_language
-          - tests/models/decoder_only/vision_language
-          - tests/models/embedding/vision_language
-          - tests/models/encoder_decoder/vision_language
-      commands:
-          - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-          - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
-          - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
-          # HACK - run phi3v tests separately to sidestep this transformers bug
-          # https://github.com/huggingface/transformers/issues/34307
-          - pytest -v -s models/decoder_only/vision_language/test_phi3v.py
-          - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
-          - pytest -v -s models/embedding/vision_language -m 'not core_model'
-          - pytest -v -s models/encoder_decoder/language -m 'not core_model'
-          - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
+- label: Multi-Modal Models Test (Extended) 2 # 38m
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/models/decoder_only/vision_language
+  commands:
+    - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
+    - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
 
-    - label: Multi-Modal Models Test (Extended) 2 # 38m
-      optional: true
-      source_file_dependencies:
-          - vllm/
-          - tests/models/decoder_only/vision_language
-      commands:
-          - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
-          - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
+- label: SSM and Hybrid Models Test # X min
+  source_file_dependencies:
+      - vllm/
+      - tests/models/decoder_only/language/test_hybrid.py
+      - tests/models/decoder_only/language/test_mamba.py
+  commands:
+      - pytest -v -s models/decoder_only/language/test_hybrid.py
+      - pytest -v -s models/decoder_only/language/test_mamba.py
 
-    # This test is used only in PR development phase to test individual models and should never run on main
-    - label: Custom Models Test
-      optional: true
-      commands:
-          - echo 'Testing custom models...'
-          # PR authors can temporarily add commands below to test individual models
-          # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
-          # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
+# This test is used only in PR development phase to test individual models and should never run on main
+- label: Custom Models Test
+  optional: true
+  commands:
+    - echo 'Testing custom models...'
+    # PR authors can temporarily add commands below to test individual models
+    # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
+    # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
 
-    #####  1 GPU test  #####
-    #####  multi gpus test  #####
+#####  1 GPU test  #####
+#####  multi gpus test  #####
 
-    - label: Distributed Comm Ops Test # 7min
-      working_dir: "/vllm-workspace/tests"
-      num_gpus: 2
-      source_file_dependencies:
-          - vllm/distributed
-          - tests/distributed
-      commands:
-          - pytest -v -s distributed/test_comm_ops.py
-          - pytest -v -s distributed/test_shm_broadcast.py
+- label: Distributed Comm Ops Test # 7min
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/distributed
+  - tests/distributed
+  commands:
+  - pytest -v -s distributed/test_comm_ops.py
+  - pytest -v -s distributed/test_shm_broadcast.py
 
-    - label: 2 Node Tests (4 GPUs in total) # 16min
-      working_dir: "/vllm-workspace/tests"
-      num_gpus: 2
-      num_nodes: 2
-      source_file_dependencies:
-          - vllm/distributed/
-          - vllm/engine/
-          - vllm/executor/
-          - vllm/model_executor/models/
-          - tests/distributed/
-      commands:
-          - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
-            - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
-            - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
-            - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
-          - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
-            - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
+- label: 2 Node Tests (4 GPUs in total) # 16min
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  num_nodes: 2
+  source_file_dependencies:
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/model_executor/models/
+  - tests/distributed/
+  commands:
+  - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
+    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
+    - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
+  - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
+    - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
 
-    - label: Distributed Tests (2 GPUs) # 40min
-      #mirror_hardwares: [amd]
-      working_dir: "/vllm-workspace/tests"
-      num_gpus: 2
-      source_file_dependencies:
-          - vllm/distributed/
-          - vllm/engine/
-          - vllm/executor/
-          - vllm/model_executor/models/
-          - tests/distributed/
-          - vllm/compilation
-          - vllm/worker/worker_base.py
-          - vllm/worker/worker.py
-          - vllm/worker/model_runner.py
-          - entrypoints/llm/test_collective_rpc.py
-      commands:
-          - pytest -v -s entrypoints/llm/test_collective_rpc.py
-          - VLLM_USE_V1=1 torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
-          - torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
-          - pytest -v -s ./compile/test_basic_correctness.py
-          - pytest -v -s ./compile/test_wrapper.py
-          - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
-          - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
-          # Avoid importing model tests that cause CUDA reinitialization error
-          - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
-          - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
-          - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
-          - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
-          # this test fails consistently.
-          # TODO: investigate and fix
-          # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
-          - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-          - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
+- label: Distributed Tests (2 GPUs) # 40min
+  #mirror_hardwares: [amd]
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/model_executor/models/
+  - tests/distributed/
+  - vllm/compilation
+  - vllm/worker/worker_base.py
+  - vllm/worker/worker.py
+  - vllm/worker/model_runner.py
+  - entrypoints/llm/test_collective_rpc.py
+  commands:
+  - pytest -v -s entrypoints/llm/test_collective_rpc.py
+  - VLLM_USE_V1=1 torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
+  - torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
+  - pytest -v -s ./compile/test_basic_correctness.py
+  - pytest -v -s ./compile/test_wrapper.py
+  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
+  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+  # Avoid importing model tests that cause CUDA reinitialization error
+  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
+  - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
+  # this test fails consistently.
+  # TODO: investigate and fix
+  # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
+  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
+  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
 
-    - label: Plugin Tests (2 GPUs) # 40min
-      working_dir: "/vllm-workspace/tests"
-      num_gpus: 2
-      source_file_dependencies:
-          - vllm/plugins/
-          - tests/plugins/
-      commands:
-          # begin platform plugin tests, all the code in-between runs on dummy platform
-          - pip install -e ./plugins/vllm_add_dummy_platform
-          - pytest -v -s plugins_tests/test_platform_plugins.py
-          - pip uninstall vllm_add_dummy_platform -y
-          # end platform plugin tests
-          # other tests continue here:
-          - pytest -v -s plugins_tests/test_scheduler_plugins.py
-          - pip install -e ./plugins/vllm_add_dummy_model
-          - pytest -v -s distributed/test_distributed_oot.py
-          - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
-          - pytest -v -s models/test_oot_registration.py # it needs a clean process
+- label: Plugin Tests (2 GPUs) # 40min
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/plugins/
+  - tests/plugins/
+  commands:
+  # begin platform plugin tests, all the code in-between runs on dummy platform
+  - pip install -e ./plugins/vllm_add_dummy_platform
+  - pytest -v -s plugins_tests/test_platform_plugins.py
+  - pip uninstall vllm_add_dummy_platform -y
+  # end platform plugin tests
+  # other tests continue here:
+  - pytest -v -s plugins_tests/test_scheduler_plugins.py
+  - pip install -e ./plugins/vllm_add_dummy_model
+  - pytest -v -s distributed/test_distributed_oot.py
+  - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
+  - pytest -v -s models/test_oot_registration.py # it needs a clean process
 
-    - label: Multi-step Tests (4 GPUs) # 36min
-      working_dir: "/vllm-workspace/tests"
-      num_gpus: 4
-      source_file_dependencies:
-          - vllm/model_executor/layers/sampler.py
-          - vllm/sequence.py
-          - vllm/worker/worker_base.py
-          - vllm/worker/worker.py
-          - vllm/worker/multi_step_worker.py
-          - vllm/worker/model_runner_base.py
-          - vllm/worker/model_runner.py
-          - vllm/worker/multi_step_model_runner.py
-          - vllm/engine
-          - tests/multi_step
-      commands:
-          # this test is quite flaky
-          # TODO: investigate and fix.
-          # - pytest -v -s multi_step/test_correctness_async_llm.py
-          - pytest -v -s multi_step/test_correctness_llm.py
+- label: Multi-step Tests (4 GPUs) # 36min
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/model_executor/layers/sampler.py
+  - vllm/sequence.py
+  - vllm/worker/worker_base.py
+  - vllm/worker/worker.py
+  - vllm/worker/multi_step_worker.py
+  - vllm/worker/model_runner_base.py
+  - vllm/worker/model_runner.py
+  - vllm/worker/multi_step_model_runner.py
+  - vllm/engine
+  - tests/multi_step
+  commands:
+  # this test is quite flaky
+  # TODO: investigate and fix.
+  # - pytest -v -s multi_step/test_correctness_async_llm.py
+  - pytest -v -s multi_step/test_correctness_llm.py
 
-    - label: Pipeline Parallelism Test # 45min
-      working_dir: "/vllm-workspace/tests"
-      num_gpus: 4
-      source_file_dependencies:
-          - vllm/distributed/
-          - vllm/engine/
-          - vllm/executor/
-          - vllm/model_executor/models/
-          - tests/distributed/
-      commands:
-          - pytest -v -s distributed/test_pp_cudagraph.py
-          - pytest -v -s distributed/test_pipeline_parallel.py
+- label: Pipeline Parallelism Test # 45min
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/model_executor/models/
+  - tests/distributed/
+  commands:
+  - pytest -v -s distributed/test_pp_cudagraph.py
+  - pytest -v -s distributed/test_pipeline_parallel.py
 
-    - label: LoRA TP Test (Distributed)
-      num_gpus: 4
-      source_file_dependencies:
-          - vllm/lora
-          - tests/lora
-      commands:
-          # FIXIT: find out which code initialize cuda before running the test
-          # before the fix, we need to use spawn to test it
-          - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-          # This test runs llama 13B, so it is required to run on 4 GPUs.
-          - pytest -v -s -x lora/test_long_context.py
-          # There is some Tensor Parallelism related processing logic in LoRA that
-          # requires multi-GPU testing for validation.
-          - pytest -v -s -x lora/test_chatglm3_tp.py
-          - pytest -v -s -x lora/test_llama_tp.py
-          - pytest -v -s -x lora/test_minicpmv_tp.py
-          - pytest -v -s -x lora/test_transfomers_model.py
+- label: LoRA TP Test (Distributed)
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/lora
+  - tests/lora
+  commands:
+    # FIXIT: find out which code initialize cuda before running the test
+    # before the fix, we need to use spawn to test it
+    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+    # This test runs llama 13B, so it is required to run on 4 GPUs.
+    - pytest -v -s -x lora/test_long_context.py
+    # There is some Tensor Parallelism related processing logic in LoRA that
+    # requires multi-GPU testing for validation.
+    - pytest -v -s -x lora/test_chatglm3_tp.py
+    - pytest -v -s -x lora/test_llama_tp.py
+    - pytest -v -s -x lora/test_minicpmv_tp.py
+    - pytest -v -s -x lora/test_transfomers_model.py
 
-    - label: Weight Loading Multiple GPU Test # 33min
-      working_dir: "/vllm-workspace/tests"
-      num_gpus: 2
-      source_file_dependencies:
-          - vllm/
-          - tests/weight_loading
-      commands:
-          - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
 
-    - label: Weight Loading Multiple GPU Test - Large Models # optional
-      working_dir: "/vllm-workspace/tests"
-      num_gpus: 2
-      gpu: a100
-      optional: true
-      source_file_dependencies:
-          - vllm/
-          - tests/weight_loading
-      commands:
-          - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
+- label: Weight Loading Multiple GPU Test  # 33min
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  source_file_dependencies:
+  - vllm/
+  - tests/weight_loading
+  commands:
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
 
-    ##### multi gpus test #####
-    ##### A100 test #####
+- label: Weight Loading Multiple GPU Test - Large Models # optional
+  working_dir: "/vllm-workspace/tests"
+  num_gpus: 2
+  gpu: a100
+  optional: true
+  source_file_dependencies:
+  - vllm/
+  - tests/weight_loading
+  commands:
+    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
 
-    - label: Distributed Tests (A100) # optional
-      gpu: a100
-      optional: true
-      num_gpus: 4
-      source_file_dependencies:
-          - vllm/
-      commands:
-          # NOTE: don't test llama model here, it seems hf implementation is buggy
-          # see https://github.com/vllm-project/vllm/pull/5689 for details
-          - pytest -v -s distributed/test_custom_all_reduce.py
-          - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
-          - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
-          - pytest -v -s -x lora/test_mixtral.py
 
-    - label: LM Eval Large Models # optional
-      gpu: a100
-      optional: true
-      num_gpus: 4
-      working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
-      source_file_dependencies:
-          - csrc/
-          - vllm/model_executor/layers/quantization
-      commands:
-          - export VLLM_WORKER_MULTIPROC_METHOD=spawn
-          - bash ./run-tests.sh -c configs/models-large.txt -t 4
+##### multi gpus test #####
+##### A100 test #####
+
+- label: Distributed Tests (A100) # optional
+  gpu: a100
+  optional: true
+  num_gpus: 4
+  source_file_dependencies:
+  - vllm/
+  commands:
+  # NOTE: don't test llama model here, it seems hf implementation is buggy
+  # see https://github.com/vllm-project/vllm/pull/5689 for details
+  - pytest -v -s distributed/test_custom_all_reduce.py
+  - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
+  - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
+  - pytest -v -s -x lora/test_mixtral.py
+
+- label: LM Eval Large Models # optional
+  gpu: a100
+  optional: true
+  num_gpus: 4
+  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
+  source_file_dependencies:
+  - csrc/
+  - vllm/model_executor/layers/quantization
+  commands:
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - bash ./run-tests.sh -c configs/models-large.txt -t 4