From a5d29e9ee1155c6686ff1def4657c5a3e2d179eb Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Sat, 15 Mar 2025 17:31:21 +0000 Subject: [PATCH] undo massive formatting change Signed-off-by: Tyler Michael Smith --- .buildkite/test-pipeline.yaml | 1144 +++++++++++++++++---------------- 1 file changed, 573 insertions(+), 571 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index ac7b0feab7606..6753800f19902 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -28,625 +28,627 @@ # Note that all steps execute in parallel. steps: - ##### fast check tests ##### +##### fast check tests ##### - - label: Documentation Build # 2min - working_dir: "/vllm-workspace/test_docs/docs" - fast_check: true - no_gpu: True - commands: - - pip install -r ../../requirements/docs.txt - - SPHINXOPTS=\"-W\" make html - # Check API reference (if it fails, you may have missing mock imports) - - grep \"sig sig-object py\" build/html/api/inference_params.html +- label: Documentation Build # 2min + working_dir: "/vllm-workspace/test_docs/docs" + fast_check: true + no_gpu: True + commands: + - pip install -r ../../requirements/docs.txt + - SPHINXOPTS=\"-W\" make html + # Check API reference (if it fails, you may have missing mock imports) + - grep \"sig sig-object py\" build/html/api/inference_params.html - - label: Async Engine, Inputs, Utils, Worker Test # 24min - source_file_dependencies: - - vllm/ - - tests/mq_llm_engine - - tests/async_engine - - tests/test_inputs - - tests/multimodal - - tests/test_utils - - tests/worker - - tests/standalone_tests/lazy_imports.py - commands: - - python3 standalone_tests/lazy_imports.py - - pytest -v -s mq_llm_engine # MQLLMEngine - - pytest -v -s async_engine # AsyncLLMEngine - - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py - - pytest -v -s test_inputs.py - - pytest -v -s multimodal - - pytest -v -s test_utils.py # Utils - - pytest -v -s worker # Worker +- label: Async Engine, Inputs, Utils, Worker Test # 24min + source_file_dependencies: + - vllm/ + - tests/mq_llm_engine + - tests/async_engine + - tests/test_inputs + - tests/multimodal + - tests/test_utils + - tests/worker + - tests/standalone_tests/lazy_imports.py + commands: + - python3 standalone_tests/lazy_imports.py + - pytest -v -s mq_llm_engine # MQLLMEngine + - pytest -v -s async_engine # AsyncLLMEngine + - NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py + - pytest -v -s test_inputs.py + - pytest -v -s multimodal + - pytest -v -s test_utils.py # Utils + - pytest -v -s worker # Worker - - label: Python-only Installation Test - source_file_dependencies: - - tests/standalone_tests/python_only_compile.sh - - setup.py - commands: - - bash standalone_tests/python_only_compile.sh +- label: Python-only Installation Test + source_file_dependencies: + - tests/standalone_tests/python_only_compile.sh + - setup.py + commands: + - bash standalone_tests/python_only_compile.sh - - label: Basic Correctness Test # 30min - #mirror_hardwares: [amd] - fast_check: true - source_file_dependencies: - - vllm/ - - tests/basic_correctness/test_basic_correctness - - tests/basic_correctness/test_cpu_offload - - tests/basic_correctness/test_preemption - - tests/basic_correctness/test_cumem.py - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s basic_correctness/test_cumem.py - - pytest -v -s basic_correctness/test_basic_correctness.py - - pytest -v -s basic_correctness/test_cpu_offload.py - - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py +- label: Basic Correctness Test # 30min + #mirror_hardwares: [amd] + fast_check: true + source_file_dependencies: + - vllm/ + - tests/basic_correctness/test_basic_correctness + - tests/basic_correctness/test_cpu_offload + - tests/basic_correctness/test_preemption + - tests/basic_correctness/test_cumem.py + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s basic_correctness/test_cumem.py + - pytest -v -s basic_correctness/test_basic_correctness.py + - pytest -v -s basic_correctness/test_cpu_offload.py + - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py - - label: Chunked Prefill Test - source_file_dependencies: - - vllm/ - - tests/basic_correctness/test_chunked_prefill - commands: - - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py - - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py +- label: Chunked Prefill Test + source_file_dependencies: + - vllm/ + - tests/basic_correctness/test_chunked_prefill + commands: + - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py + - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py - - label: Core Test # 10min - mirror_hardwares: [amd] - fast_check: true - source_file_dependencies: - - vllm/core - - vllm/distributed - - tests/core - commands: - - pytest -v -s core +- label: Core Test # 10min + mirror_hardwares: [amd] + fast_check: true + source_file_dependencies: + - vllm/core + - vllm/distributed + - tests/core + commands: + - pytest -v -s core - - label: Entrypoints Test # 40min - working_dir: "/vllm-workspace/tests" - fast_check: true - mirror_hardwares: [amd] - source_file_dependencies: - - vllm/ - - tests/entrypoints/llm - - tests/entrypoints/openai - - tests/entrypoints/test_chat_utils - - tests/entrypoints/offline_mode - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py - - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process - - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process - - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process - - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process - - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/ - - pytest -v -s entrypoints/test_chat_utils.py - - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests +- label: Entrypoints Test # 40min + working_dir: "/vllm-workspace/tests" + fast_check: true + mirror_hardwares: [amd] + source_file_dependencies: + - vllm/ + - tests/entrypoints/llm + - tests/entrypoints/openai + - tests/entrypoints/test_chat_utils + - tests/entrypoints/offline_mode + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py + - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process + - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process + - pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process + - VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process + - pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/ + - pytest -v -s entrypoints/test_chat_utils.py + - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests - - label: Distributed Tests (4 GPUs) # 10min - working_dir: "/vllm-workspace/tests" - num_gpus: 4 - source_file_dependencies: - - vllm/distributed/ - - vllm/core/ - - tests/distributed/test_utils - - tests/distributed/test_pynccl - - tests/spec_decode/e2e/test_integration_dist_tp4 - - tests/compile/test_basic_correctness - - examples/offline_inference/rlhf.py - - examples/offline_inference/rlhf_colocate.py - - tests/examples/offline_inference/data_parallel.py - commands: - - python3 ../examples/offline_inference/data_parallel.py - - pytest -v -s distributed/test_utils.py - - pytest -v -s compile/test_basic_correctness.py - - pytest -v -s distributed/test_pynccl.py - - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py - # TODO: create a dedicated test section for multi-GPU example tests - # when we have multiple distributed example tests - - pushd ../examples/offline_inference - - python3 rlhf.py - - RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py - - popd +- label: Distributed Tests (4 GPUs) # 10min + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/ + - vllm/core/ + - tests/distributed/test_utils + - tests/distributed/test_pynccl + - tests/spec_decode/e2e/test_integration_dist_tp4 + - tests/compile/test_basic_correctness + - examples/offline_inference/rlhf.py + - examples/offline_inference/rlhf_colocate.py + - tests/examples/offline_inference/data_parallel.py + commands: + - python3 ../examples/offline_inference/data_parallel.py + - pytest -v -s distributed/test_utils.py + - pytest -v -s compile/test_basic_correctness.py + - pytest -v -s distributed/test_pynccl.py + - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py + # TODO: create a dedicated test section for multi-GPU example tests + # when we have multiple distributed example tests + - pushd ../examples/offline_inference + - python3 rlhf.py + - RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py + - popd - - label: Metrics, Tracing Test # 10min - num_gpus: 2 - source_file_dependencies: - - vllm/ - - tests/metrics - - tests/tracing - commands: - - pytest -v -s metrics - - "pip install \ - 'opentelemetry-sdk>=1.26.0,<1.27.0' \ - 'opentelemetry-api>=1.26.0,<1.27.0' \ - 'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \ - 'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'" - - pytest -v -s tracing +- label: Metrics, Tracing Test # 10min + num_gpus: 2 + source_file_dependencies: + - vllm/ + - tests/metrics + - tests/tracing + commands: + - pytest -v -s metrics + - "pip install \ + 'opentelemetry-sdk>=1.26.0,<1.27.0' \ + 'opentelemetry-api>=1.26.0,<1.27.0' \ + 'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \ + 'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'" + - pytest -v -s tracing - ##### fast check tests ##### - ##### 1 GPU test ##### +##### fast check tests ##### +##### 1 GPU test ##### - - label: Regression Test # 5min - mirror_hardwares: [amd] - source_file_dependencies: - - vllm/ - - tests/test_regression - commands: - - pip install modelscope - - pytest -v -s test_regression.py - working_dir: "/vllm-workspace/tests" # optional +- label: Regression Test # 5min + mirror_hardwares: [amd] + source_file_dependencies: + - vllm/ + - tests/test_regression + commands: + - pip install modelscope + - pytest -v -s test_regression.py + working_dir: "/vllm-workspace/tests" # optional - - label: Engine Test # 10min - mirror_hardwares: [amd] - source_file_dependencies: - - vllm/ - - tests/engine - - tests/tokenization - - tests/test_sequence - - tests/test_config - - tests/test_logger - commands: - - pytest -v -s engine test_sequence.py test_config.py test_logger.py - # OOM in the CI unless we run this separately - - pytest -v -s tokenization +- label: Engine Test # 10min + mirror_hardwares: [amd] + source_file_dependencies: + - vllm/ + - tests/engine + - tests/tokenization + - tests/test_sequence + - tests/test_config + - tests/test_logger + commands: + - pytest -v -s engine test_sequence.py test_config.py test_logger.py + # OOM in the CI unless we run this separately + - pytest -v -s tokenization - - label: V1 Test - #mirror_hardwares: [amd] - source_file_dependencies: - - vllm/ - - tests/v1 - commands: - # split the test to avoid interference - - pytest -v -s v1/core - - pytest -v -s v1/engine - - pytest -v -s v1/sample - - pytest -v -s v1/worker - - pytest -v -s v1/structured_output - - pytest -v -s v1/test_stats.py - - pytest -v -s v1/test_utils.py - - pytest -v -s v1/test_oracle.py - # TODO: accuracy does not match, whether setting - # VLLM_USE_FLASHINFER_SAMPLER or not on H100. - - pytest -v -s v1/e2e - # Integration test for streaming correctness (requires special branch). - - pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api - - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine +- label: V1 Test + #mirror_hardwares: [amd] + source_file_dependencies: + - vllm/ + - tests/v1 + commands: + # split the test to avoid interference + - pytest -v -s v1/core + - pytest -v -s v1/engine + - pytest -v -s v1/sample + - pytest -v -s v1/worker + - pytest -v -s v1/structured_output + - pytest -v -s v1/test_stats.py + - pytest -v -s v1/test_utils.py + - pytest -v -s v1/test_oracle.py + # TODO: accuracy does not match, whether setting + # VLLM_USE_FLASHINFER_SAMPLER or not on H100. + - pytest -v -s v1/e2e + # Integration test for streaming correctness (requires special branch). + - pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api + - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine - - label: Examples Test # 25min - working_dir: "/vllm-workspace/examples" - #mirror_hardwares: [amd] - source_file_dependencies: - - vllm/entrypoints - - examples/ - commands: - - pip install tensorizer # for tensorizer test - - python3 offline_inference/basic/generate.py --model facebook/opt-125m - - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 - - python3 offline_inference/basic/chat.py - - python3 offline_inference/prefix_caching.py - - python3 offline_inference/llm_engine_example.py - - python3 offline_inference/vision_language.py - - python3 offline_inference/vision_language_multi_image.py - - VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - - python3 offline_inference/encoder_decoder.py - - python3 offline_inference/basic/classify.py - - python3 offline_inference/basic/embed.py - - python3 offline_inference/basic/score.py - - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2 +- label: Examples Test # 25min + working_dir: "/vllm-workspace/examples" + #mirror_hardwares: [amd] + source_file_dependencies: + - vllm/entrypoints + - examples/ + commands: + - pip install tensorizer # for tensorizer test + - python3 offline_inference/basic/generate.py --model facebook/opt-125m + - python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10 + - python3 offline_inference/basic/chat.py + - python3 offline_inference/prefix_caching.py + - python3 offline_inference/llm_engine_example.py + - python3 offline_inference/vision_language.py + - python3 offline_inference/vision_language_multi_image.py + - VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors + - python3 offline_inference/encoder_decoder.py + - python3 offline_inference/basic/classify.py + - python3 offline_inference/basic/embed.py + - python3 offline_inference/basic/score.py + - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2 - - label: Prefix Caching Test # 9min - mirror_hardwares: [amd] - source_file_dependencies: - - vllm/ - - tests/prefix_caching - commands: - - pytest -v -s prefix_caching +- label: Prefix Caching Test # 9min + mirror_hardwares: [amd] + source_file_dependencies: + - vllm/ + - tests/prefix_caching + commands: + - pytest -v -s prefix_caching - - label: Samplers Test # 36min - source_file_dependencies: - - vllm/model_executor/layers - - vllm/sampling_metadata.py - - tests/samplers - - tests/conftest.py - commands: - - pytest -v -s samplers - - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers +- label: Samplers Test # 36min + source_file_dependencies: + - vllm/model_executor/layers + - vllm/sampling_metadata.py + - tests/samplers + - tests/conftest.py + commands: + - pytest -v -s samplers + - VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers - - label: LogitsProcessor Test # 5min - mirror_hardwares: [amd] - source_file_dependencies: - - vllm/model_executor/layers - - vllm/model_executor/guided_decoding - - tests/test_logits_processor - - tests/model_executor/test_guided_processors - commands: - - pytest -v -s test_logits_processor.py - - pytest -v -s model_executor/test_guided_processors.py +- label: LogitsProcessor Test # 5min + mirror_hardwares: [amd] + source_file_dependencies: + - vllm/model_executor/layers + - vllm/model_executor/guided_decoding + - tests/test_logits_processor + - tests/model_executor/test_guided_processors + commands: + - pytest -v -s test_logits_processor.py + - pytest -v -s model_executor/test_guided_processors.py - - label: Speculative decoding tests # 40min - source_file_dependencies: - - vllm/spec_decode - - tests/spec_decode - - vllm/model_executor/models/eagle.py - commands: - - pytest -v -s spec_decode/e2e/test_multistep_correctness.py - - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_mtp_correctness.py - - pytest -v -s spec_decode/e2e/test_eagle_correctness.py +- label: Speculative decoding tests # 40min + source_file_dependencies: + - vllm/spec_decode + - tests/spec_decode + - vllm/model_executor/models/eagle.py + commands: + - pytest -v -s spec_decode/e2e/test_multistep_correctness.py + - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_mtp_correctness.py + - pytest -v -s spec_decode/e2e/test_eagle_correctness.py - - label: LoRA Test %N # 15min each - mirror_hardwares: [amd] - source_file_dependencies: - - vllm/lora - - tests/lora - command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py --ignore=lora/test_transfomers_model.py - parallelism: 4 +- label: LoRA Test %N # 15min each + mirror_hardwares: [amd] + source_file_dependencies: + - vllm/lora + - tests/lora + command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py --ignore=lora/test_transfomers_model.py + parallelism: 4 - - label: PyTorch Fullgraph Smoke Test # 9min - source_file_dependencies: - - vllm/ - - tests/compile - commands: - - pytest -v -s compile/test_basic_correctness.py - # these tests need to be separated, cannot combine - - pytest -v -s compile/piecewise/test_simple.py - - pytest -v -s compile/piecewise/test_toy_llama.py +- label: PyTorch Fullgraph Smoke Test # 9min + source_file_dependencies: + - vllm/ + - tests/compile + commands: + - pytest -v -s compile/test_basic_correctness.py + # these tests need to be separated, cannot combine + - pytest -v -s compile/piecewise/test_simple.py + - pytest -v -s compile/piecewise/test_toy_llama.py - - label: PyTorch Fullgraph Test # 18min - source_file_dependencies: - - vllm/ - - tests/compile - commands: - - pytest -v -s compile/test_full_graph.py +- label: PyTorch Fullgraph Test # 18min + source_file_dependencies: + - vllm/ + - tests/compile + commands: + - pytest -v -s compile/test_full_graph.py - - label: Kernels Test %N # 1h each - mirror_hardwares: [amd] - source_file_dependencies: - - csrc/ - - vllm/attention - - tests/kernels - commands: - - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT - parallelism: 4 +- label: Kernels Test %N # 1h each + mirror_hardwares: [amd] + source_file_dependencies: + - csrc/ + - vllm/attention + - tests/kernels + commands: + - pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT + parallelism: 4 - - label: Tensorizer Test # 11min - mirror_hardwares: [amd] - soft_fail: true - source_file_dependencies: - - vllm/model_executor/model_loader - - tests/tensorizer_loader - commands: - - apt-get update && apt-get install -y curl libsodium23 - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -v -s tensorizer_loader +- label: Tensorizer Test # 11min + mirror_hardwares: [amd] + soft_fail: true + source_file_dependencies: + - vllm/model_executor/model_loader + - tests/tensorizer_loader + commands: + - apt-get update && apt-get install -y curl libsodium23 + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - pytest -v -s tensorizer_loader - - label: Benchmarks # 9min - working_dir: "/vllm-workspace/.buildkite" - mirror_hardwares: [amd] - source_file_dependencies: - - benchmarks/ - commands: - - bash run-benchmarks.sh +- label: Benchmarks # 9min + working_dir: "/vllm-workspace/.buildkite" + mirror_hardwares: [amd] + source_file_dependencies: + - benchmarks/ + commands: + - bash run-benchmarks.sh - - label: Quantization Test # 33min - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - - tests/quantization - command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization +- label: Quantization Test # 33min + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + - tests/quantization + command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization - - label: LM Eval Small Models # 53min - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - bash ./run-tests.sh -c configs/models-small.txt -t 1 +- label: LM Eval Small Models # 53min + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - bash ./run-tests.sh -c configs/models-small.txt -t 1 - - label: OpenAI API correctness - source_file_dependencies: - - csrc/ - - vllm/entrypoints/openai/ - - vllm/model_executor/models/whisper.py - commands: # LMEval+Transcription WER check - - pytest -s entrypoints/openai/correctness/ +- label: OpenAI API correctness + source_file_dependencies: + - csrc/ + - vllm/entrypoints/openai/ + - vllm/model_executor/models/whisper.py + commands: # LMEval+Transcription WER check + - pytest -s entrypoints/openai/correctness/ - - label: Encoder Decoder tests # 5min - source_file_dependencies: - - vllm/ - - tests/encoder_decoder - commands: - - pytest -v -s encoder_decoder +- label: Encoder Decoder tests # 5min + source_file_dependencies: + - vllm/ + - tests/encoder_decoder + commands: + - pytest -v -s encoder_decoder - - label: OpenAI-Compatible Tool Use # 20 min - fast_check: false - mirror_hardwares: [amd] - source_file_dependencies: - - vllm/ - - tests/tool_use - commands: - - pytest -v -s tool_use +- label: OpenAI-Compatible Tool Use # 20 min + fast_check: false + mirror_hardwares: [ amd ] + source_file_dependencies: + - vllm/ + - tests/tool_use + commands: + - pytest -v -s tool_use - ##### models test ##### +##### models test ##### - - label: Basic Models Test # 24min - source_file_dependencies: - - vllm/ - - tests/models - commands: - - pytest -v -s models/test_transformers.py - - pytest -v -s models/test_registry.py - # V1 Test: https://github.com/vllm-project/vllm/issues/14531 - - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py +- label: Basic Models Test # 24min + source_file_dependencies: + - vllm/ + - tests/models + commands: + - pytest -v -s models/test_transformers.py + - pytest -v -s models/test_registry.py + # V1 Test: https://github.com/vllm-project/vllm/issues/14531 + - VLLM_USE_V1=0 pytest -v -s models/test_initialization.py - - label: Language Models Test (Standard) # 32min - #mirror_hardwares: [amd] - source_file_dependencies: - - vllm/ - - tests/models/decoder_only/language - - tests/models/embedding/language - - tests/models/encoder_decoder/language - commands: - - pytest -v -s models/decoder_only/language -m 'core_model or quant_model' - - pytest -v -s models/embedding/language -m core_model +- label: Language Models Test (Standard) # 32min + #mirror_hardwares: [amd] + source_file_dependencies: + - vllm/ + - tests/models/decoder_only/language + - tests/models/embedding/language + - tests/models/encoder_decoder/language + commands: + - pytest -v -s models/decoder_only/language -m 'core_model or quant_model' + - pytest -v -s models/embedding/language -m core_model - - label: SSM and Hybrid Models Test # X min - source_file_dependencies: - - vllm/ - - tests/models/decoder_only/language/test_hybrid.py - - tests/models/decoder_only/language/test_mamba.py - commands: - - pytest -v -s models/decoder_only/language/test_hybrid.py - - pytest -v -s models/decoder_only/language/test_mamba.py +- label: Language Models Test (Extended) # 1h10min + optional: true + source_file_dependencies: + - vllm/ + - tests/models/decoder_only/language + - tests/models/embedding/language + - tests/models/encoder_decoder/language + commands: + - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model' + - pytest -v -s models/embedding/language -m 'not core_model' - - label: Language Models Test (Extended) # 1h10min - optional: true - source_file_dependencies: - - vllm/ - - tests/models/decoder_only/language - - tests/models/embedding/language - - tests/models/encoder_decoder/language - commands: - - pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model' - - pytest -v -s models/embedding/language -m 'not core_model' +- label: Multi-Modal Models Test (Standard) # 40min + #mirror_hardwares: [amd] + source_file_dependencies: + - vllm/ + - tests/models/decoder_only/audio_language + - tests/models/decoder_only/vision_language + - tests/models/embedding/vision_language + - tests/models/encoder_decoder/audio_language + - tests/models/encoder_decoder/vision_language + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/multimodal + - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model' + - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model' + - pytest -v -s models/embedding/vision_language -m core_model + - pytest -v -s models/encoder_decoder/audio_language -m core_model + - pytest -v -s models/encoder_decoder/language -m core_model + - pytest -v -s models/encoder_decoder/vision_language -m core_model - - label: Multi-Modal Models Test (Standard) # 40min - #mirror_hardwares: [amd] - source_file_dependencies: - - vllm/ - - tests/models/decoder_only/audio_language - - tests/models/decoder_only/vision_language - - tests/models/embedding/vision_language - - tests/models/encoder_decoder/audio_language - - tests/models/encoder_decoder/vision_language - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/multimodal - - pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model' - - pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model' - - pytest -v -s models/embedding/vision_language -m core_model - - pytest -v -s models/encoder_decoder/audio_language -m core_model - - pytest -v -s models/encoder_decoder/language -m core_model - - pytest -v -s models/encoder_decoder/vision_language -m core_model +- label: Multi-Modal Models Test (Extended) 1 # 48m + optional: true + source_file_dependencies: + - vllm/ + - tests/models/decoder_only/audio_language + - tests/models/decoder_only/vision_language + - tests/models/embedding/vision_language + - tests/models/encoder_decoder/vision_language + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model' + - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model' + # HACK - run phi3v tests separately to sidestep this transformers bug + # https://github.com/huggingface/transformers/issues/34307 + - pytest -v -s models/decoder_only/vision_language/test_phi3v.py + - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model' + - pytest -v -s models/embedding/vision_language -m 'not core_model' + - pytest -v -s models/encoder_decoder/language -m 'not core_model' + - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model' - - label: Multi-Modal Models Test (Extended) 1 # 48m - optional: true - source_file_dependencies: - - vllm/ - - tests/models/decoder_only/audio_language - - tests/models/decoder_only/vision_language - - tests/models/embedding/vision_language - - tests/models/encoder_decoder/vision_language - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model' - - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model' - # HACK - run phi3v tests separately to sidestep this transformers bug - # https://github.com/huggingface/transformers/issues/34307 - - pytest -v -s models/decoder_only/vision_language/test_phi3v.py - - pytest -v -s --ignore models/decoder_only/vision_language/test_models.py --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'not core_model and not quant_model' - - pytest -v -s models/embedding/vision_language -m 'not core_model' - - pytest -v -s models/encoder_decoder/language -m 'not core_model' - - pytest -v -s models/encoder_decoder/vision_language -m 'not core_model' +- label: Multi-Modal Models Test (Extended) 2 # 38m + optional: true + source_file_dependencies: + - vllm/ + - tests/models/decoder_only/vision_language + commands: + - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git + - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model' - - label: Multi-Modal Models Test (Extended) 2 # 38m - optional: true - source_file_dependencies: - - vllm/ - - tests/models/decoder_only/vision_language - commands: - - pip install git+https://github.com/TIGER-AI-Lab/Mantis.git - - pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model' +- label: SSM and Hybrid Models Test # X min + source_file_dependencies: + - vllm/ + - tests/models/decoder_only/language/test_hybrid.py + - tests/models/decoder_only/language/test_mamba.py + commands: + - pytest -v -s models/decoder_only/language/test_hybrid.py + - pytest -v -s models/decoder_only/language/test_mamba.py - # This test is used only in PR development phase to test individual models and should never run on main - - label: Custom Models Test - optional: true - commands: - - echo 'Testing custom models...' - # PR authors can temporarily add commands below to test individual models - # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py - # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR* +# This test is used only in PR development phase to test individual models and should never run on main +- label: Custom Models Test + optional: true + commands: + - echo 'Testing custom models...' + # PR authors can temporarily add commands below to test individual models + # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py + # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR* - ##### 1 GPU test ##### - ##### multi gpus test ##### +##### 1 GPU test ##### +##### multi gpus test ##### - - label: Distributed Comm Ops Test # 7min - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - source_file_dependencies: - - vllm/distributed - - tests/distributed - commands: - - pytest -v -s distributed/test_comm_ops.py - - pytest -v -s distributed/test_shm_broadcast.py +- label: Distributed Comm Ops Test # 7min + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/distributed + - tests/distributed + commands: + - pytest -v -s distributed/test_comm_ops.py + - pytest -v -s distributed/test_shm_broadcast.py - - label: 2 Node Tests (4 GPUs in total) # 16min - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - num_nodes: 2 - source_file_dependencies: - - vllm/distributed/ - - vllm/engine/ - - vllm/executor/ - - vllm/model_executor/models/ - - tests/distributed/ - commands: - - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) - - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py - - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py - - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) - - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' +- label: 2 Node Tests (4 GPUs in total) # 16min + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + num_nodes: 2 + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/model_executor/models/ + - tests/distributed/ + commands: + - # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up) + - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' + - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py + - VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py + - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up) + - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' - - label: Distributed Tests (2 GPUs) # 40min - #mirror_hardwares: [amd] - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - source_file_dependencies: - - vllm/distributed/ - - vllm/engine/ - - vllm/executor/ - - vllm/model_executor/models/ - - tests/distributed/ - - vllm/compilation - - vllm/worker/worker_base.py - - vllm/worker/worker.py - - vllm/worker/model_runner.py - - entrypoints/llm/test_collective_rpc.py - commands: - - pytest -v -s entrypoints/llm/test_collective_rpc.py - - VLLM_USE_V1=1 torchrun --nproc-per-node=2 distributed/test_torchrun_example.py - - torchrun --nproc-per-node=2 distributed/test_torchrun_example.py - - pytest -v -s ./compile/test_basic_correctness.py - - pytest -v -s ./compile/test_wrapper.py - - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' - - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - # Avoid importing model tests that cause CUDA reinitialization error - - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' - - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)' - - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)' - - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)' - # this test fails consistently. - # TODO: investigate and fix - # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py +- label: Distributed Tests (2 GPUs) # 40min + #mirror_hardwares: [amd] + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/model_executor/models/ + - tests/distributed/ + - vllm/compilation + - vllm/worker/worker_base.py + - vllm/worker/worker.py + - vllm/worker/model_runner.py + - entrypoints/llm/test_collective_rpc.py + commands: + - pytest -v -s entrypoints/llm/test_collective_rpc.py + - VLLM_USE_V1=1 torchrun --nproc-per-node=2 distributed/test_torchrun_example.py + - torchrun --nproc-per-node=2 distributed/test_torchrun_example.py + - pytest -v -s ./compile/test_basic_correctness.py + - pytest -v -s ./compile/test_wrapper.py + - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed' + - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' + # Avoid importing model tests that cause CUDA reinitialization error + - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)' + - pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)' + - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)' + - pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)' + # this test fails consistently. + # TODO: investigate and fix + # - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py + - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py + - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py - - label: Plugin Tests (2 GPUs) # 40min - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - source_file_dependencies: - - vllm/plugins/ - - tests/plugins/ - commands: - # begin platform plugin tests, all the code in-between runs on dummy platform - - pip install -e ./plugins/vllm_add_dummy_platform - - pytest -v -s plugins_tests/test_platform_plugins.py - - pip uninstall vllm_add_dummy_platform -y - # end platform plugin tests - # other tests continue here: - - pytest -v -s plugins_tests/test_scheduler_plugins.py - - pip install -e ./plugins/vllm_add_dummy_model - - pytest -v -s distributed/test_distributed_oot.py - - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process - - pytest -v -s models/test_oot_registration.py # it needs a clean process +- label: Plugin Tests (2 GPUs) # 40min + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/plugins/ + - tests/plugins/ + commands: + # begin platform plugin tests, all the code in-between runs on dummy platform + - pip install -e ./plugins/vllm_add_dummy_platform + - pytest -v -s plugins_tests/test_platform_plugins.py + - pip uninstall vllm_add_dummy_platform -y + # end platform plugin tests + # other tests continue here: + - pytest -v -s plugins_tests/test_scheduler_plugins.py + - pip install -e ./plugins/vllm_add_dummy_model + - pytest -v -s distributed/test_distributed_oot.py + - pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process + - pytest -v -s models/test_oot_registration.py # it needs a clean process - - label: Multi-step Tests (4 GPUs) # 36min - working_dir: "/vllm-workspace/tests" - num_gpus: 4 - source_file_dependencies: - - vllm/model_executor/layers/sampler.py - - vllm/sequence.py - - vllm/worker/worker_base.py - - vllm/worker/worker.py - - vllm/worker/multi_step_worker.py - - vllm/worker/model_runner_base.py - - vllm/worker/model_runner.py - - vllm/worker/multi_step_model_runner.py - - vllm/engine - - tests/multi_step - commands: - # this test is quite flaky - # TODO: investigate and fix. - # - pytest -v -s multi_step/test_correctness_async_llm.py - - pytest -v -s multi_step/test_correctness_llm.py +- label: Multi-step Tests (4 GPUs) # 36min + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/model_executor/layers/sampler.py + - vllm/sequence.py + - vllm/worker/worker_base.py + - vllm/worker/worker.py + - vllm/worker/multi_step_worker.py + - vllm/worker/model_runner_base.py + - vllm/worker/model_runner.py + - vllm/worker/multi_step_model_runner.py + - vllm/engine + - tests/multi_step + commands: + # this test is quite flaky + # TODO: investigate and fix. + # - pytest -v -s multi_step/test_correctness_async_llm.py + - pytest -v -s multi_step/test_correctness_llm.py - - label: Pipeline Parallelism Test # 45min - working_dir: "/vllm-workspace/tests" - num_gpus: 4 - source_file_dependencies: - - vllm/distributed/ - - vllm/engine/ - - vllm/executor/ - - vllm/model_executor/models/ - - tests/distributed/ - commands: - - pytest -v -s distributed/test_pp_cudagraph.py - - pytest -v -s distributed/test_pipeline_parallel.py +- label: Pipeline Parallelism Test # 45min + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/ + - vllm/engine/ + - vllm/executor/ + - vllm/model_executor/models/ + - tests/distributed/ + commands: + - pytest -v -s distributed/test_pp_cudagraph.py + - pytest -v -s distributed/test_pipeline_parallel.py - - label: LoRA TP Test (Distributed) - num_gpus: 4 - source_file_dependencies: - - vllm/lora - - tests/lora - commands: - # FIXIT: find out which code initialize cuda before running the test - # before the fix, we need to use spawn to test it - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - # This test runs llama 13B, so it is required to run on 4 GPUs. - - pytest -v -s -x lora/test_long_context.py - # There is some Tensor Parallelism related processing logic in LoRA that - # requires multi-GPU testing for validation. - - pytest -v -s -x lora/test_chatglm3_tp.py - - pytest -v -s -x lora/test_llama_tp.py - - pytest -v -s -x lora/test_minicpmv_tp.py - - pytest -v -s -x lora/test_transfomers_model.py +- label: LoRA TP Test (Distributed) + num_gpus: 4 + source_file_dependencies: + - vllm/lora + - tests/lora + commands: + # FIXIT: find out which code initialize cuda before running the test + # before the fix, we need to use spawn to test it + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + # This test runs llama 13B, so it is required to run on 4 GPUs. + - pytest -v -s -x lora/test_long_context.py + # There is some Tensor Parallelism related processing logic in LoRA that + # requires multi-GPU testing for validation. + - pytest -v -s -x lora/test_chatglm3_tp.py + - pytest -v -s -x lora/test_llama_tp.py + - pytest -v -s -x lora/test_minicpmv_tp.py + - pytest -v -s -x lora/test_transfomers_model.py - - label: Weight Loading Multiple GPU Test # 33min - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - source_file_dependencies: - - vllm/ - - tests/weight_loading - commands: - - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt - - label: Weight Loading Multiple GPU Test - Large Models # optional - working_dir: "/vllm-workspace/tests" - num_gpus: 2 - gpu: a100 - optional: true - source_file_dependencies: - - vllm/ - - tests/weight_loading - commands: - - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt +- label: Weight Loading Multiple GPU Test # 33min + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + source_file_dependencies: + - vllm/ + - tests/weight_loading + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt - ##### multi gpus test ##### - ##### A100 test ##### +- label: Weight Loading Multiple GPU Test - Large Models # optional + working_dir: "/vllm-workspace/tests" + num_gpus: 2 + gpu: a100 + optional: true + source_file_dependencies: + - vllm/ + - tests/weight_loading + commands: + - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt - - label: Distributed Tests (A100) # optional - gpu: a100 - optional: true - num_gpus: 4 - source_file_dependencies: - - vllm/ - commands: - # NOTE: don't test llama model here, it seems hf implementation is buggy - # see https://github.com/vllm-project/vllm/pull/5689 for details - - pytest -v -s distributed/test_custom_all_reduce.py - - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py - - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' - - pytest -v -s -x lora/test_mixtral.py - - label: LM Eval Large Models # optional - gpu: a100 - optional: true - num_gpus: 4 - working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" - source_file_dependencies: - - csrc/ - - vllm/model_executor/layers/quantization - commands: - - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - bash ./run-tests.sh -c configs/models-large.txt -t 4 +##### multi gpus test ##### +##### A100 test ##### + +- label: Distributed Tests (A100) # optional + gpu: a100 + optional: true + num_gpus: 4 + source_file_dependencies: + - vllm/ + commands: + # NOTE: don't test llama model here, it seems hf implementation is buggy + # see https://github.com/vllm-project/vllm/pull/5689 for details + - pytest -v -s distributed/test_custom_all_reduce.py + - torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py + - TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)' + - pytest -v -s -x lora/test_mixtral.py + +- label: LM Eval Large Models # optional + gpu: a100 + optional: true + num_gpus: 4 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - csrc/ + - vllm/model_executor/layers/quantization + commands: + - export VLLM_WORKER_MULTIPROC_METHOD=spawn + - bash ./run-tests.sh -c configs/models-large.txt -t 4