diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index a65f26d716ad..ac96f2f0d71d 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -454,8 +454,8 @@ steps: - pytest -v -s compile/test_fusion_attn.py - pytest -v -s compile/test_functionalization.py - pytest -v -s compile/test_silu_mul_quant_fusion.py - - pytest -v -s compile/test_sequence_parallelism.py - - pytest -v -s compile/test_async_tp.py + # - pytest -v -s compile/test_sequence_parallelism.py + # - pytest -v -s compile/test_async_tp.py - pytest -v -s compile/test_fusion_all_reduce.py - pytest -v -s compile/test_decorator.py - pytest -v -s compile/test_noop_elimination.py @@ -474,8 +474,8 @@ steps: - pytest -v -s compile/test_basic_correctness.py - pytest -v -s compile/piecewise/ -- label: PyTorch Fullgraph Test # 20min - timeout_in_minutes: 30 +- label: PyTorch Fullgraph Test # 22min + timeout_in_minutes: 35 mirror_hardwares: [amdexperimental, amdproduction] agent_pool: mi325_1 # grade: Blocking @@ -485,6 +485,7 @@ steps: - tests/compile commands: - pytest -v -s compile/test_full_graph.py + - pytest -v -s compile/test_fusions_e2e.py - label: Kernels Core Operation Test # 48min timeout_in_minutes: 75 @@ -494,6 +495,7 @@ steps: source_file_dependencies: - csrc/ - tests/kernels/core + - tests/kernels/test_top_k_per_row.py commands: - pytest -v -s kernels/core kernels/test_top_k_per_row.py @@ -606,7 +608,7 @@ steps: # we can only upgrade after this is resolved # TODO(jerryzh168): resolve the above comment - uv pip install --system torchao==0.13.0 - - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ + - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py - label: LM Eval Small Models # 53min timeout_in_minutes: 75 @@ -848,6 +850,18 @@ steps: - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work +- label: Multi-Modal Accuracy Eval (Small Models) # 50min + mirror_hardwares: [amdexperimental] + agent_pool: mi325_1 + timeout_in_minutes: 70 + working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" + source_file_dependencies: + - vllm/multimodal/ + - vllm/inputs/ + - vllm/v1/core/ + commands: + - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 + - label: Multi-Modal Models Test (Extended) 1 mirror_hardwares: [amdexperimental] agent_pool: mi325_1 @@ -923,8 +937,8 @@ steps: # Whisper needs spawn method to avoid deadlock - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper -- label: Blackwell Test # 38 min - timeout_in_minutes: 60 +- label: Blackwell Test # 21 min + timeout_in_minutes: 30 working_dir: "/vllm-workspace/" gpu: b200 # optional: true @@ -937,8 +951,6 @@ steps: - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - vllm/v1/attention/backends/flashinfer.py - - vllm/compilation/fusion.py - - vllm/compilation/fusion_attn.py commands: - nvidia-smi - python3 examples/offline_inference/basic/chat.py @@ -955,13 +967,32 @@ steps: - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py + - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py + - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py - # Fusion - - pytest -v -s tests/compile/test_fusion_all_reduce.py - - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern - pytest -v -s tests/kernels/moe/test_flashinfer.py + +- label: Blackwell Fusion Tests # 30 min + timeout_in_minutes: 40 + working_dir: "/vllm-workspace/" + gpu: b200 + source_file_dependencies: + - csrc/quantization/fp4/ + - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/compilation/ + # can affect pattern matching + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/quantization/input_quant_fp8.py + commands: + - nvidia-smi + - pytest -v -s tests/compile/test_fusion_attn.py - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py + # this runner has 2 GPUs available even though num_gpus=2 is not set + - pytest -v -s tests/compile/test_fusion_all_reduce.py + - pytest -v -s tests/compile/test_fusions_e2e.py - label: Blackwell GPT-OSS Eval timeout_in_minutes: 60 @@ -1129,6 +1160,11 @@ steps: - pytest -v -s plugins_tests/test_io_processor_plugins.py - pip uninstall prithvi_io_processor_plugin -y # end io_processor plugins test + # begin stat_logger plugins test + - pip install -e ./plugins/vllm_add_dummy_stat_logger + - pytest -v -s plugins_tests/test_stats_logger_plugins.py + - pip uninstall dummy_stat_logger -y + # end stat_logger plugins test # other tests continue here: - pytest -v -s plugins_tests/test_scheduler_plugins.py - pip install -e ./plugins/vllm_add_dummy_model @@ -1173,7 +1209,6 @@ steps: - pytest -v -s -x lora/test_llama_tp.py - pytest -v -s -x lora/test_llm_with_multi_loras.py - - label: Weight Loading Multiple GPU Test # 33min timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] @@ -1202,6 +1237,18 @@ steps: commands: - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt +- label: NixlConnector PD accuracy tests (Distributed) # 30min + mirror_hardwares: [amdexperimental] + agent_pool: mi325_4 + timeout_in_minutes: 30 + working_dir: "/vllm-workspace/tests" + num_gpus: 4 + source_file_dependencies: + - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py + - tests/v1/kv_connector/nixl_integration/ + commands: + - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt + - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh ##### multi gpus test ##### ##### A100 test ##### @@ -1233,12 +1280,16 @@ steps: - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 ##### H200 test ##### -- label: Distrubted Tests (H200) # optional +- label: Distributed Tests (H200) # optional gpu: h200 optional: true working_dir: "/vllm-workspace/" num_gpus: 2 commands: + - pytest -v -s tests/compile/test_async_tp.py + - pytest -v -s tests/compile/test_sequence_parallelism.py + - pytest -v -s tests/compile/test_fusion_all_reduce.py + - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm - pytest -v -s tests/distributed/test_context_parallel.py - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048