From a736e5ff770bd0e69492c24d24d04b7fd47789c0 Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Fri, 7 Nov 2025 15:58:16 +0800 Subject: [PATCH] [CI] Reduce Blackwell Fusion test runtime by filtering tests and only run all tests in nightly (#28074) --- .buildkite/test-pipeline.yaml | 27 ++++++++++++++++++++++++++- tests/compile/test_fusions_e2e.py | 12 +++++------- 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index f3c04183f9ef..a4436bc2ac22 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -472,7 +472,9 @@ steps: - tests/compile commands: - pytest -v -s compile/test_full_graph.py - - pytest -v -s compile/test_fusions_e2e.py + # Limit to no custom ops to reduce running time + # Wrap with quotes to escape yaml and avoid starting -k string with a - + - "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and -quant_fp8'" - label: Cudagraph test timeout_in_minutes: 20 @@ -929,6 +931,29 @@ steps: - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py # this runner has 2 GPUs available even though num_gpus=2 is not set - pytest -v -s tests/compile/test_fusion_all_reduce.py + # Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time + # Wrap with quotes to escape yaml + - "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and Llama-3.1 and -quant_fp8 and -rms_norm'" + +- label: Blackwell Fusion E2E Tests # 30 min + timeout_in_minutes: 40 + working_dir: "/vllm-workspace/" + gpu: b200 + optional: true + num_gpus: 2 + source_file_dependencies: + - csrc/quantization/fp4/ + - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py + - vllm/v1/attention/backends/flashinfer.py + - vllm/compilation/ + # can affect pattern matching + - vllm/model_executor/layers/layernorm.py + - vllm/model_executor/layers/activation.py + - vllm/model_executor/layers/quantization/input_quant_fp8.py + - tests/compile/test_fusions_e2e.py + commands: + - nvidia-smi + # Run all e2e fusion tests - pytest -v -s tests/compile/test_fusions_e2e.py - label: Blackwell GPT-OSS Eval diff --git a/tests/compile/test_fusions_e2e.py b/tests/compile/test_fusions_e2e.py index d66c60ccb5b2..58026e7e7e78 100644 --- a/tests/compile/test_fusions_e2e.py +++ b/tests/compile/test_fusions_e2e.py @@ -54,11 +54,11 @@ if current_platform.is_cuda(): MODELS_FP4 = [ ModelBackendTestCase( - model_name="nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", + model_name="nvidia/Llama-3.1-8B-Instruct-FP4", model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"), backend=_Backend.FLASHINFER, - attention_fusions=48, - allreduce_fusions=96, + attention_fusions=32, + allreduce_fusions=65, ), ] @@ -95,8 +95,7 @@ elif current_platform.is_rocm(): ), ] -# TODO(luka) test both in nightly -CUSTOM_OPS_FP8 = ["-quant_fp8"] # , "+quant_fp8"] +CUSTOM_OPS_FP8 = ["-quant_fp8", "+quant_fp8"] @pytest.mark.parametrize( @@ -171,8 +170,7 @@ def test_attn_quant( assert int(matches[0]) == attention_fusions -# TODO(luka) test both in nightly -CUSTOM_OPS_RMS_NORM = ["-rms_norm"] # , "+rms_norm"] +CUSTOM_OPS_RMS_NORM = ["-rms_norm", "+rms_norm"] def custom_ops_product(*custom_ops_lists: list[str]) -> Iterable[str]: