From 61728cd1dfb03cbbfa03924f2a2cda311cfc13ac Mon Sep 17 00:00:00 2001 From: Copilot <198982749+Copilot@users.noreply.github.com> Date: Wed, 19 Nov 2025 13:32:19 -0500 Subject: [PATCH] Re-enable FlashInfer for Llama4 on Blackwell in e2e fusion tests (#28966) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Luka Govedič Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: ProExpertProg <11367180+ProExpertProg@users.noreply.github.com> Co-authored-by: Luka Govedič --- .buildkite/test-pipeline.yaml | 2 ++ tests/compile/distributed/test_fusions_e2e.py | 12 ++++-------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index d4b6f4077ab3..98daebcc0693 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -930,6 +930,8 @@ steps: - csrc/quantization/fp4/ - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py - vllm/v1/attention/backends/flashinfer.py + - vllm/v1/worker/ + - vllm/v1/cudagraph_dispatcher.py - vllm/compilation/ # can affect pattern matching - vllm/model_executor/layers/layernorm.py diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py index 2e1b595a4389..661172e1965b 100644 --- a/tests/compile/distributed/test_fusions_e2e.py +++ b/tests/compile/distributed/test_fusions_e2e.py @@ -47,12 +47,8 @@ if current_platform.is_cuda(): ModelBackendTestCase( # Use smaller model for L40s in CI model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8", - # TODO while llama4 is broken, use FLASHINFER for llama3 on Blackwell - # so FI attention+fp8_quant is at least tested once model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"), - backend=AttentionBackendEnum.FLASHINFER - if is_blackwell() - else AttentionBackendEnum.TRITON_ATTN, + backend=AttentionBackendEnum.TRITON_ATTN, matches=Matches( attention_fusion=32, allreduce_fusion=65, @@ -65,9 +61,9 @@ if current_platform.is_cuda(): model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"), # TODO FlashInfer attn broken on Hopper with kvcache=fp8: # https://github.com/vllm-project/vllm/issues/28568 - # TODO FlashInfer attn broken on Blackwell for llama4: - # https://github.com/vllm-project/vllm/issues/28604 - backend=AttentionBackendEnum.TRITON_ATTN, + backend=AttentionBackendEnum.FLASHINFER + if is_blackwell() + else AttentionBackendEnum.TRITON_ATTN, matches=Matches( attention_fusion=48, allreduce_fusion=96,