From 61728cd1dfb03cbbfa03924f2a2cda311cfc13ac Mon Sep 17 00:00:00 2001
From: Copilot <198982749+Copilot@users.noreply.github.com>
Date: Wed, 19 Nov 2025 13:32:19 -0500
Subject: [PATCH] Re-enable FlashInfer for Llama4 on Blackwell in e2e fusion
 tests (#28966)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com>
Co-authored-by: ProExpertProg <11367180+ProExpertProg@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
---
 .buildkite/test-pipeline.yaml                 |  2 ++
 tests/compile/distributed/test_fusions_e2e.py | 12 ++++--------
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index d4b6f4077ab3..98daebcc0693 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -930,6 +930,8 @@ steps:
   - csrc/quantization/fp4/
   - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
   - vllm/v1/attention/backends/flashinfer.py
+  - vllm/v1/worker/
+  - vllm/v1/cudagraph_dispatcher.py
   - vllm/compilation/
   # can affect pattern matching
   - vllm/model_executor/layers/layernorm.py
diff --git a/tests/compile/distributed/test_fusions_e2e.py b/tests/compile/distributed/test_fusions_e2e.py
index 2e1b595a4389..661172e1965b 100644
--- a/tests/compile/distributed/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@@ -47,12 +47,8 @@ if current_platform.is_cuda():
         ModelBackendTestCase(
             # Use smaller model for L40s in CI
             model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
-            # TODO while llama4 is broken, use FLASHINFER for llama3 on Blackwell
-            #  so FI attention+fp8_quant is at least tested once
             model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
-            backend=AttentionBackendEnum.FLASHINFER
-            if is_blackwell()
-            else AttentionBackendEnum.TRITON_ATTN,
+            backend=AttentionBackendEnum.TRITON_ATTN,
             matches=Matches(
                 attention_fusion=32,
                 allreduce_fusion=65,
@@ -65,9 +61,9 @@ if current_platform.is_cuda():
             model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
             # TODO FlashInfer attn broken on Hopper with kvcache=fp8:
             # https://github.com/vllm-project/vllm/issues/28568
-            # TODO FlashInfer attn broken on Blackwell for llama4:
-            # https://github.com/vllm-project/vllm/issues/28604
-            backend=AttentionBackendEnum.TRITON_ATTN,
+            backend=AttentionBackendEnum.FLASHINFER
+            if is_blackwell()
+            else AttentionBackendEnum.TRITON_ATTN,
             matches=Matches(
                 attention_fusion=48,
                 allreduce_fusion=96,