Re-enable FlashInfer for Llama4 on Blackwell in e2e fusion tests (#28966)

Signed-off-by: Luka Govedič <ProExpertProg@users.noreply.github.com> Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: ProExpertProg <11367180+ProExpertProg@users.noreply.github.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2026-05-03 00:04:39 +08:00 · 2025-11-19 13:32:19 -05:00 · 2025-11-19 13:32:19 -05:00 · 61728cd1df
commit 61728cd1df
parent 0c80efd94f
2 changed files with 6 additions and 8 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -930,6 +930,8 @@ steps:
  - csrc/quantization/fp4/
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
  - vllm/v1/attention/backends/flashinfer.py
  - vllm/v1/worker/
  - vllm/v1/cudagraph_dispatcher.py
  - vllm/compilation/
  # can affect pattern matching
  - vllm/model_executor/layers/layernorm.py
--- a/tests/compile/distributed/test_fusions_e2e.py
+++ b/tests/compile/distributed/test_fusions_e2e.py
@ -47,12 +47,8 @@ if current_platform.is_cuda():
        ModelBackendTestCase(
            # Use smaller model for L40s in CI
            model_name="RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8",
            # TODO while llama4 is broken, use FLASHINFER for llama3 on Blackwell
            #  so FI attention+fp8_quant is at least tested once
            model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
-            backend=AttentionBackendEnum.FLASHINFER
+            backend=AttentionBackendEnum.TRITON_ATTN,
            if is_blackwell()
            else AttentionBackendEnum.TRITON_ATTN,
            matches=Matches(
                attention_fusion=32,
                allreduce_fusion=65,
@ -65,9 +61,9 @@ if current_platform.is_cuda():
            model_kwargs=dict(max_model_len=1024, kv_cache_dtype="fp8"),
            # TODO FlashInfer attn broken on Hopper with kvcache=fp8:
            # https://github.com/vllm-project/vllm/issues/28568
-            # TODO FlashInfer attn broken on Blackwell for llama4:
+            backend=AttentionBackendEnum.FLASHINFER
-            # https://github.com/vllm-project/vllm/issues/28604
+            if is_blackwell()
-            backend=AttentionBackendEnum.TRITON_ATTN,
+            else AttentionBackendEnum.TRITON_ATTN,
            matches=Matches(
                attention_fusion=48,
                allreduce_fusion=96,