From d23539549a6db54ab152ce4e566c31f6891ddab5 Mon Sep 17 00:00:00 2001
From: Adrian Abeyta <aabeyta@redhat.com>
Date: Tue, 11 Nov 2025 18:34:58 -0600
Subject: [PATCH] Use FLASHINFER MLA backend when testing fp8_kv_scale_compile
 (#28491)

Signed-off-by: adabeyta <aabeyta@redhat.com>
---
 tests/compile/test_full_graph.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/tests/compile/test_full_graph.py b/tests/compile/test_full_graph.py
index 71f90f6d8d3ee..b4e5e56ac9fe6 100644
--- a/tests/compile/test_full_graph.py
+++ b/tests/compile/test_full_graph.py
@@ -10,6 +10,7 @@ import torch
 
 from tests.quantization.utils import is_quant_method_supported
 from vllm import LLM, SamplingParams
+from vllm.attention.backends.registry import AttentionBackendEnum
 from vllm.config import CompilationConfig, CompilationMode, CUDAGraphMode, PassConfig
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import is_torch_equal_or_newer
@@ -184,13 +185,24 @@ def test_custom_compile_config(
     [CompilationMode.NONE, CompilationMode.VLLM_COMPILE],
 )
 @pytest.mark.parametrize(
-    "model",
+    "model, backend",
     [
-        "Qwen/Qwen2-0.5B",  # Standard attention model
-        "deepseek-ai/DeepSeek-V2-Lite",  # MLA (Multi-head Latent Attention) model
+        ("Qwen/Qwen2-0.5B", None),  # Standard attention model
+        (
+            "deepseek-ai/DeepSeek-V2-Lite",
+            AttentionBackendEnum.FLASHINFER_MLA,
+        ),  # MLA (Multi-head Latent Attention) model
     ],
 )
-def test_fp8_kv_scale_compile(compilation_mode: int, model: str):
+def test_fp8_kv_scale_compile(
+    monkeypatch: pytest.MonkeyPatch,
+    compilation_mode: int,
+    model: str,
+    backend: AttentionBackendEnum | None,
+):
+    if backend:
+        monkeypatch.setenv("VLLM_ATTENTION_BACKEND", backend.name)
+
     model_kwargs = {
         "quantization": "fp8",
         "kv_cache_dtype": "fp8_e4m3",