From caf8b1c0840b804bf36db30e57f14e5ecf3126dd Mon Sep 17 00:00:00 2001
From: Benjamin Chislett <bchislett@nvidia.com>
Date: Tue, 7 Oct 2025 18:12:26 -0400
Subject: [PATCH] [Bugfix] Fix MTP+FlashInfer crash when trtllm kernels are
 available but disabled (#26361)

Signed-off-by: Benjamin Chislett <bchislett@nvidia.com>
Signed-off-by: Benjamin Chislett <chislett.ben@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
---
 vllm/utils/flashinfer.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/utils/flashinfer.py b/vllm/utils/flashinfer.py
index 3fd37595900de..159d19bfad31b 100644
--- a/vllm/utils/flashinfer.py
+++ b/vllm/utils/flashinfer.py
@@ -220,6 +220,8 @@ def force_use_trtllm_attention() -> bool | None:
 
 def can_use_trtllm_attention(num_qo_heads: int, num_kv_heads: int) -> bool:
     """Check if the current configuration supports TRTLLM attention."""
+    if force_use_trtllm_attention() is False:
+        return False
     has_trtllm = supports_trtllm_attention()
     return has_trtllm and (num_qo_heads % num_kv_heads == 0)