From b5e383cd8b62975dec605bed05e22d273c296c7a Mon Sep 17 00:00:00 2001 From: Chen Zhang Date: Wed, 10 Sep 2025 14:33:13 -0700 Subject: [PATCH] [gpt-oss] raise error for flashinfer backend without trtllm (#24482) Signed-off-by: Chen Zhang --- vllm/v1/attention/backends/flashinfer.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 51defdd40de1..afa5a7c14d4d 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -216,7 +216,11 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): self.window_left = self.global_hyperparameters.window_left self.logits_soft_cap = self.global_hyperparameters.logits_soft_cap self.has_sinks = self.global_hyperparameters.has_sinks - + if self.has_sinks and not supports_trtllm_attention()[0]: + raise NotImplementedError( + "FlashInfer backend currently does not support attention " + "sinks, please use trtllm on blackwell or flash attention on " + "earlier GPUs.") # Preparing persistent buffers (device-side) self.paged_kv_indptr = torch.zeros(max_num_reqs + 1, dtype=torch.int32, @@ -408,7 +412,11 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]): self.q_data_type, is_prefill=False, has_sinks=self.has_sinks) - + if self.has_sinks and not (prefill_use_trtllm and decode_use_trtllm): + raise NotImplementedError( + "FlashInfer backend currently does not support attention " + "sinks, please use trtllm on blackwell or flash attention on " + "earlier GPUs.") attn_metadata = FlashInferMetadata( num_actual_tokens=num_actual_tokens, q_data_type=self.q_data_type,