[gpt-oss] raise error for flashinfer backend without trtllm (#24482)

Signed-off-by: Chen Zhang <zhangch99@outlook.com>
This commit is contained in:
Chen Zhang 2025-09-10 14:33:13 -07:00 committed by GitHub
parent 9a161307f5
commit b5e383cd8b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -216,7 +216,11 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
self.window_left = self.global_hyperparameters.window_left
self.logits_soft_cap = self.global_hyperparameters.logits_soft_cap
self.has_sinks = self.global_hyperparameters.has_sinks
if self.has_sinks and not supports_trtllm_attention()[0]:
raise NotImplementedError(
"FlashInfer backend currently does not support attention "
"sinks, please use trtllm on blackwell or flash attention on "
"earlier GPUs.")
# Preparing persistent buffers (device-side)
self.paged_kv_indptr = torch.zeros(max_num_reqs + 1,
dtype=torch.int32,
@ -408,7 +412,11 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
self.q_data_type,
is_prefill=False,
has_sinks=self.has_sinks)
if self.has_sinks and not (prefill_use_trtllm and decode_use_trtllm):
raise NotImplementedError(
"FlashInfer backend currently does not support attention "
"sinks, please use trtllm on blackwell or flash attention on "
"earlier GPUs.")
attn_metadata = FlashInferMetadata(
num_actual_tokens=num_actual_tokens,
q_data_type=self.q_data_type,