mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-09 08:05:31 +08:00
[gpt-oss] raise error for flashinfer backend without trtllm (#24482)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
This commit is contained in:
parent
9a161307f5
commit
b5e383cd8b
@ -216,7 +216,11 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
|
||||
self.window_left = self.global_hyperparameters.window_left
|
||||
self.logits_soft_cap = self.global_hyperparameters.logits_soft_cap
|
||||
self.has_sinks = self.global_hyperparameters.has_sinks
|
||||
|
||||
if self.has_sinks and not supports_trtllm_attention()[0]:
|
||||
raise NotImplementedError(
|
||||
"FlashInfer backend currently does not support attention "
|
||||
"sinks, please use trtllm on blackwell or flash attention on "
|
||||
"earlier GPUs.")
|
||||
# Preparing persistent buffers (device-side)
|
||||
self.paged_kv_indptr = torch.zeros(max_num_reqs + 1,
|
||||
dtype=torch.int32,
|
||||
@ -408,7 +412,11 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
|
||||
self.q_data_type,
|
||||
is_prefill=False,
|
||||
has_sinks=self.has_sinks)
|
||||
|
||||
if self.has_sinks and not (prefill_use_trtllm and decode_use_trtllm):
|
||||
raise NotImplementedError(
|
||||
"FlashInfer backend currently does not support attention "
|
||||
"sinks, please use trtllm on blackwell or flash attention on "
|
||||
"earlier GPUs.")
|
||||
attn_metadata = FlashInferMetadata(
|
||||
num_actual_tokens=num_actual_tokens,
|
||||
q_data_type=self.q_data_type,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user