mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-14 19:54:57 +08:00
[gpt-oss] raise error for flashinfer backend without trtllm (#24482)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
This commit is contained in:
parent
9a161307f5
commit
b5e383cd8b
@ -216,7 +216,11 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
|
|||||||
self.window_left = self.global_hyperparameters.window_left
|
self.window_left = self.global_hyperparameters.window_left
|
||||||
self.logits_soft_cap = self.global_hyperparameters.logits_soft_cap
|
self.logits_soft_cap = self.global_hyperparameters.logits_soft_cap
|
||||||
self.has_sinks = self.global_hyperparameters.has_sinks
|
self.has_sinks = self.global_hyperparameters.has_sinks
|
||||||
|
if self.has_sinks and not supports_trtllm_attention()[0]:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"FlashInfer backend currently does not support attention "
|
||||||
|
"sinks, please use trtllm on blackwell or flash attention on "
|
||||||
|
"earlier GPUs.")
|
||||||
# Preparing persistent buffers (device-side)
|
# Preparing persistent buffers (device-side)
|
||||||
self.paged_kv_indptr = torch.zeros(max_num_reqs + 1,
|
self.paged_kv_indptr = torch.zeros(max_num_reqs + 1,
|
||||||
dtype=torch.int32,
|
dtype=torch.int32,
|
||||||
@ -408,7 +412,11 @@ class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
|
|||||||
self.q_data_type,
|
self.q_data_type,
|
||||||
is_prefill=False,
|
is_prefill=False,
|
||||||
has_sinks=self.has_sinks)
|
has_sinks=self.has_sinks)
|
||||||
|
if self.has_sinks and not (prefill_use_trtllm and decode_use_trtllm):
|
||||||
|
raise NotImplementedError(
|
||||||
|
"FlashInfer backend currently does not support attention "
|
||||||
|
"sinks, please use trtllm on blackwell or flash attention on "
|
||||||
|
"earlier GPUs.")
|
||||||
attn_metadata = FlashInferMetadata(
|
attn_metadata = FlashInferMetadata(
|
||||||
num_actual_tokens=num_actual_tokens,
|
num_actual_tokens=num_actual_tokens,
|
||||||
q_data_type=self.q_data_type,
|
q_data_type=self.q_data_type,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user