mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-15 06:45:01 +08:00
[Attention] Remove max cudagraph size limit of 992 (#27840)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
This commit is contained in:
parent
4a36681f85
commit
608bb14462
@ -244,13 +244,6 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad
|
|||||||
self.max_cudagraph_size = self.compilation_config.max_cudagraph_capture_size
|
self.max_cudagraph_size = self.compilation_config.max_cudagraph_capture_size
|
||||||
|
|
||||||
if self.use_full_cuda_graph and self.aot_schedule:
|
if self.use_full_cuda_graph and self.aot_schedule:
|
||||||
if self.max_cudagraph_size > 992:
|
|
||||||
# This condition derives from FA3's internal heuristic.
|
|
||||||
# TODO(woosuk): Support larger cudagraph sizes.
|
|
||||||
raise ValueError(
|
|
||||||
"Capture size larger than 992 is not supported for full cuda graph."
|
|
||||||
)
|
|
||||||
|
|
||||||
self.scheduler_metadata = torch.zeros(
|
self.scheduler_metadata = torch.zeros(
|
||||||
vllm_config.scheduler_config.max_num_seqs + 1,
|
vllm_config.scheduler_config.max_num_seqs + 1,
|
||||||
dtype=torch.int32,
|
dtype=torch.int32,
|
||||||
|
|||||||
@ -97,13 +97,6 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata]
|
|||||||
self.max_cudagraph_size = self.compilation_config.max_cudagraph_capture_size
|
self.max_cudagraph_size = self.compilation_config.max_cudagraph_capture_size
|
||||||
|
|
||||||
if self.use_full_cuda_graph and self.fa_aot_schedule:
|
if self.use_full_cuda_graph and self.fa_aot_schedule:
|
||||||
if self.max_cudagraph_size > 992:
|
|
||||||
# This condition derives from FA3's internal heuristic.
|
|
||||||
# TODO(woosuk): Support larger cudagraph sizes.
|
|
||||||
raise ValueError(
|
|
||||||
"Capture size larger than 992 is not supported for full cuda graph."
|
|
||||||
)
|
|
||||||
|
|
||||||
self.scheduler_metadata = torch.zeros(
|
self.scheduler_metadata = torch.zeros(
|
||||||
vllm_config.scheduler_config.max_num_seqs + 1,
|
vllm_config.scheduler_config.max_num_seqs + 1,
|
||||||
dtype=torch.int32,
|
dtype=torch.int32,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user