diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py index a41d318c3c2c..9be90442fa80 100755 --- a/vllm/v1/attention/backends/flash_attn.py +++ b/vllm/v1/attention/backends/flash_attn.py @@ -244,13 +244,6 @@ class FlashAttentionMetadataBuilder(AttentionMetadataBuilder[FlashAttentionMetad self.max_cudagraph_size = self.compilation_config.max_cudagraph_capture_size if self.use_full_cuda_graph and self.aot_schedule: - if self.max_cudagraph_size > 992: - # This condition derives from FA3's internal heuristic. - # TODO(woosuk): Support larger cudagraph sizes. - raise ValueError( - "Capture size larger than 992 is not supported for full cuda graph." - ) - self.scheduler_metadata = torch.zeros( vllm_config.scheduler_config.max_num_seqs + 1, dtype=torch.int32, diff --git a/vllm/v1/attention/backends/mla/flashattn_mla.py b/vllm/v1/attention/backends/mla/flashattn_mla.py index 7b084ae969d9..8a1e79baa87c 100644 --- a/vllm/v1/attention/backends/mla/flashattn_mla.py +++ b/vllm/v1/attention/backends/mla/flashattn_mla.py @@ -97,13 +97,6 @@ class FlashAttnMLAMetadataBuilder(MLACommonMetadataBuilder[FlashAttnMLAMetadata] self.max_cudagraph_size = self.compilation_config.max_cudagraph_capture_size if self.use_full_cuda_graph and self.fa_aot_schedule: - if self.max_cudagraph_size > 992: - # This condition derives from FA3's internal heuristic. - # TODO(woosuk): Support larger cudagraph sizes. - raise ValueError( - "Capture size larger than 992 is not supported for full cuda graph." - ) - self.scheduler_metadata = torch.zeros( vllm_config.scheduler_config.max_num_seqs + 1, dtype=torch.int32,