From 8226dd56bf64f80e04f2aa90c97401d8783aa2bf Mon Sep 17 00:00:00 2001 From: Tao He Date: Sat, 13 Sep 2025 06:31:32 +0800 Subject: [PATCH] [Qwen3Next] Fixes the cuda graph capture conditions under large batch sizes (#24660) (#24667) Signed-off-by: Tao He --- vllm/v1/attention/backends/gdn_attn.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/v1/attention/backends/gdn_attn.py b/vllm/v1/attention/backends/gdn_attn.py index 12233af057b04..74eb9ae9d3254 100644 --- a/vllm/v1/attention/backends/gdn_attn.py +++ b/vllm/v1/attention/backends/gdn_attn.py @@ -209,7 +209,8 @@ class GDNAttentionMetadataBuilder( # prepare tensors for cudagraph if (self.use_full_cuda_graph and num_prefills == 0 and num_decodes == 0 - and num_spec_decodes <= self.decode_cudagraph_max_bs): + and num_spec_decodes <= self.decode_cudagraph_max_bs + and m.num_actual_tokens <= self.decode_cudagraph_max_bs): num_total_tokens = self.vllm_config.pad_for_cudagraph( m.num_actual_tokens) batch_size = num_total_tokens // (self.num_spec + 1)