From 8226dd56bf64f80e04f2aa90c97401d8783aa2bf Mon Sep 17 00:00:00 2001
From: Tao He <linzhu.ht@alibaba-inc.com>
Date: Sat, 13 Sep 2025 06:31:32 +0800
Subject: [PATCH] [Qwen3Next] Fixes the cuda graph capture conditions under
 large batch sizes (#24660) (#24667)

Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com>
---
 vllm/v1/attention/backends/gdn_attn.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/v1/attention/backends/gdn_attn.py b/vllm/v1/attention/backends/gdn_attn.py
index 12233af057b04..74eb9ae9d3254 100644
--- a/vllm/v1/attention/backends/gdn_attn.py
+++ b/vllm/v1/attention/backends/gdn_attn.py
@@ -209,7 +209,8 @@ class GDNAttentionMetadataBuilder(
 
         # prepare tensors for cudagraph
         if (self.use_full_cuda_graph and num_prefills == 0 and num_decodes == 0
-                and num_spec_decodes <= self.decode_cudagraph_max_bs):
+                and num_spec_decodes <= self.decode_cudagraph_max_bs
+                and m.num_actual_tokens <= self.decode_cudagraph_max_bs):
             num_total_tokens = self.vllm_config.pad_for_cudagraph(
                 m.num_actual_tokens)
             batch_size = num_total_tokens // (self.num_spec + 1)