From add1b9d3dec4a6d1b404f5793a210ff77482b7ae Mon Sep 17 00:00:00 2001 From: drslark <96540755+drslark@users.noreply.github.com> Date: Sun, 14 Dec 2025 17:32:16 +0800 Subject: [PATCH] [main][BugFix] Fixed an accuracy bug of Qwen3-next-MTP when batched inferring (#30632) Signed-off-by: drslark --- vllm/v1/attention/backends/gdn_attn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/v1/attention/backends/gdn_attn.py b/vllm/v1/attention/backends/gdn_attn.py index 3a2f92d9921c3..ace2cbb0564c8 100644 --- a/vllm/v1/attention/backends/gdn_attn.py +++ b/vllm/v1/attention/backends/gdn_attn.py @@ -211,7 +211,7 @@ class GDNAttentionMetadataBuilder(AttentionMetadataBuilder[GDNAttentionMetadata] spec_token_masks = torch.repeat_interleave( spec_sequence_masks, query_lens ) - index = torch.argsort(spec_token_masks) + index = torch.argsort(spec_token_masks, stable=True) num_non_spec_tokens = num_prefill_tokens + num_decode_tokens non_spec_token_indx = index[:num_non_spec_tokens] spec_token_indx = index[num_non_spec_tokens:]