From 6a3f0a5abc1509a31fa8cc2a34fc1b9df474abad Mon Sep 17 00:00:00 2001
From: bk-201 <joy25810@foxmail.com>
Date: Sat, 13 Dec 2025 16:45:54 +0000
Subject: [PATCH] fix the issue with the MM token count

Signed-off-by: bk-201 <joy25810@foxmail.com>
---
 vllm/lora/layers/base_linear.py    | 4 ++--
 vllm/v1/worker/gpu_model_runner.py | 5 +++++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/vllm/lora/layers/base_linear.py b/vllm/lora/layers/base_linear.py
index 0ed3508510914..fc79aec5d650f 100644
--- a/vllm/lora/layers/base_linear.py
+++ b/vllm/lora/layers/base_linear.py
@@ -122,7 +122,6 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
     def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
 
-        # Store original shape for later reshaping
         original_shape = output.shape if output.ndim == 3 else None
 
         # In transformers backend, x and output have extra batch dimension like
@@ -138,7 +137,8 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
         if not current_platform.can_update_inplace():
             output = lora_output
 
-        # Restore original shape if it was flattened
+        # Reshape the flattened output back to its original shape,
+        # as some MM encoders cannot handle flattened inputs.
         if original_shape is not None:
             output = output.reshape(original_shape)
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 17f77000f5305..d4f214a20595c 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2157,8 +2157,13 @@ class GPUModelRunner(
                 req_idx = self.input_batch.req_id_to_index[req_id]
                 lora_id = int(self.input_batch.request_lora_mapping[req_idx])
 
+                # Prefer pos_info.is_embed to count actual MM embedding tokens.
+                # pos_info.length may overcount (e.g., special tokens in Qwen-VL).
+                # Fall back to length if is_embed is None.
                 num_tokens = self.info.get_num_mm_encoder_tokens(  # type: ignore[attr-defined]
                     pos_info.length
+                    if pos_info.is_embed is None
+                    else pos_info.is_embed.sum()
                 )
                 prompt_lora_mapping.append(lora_id)
                 token_lora_mapping.extend([lora_id] * num_tokens)