From 6a3f0a5abc1509a31fa8cc2a34fc1b9df474abad Mon Sep 17 00:00:00 2001 From: bk-201 Date: Sat, 13 Dec 2025 16:45:54 +0000 Subject: [PATCH] fix the issue with the MM token count Signed-off-by: bk-201 --- vllm/lora/layers/base_linear.py | 4 ++-- vllm/v1/worker/gpu_model_runner.py | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/vllm/lora/layers/base_linear.py b/vllm/lora/layers/base_linear.py index 0ed3508510914..fc79aec5d650f 100644 --- a/vllm/lora/layers/base_linear.py +++ b/vllm/lora/layers/base_linear.py @@ -122,7 +122,6 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA): def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor: output = self.base_layer.quant_method.apply(self.base_layer, x, bias) - # Store original shape for later reshaping original_shape = output.shape if output.ndim == 3 else None # In transformers backend, x and output have extra batch dimension like @@ -138,7 +137,8 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA): if not current_platform.can_update_inplace(): output = lora_output - # Restore original shape if it was flattened + # Reshape the flattened output back to its original shape, + # as some MM encoders cannot handle flattened inputs. if original_shape is not None: output = output.reshape(original_shape) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 17f77000f5305..d4f214a20595c 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2157,8 +2157,13 @@ class GPUModelRunner( req_idx = self.input_batch.req_id_to_index[req_id] lora_id = int(self.input_batch.request_lora_mapping[req_idx]) + # Prefer pos_info.is_embed to count actual MM embedding tokens. + # pos_info.length may overcount (e.g., special tokens in Qwen-VL). + # Fall back to length if is_embed is None. num_tokens = self.info.get_num_mm_encoder_tokens( # type: ignore[attr-defined] pos_info.length + if pos_info.is_embed is None + else pos_info.is_embed.sum() ) prompt_lora_mapping.append(lora_id) token_lora_mapping.extend([lora_id] * num_tokens)