fix the issue with the MM token count

Signed-off-by: bk-201 <joy25810@foxmail.com>
2026-06-26 16:17:17 +08:00 · 2025-12-13 16:45:54 +00:00 · 2025-12-13 16:45:54 +00:00 · 6a3f0a5abc
commit 6a3f0a5abc
parent 35acd22a5d
2 changed files with 7 additions and 2 deletions
--- a/vllm/lora/layers/base_linear.py
+++ b/vllm/lora/layers/base_linear.py
@ -122,7 +122,6 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
    def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor:
        output = self.base_layer.quant_method.apply(self.base_layer, x, bias)

-        # Store original shape for later reshaping
        original_shape = output.shape if output.ndim == 3 else None

        # In transformers backend, x and output have extra batch dimension like
@ -138,7 +137,8 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
        if not current_platform.can_update_inplace():
            output = lora_output

-        # Restore original shape if it was flattened
+        # Reshape the flattened output back to its original shape,
+        # as some MM encoders cannot handle flattened inputs.
        if original_shape is not None:
            output = output.reshape(original_shape)

--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@ -2157,8 +2157,13 @@ class GPUModelRunner(
                req_idx = self.input_batch.req_id_to_index[req_id]
                lora_id = int(self.input_batch.request_lora_mapping[req_idx])

+                # Prefer pos_info.is_embed to count actual MM embedding tokens.
+                # pos_info.length may overcount (e.g., special tokens in Qwen-VL).
+                # Fall back to length if is_embed is None.
                num_tokens = self.info.get_num_mm_encoder_tokens(  # type: ignore[attr-defined]
                    pos_info.length
+                    if pos_info.is_embed is None
+                    else pos_info.is_embed.sum()
                )
                prompt_lora_mapping.append(lora_id)
                token_lora_mapping.extend([lora_id] * num_tokens)