mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-03-27 09:41:25 +08:00
fix the issue with the MM token count
Signed-off-by: bk-201 <joy25810@foxmail.com>
This commit is contained in:
parent
35acd22a5d
commit
6a3f0a5abc
@ -122,7 +122,6 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
|
||||
def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor:
|
||||
output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
|
||||
|
||||
# Store original shape for later reshaping
|
||||
original_shape = output.shape if output.ndim == 3 else None
|
||||
|
||||
# In transformers backend, x and output have extra batch dimension like
|
||||
@ -138,7 +137,8 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
|
||||
if not current_platform.can_update_inplace():
|
||||
output = lora_output
|
||||
|
||||
# Restore original shape if it was flattened
|
||||
# Reshape the flattened output back to its original shape,
|
||||
# as some MM encoders cannot handle flattened inputs.
|
||||
if original_shape is not None:
|
||||
output = output.reshape(original_shape)
|
||||
|
||||
|
||||
@ -2157,8 +2157,13 @@ class GPUModelRunner(
|
||||
req_idx = self.input_batch.req_id_to_index[req_id]
|
||||
lora_id = int(self.input_batch.request_lora_mapping[req_idx])
|
||||
|
||||
# Prefer pos_info.is_embed to count actual MM embedding tokens.
|
||||
# pos_info.length may overcount (e.g., special tokens in Qwen-VL).
|
||||
# Fall back to length if is_embed is None.
|
||||
num_tokens = self.info.get_num_mm_encoder_tokens( # type: ignore[attr-defined]
|
||||
pos_info.length
|
||||
if pos_info.is_embed is None
|
||||
else pos_info.is_embed.sum()
|
||||
)
|
||||
prompt_lora_mapping.append(lora_id)
|
||||
token_lora_mapping.extend([lora_id] * num_tokens)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user