fix the issue with the MM token count

Signed-off-by: bk-201 <joy25810@foxmail.com>
This commit is contained in:
bk-201 2025-12-13 16:45:54 +00:00
parent 35acd22a5d
commit 6a3f0a5abc
2 changed files with 7 additions and 2 deletions

View File

@ -122,7 +122,6 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor:
output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
# Store original shape for later reshaping
original_shape = output.shape if output.ndim == 3 else None
# In transformers backend, x and output have extra batch dimension like
@ -138,7 +137,8 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
if not current_platform.can_update_inplace():
output = lora_output
# Restore original shape if it was flattened
# Reshape the flattened output back to its original shape,
# as some MM encoders cannot handle flattened inputs.
if original_shape is not None:
output = output.reshape(original_shape)

View File

@ -2157,8 +2157,13 @@ class GPUModelRunner(
req_idx = self.input_batch.req_id_to_index[req_id]
lora_id = int(self.input_batch.request_lora_mapping[req_idx])
# Prefer pos_info.is_embed to count actual MM embedding tokens.
# pos_info.length may overcount (e.g., special tokens in Qwen-VL).
# Fall back to length if is_embed is None.
num_tokens = self.info.get_num_mm_encoder_tokens( # type: ignore[attr-defined]
pos_info.length
if pos_info.is_embed is None
else pos_info.is_embed.sum()
)
prompt_lora_mapping.append(lora_id)
token_lora_mapping.extend([lora_id] * num_tokens)