diff --git a/vllm/lora/ops/torch_ops/lora_ops.py b/vllm/lora/ops/torch_ops/lora_ops.py index af79f98415cbc..ab65faceb2c10 100644 --- a/vllm/lora/ops/torch_ops/lora_ops.py +++ b/vllm/lora/ops/torch_ops/lora_ops.py @@ -36,10 +36,13 @@ def bgmv_expand(inputs: torch.Tensor, if outputs.shape[0] == 1 and output_tensor.shape[0] != 1: limit = 1 + # LoRA adapter and model may add different amounts of padding to output + common_len = min(outputs.shape[1], output_tensor.shape[1]) + if add_inputs: - output_tensor[:, :outputs.shape[1]] += outputs[:limit, :] + output_tensor[:, :common_len] += outputs[:limit, :common_len] else: - output_tensor[:, :outputs.shape[1]] = outputs[:limit, :] + output_tensor[:, :common_len] = outputs[:limit, :common_len] def sgmv_shrink(