[Bugfix] Fix QKVParallelLinearWithShardedLora bias bug (#10844)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2026-07-21 03:57:09 +08:00 · 2024-12-03 12:10:29 +08:00 · 2024-12-03 12:10:29 +08:00 · a4cf256159
commit a4cf256159
parent d746268e92
2 changed files with 1 additions and 9 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -481,7 +481,6 @@ steps:
 - label: LoRA TP Test (Distributed)
  num_gpus: 4
  soft_fail: true
  source_file_dependencies:
  - vllm/lora
  - tests/lora
--- a/vllm/lora/fully_sharded_layers.py
+++ b/vllm/lora/fully_sharded_layers.py
@ -77,13 +77,6 @@ class ColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithLoRA):
                                       add_input=True)
        # now have column partitioned output
        if self.bias_stacked is not None:
            self.bias_stacked = self.bias_stacked.view(
                -1, self.bias_stacked.shape[-1])
            self.bias_stacked = self.bias_stacked[
                self.punica_wrapper.token_lora_indices]
            output += self.bias_stacked
        output = output.view(*out_orig_shape)
        return output
@ -222,7 +215,7 @@ class QKVParallelLinearWithShardedLora(QKVParallelLinearWithLora):
        self.punica_wrapper.add_expand(output,
                                       buffer,
                                       self.lora_b_stacked,
-                                       self.bias_all,
+                                       self.bias_stacked,
                                       add_input=True)
        # now have column partitioned output
        output = output.view(*out_orig_shape)