[Bugfix] Fix QKVParallelLinearWithShardedLora bias bug (#10844)

Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2026-05-04 14:17:58 +08:00 · 2024-12-03 12:10:29 +08:00 · 2024-12-03 12:10:29 +08:00 · a4cf256159
commit a4cf256159
parent d746268e92
2 changed files with 1 additions and 9 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -481,7 +481,6 @@ steps:

 - label: LoRA TP Test (Distributed)
  num_gpus: 4
-  soft_fail: true
  source_file_dependencies:
  - vllm/lora
  - tests/lora
--- a/vllm/lora/fully_sharded_layers.py
+++ b/vllm/lora/fully_sharded_layers.py
@ -77,13 +77,6 @@ class ColumnParallelLinearWithShardedLoRA(ColumnParallelLinearWithLoRA):
                                       add_input=True)
        # now have column partitioned output

-        if self.bias_stacked is not None:
-            self.bias_stacked = self.bias_stacked.view(
-                -1, self.bias_stacked.shape[-1])
-            self.bias_stacked = self.bias_stacked[
-                self.punica_wrapper.token_lora_indices]
-            output += self.bias_stacked
-
        output = output.view(*out_orig_shape)
        return output

@ -222,7 +215,7 @@ class QKVParallelLinearWithShardedLora(QKVParallelLinearWithLora):
        self.punica_wrapper.add_expand(output,
                                       buffer,
                                       self.lora_b_stacked,
-                                       self.bias_all,
+                                       self.bias_stacked,
                                       add_input=True)
        # now have column partitioned output
        output = output.view(*out_orig_shape)