[Bugfix] Fix fully sharded LoRAs with Mixtral (#11390)

Signed-off-by: Jason Greene <jason.greene@redhat.com>
2026-05-16 16:29:10 +08:00 · 2024-12-22 09:25:10 -06:00 · 2024-12-22 09:25:10 -06:00 · f1d1bf6288
commit f1d1bf6288
parent 72d9c316d3
2 changed files with 5 additions and 2 deletions
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@ -62,8 +62,9 @@ def test_mixtral_lora(mixtral_lora_files, tp_size):


@pytest.mark.parametrize("tp_size", [4])
+@pytest.mark.parametrize("fully_shard", [True, False])
 def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules,
-                                         tp_size):
+                                         tp_size, fully_shard):
    """This LoRA model has all supported Mixtral target modules"""

    if torch.cuda.device_count() < tp_size:
@ -82,6 +83,7 @@ def test_mixtral_lora_all_target_modules(mixtral_lora_files_all_target_modules,
        max_loras=4,
        distributed_executor_backend="ray",
        tensor_parallel_size=tp_size,
+        fully_sharded_loras=fully_shard,
        max_lora_rank=32,
    )

--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@ -425,8 +425,9 @@ class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA):
                       if self.base_layer.skip_bias_add else None)
        return output, output_bias

+    # ReplicatedLinear should always be replaced, regardless of the fully
+    # sharded LoRAs setting, because it is, by definition, copied per GPU.
    @classmethod
-    @_not_fully_sharded_can_replace
    def can_replace_layer(
        cls,
        source_layer: nn.Module,