[V1] LoRA - Enable more V1 tests (#14315)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com> Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2026-01-25 21:44:37 +08:00 · 2025-03-05 22:55:42 -05:00 · 2025-03-05 22:55:42 -05:00 · 3dbd2d813a
commit 3dbd2d813a
parent f5f7f00cd9
5 changed files with 57 additions and 8 deletions
--- a/tests/lora/test_chatglm3_tp.py
+++ b/tests/lora/test_chatglm3_tp.py
@ -55,7 +55,6 @@ def v1(run_with_both_engines_lora):
    pass


-@pytest.mark.skip_v1
@fork_new_process_for_each_test
 def test_chatglm3_lora(chatglm3_lora_files):
    llm = vllm.LLM(MODEL_PATH,
@ -75,7 +74,6 @@ def test_chatglm3_lora(chatglm3_lora_files):
        assert output2[i] == EXPECTED_LORA_OUTPUT[i]


-@pytest.mark.skip_v1
@multi_gpu_test(num_gpus=4)
@fork_new_process_for_each_test
 def test_chatglm3_lora_tp4(chatglm3_lora_files):
@ -97,7 +95,6 @@ def test_chatglm3_lora_tp4(chatglm3_lora_files):
        assert output2[i] == EXPECTED_LORA_OUTPUT[i]


-@pytest.mark.skip_v1
@multi_gpu_test(num_gpus=4)
@fork_new_process_for_each_test
 def test_chatglm3_lora_tp4_fully_sharded_loras(chatglm3_lora_files):
--- a/tests/lora/test_mixtral.py
+++ b/tests/lora/test_mixtral.py
@ -10,6 +10,14 @@ from vllm.platforms import current_platform
 MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1"


+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int,
              prompts: list[str]) -> list[str]:

--- a/tests/lora/test_qwen2vl.py
+++ b/tests/lora/test_qwen2vl.py
@ -12,6 +12,14 @@ from vllm.lora.request import LoRARequest
 from vllm.platforms import current_platform


+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
@dataclass
 class TestConfig:
    model_path: str
--- a/tests/lora/test_ultravox.py
+++ b/tests/lora/test_ultravox.py
@ -4,6 +4,7 @@ import shutil
 from os import path
 from tempfile import TemporaryDirectory

+import pytest
 import torch
 from huggingface_hub import snapshot_download
 from safetensors.torch import load_file, save_file
@ -21,6 +22,14 @@ VLLM_PLACEHOLDER = "<|reserved_special_token_0|>"
 PROMPT = "Tell me about a Fool's mate move in 20 words. Provide the moves!"


+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
 def llama3_1_8b_chess_lora_path():
    return snapshot_download(
        repo_id="mkopecki/chess-lora-adapter-llama-3.1-8b")
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@ -3,18 +3,45 @@
 import os
 import random
 import tempfile
+from typing import Union
 from unittest.mock import patch

+import pytest
+
+import vllm.envs as envs
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
                         ModelConfig, ParallelConfig, SchedulerConfig,
                         VllmConfig)
 from vllm.lora.models import LoRAMapping
 from vllm.lora.request import LoRARequest
+from vllm.v1.worker.gpu_worker import Worker as V1Worker
 from vllm.worker.worker import Worker


+@pytest.fixture(autouse=True)
+def v1(run_with_both_engines_lora):
+    # Simple autouse wrapper to run both engines for each test
+    # This can be promoted up to conftest.py to run for every
+    # test in a package
+    pass
+
+
@patch.dict(os.environ, {"RANK": "0"})
 def test_worker_apply_lora(sql_lora_files):
+
+    def set_active_loras(worker: Union[Worker, V1Worker],
+                         lora_requests: list[LoRARequest]):
+        lora_mapping = LoRAMapping([], [])
+        if isinstance(worker, Worker):
+            # v0 case
+            worker.model_runner.set_active_loras(lora_requests, lora_mapping)
+        else:
+            # v1 case
+            worker.model_runner.lora_manager.set_active_adapters(
+                lora_requests, lora_mapping)
+
+    worker_cls = V1Worker if envs.VLLM_USE_V1 else Worker
+
    vllm_config = VllmConfig(
        model_config=ModelConfig(
            "meta-llama/Llama-2-7b-hf",
@ -40,16 +67,17 @@ def test_worker_apply_lora(sql_lora_files):
        lora_config=LoRAConfig(max_lora_rank=8, max_cpu_loras=32,
                               max_loras=32),
    )
-    worker = Worker(
+    worker = worker_cls(
        vllm_config=vllm_config,
        local_rank=0,
        rank=0,
        distributed_init_method=f"file://{tempfile.mkstemp()[1]}",
    )
+
    worker.init_device()
    worker.load_model()

-    worker.model_runner.set_active_loras([], LoRAMapping([], []))
+    set_active_loras(worker, [])
    assert worker.list_loras() == set()

    n_loras = 32
@ -57,7 +85,7 @@ def test_worker_apply_lora(sql_lora_files):
        LoRARequest(str(i + 1), i + 1, sql_lora_files) for i in range(n_loras)
    ]

-    worker.model_runner.set_active_loras(lora_requests, LoRAMapping([], []))
+    set_active_loras(worker, lora_requests)
    assert worker.list_loras() == {
        lora_request.lora_int_id
        for lora_request in lora_requests
@ -69,8 +97,7 @@ def test_worker_apply_lora(sql_lora_files):
                                            k=random.randint(1, n_loras))
        random.shuffle(iter_lora_requests)
        iter_lora_requests = iter_lora_requests[:-random.randint(0, n_loras)]
-        worker.model_runner.set_active_loras(iter_lora_requests,
-                                             LoRAMapping([], []))
+        set_active_loras(worker, lora_requests)
        assert worker.list_loras().issuperset(
            {lora_request.lora_int_id
             for lora_request in iter_lora_requests})