[V1] LoRA - Enable Serving Usecase (#12883)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com> Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2026-06-30 04:47:12 +08:00 · 2025-02-14 11:51:12 +05:30 · 2025-02-14 11:51:12 +05:30 · cbc40128eb
commit cbc40128eb
parent f0b2da72a8
7 changed files with 210 additions and 7 deletions
--- a/tests/lora/test_add_lora.py
+++ b/tests/lora/test_add_lora.py
@ -0,0 +1,165 @@
 # SPDX-License-Identifier: Apache-2.0
 import asyncio
 import time
 from pathlib import Path
 from typing import List
 import pytest
 from huggingface_hub import snapshot_download
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.inputs import TextPrompt
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
 from vllm.utils import merge_async_iterators
 MODEL_PATH = "meta-llama/Llama-2-7b-hf"
 LORA_MODULE_DOWNLOAD_PATH = None  # Populated by download_and_prepare_lora_module() #noqa
 LORA_RANK = 8
 DEFAULT_MAX_LORAS = 16 * 3
 def download_and_prepare_lora_module():
    """
    Request submission is expensive when the LoRA adapters have their own
    tokenizers. This is because, for each request with a new LoRA adapter ID,
    the front-end loads the tokenizer from disk.
    In this test, as we are comparing request processing times, we want to
    minimize any extra activity. To this effect, we download the LoRA
    adapter and remove all the tokenizer files, so the engine will default
    to the base model tokenizer.
    """
    global LORA_MODULE_DOWNLOAD_PATH
    LORA_MODULE_HF_PATH = "yard1/llama-2-7b-sql-lora-test"
    LORA_MODULE_DOWNLOAD_PATH = snapshot_download(repo_id=LORA_MODULE_HF_PATH)
    tokenizer_files = [
        'added_tokens.json', 'tokenizer_config.json', 'tokenizer.json',
        'tokenizer.model'
    ]
    for tokenizer_file in tokenizer_files:
        del_path = Path(LORA_MODULE_DOWNLOAD_PATH) / tokenizer_file
        del_path.unlink()
@pytest.fixture(autouse=True)
 def v1(run_with_both_engines_lora):
    # Simple autouse wrapper to run both engines for each test
    # This can be promoted up to conftest.py to run for every
    # test in a package
    pass
 def get_lora_requests() -> List[LoRARequest]:
    lora_requests: List[LoRARequest] = [
        LoRARequest(lora_name=f"{i}",
                    lora_int_id=i,
                    lora_path=LORA_MODULE_DOWNLOAD_PATH)
        for i in range(1, DEFAULT_MAX_LORAS + 1)
    ]
    return lora_requests
 async def requests_processing_time(llm,
                                   lora_requests: List[LoRARequest]) -> float:
    sampling_params = SamplingParams(n=1,
                                     temperature=0.0,
                                     top_p=1.0,
                                     ignore_eos=True,
                                     max_tokens=1)
    generators = []
    start = time.perf_counter()
    for lora_request in lora_requests:
        lora_int_id = lora_request.lora_int_id
        generator = llm.generate(
            prompt=TextPrompt(prompt=f"hello {lora_int_id}",
                              multi_modal_data=None),  # type: ignore 
            sampling_params=sampling_params,
            lora_request=lora_request,
            request_id=f"test{lora_int_id}")
        generators.append(generator)
    all_gens = merge_async_iterators(*generators)
    async for i, res in all_gens:
        pass
    end = time.perf_counter()
    return end - start
@pytest.mark.asyncio
 async def test_add_lora():
    """ 
    The add_lora function is used to pre-load some LoRA adapters into the
    engine in anticipation of future requests using these adapters. To test
    this functionality, we use the async engine to process some requests - We
    do it twice, once with add_lora() pre-loading and once without.
    We measure the request processing time in both cases and expect the time 
    to be lesser in the case with add_lora() calls.
    """
    download_and_prepare_lora_module()
    lora_requests: List[LoRARequest] = get_lora_requests()
    max_loras = len(set([lr.lora_int_id for lr in lora_requests]))
    # Create engine in eager-mode. Due to high max_loras, the CI can
    # OOM during cuda-graph capture.
    engine_args = AsyncEngineArgs(
        model=MODEL_PATH,
        enable_lora=True,
        max_loras=max_loras,
        max_lora_rank=LORA_RANK,
        max_model_len=128,
        gpu_memory_utilization=0.8,  #avoid OOM
        enforce_eager=True)
    # The run_with_both_engines_lora fixture sets up the `VLLM_USE_V1`
    # environment variable. reload vllm.enging.async_llm_engine as
    # vllm.engine.async_llm_engine.AsyncLLMEgnine changes depending on the
    # env var.
    import importlib
    import vllm.engine.async_llm_engine
    importlib.reload(vllm.engine.async_llm_engine)
    from vllm.entrypoints.openai.api_server import (
        build_async_engine_client_from_engine_args)
    # split lora_requests into 3 parts
    part_size = len(lora_requests) // 3
    dummy_run_requests = lora_requests[:part_size]
    warmup_run_requests = lora_requests[part_size:part_size * 2]
    cold_run_requests = lora_requests[part_size * 2:]
    async with build_async_engine_client_from_engine_args(engine_args) as llm:
        # Dummy run - So any 1-time functionality like triton kernel compilation
        # is complete here.
        await requests_processing_time(llm, dummy_run_requests)
        # Run with warmup
        for lr in warmup_run_requests:
            await llm.add_lora(lr)
        # Wait for the add_lora function to complete on the server side.
        await asyncio.sleep(30)
        time_with_add_lora = await requests_processing_time(
            llm, warmup_run_requests)
        # Run without any warmup
        time_cold_start = await requests_processing_time(
            llm, cold_run_requests)
    print(f"time hot-start {time_with_add_lora} vs "
          f"time cold-start {time_cold_start} ")
    assert time_with_add_lora < time_cold_start, (
        f"time_with_add_lora={time_with_add_lora}, "
        f"time_cold_start={time_cold_start}"
        "The engine request processing time with LoRA pre-loading "
        "must be less than the version that does on-demand LoRA loading.")
--- a/vllm/v1/engine/init.py
+++ b/vllm/v1/engine/init.py
@ -134,3 +134,4 @@ class EngineCoreRequestType(enum.Enum):
    ABORT = b'\x01'
    PROFILE = b'\x02'
    RESET_PREFIX_CACHE = b'\x03'
    ADD_LORA = b'\x04'
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@ -361,6 +361,10 @@ class AsyncLLM(EngineClient):
    async def reset_prefix_cache(self) -> None:
        await self.engine_core.reset_prefix_cache_async()
    async def add_lora(self, lora_request: LoRARequest) -> None:
        """Load a new LoRA adapter into the engine for future requests."""
        await self.engine_core.add_lora_async(lora_request)
    @property
    def is_running(self) -> bool:
        return True
@ -376,7 +380,3 @@ class AsyncLLM(EngineClient):
    @property
    def dead_error(self) -> BaseException:
        return Exception()  # TODO: implement
    async def add_lora(self, lora_request: LoRARequest) -> None:
        """Load a new LoRA adapter into the engine for future requests."""
        raise NotImplementedError("LoRA not yet supported in V1")
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@ -13,6 +13,7 @@ import zmq.asyncio
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.config import (
    maybe_register_config_serialize_by_value)
 from vllm.utils import get_exception_traceback, zmq_socket_ctx
@ -146,6 +147,9 @@ class EngineCore:
    def reset_prefix_cache(self):
        self.scheduler.reset_prefix_cache()
    def add_lora(self, lora_request: LoRARequest) -> None:
        self.model_executor.add_lora(lora_request)
 class EngineCoreProc(EngineCore):
    """ZMQ-wrapper for running EngineCore in background process."""
@ -262,12 +266,15 @@ class EngineCoreProc(EngineCore):
            self.reset_prefix_cache()
        elif request_type == EngineCoreRequestType.PROFILE:
            self.model_executor.profile(request)
        elif request_type == EngineCoreRequestType.ADD_LORA:
            self.model_executor.add_lora(request)
    def process_input_socket(self, input_path: str):
        """Input socket IO thread."""
        # Msgpack serialization decoding.
        add_request_decoder = MsgpackDecoder(EngineCoreRequest)
        add_lora_decoder = MsgpackDecoder(LoRARequest)
        generic_decoder = MsgpackDecoder()
        with zmq_socket_ctx(input_path, zmq.constants.PULL) as socket:
@ -277,9 +284,14 @@ class EngineCoreProc(EngineCore):
                request_type = EngineCoreRequestType(bytes(type_frame.buffer))
                # Deserialize the request data.
-                decoder = add_request_decoder if (
+                decoder = None
-                    request_type
+                if request_type == EngineCoreRequestType.ADD:
-                    == EngineCoreRequestType.ADD) else generic_decoder
+                    decoder = add_request_decoder
                elif request_type == EngineCoreRequestType.ADD_LORA:
                    decoder = add_lora_decoder
                else:
                    decoder = generic_decoder
                request = decoder.decode(data_frame.buffer)
                # Push to input queue for core busy loop.
--- a/vllm/v1/engine/core_client.py
+++ b/vllm/v1/engine/core_client.py
@ -12,6 +12,7 @@ import zmq.asyncio
 from vllm.config import VllmConfig
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.utils import (get_open_zmq_ipc_path, kill_process_tree,
                        make_zmq_socket)
 from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
@ -77,6 +78,9 @@ class EngineCoreClient(ABC):
    def abort_requests(self, request_ids: List[str]) -> None:
        raise NotImplementedError
    def add_lora(self, lora_request: LoRARequest) -> None:
        raise NotImplementedError
    async def get_output_async(self) -> EngineCoreOutputs:
        raise NotImplementedError
@ -92,6 +96,9 @@ class EngineCoreClient(ABC):
    async def abort_requests_async(self, request_ids: List[str]) -> None:
        raise NotImplementedError
    async def add_lora_async(self, lora_request: LoRARequest) -> None:
        raise NotImplementedError
 class InprocClient(EngineCoreClient):
    """
@ -125,6 +132,9 @@ class InprocClient(EngineCoreClient):
    def reset_prefix_cache(self) -> None:
        self.engine_core.reset_prefix_cache()
    def add_lora(self, lora_request: LoRARequest) -> None:
        self.engine_core.add_lora(lora_request)
 class MPClient(EngineCoreClient):
    """
@ -242,6 +252,9 @@ class SyncMPClient(MPClient):
    def reset_prefix_cache(self) -> None:
        self._send_input(EngineCoreRequestType.RESET_PREFIX_CACHE, None)
    def add_lora(self, lora_request: LoRARequest) -> None:
        self._send_input(EngineCoreRequestType.ADD_LORA, lora_request)
 class AsyncMPClient(MPClient):
    """Asyncio-compatible client for multi-proc EngineCore."""
@ -295,3 +308,6 @@ class AsyncMPClient(MPClient):
    async def reset_prefix_cache_async(self) -> None:
        await self._send_input(EngineCoreRequestType.RESET_PREFIX_CACHE, None)
    async def add_lora_async(self, lora_request: LoRARequest) -> None:
        await self._send_input(EngineCoreRequestType.ADD_LORA, lora_request)
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@ -15,6 +15,7 @@ from vllm.distributed import (ensure_model_parallel_initialized,
                              init_distributed_environment,
                              set_custom_all_reduce)
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
 from vllm.platforms import current_platform
 from vllm.utils import GiB_bytes
@ -234,6 +235,9 @@ class Worker(WorkerBase):
        else:
            self.profiler.stop()
    def add_lora(self, lora_request: LoRARequest) -> bool:
        return self.model_runner.add_lora(lora_request)
    def check_health(self) -> None:
        # worker will always be healthy as long as it's running.
        return
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@ -127,3 +127,8 @@ class LoRAModelRunnerMixin:
            # __exit__ code
            self.lora_manager.remove_all_adapters()
    def add_lora(self, lora_request: LoRARequest) -> bool:
        if not self.lora_manager:
            raise RuntimeError("LoRA is not enabled.")
        return self.lora_manager.add_adapter(lora_request)