Move LoRAConfig from config/__init__.py to config/lora.py (#24644)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor 2025-09-11 12:01:38 +01:00 committed by GitHub
parent d6249d0699
commit 5f5271f1ee
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
25 changed files with 167 additions and 137 deletions

View File

@ -10,7 +10,8 @@ import pytest # noqa
import torch import torch
from torch import Use # noqa from torch import Use # noqa
from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig from vllm.config import CacheConfig, SchedulerConfig
from vllm.config.lora import LoRAConfig
from vllm.core.interfaces import AllocStatus from vllm.core.interfaces import AllocStatus
from vllm.core.scheduler import Scheduler, SchedulingBudget from vllm.core.scheduler import Scheduler, SchedulingBudget
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest

View File

@ -11,7 +11,7 @@ import pytest
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
from vllm.config import LoRAConfig from vllm.config.lora import LoRAConfig
# yapf conflicts with isort for this block # yapf conflicts with isort for this block
# yapf: disable # yapf: disable
from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA, from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,

View File

@ -3,8 +3,8 @@
import pytest import pytest
from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
VllmConfig) from vllm.config.lora import LoRAConfig
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs

View File

@ -8,7 +8,7 @@ import torch
from safetensors.torch import load_file from safetensors.torch import load_file
from torch import nn from torch import nn
from vllm.config import LoRAConfig from vllm.config.lora import LoRAConfig
from vllm.lora.layers import (ColumnParallelLinearWithLoRA, from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
MergedColumnParallelLinearWithLoRA, MergedColumnParallelLinearWithLoRA,
RowParallelLinearWithLoRA) RowParallelLinearWithLoRA)

View File

@ -7,7 +7,7 @@ import shutil
import pytest import pytest
from vllm.config import LoRAConfig from vllm.config.lora import LoRAConfig
from vllm.lora.peft_helper import PEFTHelper from vllm.lora.peft_helper import PEFTHelper
ERROR_CASES = [ ERROR_CASES = [

View File

@ -6,9 +6,10 @@ import random
import tempfile import tempfile
from unittest.mock import patch from unittest.mock import patch
from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
ParallelConfig, SchedulerConfig, VllmConfig) ParallelConfig, SchedulerConfig, VllmConfig)
from vllm.config.load import LoadConfig from vllm.config.load import LoadConfig
from vllm.config.lora import LoRAConfig
from vllm.lora.models import LoRAMapping from vllm.lora.models import LoRAMapping
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.v1.worker.gpu_worker import Worker from vllm.v1.worker.gpu_worker import Worker

View File

@ -36,6 +36,7 @@ from vllm.config.compilation import (CompilationConfig, CompilationLevel,
from vllm.config.kv_events import KVEventsConfig from vllm.config.kv_events import KVEventsConfig
from vllm.config.kv_transfer import KVTransferConfig from vllm.config.kv_transfer import KVTransferConfig
from vllm.config.load import LoadConfig from vllm.config.load import LoadConfig
from vllm.config.lora import LoRAConfig
from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig, from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig,
ParallelConfig) ParallelConfig)
from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy
@ -2400,116 +2401,6 @@ class SpeculativeConfig:
return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})" return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})"
LoRADType = Literal["auto", "float16", "bfloat16"]
@config
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class LoRAConfig:
"""Configuration for LoRA."""
max_lora_rank: int = 16
"""Max LoRA rank."""
max_loras: int = 1
"""Max number of LoRAs in a single batch."""
fully_sharded_loras: bool = False
"""By default, only half of the LoRA computation is sharded with tensor
parallelism. Enabling this will use the fully sharded layers. At high
sequence length, max rank or tensor parallel size, this is likely faster.
"""
max_cpu_loras: Optional[int] = None
"""Maximum number of LoRAs to store in CPU memory. Must be >= than
`max_loras`."""
lora_dtype: Union[torch.dtype, LoRADType] = "auto"
"""Data type for LoRA. If auto, will default to base model dtype."""
lora_extra_vocab_size: int = 256
"""(Deprecated) Maximum size of extra vocabulary that can be present in a
LoRA adapter. Will be removed in v0.12.0."""
lora_vocab_padding_size: ClassVar[int] = current_platform\
.get_lora_vocab_padding_size()
default_mm_loras: Optional[dict[str, str]] = None
"""Dictionary mapping specific modalities to LoRA model paths; this field
is only applicable to multimodal models and should be leveraged when a
model always expects a LoRA to be active when a given modality is present.
Note that currently, if a request provides multiple additional
modalities, each of which have their own LoRA, we do NOT apply
default_mm_loras because we currently only support one lora adapter
per prompt. When run in offline mode, the lora IDs for n modalities
will be automatically assigned to 1-n with the names of the modalities
in alphabetic order."""
bias_enabled: bool = False
"""[DEPRECATED] Enable bias for LoRA adapters. This option will be
removed in v0.12.0."""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
factors: list[Any] = []
factors.append(self.max_lora_rank)
factors.append(self.max_loras)
factors.append(self.fully_sharded_loras)
factors.append(self.lora_dtype)
factors.append(self.lora_extra_vocab_size)
factors.append(self.lora_vocab_padding_size)
factors.append(self.bias_enabled)
hash_str = hashlib.md5(str(factors).encode(),
usedforsecurity=False).hexdigest()
return hash_str
def __post_init__(self):
# Deprecation warning for lora_extra_vocab_size
logger.warning(
"`lora_extra_vocab_size` is deprecated and will be removed "
"in v0.12.0. Additional vocabulary support for "
"LoRA adapters is being phased out.")
# Deprecation warning for enable_lora_bias
if self.bias_enabled:
logger.warning("`enable_lora_bias` is deprecated "
"and will be removed in v0.12.0.")
# Setting the maximum rank to 512 should be able to satisfy the vast
# majority of applications.
possible_max_ranks = (8, 16, 32, 64, 128, 256, 320, 512)
possible_lora_extra_vocab_size = (256, 512)
if self.max_lora_rank not in possible_max_ranks:
raise ValueError(
f"max_lora_rank ({self.max_lora_rank}) must be one of "
f"{possible_max_ranks}.")
if self.lora_extra_vocab_size not in possible_lora_extra_vocab_size:
raise ValueError(
f"lora_extra_vocab_size ({self.lora_extra_vocab_size}) "
f"must be one of {possible_lora_extra_vocab_size}.")
if self.max_loras < 1:
raise ValueError(f"max_loras ({self.max_loras}) must be >= 1.")
if self.max_cpu_loras is None:
self.max_cpu_loras = self.max_loras
elif self.max_cpu_loras < self.max_loras:
raise ValueError(
f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
f"max_loras ({self.max_loras})")
def verify_with_cache_config(self, cache_config: CacheConfig):
if cache_config.cpu_offload_gb > 0 and not envs.VLLM_USE_V1:
raise ValueError(
"V0 LoRA does not support CPU offload, please use V1.")
def verify_with_model_config(self, model_config: ModelConfig):
if self.lora_dtype in (None, "auto"):
self.lora_dtype = model_config.dtype
elif isinstance(self.lora_dtype, str):
self.lora_dtype = getattr(torch, self.lora_dtype)
@config @config
@dataclass @dataclass
class MultiModalConfig: class MultiModalConfig:

132
vllm/config/lora.py Normal file
View File

@ -0,0 +1,132 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import hashlib
from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, Union
import torch
from pydantic import ConfigDict
from pydantic.dataclasses import dataclass
import vllm.envs as envs
from vllm.config.utils import config
from vllm.logger import init_logger
from vllm.platforms import current_platform
if TYPE_CHECKING:
from vllm.config import ModelConfig
from vllm.config.cache import CacheConfig
else:
ModelConfig = Any
CacheConfig = Any
logger = init_logger(__name__)
LoRADType = Literal["auto", "float16", "bfloat16"]
@config
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
class LoRAConfig:
"""Configuration for LoRA."""
max_lora_rank: int = 16
"""Max LoRA rank."""
max_loras: int = 1
"""Max number of LoRAs in a single batch."""
fully_sharded_loras: bool = False
"""By default, only half of the LoRA computation is sharded with tensor
parallelism. Enabling this will use the fully sharded layers. At high
sequence length, max rank or tensor parallel size, this is likely faster.
"""
max_cpu_loras: Optional[int] = None
"""Maximum number of LoRAs to store in CPU memory. Must be >= than
`max_loras`."""
lora_dtype: Union[torch.dtype, LoRADType] = "auto"
"""Data type for LoRA. If auto, will default to base model dtype."""
lora_extra_vocab_size: int = 256
"""(Deprecated) Maximum size of extra vocabulary that can be present in a
LoRA adapter. Will be removed in v0.12.0."""
lora_vocab_padding_size: ClassVar[int] = current_platform\
.get_lora_vocab_padding_size()
default_mm_loras: Optional[dict[str, str]] = None
"""Dictionary mapping specific modalities to LoRA model paths; this field
is only applicable to multimodal models and should be leveraged when a
model always expects a LoRA to be active when a given modality is present.
Note that currently, if a request provides multiple additional
modalities, each of which have their own LoRA, we do NOT apply
default_mm_loras because we currently only support one lora adapter
per prompt. When run in offline mode, the lora IDs for n modalities
will be automatically assigned to 1-n with the names of the modalities
in alphabetic order."""
bias_enabled: bool = False
"""[DEPRECATED] Enable bias for LoRA adapters. This option will be
removed in v0.12.0."""
def compute_hash(self) -> str:
"""
WARNING: Whenever a new field is added to this config,
ensure that it is included in the factors list if
it affects the computation graph.
Provide a hash that uniquely identifies all the configs
that affect the structure of the computation
graph from input ids/embeddings to the final hidden states,
excluding anything before input ids/embeddings and after
the final hidden states.
"""
factors: list[Any] = []
factors.append(self.max_lora_rank)
factors.append(self.max_loras)
factors.append(self.fully_sharded_loras)
factors.append(self.lora_dtype)
factors.append(self.lora_extra_vocab_size)
factors.append(self.lora_vocab_padding_size)
factors.append(self.bias_enabled)
hash_str = hashlib.md5(str(factors).encode(),
usedforsecurity=False).hexdigest()
return hash_str
def __post_init__(self):
# Deprecation warning for lora_extra_vocab_size
logger.warning(
"`lora_extra_vocab_size` is deprecated and will be removed "
"in v0.12.0. Additional vocabulary support for "
"LoRA adapters is being phased out.")
# Deprecation warning for enable_lora_bias
if self.bias_enabled:
logger.warning("`enable_lora_bias` is deprecated "
"and will be removed in v0.12.0.")
# Setting the maximum rank to 512 should be able to satisfy the vast
# majority of applications.
possible_max_ranks = (8, 16, 32, 64, 128, 256, 320, 512)
possible_lora_extra_vocab_size = (256, 512)
if self.max_lora_rank not in possible_max_ranks:
raise ValueError(
f"max_lora_rank ({self.max_lora_rank}) must be one of "
f"{possible_max_ranks}.")
if self.lora_extra_vocab_size not in possible_lora_extra_vocab_size:
raise ValueError(
f"lora_extra_vocab_size ({self.lora_extra_vocab_size}) "
f"must be one of {possible_lora_extra_vocab_size}.")
if self.max_loras < 1:
raise ValueError(f"max_loras ({self.max_loras}) must be >= 1.")
if self.max_cpu_loras is None:
self.max_cpu_loras = self.max_loras
elif self.max_cpu_loras < self.max_loras:
raise ValueError(
f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
f"max_loras ({self.max_loras})")
def verify_with_cache_config(self, cache_config: CacheConfig):
if cache_config.cpu_offload_gb > 0 and not envs.VLLM_USE_V1:
raise ValueError(
"V0 LoRA does not support CPU offload, please use V1.")
def verify_with_model_config(self, model_config: ModelConfig):
if self.lora_dtype in (None, "auto"):
self.lora_dtype = model_config.dtype
elif isinstance(self.lora_dtype, str):
self.lora_dtype = getattr(torch, self.lora_dtype)

View File

@ -11,7 +11,8 @@ from typing import Callable, Deque, Dict, Iterable, List, Optional
from typing import Sequence as GenericSequence from typing import Sequence as GenericSequence
from typing import Set, Tuple, Union from typing import Set, Tuple, Union
from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig from vllm.config import CacheConfig, SchedulerConfig
from vllm.config.lora import LoRAConfig
from vllm.core.interfaces import AllocStatus, BlockSpaceManager from vllm.core.interfaces import AllocStatus, BlockSpaceManager
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest

View File

@ -10,8 +10,9 @@ from typing import (Any, AsyncGenerator, Callable, Dict, Iterable, List,
from weakref import ReferenceType from weakref import ReferenceType
import vllm.envs as envs import vllm.envs as envs
from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig, from vllm.config import (DecodingConfig, ModelConfig, ParallelConfig,
ParallelConfig, SchedulerConfig, VllmConfig) SchedulerConfig, VllmConfig)
from vllm.config.lora import LoRAConfig
from vllm.core.scheduler import SchedulerOutputs from vllm.core.scheduler import SchedulerOutputs
from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_timeout import asyncio_timeout from vllm.engine.async_timeout import asyncio_timeout

View File

@ -16,9 +16,9 @@ import torch
from typing_extensions import TypeVar from typing_extensions import TypeVar
import vllm.envs as envs import vllm.envs as envs
from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig, from vllm.config import (DecodingConfig, ModelConfig, ObservabilityConfig,
ObservabilityConfig, ParallelConfig, SchedulerConfig, ParallelConfig, SchedulerConfig, VllmConfig)
VllmConfig) from vllm.config.lora import LoRAConfig
from vllm.core.scheduler import ScheduledSequenceGroup, SchedulerOutputs from vllm.core.scheduler import ScheduledSequenceGroup, SchedulerOutputs
from vllm.engine.arg_utils import EngineArgs from vllm.engine.arg_utils import EngineArgs
from vllm.engine.metrics_types import StatLoggerBase, Stats from vllm.engine.metrics_types import StatLoggerBase, Stats

View File

@ -7,7 +7,7 @@ import torch
import torch.nn as nn import torch.nn as nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.config import LoRAConfig from vllm.config.lora import LoRAConfig
if TYPE_CHECKING: if TYPE_CHECKING:
from vllm.lora.punica_wrapper import PunicaWrapperBase from vllm.lora.punica_wrapper import PunicaWrapperBase

View File

@ -6,7 +6,7 @@ from typing import Optional, cast
import torch import torch
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.config import LoRAConfig from vllm.config.lora import LoRAConfig
from vllm.distributed.utils import divide from vllm.distributed.utils import divide
# yapf: disable # yapf: disable
from vllm.model_executor.layers.linear import (ColumnParallelLinear, from vllm.model_executor.layers.linear import (ColumnParallelLinear,

View File

@ -7,7 +7,7 @@ import torch
import torch.nn as nn import torch.nn as nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.config import LoRAConfig from vllm.config.lora import LoRAConfig
from vllm.distributed import (get_tensor_model_parallel_rank, from vllm.distributed import (get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size, get_tensor_model_parallel_world_size,
tensor_model_parallel_all_gather) tensor_model_parallel_all_gather)

View File

@ -8,7 +8,7 @@ import torch
import torch.nn as nn import torch.nn as nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.config import LoRAConfig from vllm.config.lora import LoRAConfig
from vllm.distributed import (get_tensor_model_parallel_rank, from vllm.distributed import (get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size) get_tensor_model_parallel_world_size)
from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.logits_processor import LogitsProcessor

View File

@ -7,7 +7,7 @@ import torch
import torch.nn as nn import torch.nn as nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.config import LoRAConfig from vllm.config.lora import LoRAConfig
from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.linear import ReplicatedLinear
from .base_linear import BaseLinearLayerWithLoRA from .base_linear import BaseLinearLayerWithLoRA

View File

@ -7,7 +7,7 @@ import torch
import torch.nn as nn import torch.nn as nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.config import LoRAConfig from vllm.config.lora import LoRAConfig
from vllm.distributed import (get_tensor_model_parallel_rank, from vllm.distributed import (get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size, get_tensor_model_parallel_world_size,
split_tensor_along_last_dim, split_tensor_along_last_dim,

View File

@ -8,7 +8,7 @@ import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.config import LoRAConfig from vllm.config.lora import LoRAConfig
from vllm.model_executor.layers.vocab_parallel_embedding import ( from vllm.model_executor.layers.vocab_parallel_embedding import (
VocabParallelEmbedding) VocabParallelEmbedding)
from vllm.platforms import current_platform from vllm.platforms import current_platform

View File

@ -16,7 +16,7 @@ from vllm.adapter_commons.models import (AdapterLRUCache, AdapterModel,
from vllm.adapter_commons.utils import (add_adapter, deactivate_adapter, from vllm.adapter_commons.utils import (add_adapter, deactivate_adapter,
get_adapter, list_adapters, get_adapter, list_adapters,
remove_adapter, set_adapter_mapping) remove_adapter, set_adapter_mapping)
from vllm.config import LoRAConfig from vllm.config.lora import LoRAConfig
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.lora.layers import BaseLayerWithLoRA, LoRAMapping from vllm.lora.layers import BaseLayerWithLoRA, LoRAMapping
from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights

View File

@ -9,7 +9,7 @@ import os
from dataclasses import MISSING, dataclass, field, fields from dataclasses import MISSING, dataclass, field, fields
from typing import Literal, Optional, Union from typing import Literal, Optional, Union
from vllm.config import LoRAConfig from vllm.config.lora import LoRAConfig
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig from vllm.model_executor.model_loader.tensorizer import TensorizerConfig

View File

@ -11,7 +11,7 @@ from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError,
from torch import nn from torch import nn
from transformers import PretrainedConfig from transformers import PretrainedConfig
from vllm.config import LoRAConfig from vllm.config.lora import LoRAConfig
from vllm.logger import init_logger from vllm.logger import init_logger
# being imported for _all_lora_classes below # being imported for _all_lora_classes below
# yapf conflicts with isort for this block # yapf conflicts with isort for this block

View File

@ -11,7 +11,7 @@ from vllm.adapter_commons.utils import (add_adapter_worker,
list_adapters_worker, list_adapters_worker,
set_active_adapters_worker) set_active_adapters_worker)
from vllm.adapter_commons.worker_manager import AbstractWorkerManager from vllm.adapter_commons.worker_manager import AbstractWorkerManager
from vllm.config import LoRAConfig from vllm.config.lora import LoRAConfig
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.lora.models import (LoRAModel, LoRAModelManager, from vllm.lora.models import (LoRAModel, LoRAModelManager,
LRUCacheLoRAModelManager, create_lora_manager) LRUCacheLoRAModelManager, create_lora_manager)

View File

@ -29,7 +29,8 @@ from transformers import BartConfig
from transformers.utils import logging from transformers.utils import logging
from vllm.attention import Attention, AttentionType from vllm.attention import Attention, AttentionType
from vllm.config import CacheConfig, LoRAConfig, VllmConfig from vllm.config import CacheConfig, VllmConfig
from vllm.config.lora import LoRAConfig
from vllm.distributed import get_tensor_model_parallel_world_size from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.linear import (ColumnParallelLinear, from vllm.model_executor.layers.linear import (ColumnParallelLinear,

View File

@ -5,7 +5,8 @@ from typing import Optional
from typing_extensions import assert_never from typing_extensions import assert_never
from vllm.config import LoRAConfig, ModelConfig, SchedulerConfig from vllm.config import ModelConfig, SchedulerConfig
from vllm.config.lora import LoRAConfig
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.transformers_utils.tokenizer import (AnyTokenizer, encode_tokens, from vllm.transformers_utils.tokenizer import (AnyTokenizer, encode_tokens,
get_lora_tokenizer, get_lora_tokenizer,

View File

@ -11,7 +11,8 @@ import numpy as np
import torch import torch
import torch.nn as nn import torch.nn as nn
from vllm.config import LoRAConfig, ModelConfig, SchedulerConfig from vllm.config import ModelConfig, SchedulerConfig
from vllm.config.lora import LoRAConfig
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.lora.layers import LoRAMapping from vllm.lora.layers import LoRAMapping
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest