From 5f5271f1ee2ef281ed8ecc66b126aa34964e797d Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Thu, 11 Sep 2025 12:01:38 +0100 Subject: [PATCH] Move `LoRAConfig` from `config/__init__.py` to `config/lora.py` (#24644) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- tests/core/test_scheduler.py | 3 +- tests/lora/test_layers.py | 2 +- tests/lora/test_lora_allowed_token_ids.py | 4 +- tests/lora/test_lora_manager.py | 2 +- tests/lora/test_peft_helper.py | 2 +- tests/lora/test_worker.py | 3 +- vllm/config/__init__.py | 111 +--------------- vllm/config/lora.py | 132 +++++++++++++++++++ vllm/core/scheduler.py | 3 +- vllm/engine/async_llm_engine.py | 5 +- vllm/engine/llm_engine.py | 6 +- vllm/lora/layers/base.py | 2 +- vllm/lora/layers/base_linear.py | 2 +- vllm/lora/layers/column_parallel_linear.py | 2 +- vllm/lora/layers/logits_processor.py | 2 +- vllm/lora/layers/replicated_linear.py | 2 +- vllm/lora/layers/row_parallel_linear.py | 2 +- vllm/lora/layers/vocal_parallel_embedding.py | 2 +- vllm/lora/models.py | 2 +- vllm/lora/peft_helper.py | 2 +- vllm/lora/utils.py | 2 +- vllm/lora/worker_manager.py | 2 +- vllm/model_executor/models/bart.py | 3 +- vllm/transformers_utils/tokenizer_group.py | 3 +- vllm/v1/worker/lora_model_runner_mixin.py | 3 +- 25 files changed, 167 insertions(+), 137 deletions(-) create mode 100644 vllm/config/lora.py diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py index e1a840bb15039..86e08328c43b0 100644 --- a/tests/core/test_scheduler.py +++ b/tests/core/test_scheduler.py @@ -10,7 +10,8 @@ import pytest # noqa import torch from torch import Use # noqa -from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig +from vllm.config import CacheConfig, SchedulerConfig +from vllm.config.lora import LoRAConfig from vllm.core.interfaces import AllocStatus from vllm.core.scheduler import Scheduler, SchedulingBudget from vllm.lora.request import LoRARequest diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index b0038a28ed89d..6735b7cd9e436 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -11,7 +11,7 @@ import pytest import torch import torch.nn.functional as F -from vllm.config import LoRAConfig +from vllm.config.lora import LoRAConfig # yapf conflicts with isort for this block # yapf: disable from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA, diff --git a/tests/lora/test_lora_allowed_token_ids.py b/tests/lora/test_lora_allowed_token_ids.py index e77eae70445db..be6409000ae77 100644 --- a/tests/lora/test_lora_allowed_token_ids.py +++ b/tests/lora/test_lora_allowed_token_ids.py @@ -3,8 +3,8 @@ import pytest -from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, - VllmConfig) +from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig +from vllm.config.lora import LoRAConfig from vllm.lora.request import LoRARequest from vllm.sampling_params import SamplingParams from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index c9ab32edc7f32..a5802c108c6be 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -8,7 +8,7 @@ import torch from safetensors.torch import load_file from torch import nn -from vllm.config import LoRAConfig +from vllm.config.lora import LoRAConfig from vllm.lora.layers import (ColumnParallelLinearWithLoRA, MergedColumnParallelLinearWithLoRA, RowParallelLinearWithLoRA) diff --git a/tests/lora/test_peft_helper.py b/tests/lora/test_peft_helper.py index df8696cf58e0f..ffffb5d8eab90 100644 --- a/tests/lora/test_peft_helper.py +++ b/tests/lora/test_peft_helper.py @@ -7,7 +7,7 @@ import shutil import pytest -from vllm.config import LoRAConfig +from vllm.config.lora import LoRAConfig from vllm.lora.peft_helper import PEFTHelper ERROR_CASES = [ diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index 02bfe0bf914a9..9c47abf8f4dce 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -6,9 +6,10 @@ import random import tempfile from unittest.mock import patch -from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, +from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, ParallelConfig, SchedulerConfig, VllmConfig) from vllm.config.load import LoadConfig +from vllm.config.lora import LoRAConfig from vllm.lora.models import LoRAMapping from vllm.lora.request import LoRARequest from vllm.v1.worker.gpu_worker import Worker diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 2ead8f5a3741d..8a75b28f38a54 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -36,6 +36,7 @@ from vllm.config.compilation import (CompilationConfig, CompilationLevel, from vllm.config.kv_events import KVEventsConfig from vllm.config.kv_transfer import KVTransferConfig from vllm.config.load import LoadConfig +from vllm.config.lora import LoRAConfig from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig, ParallelConfig) from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy @@ -2400,116 +2401,6 @@ class SpeculativeConfig: return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})" -LoRADType = Literal["auto", "float16", "bfloat16"] - - -@config -@dataclass(config=ConfigDict(arbitrary_types_allowed=True)) -class LoRAConfig: - """Configuration for LoRA.""" - - max_lora_rank: int = 16 - """Max LoRA rank.""" - max_loras: int = 1 - """Max number of LoRAs in a single batch.""" - fully_sharded_loras: bool = False - """By default, only half of the LoRA computation is sharded with tensor - parallelism. Enabling this will use the fully sharded layers. At high - sequence length, max rank or tensor parallel size, this is likely faster. - """ - max_cpu_loras: Optional[int] = None - """Maximum number of LoRAs to store in CPU memory. Must be >= than - `max_loras`.""" - lora_dtype: Union[torch.dtype, LoRADType] = "auto" - """Data type for LoRA. If auto, will default to base model dtype.""" - lora_extra_vocab_size: int = 256 - """(Deprecated) Maximum size of extra vocabulary that can be present in a - LoRA adapter. Will be removed in v0.12.0.""" - lora_vocab_padding_size: ClassVar[int] = current_platform\ - .get_lora_vocab_padding_size() - default_mm_loras: Optional[dict[str, str]] = None - """Dictionary mapping specific modalities to LoRA model paths; this field - is only applicable to multimodal models and should be leveraged when a - model always expects a LoRA to be active when a given modality is present. - Note that currently, if a request provides multiple additional - modalities, each of which have their own LoRA, we do NOT apply - default_mm_loras because we currently only support one lora adapter - per prompt. When run in offline mode, the lora IDs for n modalities - will be automatically assigned to 1-n with the names of the modalities - in alphabetic order.""" - bias_enabled: bool = False - """[DEPRECATED] Enable bias for LoRA adapters. This option will be - removed in v0.12.0.""" - - def compute_hash(self) -> str: - """ - WARNING: Whenever a new field is added to this config, - ensure that it is included in the factors list if - it affects the computation graph. - - Provide a hash that uniquely identifies all the configs - that affect the structure of the computation - graph from input ids/embeddings to the final hidden states, - excluding anything before input ids/embeddings and after - the final hidden states. - """ - factors: list[Any] = [] - factors.append(self.max_lora_rank) - factors.append(self.max_loras) - factors.append(self.fully_sharded_loras) - factors.append(self.lora_dtype) - factors.append(self.lora_extra_vocab_size) - factors.append(self.lora_vocab_padding_size) - factors.append(self.bias_enabled) - hash_str = hashlib.md5(str(factors).encode(), - usedforsecurity=False).hexdigest() - return hash_str - - def __post_init__(self): - # Deprecation warning for lora_extra_vocab_size - logger.warning( - "`lora_extra_vocab_size` is deprecated and will be removed " - "in v0.12.0. Additional vocabulary support for " - "LoRA adapters is being phased out.") - - # Deprecation warning for enable_lora_bias - if self.bias_enabled: - logger.warning("`enable_lora_bias` is deprecated " - "and will be removed in v0.12.0.") - - # Setting the maximum rank to 512 should be able to satisfy the vast - # majority of applications. - possible_max_ranks = (8, 16, 32, 64, 128, 256, 320, 512) - possible_lora_extra_vocab_size = (256, 512) - if self.max_lora_rank not in possible_max_ranks: - raise ValueError( - f"max_lora_rank ({self.max_lora_rank}) must be one of " - f"{possible_max_ranks}.") - if self.lora_extra_vocab_size not in possible_lora_extra_vocab_size: - raise ValueError( - f"lora_extra_vocab_size ({self.lora_extra_vocab_size}) " - f"must be one of {possible_lora_extra_vocab_size}.") - if self.max_loras < 1: - raise ValueError(f"max_loras ({self.max_loras}) must be >= 1.") - if self.max_cpu_loras is None: - self.max_cpu_loras = self.max_loras - elif self.max_cpu_loras < self.max_loras: - raise ValueError( - f"max_cpu_loras ({self.max_cpu_loras}) must be >= " - f"max_loras ({self.max_loras})") - - def verify_with_cache_config(self, cache_config: CacheConfig): - if cache_config.cpu_offload_gb > 0 and not envs.VLLM_USE_V1: - raise ValueError( - "V0 LoRA does not support CPU offload, please use V1.") - - def verify_with_model_config(self, model_config: ModelConfig): - if self.lora_dtype in (None, "auto"): - self.lora_dtype = model_config.dtype - elif isinstance(self.lora_dtype, str): - self.lora_dtype = getattr(torch, self.lora_dtype) - - @config @dataclass class MultiModalConfig: diff --git a/vllm/config/lora.py b/vllm/config/lora.py new file mode 100644 index 0000000000000..3fe28f5dad4fa --- /dev/null +++ b/vllm/config/lora.py @@ -0,0 +1,132 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import hashlib +from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, Union + +import torch +from pydantic import ConfigDict +from pydantic.dataclasses import dataclass + +import vllm.envs as envs +from vllm.config.utils import config +from vllm.logger import init_logger +from vllm.platforms import current_platform + +if TYPE_CHECKING: + from vllm.config import ModelConfig + from vllm.config.cache import CacheConfig +else: + ModelConfig = Any + CacheConfig = Any + +logger = init_logger(__name__) + +LoRADType = Literal["auto", "float16", "bfloat16"] + + +@config +@dataclass(config=ConfigDict(arbitrary_types_allowed=True)) +class LoRAConfig: + """Configuration for LoRA.""" + + max_lora_rank: int = 16 + """Max LoRA rank.""" + max_loras: int = 1 + """Max number of LoRAs in a single batch.""" + fully_sharded_loras: bool = False + """By default, only half of the LoRA computation is sharded with tensor + parallelism. Enabling this will use the fully sharded layers. At high + sequence length, max rank or tensor parallel size, this is likely faster. + """ + max_cpu_loras: Optional[int] = None + """Maximum number of LoRAs to store in CPU memory. Must be >= than + `max_loras`.""" + lora_dtype: Union[torch.dtype, LoRADType] = "auto" + """Data type for LoRA. If auto, will default to base model dtype.""" + lora_extra_vocab_size: int = 256 + """(Deprecated) Maximum size of extra vocabulary that can be present in a + LoRA adapter. Will be removed in v0.12.0.""" + lora_vocab_padding_size: ClassVar[int] = current_platform\ + .get_lora_vocab_padding_size() + default_mm_loras: Optional[dict[str, str]] = None + """Dictionary mapping specific modalities to LoRA model paths; this field + is only applicable to multimodal models and should be leveraged when a + model always expects a LoRA to be active when a given modality is present. + Note that currently, if a request provides multiple additional + modalities, each of which have their own LoRA, we do NOT apply + default_mm_loras because we currently only support one lora adapter + per prompt. When run in offline mode, the lora IDs for n modalities + will be automatically assigned to 1-n with the names of the modalities + in alphabetic order.""" + bias_enabled: bool = False + """[DEPRECATED] Enable bias for LoRA adapters. This option will be + removed in v0.12.0.""" + + def compute_hash(self) -> str: + """ + WARNING: Whenever a new field is added to this config, + ensure that it is included in the factors list if + it affects the computation graph. + + Provide a hash that uniquely identifies all the configs + that affect the structure of the computation + graph from input ids/embeddings to the final hidden states, + excluding anything before input ids/embeddings and after + the final hidden states. + """ + factors: list[Any] = [] + factors.append(self.max_lora_rank) + factors.append(self.max_loras) + factors.append(self.fully_sharded_loras) + factors.append(self.lora_dtype) + factors.append(self.lora_extra_vocab_size) + factors.append(self.lora_vocab_padding_size) + factors.append(self.bias_enabled) + hash_str = hashlib.md5(str(factors).encode(), + usedforsecurity=False).hexdigest() + return hash_str + + def __post_init__(self): + # Deprecation warning for lora_extra_vocab_size + logger.warning( + "`lora_extra_vocab_size` is deprecated and will be removed " + "in v0.12.0. Additional vocabulary support for " + "LoRA adapters is being phased out.") + + # Deprecation warning for enable_lora_bias + if self.bias_enabled: + logger.warning("`enable_lora_bias` is deprecated " + "and will be removed in v0.12.0.") + + # Setting the maximum rank to 512 should be able to satisfy the vast + # majority of applications. + possible_max_ranks = (8, 16, 32, 64, 128, 256, 320, 512) + possible_lora_extra_vocab_size = (256, 512) + if self.max_lora_rank not in possible_max_ranks: + raise ValueError( + f"max_lora_rank ({self.max_lora_rank}) must be one of " + f"{possible_max_ranks}.") + if self.lora_extra_vocab_size not in possible_lora_extra_vocab_size: + raise ValueError( + f"lora_extra_vocab_size ({self.lora_extra_vocab_size}) " + f"must be one of {possible_lora_extra_vocab_size}.") + if self.max_loras < 1: + raise ValueError(f"max_loras ({self.max_loras}) must be >= 1.") + if self.max_cpu_loras is None: + self.max_cpu_loras = self.max_loras + elif self.max_cpu_loras < self.max_loras: + raise ValueError( + f"max_cpu_loras ({self.max_cpu_loras}) must be >= " + f"max_loras ({self.max_loras})") + + def verify_with_cache_config(self, cache_config: CacheConfig): + if cache_config.cpu_offload_gb > 0 and not envs.VLLM_USE_V1: + raise ValueError( + "V0 LoRA does not support CPU offload, please use V1.") + + def verify_with_model_config(self, model_config: ModelConfig): + if self.lora_dtype in (None, "auto"): + self.lora_dtype = model_config.dtype + elif isinstance(self.lora_dtype, str): + self.lora_dtype = getattr(torch, self.lora_dtype) diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py index d7864293e9647..92ebad778ea4b 100644 --- a/vllm/core/scheduler.py +++ b/vllm/core/scheduler.py @@ -11,7 +11,8 @@ from typing import Callable, Deque, Dict, Iterable, List, Optional from typing import Sequence as GenericSequence from typing import Set, Tuple, Union -from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig +from vllm.config import CacheConfig, SchedulerConfig +from vllm.config.lora import LoRAConfig from vllm.core.interfaces import AllocStatus, BlockSpaceManager from vllm.logger import init_logger from vllm.lora.request import LoRARequest diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py index 6010a4647a0af..c53ece18964cb 100644 --- a/vllm/engine/async_llm_engine.py +++ b/vllm/engine/async_llm_engine.py @@ -10,8 +10,9 @@ from typing import (Any, AsyncGenerator, Callable, Dict, Iterable, List, from weakref import ReferenceType import vllm.envs as envs -from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig, - ParallelConfig, SchedulerConfig, VllmConfig) +from vllm.config import (DecodingConfig, ModelConfig, ParallelConfig, + SchedulerConfig, VllmConfig) +from vllm.config.lora import LoRAConfig from vllm.core.scheduler import SchedulerOutputs from vllm.engine.arg_utils import AsyncEngineArgs from vllm.engine.async_timeout import asyncio_timeout diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 47f56e58130fa..3462142d9fb91 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -16,9 +16,9 @@ import torch from typing_extensions import TypeVar import vllm.envs as envs -from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig, - ObservabilityConfig, ParallelConfig, SchedulerConfig, - VllmConfig) +from vllm.config import (DecodingConfig, ModelConfig, ObservabilityConfig, + ParallelConfig, SchedulerConfig, VllmConfig) +from vllm.config.lora import LoRAConfig from vllm.core.scheduler import ScheduledSequenceGroup, SchedulerOutputs from vllm.engine.arg_utils import EngineArgs from vllm.engine.metrics_types import StatLoggerBase, Stats diff --git a/vllm/lora/layers/base.py b/vllm/lora/layers/base.py index 0e759d5d5719b..a80a033e39b40 100644 --- a/vllm/lora/layers/base.py +++ b/vllm/lora/layers/base.py @@ -7,7 +7,7 @@ import torch import torch.nn as nn from transformers import PretrainedConfig -from vllm.config import LoRAConfig +from vllm.config.lora import LoRAConfig if TYPE_CHECKING: from vllm.lora.punica_wrapper import PunicaWrapperBase diff --git a/vllm/lora/layers/base_linear.py b/vllm/lora/layers/base_linear.py index 4e062971d9188..85a1f86ce6bf2 100644 --- a/vllm/lora/layers/base_linear.py +++ b/vllm/lora/layers/base_linear.py @@ -6,7 +6,7 @@ from typing import Optional, cast import torch from transformers import PretrainedConfig -from vllm.config import LoRAConfig +from vllm.config.lora import LoRAConfig from vllm.distributed.utils import divide # yapf: disable from vllm.model_executor.layers.linear import (ColumnParallelLinear, diff --git a/vllm/lora/layers/column_parallel_linear.py b/vllm/lora/layers/column_parallel_linear.py index d2f8e05554c84..658fd23165da0 100644 --- a/vllm/lora/layers/column_parallel_linear.py +++ b/vllm/lora/layers/column_parallel_linear.py @@ -7,7 +7,7 @@ import torch import torch.nn as nn from transformers import PretrainedConfig -from vllm.config import LoRAConfig +from vllm.config.lora import LoRAConfig from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, tensor_model_parallel_all_gather) diff --git a/vllm/lora/layers/logits_processor.py b/vllm/lora/layers/logits_processor.py index db974147ccca7..a50dcfa748f2f 100644 --- a/vllm/lora/layers/logits_processor.py +++ b/vllm/lora/layers/logits_processor.py @@ -8,7 +8,7 @@ import torch import torch.nn as nn from transformers import PretrainedConfig -from vllm.config import LoRAConfig +from vllm.config.lora import LoRAConfig from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) from vllm.model_executor.layers.logits_processor import LogitsProcessor diff --git a/vllm/lora/layers/replicated_linear.py b/vllm/lora/layers/replicated_linear.py index db922a02d40b0..3356297c1537a 100644 --- a/vllm/lora/layers/replicated_linear.py +++ b/vllm/lora/layers/replicated_linear.py @@ -7,7 +7,7 @@ import torch import torch.nn as nn from transformers import PretrainedConfig -from vllm.config import LoRAConfig +from vllm.config.lora import LoRAConfig from vllm.model_executor.layers.linear import ReplicatedLinear from .base_linear import BaseLinearLayerWithLoRA diff --git a/vllm/lora/layers/row_parallel_linear.py b/vllm/lora/layers/row_parallel_linear.py index bf1d9ae374f48..18ef6fd1ddd78 100644 --- a/vllm/lora/layers/row_parallel_linear.py +++ b/vllm/lora/layers/row_parallel_linear.py @@ -7,7 +7,7 @@ import torch import torch.nn as nn from transformers import PretrainedConfig -from vllm.config import LoRAConfig +from vllm.config.lora import LoRAConfig from vllm.distributed import (get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, split_tensor_along_last_dim, diff --git a/vllm/lora/layers/vocal_parallel_embedding.py b/vllm/lora/layers/vocal_parallel_embedding.py index 192e154fe56a6..4d6218d970977 100644 --- a/vllm/lora/layers/vocal_parallel_embedding.py +++ b/vllm/lora/layers/vocal_parallel_embedding.py @@ -8,7 +8,7 @@ import torch.nn as nn import torch.nn.functional as F from transformers import PretrainedConfig -from vllm.config import LoRAConfig +from vllm.config.lora import LoRAConfig from vllm.model_executor.layers.vocab_parallel_embedding import ( VocabParallelEmbedding) from vllm.platforms import current_platform diff --git a/vllm/lora/models.py b/vllm/lora/models.py index 3072047a2606c..7712438054914 100644 --- a/vllm/lora/models.py +++ b/vllm/lora/models.py @@ -16,7 +16,7 @@ from vllm.adapter_commons.models import (AdapterLRUCache, AdapterModel, from vllm.adapter_commons.utils import (add_adapter, deactivate_adapter, get_adapter, list_adapters, remove_adapter, set_adapter_mapping) -from vllm.config import LoRAConfig +from vllm.config.lora import LoRAConfig from vllm.logger import init_logger from vllm.lora.layers import BaseLayerWithLoRA, LoRAMapping from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py index 8b8e5cb7d5fae..dc7249c386021 100644 --- a/vllm/lora/peft_helper.py +++ b/vllm/lora/peft_helper.py @@ -9,7 +9,7 @@ import os from dataclasses import MISSING, dataclass, field, fields from typing import Literal, Optional, Union -from vllm.config import LoRAConfig +from vllm.config.lora import LoRAConfig from vllm.logger import init_logger from vllm.model_executor.model_loader.tensorizer import TensorizerConfig diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py index 2b05a2cf4d40c..10ba390bffd9e 100644 --- a/vllm/lora/utils.py +++ b/vllm/lora/utils.py @@ -11,7 +11,7 @@ from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError, from torch import nn from transformers import PretrainedConfig -from vllm.config import LoRAConfig +from vllm.config.lora import LoRAConfig from vllm.logger import init_logger # being imported for _all_lora_classes below # yapf conflicts with isort for this block diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 248d2954f1ef4..3a807b1e161d2 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -11,7 +11,7 @@ from vllm.adapter_commons.utils import (add_adapter_worker, list_adapters_worker, set_active_adapters_worker) from vllm.adapter_commons.worker_manager import AbstractWorkerManager -from vllm.config import LoRAConfig +from vllm.config.lora import LoRAConfig from vllm.logger import init_logger from vllm.lora.models import (LoRAModel, LoRAModelManager, LRUCacheLoRAModelManager, create_lora_manager) diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py index 32551d8102f32..fd4d820a01e9d 100644 --- a/vllm/model_executor/models/bart.py +++ b/vllm/model_executor/models/bart.py @@ -29,7 +29,8 @@ from transformers import BartConfig from transformers.utils import logging from vllm.attention import Attention, AttentionType -from vllm.config import CacheConfig, LoRAConfig, VllmConfig +from vllm.config import CacheConfig, VllmConfig +from vllm.config.lora import LoRAConfig from vllm.distributed import get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, diff --git a/vllm/transformers_utils/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group.py index ae8220f9b9dc5..6b519cccd3cc6 100644 --- a/vllm/transformers_utils/tokenizer_group.py +++ b/vllm/transformers_utils/tokenizer_group.py @@ -5,7 +5,8 @@ from typing import Optional from typing_extensions import assert_never -from vllm.config import LoRAConfig, ModelConfig, SchedulerConfig +from vllm.config import ModelConfig, SchedulerConfig +from vllm.config.lora import LoRAConfig from vllm.lora.request import LoRARequest from vllm.transformers_utils.tokenizer import (AnyTokenizer, encode_tokens, get_lora_tokenizer, diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py index 4b5f27d27541b..f2ebd5e10210b 100644 --- a/vllm/v1/worker/lora_model_runner_mixin.py +++ b/vllm/v1/worker/lora_model_runner_mixin.py @@ -11,7 +11,8 @@ import numpy as np import torch import torch.nn as nn -from vllm.config import LoRAConfig, ModelConfig, SchedulerConfig +from vllm.config import ModelConfig, SchedulerConfig +from vllm.config.lora import LoRAConfig from vllm.logger import init_logger from vllm.lora.layers import LoRAMapping from vllm.lora.request import LoRARequest