From 5f5271f1ee2ef281ed8ecc66b126aa34964e797d Mon Sep 17 00:00:00 2001
From: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Date: Thu, 11 Sep 2025 12:01:38 +0100
Subject: [PATCH] Move `LoRAConfig` from `config/__init__.py` to
 `config/lora.py` (#24644)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
---
 tests/core/test_scheduler.py                 |   3 +-
 tests/lora/test_layers.py                    |   2 +-
 tests/lora/test_lora_allowed_token_ids.py    |   4 +-
 tests/lora/test_lora_manager.py              |   2 +-
 tests/lora/test_peft_helper.py               |   2 +-
 tests/lora/test_worker.py                    |   3 +-
 vllm/config/__init__.py                      | 111 +---------------
 vllm/config/lora.py                          | 132 +++++++++++++++++++
 vllm/core/scheduler.py                       |   3 +-
 vllm/engine/async_llm_engine.py              |   5 +-
 vllm/engine/llm_engine.py                    |   6 +-
 vllm/lora/layers/base.py                     |   2 +-
 vllm/lora/layers/base_linear.py              |   2 +-
 vllm/lora/layers/column_parallel_linear.py   |   2 +-
 vllm/lora/layers/logits_processor.py         |   2 +-
 vllm/lora/layers/replicated_linear.py        |   2 +-
 vllm/lora/layers/row_parallel_linear.py      |   2 +-
 vllm/lora/layers/vocal_parallel_embedding.py |   2 +-
 vllm/lora/models.py                          |   2 +-
 vllm/lora/peft_helper.py                     |   2 +-
 vllm/lora/utils.py                           |   2 +-
 vllm/lora/worker_manager.py                  |   2 +-
 vllm/model_executor/models/bart.py           |   3 +-
 vllm/transformers_utils/tokenizer_group.py   |   3 +-
 vllm/v1/worker/lora_model_runner_mixin.py    |   3 +-
 25 files changed, 167 insertions(+), 137 deletions(-)
 create mode 100644 vllm/config/lora.py

diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
index e1a840bb15039..86e08328c43b0 100644
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -10,7 +10,8 @@ import pytest  # noqa
 import torch
 from torch import Use  # noqa
 
-from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
+from vllm.config import CacheConfig, SchedulerConfig
+from vllm.config.lora import LoRAConfig
 from vllm.core.interfaces import AllocStatus
 from vllm.core.scheduler import Scheduler, SchedulingBudget
 from vllm.lora.request import LoRARequest
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index b0038a28ed89d..6735b7cd9e436 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -11,7 +11,7 @@ import pytest
 import torch
 import torch.nn.functional as F
 
-from vllm.config import LoRAConfig
+from vllm.config.lora import LoRAConfig
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
diff --git a/tests/lora/test_lora_allowed_token_ids.py b/tests/lora/test_lora_allowed_token_ids.py
index e77eae70445db..be6409000ae77 100644
--- a/tests/lora/test_lora_allowed_token_ids.py
+++ b/tests/lora/test_lora_allowed_token_ids.py
@@ -3,8 +3,8 @@
 
 import pytest
 
-from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
-                         VllmConfig)
+from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
+from vllm.config.lora import LoRAConfig
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py
index c9ab32edc7f32..a5802c108c6be 100644
--- a/tests/lora/test_lora_manager.py
+++ b/tests/lora/test_lora_manager.py
@@ -8,7 +8,7 @@ import torch
 from safetensors.torch import load_file
 from torch import nn
 
-from vllm.config import LoRAConfig
+from vllm.config.lora import LoRAConfig
 from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
                               MergedColumnParallelLinearWithLoRA,
                               RowParallelLinearWithLoRA)
diff --git a/tests/lora/test_peft_helper.py b/tests/lora/test_peft_helper.py
index df8696cf58e0f..ffffb5d8eab90 100644
--- a/tests/lora/test_peft_helper.py
+++ b/tests/lora/test_peft_helper.py
@@ -7,7 +7,7 @@ import shutil
 
 import pytest
 
-from vllm.config import LoRAConfig
+from vllm.config.lora import LoRAConfig
 from vllm.lora.peft_helper import PEFTHelper
 
 ERROR_CASES = [
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index 02bfe0bf914a9..9c47abf8f4dce 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -6,9 +6,10 @@ import random
 import tempfile
 from unittest.mock import patch
 
-from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
+from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
                          ParallelConfig, SchedulerConfig, VllmConfig)
 from vllm.config.load import LoadConfig
+from vllm.config.lora import LoRAConfig
 from vllm.lora.models import LoRAMapping
 from vllm.lora.request import LoRARequest
 from vllm.v1.worker.gpu_worker import Worker
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
index 2ead8f5a3741d..8a75b28f38a54 100644
--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -36,6 +36,7 @@ from vllm.config.compilation import (CompilationConfig, CompilationLevel,
 from vllm.config.kv_events import KVEventsConfig
 from vllm.config.kv_transfer import KVTransferConfig
 from vllm.config.load import LoadConfig
+from vllm.config.lora import LoRAConfig
 from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig,
                                   ParallelConfig)
 from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy
@@ -2400,116 +2401,6 @@ class SpeculativeConfig:
         return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})"
 
 
-LoRADType = Literal["auto", "float16", "bfloat16"]
-
-
-@config
-@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
-class LoRAConfig:
-    """Configuration for LoRA."""
-
-    max_lora_rank: int = 16
-    """Max LoRA rank."""
-    max_loras: int = 1
-    """Max number of LoRAs in a single batch."""
-    fully_sharded_loras: bool = False
-    """By default, only half of the LoRA computation is sharded with tensor
-    parallelism. Enabling this will use the fully sharded layers. At high
-    sequence length, max rank or tensor parallel size, this is likely faster.
-    """
-    max_cpu_loras: Optional[int] = None
-    """Maximum number of LoRAs to store in CPU memory. Must be >= than
-    `max_loras`."""
-    lora_dtype: Union[torch.dtype, LoRADType] = "auto"
-    """Data type for LoRA. If auto, will default to base model dtype."""
-    lora_extra_vocab_size: int = 256
-    """(Deprecated) Maximum size of extra vocabulary that can be present in a 
-    LoRA adapter. Will be removed in v0.12.0."""
-    lora_vocab_padding_size: ClassVar[int] = current_platform\
-        .get_lora_vocab_padding_size()
-    default_mm_loras: Optional[dict[str, str]] = None
-    """Dictionary mapping specific modalities to LoRA model paths; this field
-    is only applicable to multimodal models and should be leveraged when a
-    model always expects a LoRA to be active when a given modality is present.
-    Note that currently, if a request provides multiple additional
-    modalities, each of which have their own LoRA, we do NOT apply
-    default_mm_loras because we currently only support one lora adapter
-    per prompt. When run in offline mode, the lora IDs for n modalities
-    will be automatically assigned to 1-n with the names of the modalities
-    in alphabetic order."""
-    bias_enabled: bool = False
-    """[DEPRECATED] Enable bias for LoRA adapters. This option will be
-    removed in v0.12.0."""
-
-    def compute_hash(self) -> str:
-        """
-        WARNING: Whenever a new field is added to this config,
-        ensure that it is included in the factors list if
-        it affects the computation graph.
-
-        Provide a hash that uniquely identifies all the configs
-        that affect the structure of the computation
-        graph from input ids/embeddings to the final hidden states,
-        excluding anything before input ids/embeddings and after
-        the final hidden states.
-        """
-        factors: list[Any] = []
-        factors.append(self.max_lora_rank)
-        factors.append(self.max_loras)
-        factors.append(self.fully_sharded_loras)
-        factors.append(self.lora_dtype)
-        factors.append(self.lora_extra_vocab_size)
-        factors.append(self.lora_vocab_padding_size)
-        factors.append(self.bias_enabled)
-        hash_str = hashlib.md5(str(factors).encode(),
-                               usedforsecurity=False).hexdigest()
-        return hash_str
-
-    def __post_init__(self):
-        # Deprecation warning for lora_extra_vocab_size
-        logger.warning(
-            "`lora_extra_vocab_size` is deprecated and will be removed "
-            "in v0.12.0. Additional vocabulary support for "
-            "LoRA adapters is being phased out.")
-
-        # Deprecation warning for enable_lora_bias
-        if self.bias_enabled:
-            logger.warning("`enable_lora_bias` is deprecated "
-                           "and will be removed in v0.12.0.")
-
-        # Setting the maximum rank to 512 should be able to satisfy the vast
-        # majority of applications.
-        possible_max_ranks = (8, 16, 32, 64, 128, 256, 320, 512)
-        possible_lora_extra_vocab_size = (256, 512)
-        if self.max_lora_rank not in possible_max_ranks:
-            raise ValueError(
-                f"max_lora_rank ({self.max_lora_rank}) must be one of "
-                f"{possible_max_ranks}.")
-        if self.lora_extra_vocab_size not in possible_lora_extra_vocab_size:
-            raise ValueError(
-                f"lora_extra_vocab_size ({self.lora_extra_vocab_size}) "
-                f"must be one of {possible_lora_extra_vocab_size}.")
-        if self.max_loras < 1:
-            raise ValueError(f"max_loras ({self.max_loras}) must be >= 1.")
-        if self.max_cpu_loras is None:
-            self.max_cpu_loras = self.max_loras
-        elif self.max_cpu_loras < self.max_loras:
-            raise ValueError(
-                f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
-                f"max_loras ({self.max_loras})")
-
-    def verify_with_cache_config(self, cache_config: CacheConfig):
-        if cache_config.cpu_offload_gb > 0 and not envs.VLLM_USE_V1:
-            raise ValueError(
-                "V0 LoRA does not support CPU offload, please use V1.")
-
-    def verify_with_model_config(self, model_config: ModelConfig):
-        if self.lora_dtype in (None, "auto"):
-            self.lora_dtype = model_config.dtype
-        elif isinstance(self.lora_dtype, str):
-            self.lora_dtype = getattr(torch, self.lora_dtype)
-
-
 @config
 @dataclass
 class MultiModalConfig:
diff --git a/vllm/config/lora.py b/vllm/config/lora.py
new file mode 100644
index 0000000000000..3fe28f5dad4fa
--- /dev/null
+++ b/vllm/config/lora.py
@@ -0,0 +1,132 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import hashlib
+from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, Union
+
+import torch
+from pydantic import ConfigDict
+from pydantic.dataclasses import dataclass
+
+import vllm.envs as envs
+from vllm.config.utils import config
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig
+    from vllm.config.cache import CacheConfig
+else:
+    ModelConfig = Any
+    CacheConfig = Any
+
+logger = init_logger(__name__)
+
+LoRADType = Literal["auto", "float16", "bfloat16"]
+
+
+@config
+@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
+class LoRAConfig:
+    """Configuration for LoRA."""
+
+    max_lora_rank: int = 16
+    """Max LoRA rank."""
+    max_loras: int = 1
+    """Max number of LoRAs in a single batch."""
+    fully_sharded_loras: bool = False
+    """By default, only half of the LoRA computation is sharded with tensor
+    parallelism. Enabling this will use the fully sharded layers. At high
+    sequence length, max rank or tensor parallel size, this is likely faster.
+    """
+    max_cpu_loras: Optional[int] = None
+    """Maximum number of LoRAs to store in CPU memory. Must be >= than
+    `max_loras`."""
+    lora_dtype: Union[torch.dtype, LoRADType] = "auto"
+    """Data type for LoRA. If auto, will default to base model dtype."""
+    lora_extra_vocab_size: int = 256
+    """(Deprecated) Maximum size of extra vocabulary that can be present in a 
+    LoRA adapter. Will be removed in v0.12.0."""
+    lora_vocab_padding_size: ClassVar[int] = current_platform\
+        .get_lora_vocab_padding_size()
+    default_mm_loras: Optional[dict[str, str]] = None
+    """Dictionary mapping specific modalities to LoRA model paths; this field
+    is only applicable to multimodal models and should be leveraged when a
+    model always expects a LoRA to be active when a given modality is present.
+    Note that currently, if a request provides multiple additional
+    modalities, each of which have their own LoRA, we do NOT apply
+    default_mm_loras because we currently only support one lora adapter
+    per prompt. When run in offline mode, the lora IDs for n modalities
+    will be automatically assigned to 1-n with the names of the modalities
+    in alphabetic order."""
+    bias_enabled: bool = False
+    """[DEPRECATED] Enable bias for LoRA adapters. This option will be
+    removed in v0.12.0."""
+
+    def compute_hash(self) -> str:
+        """
+        WARNING: Whenever a new field is added to this config,
+        ensure that it is included in the factors list if
+        it affects the computation graph.
+
+        Provide a hash that uniquely identifies all the configs
+        that affect the structure of the computation
+        graph from input ids/embeddings to the final hidden states,
+        excluding anything before input ids/embeddings and after
+        the final hidden states.
+        """
+        factors: list[Any] = []
+        factors.append(self.max_lora_rank)
+        factors.append(self.max_loras)
+        factors.append(self.fully_sharded_loras)
+        factors.append(self.lora_dtype)
+        factors.append(self.lora_extra_vocab_size)
+        factors.append(self.lora_vocab_padding_size)
+        factors.append(self.bias_enabled)
+        hash_str = hashlib.md5(str(factors).encode(),
+                               usedforsecurity=False).hexdigest()
+        return hash_str
+
+    def __post_init__(self):
+        # Deprecation warning for lora_extra_vocab_size
+        logger.warning(
+            "`lora_extra_vocab_size` is deprecated and will be removed "
+            "in v0.12.0. Additional vocabulary support for "
+            "LoRA adapters is being phased out.")
+
+        # Deprecation warning for enable_lora_bias
+        if self.bias_enabled:
+            logger.warning("`enable_lora_bias` is deprecated "
+                           "and will be removed in v0.12.0.")
+
+        # Setting the maximum rank to 512 should be able to satisfy the vast
+        # majority of applications.
+        possible_max_ranks = (8, 16, 32, 64, 128, 256, 320, 512)
+        possible_lora_extra_vocab_size = (256, 512)
+        if self.max_lora_rank not in possible_max_ranks:
+            raise ValueError(
+                f"max_lora_rank ({self.max_lora_rank}) must be one of "
+                f"{possible_max_ranks}.")
+        if self.lora_extra_vocab_size not in possible_lora_extra_vocab_size:
+            raise ValueError(
+                f"lora_extra_vocab_size ({self.lora_extra_vocab_size}) "
+                f"must be one of {possible_lora_extra_vocab_size}.")
+        if self.max_loras < 1:
+            raise ValueError(f"max_loras ({self.max_loras}) must be >= 1.")
+        if self.max_cpu_loras is None:
+            self.max_cpu_loras = self.max_loras
+        elif self.max_cpu_loras < self.max_loras:
+            raise ValueError(
+                f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
+                f"max_loras ({self.max_loras})")
+
+    def verify_with_cache_config(self, cache_config: CacheConfig):
+        if cache_config.cpu_offload_gb > 0 and not envs.VLLM_USE_V1:
+            raise ValueError(
+                "V0 LoRA does not support CPU offload, please use V1.")
+
+    def verify_with_model_config(self, model_config: ModelConfig):
+        if self.lora_dtype in (None, "auto"):
+            self.lora_dtype = model_config.dtype
+        elif isinstance(self.lora_dtype, str):
+            self.lora_dtype = getattr(torch, self.lora_dtype)
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index d7864293e9647..92ebad778ea4b 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -11,7 +11,8 @@ from typing import Callable, Deque, Dict, Iterable, List, Optional
 from typing import Sequence as GenericSequence
 from typing import Set, Tuple, Union
 
-from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
+from vllm.config import CacheConfig, SchedulerConfig
+from vllm.config.lora import LoRAConfig
 from vllm.core.interfaces import AllocStatus, BlockSpaceManager
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 6010a4647a0af..c53ece18964cb 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -10,8 +10,9 @@ from typing import (Any, AsyncGenerator, Callable, Dict, Iterable, List,
 from weakref import ReferenceType
 
 import vllm.envs as envs
-from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
-                         ParallelConfig, SchedulerConfig, VllmConfig)
+from vllm.config import (DecodingConfig, ModelConfig, ParallelConfig,
+                         SchedulerConfig, VllmConfig)
+from vllm.config.lora import LoRAConfig
 from vllm.core.scheduler import SchedulerOutputs
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_timeout import asyncio_timeout
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 47f56e58130fa..3462142d9fb91 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -16,9 +16,9 @@ import torch
 from typing_extensions import TypeVar
 
 import vllm.envs as envs
-from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
-                         ObservabilityConfig, ParallelConfig, SchedulerConfig,
-                         VllmConfig)
+from vllm.config import (DecodingConfig, ModelConfig, ObservabilityConfig,
+                         ParallelConfig, SchedulerConfig, VllmConfig)
+from vllm.config.lora import LoRAConfig
 from vllm.core.scheduler import ScheduledSequenceGroup, SchedulerOutputs
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.metrics_types import StatLoggerBase, Stats
diff --git a/vllm/lora/layers/base.py b/vllm/lora/layers/base.py
index 0e759d5d5719b..a80a033e39b40 100644
--- a/vllm/lora/layers/base.py
+++ b/vllm/lora/layers/base.py
@@ -7,7 +7,7 @@ import torch
 import torch.nn as nn
 from transformers import PretrainedConfig
 
-from vllm.config import LoRAConfig
+from vllm.config.lora import LoRAConfig
 
 if TYPE_CHECKING:
     from vllm.lora.punica_wrapper import PunicaWrapperBase
diff --git a/vllm/lora/layers/base_linear.py b/vllm/lora/layers/base_linear.py
index 4e062971d9188..85a1f86ce6bf2 100644
--- a/vllm/lora/layers/base_linear.py
+++ b/vllm/lora/layers/base_linear.py
@@ -6,7 +6,7 @@ from typing import Optional, cast
 import torch
 from transformers import PretrainedConfig
 
-from vllm.config import LoRAConfig
+from vllm.config.lora import LoRAConfig
 from vllm.distributed.utils import divide
 # yapf: disable
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
diff --git a/vllm/lora/layers/column_parallel_linear.py b/vllm/lora/layers/column_parallel_linear.py
index d2f8e05554c84..658fd23165da0 100644
--- a/vllm/lora/layers/column_parallel_linear.py
+++ b/vllm/lora/layers/column_parallel_linear.py
@@ -7,7 +7,7 @@ import torch
 import torch.nn as nn
 from transformers import PretrainedConfig
 
-from vllm.config import LoRAConfig
+from vllm.config.lora import LoRAConfig
 from vllm.distributed import (get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               tensor_model_parallel_all_gather)
diff --git a/vllm/lora/layers/logits_processor.py b/vllm/lora/layers/logits_processor.py
index db974147ccca7..a50dcfa748f2f 100644
--- a/vllm/lora/layers/logits_processor.py
+++ b/vllm/lora/layers/logits_processor.py
@@ -8,7 +8,7 @@ import torch
 import torch.nn as nn
 from transformers import PretrainedConfig
 
-from vllm.config import LoRAConfig
+from vllm.config.lora import LoRAConfig
 from vllm.distributed import (get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
diff --git a/vllm/lora/layers/replicated_linear.py b/vllm/lora/layers/replicated_linear.py
index db922a02d40b0..3356297c1537a 100644
--- a/vllm/lora/layers/replicated_linear.py
+++ b/vllm/lora/layers/replicated_linear.py
@@ -7,7 +7,7 @@ import torch
 import torch.nn as nn
 from transformers import PretrainedConfig
 
-from vllm.config import LoRAConfig
+from vllm.config.lora import LoRAConfig
 from vllm.model_executor.layers.linear import ReplicatedLinear
 
 from .base_linear import BaseLinearLayerWithLoRA
diff --git a/vllm/lora/layers/row_parallel_linear.py b/vllm/lora/layers/row_parallel_linear.py
index bf1d9ae374f48..18ef6fd1ddd78 100644
--- a/vllm/lora/layers/row_parallel_linear.py
+++ b/vllm/lora/layers/row_parallel_linear.py
@@ -7,7 +7,7 @@ import torch
 import torch.nn as nn
 from transformers import PretrainedConfig
 
-from vllm.config import LoRAConfig
+from vllm.config.lora import LoRAConfig
 from vllm.distributed import (get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size,
                               split_tensor_along_last_dim,
diff --git a/vllm/lora/layers/vocal_parallel_embedding.py b/vllm/lora/layers/vocal_parallel_embedding.py
index 192e154fe56a6..4d6218d970977 100644
--- a/vllm/lora/layers/vocal_parallel_embedding.py
+++ b/vllm/lora/layers/vocal_parallel_embedding.py
@@ -8,7 +8,7 @@ import torch.nn as nn
 import torch.nn.functional as F
 from transformers import PretrainedConfig
 
-from vllm.config import LoRAConfig
+from vllm.config.lora import LoRAConfig
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     VocabParallelEmbedding)
 from vllm.platforms import current_platform
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 3072047a2606c..7712438054914 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -16,7 +16,7 @@ from vllm.adapter_commons.models import (AdapterLRUCache, AdapterModel,
 from vllm.adapter_commons.utils import (add_adapter, deactivate_adapter,
                                         get_adapter, list_adapters,
                                         remove_adapter, set_adapter_mapping)
-from vllm.config import LoRAConfig
+from vllm.config.lora import LoRAConfig
 from vllm.logger import init_logger
 from vllm.lora.layers import BaseLayerWithLoRA, LoRAMapping
 from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
diff --git a/vllm/lora/peft_helper.py b/vllm/lora/peft_helper.py
index 8b8e5cb7d5fae..dc7249c386021 100644
--- a/vllm/lora/peft_helper.py
+++ b/vllm/lora/peft_helper.py
@@ -9,7 +9,7 @@ import os
 from dataclasses import MISSING, dataclass, field, fields
 from typing import Literal, Optional, Union
 
-from vllm.config import LoRAConfig
+from vllm.config.lora import LoRAConfig
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
 
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index 2b05a2cf4d40c..10ba390bffd9e 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -11,7 +11,7 @@ from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError,
 from torch import nn
 from transformers import PretrainedConfig
 
-from vllm.config import LoRAConfig
+from vllm.config.lora import LoRAConfig
 from vllm.logger import init_logger
 # being imported for _all_lora_classes below
 # yapf conflicts with isort for this block
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index 248d2954f1ef4..3a807b1e161d2 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -11,7 +11,7 @@ from vllm.adapter_commons.utils import (add_adapter_worker,
                                         list_adapters_worker,
                                         set_active_adapters_worker)
 from vllm.adapter_commons.worker_manager import AbstractWorkerManager
-from vllm.config import LoRAConfig
+from vllm.config.lora import LoRAConfig
 from vllm.logger import init_logger
 from vllm.lora.models import (LoRAModel, LoRAModelManager,
                               LRUCacheLoRAModelManager, create_lora_manager)
diff --git a/vllm/model_executor/models/bart.py b/vllm/model_executor/models/bart.py
index 32551d8102f32..fd4d820a01e9d 100644
--- a/vllm/model_executor/models/bart.py
+++ b/vllm/model_executor/models/bart.py
@@ -29,7 +29,8 @@ from transformers import BartConfig
 from transformers.utils import logging
 
 from vllm.attention import Attention, AttentionType
-from vllm.config import CacheConfig, LoRAConfig, VllmConfig
+from vllm.config import CacheConfig, VllmConfig
+from vllm.config.lora import LoRAConfig
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
diff --git a/vllm/transformers_utils/tokenizer_group.py b/vllm/transformers_utils/tokenizer_group.py
index ae8220f9b9dc5..6b519cccd3cc6 100644
--- a/vllm/transformers_utils/tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group.py
@@ -5,7 +5,8 @@ from typing import Optional
 
 from typing_extensions import assert_never
 
-from vllm.config import LoRAConfig, ModelConfig, SchedulerConfig
+from vllm.config import ModelConfig, SchedulerConfig
+from vllm.config.lora import LoRAConfig
 from vllm.lora.request import LoRARequest
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, encode_tokens,
                                                get_lora_tokenizer,
diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py
index 4b5f27d27541b..f2ebd5e10210b 100644
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -11,7 +11,8 @@ import numpy as np
 import torch
 import torch.nn as nn
 
-from vllm.config import LoRAConfig, ModelConfig, SchedulerConfig
+from vllm.config import ModelConfig, SchedulerConfig
+from vllm.config.lora import LoRAConfig
 from vllm.logger import init_logger
 from vllm.lora.layers import LoRAMapping
 from vllm.lora.request import LoRARequest