mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-05-23 22:17:54 +08:00
Move LoRAConfig from config/__init__.py to config/lora.py (#24644)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
parent
d6249d0699
commit
5f5271f1ee
@ -10,7 +10,8 @@ import pytest # noqa
|
|||||||
import torch
|
import torch
|
||||||
from torch import Use # noqa
|
from torch import Use # noqa
|
||||||
|
|
||||||
from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
|
from vllm.config import CacheConfig, SchedulerConfig
|
||||||
|
from vllm.config.lora import LoRAConfig
|
||||||
from vllm.core.interfaces import AllocStatus
|
from vllm.core.interfaces import AllocStatus
|
||||||
from vllm.core.scheduler import Scheduler, SchedulingBudget
|
from vllm.core.scheduler import Scheduler, SchedulingBudget
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
|
|||||||
@ -11,7 +11,7 @@ import pytest
|
|||||||
import torch
|
import torch
|
||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
|
|
||||||
from vllm.config import LoRAConfig
|
from vllm.config.lora import LoRAConfig
|
||||||
# yapf conflicts with isort for this block
|
# yapf conflicts with isort for this block
|
||||||
# yapf: disable
|
# yapf: disable
|
||||||
from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
|
from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
|
||||||
|
|||||||
@ -3,8 +3,8 @@
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
|
from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
|
||||||
VllmConfig)
|
from vllm.config.lora import LoRAConfig
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
|
from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
|
||||||
|
|||||||
@ -8,7 +8,7 @@ import torch
|
|||||||
from safetensors.torch import load_file
|
from safetensors.torch import load_file
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
from vllm.config import LoRAConfig
|
from vllm.config.lora import LoRAConfig
|
||||||
from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
|
from vllm.lora.layers import (ColumnParallelLinearWithLoRA,
|
||||||
MergedColumnParallelLinearWithLoRA,
|
MergedColumnParallelLinearWithLoRA,
|
||||||
RowParallelLinearWithLoRA)
|
RowParallelLinearWithLoRA)
|
||||||
|
|||||||
@ -7,7 +7,7 @@ import shutil
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm.config import LoRAConfig
|
from vllm.config.lora import LoRAConfig
|
||||||
from vllm.lora.peft_helper import PEFTHelper
|
from vllm.lora.peft_helper import PEFTHelper
|
||||||
|
|
||||||
ERROR_CASES = [
|
ERROR_CASES = [
|
||||||
|
|||||||
@ -6,9 +6,10 @@ import random
|
|||||||
import tempfile
|
import tempfile
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
|
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
|
||||||
ParallelConfig, SchedulerConfig, VllmConfig)
|
ParallelConfig, SchedulerConfig, VllmConfig)
|
||||||
from vllm.config.load import LoadConfig
|
from vllm.config.load import LoadConfig
|
||||||
|
from vllm.config.lora import LoRAConfig
|
||||||
from vllm.lora.models import LoRAMapping
|
from vllm.lora.models import LoRAMapping
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.v1.worker.gpu_worker import Worker
|
from vllm.v1.worker.gpu_worker import Worker
|
||||||
|
|||||||
@ -36,6 +36,7 @@ from vllm.config.compilation import (CompilationConfig, CompilationLevel,
|
|||||||
from vllm.config.kv_events import KVEventsConfig
|
from vllm.config.kv_events import KVEventsConfig
|
||||||
from vllm.config.kv_transfer import KVTransferConfig
|
from vllm.config.kv_transfer import KVTransferConfig
|
||||||
from vllm.config.load import LoadConfig
|
from vllm.config.load import LoadConfig
|
||||||
|
from vllm.config.lora import LoRAConfig
|
||||||
from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig,
|
from vllm.config.parallel import (DistributedExecutorBackend, EPLBConfig,
|
||||||
ParallelConfig)
|
ParallelConfig)
|
||||||
from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy
|
from vllm.config.scheduler import SchedulerConfig, SchedulerPolicy
|
||||||
@ -2400,116 +2401,6 @@ class SpeculativeConfig:
|
|||||||
return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})"
|
return f"SpeculativeConfig({method=}, {model=}, {num_spec_tokens=})"
|
||||||
|
|
||||||
|
|
||||||
LoRADType = Literal["auto", "float16", "bfloat16"]
|
|
||||||
|
|
||||||
|
|
||||||
@config
|
|
||||||
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
|
|
||||||
class LoRAConfig:
|
|
||||||
"""Configuration for LoRA."""
|
|
||||||
|
|
||||||
max_lora_rank: int = 16
|
|
||||||
"""Max LoRA rank."""
|
|
||||||
max_loras: int = 1
|
|
||||||
"""Max number of LoRAs in a single batch."""
|
|
||||||
fully_sharded_loras: bool = False
|
|
||||||
"""By default, only half of the LoRA computation is sharded with tensor
|
|
||||||
parallelism. Enabling this will use the fully sharded layers. At high
|
|
||||||
sequence length, max rank or tensor parallel size, this is likely faster.
|
|
||||||
"""
|
|
||||||
max_cpu_loras: Optional[int] = None
|
|
||||||
"""Maximum number of LoRAs to store in CPU memory. Must be >= than
|
|
||||||
`max_loras`."""
|
|
||||||
lora_dtype: Union[torch.dtype, LoRADType] = "auto"
|
|
||||||
"""Data type for LoRA. If auto, will default to base model dtype."""
|
|
||||||
lora_extra_vocab_size: int = 256
|
|
||||||
"""(Deprecated) Maximum size of extra vocabulary that can be present in a
|
|
||||||
LoRA adapter. Will be removed in v0.12.0."""
|
|
||||||
lora_vocab_padding_size: ClassVar[int] = current_platform\
|
|
||||||
.get_lora_vocab_padding_size()
|
|
||||||
default_mm_loras: Optional[dict[str, str]] = None
|
|
||||||
"""Dictionary mapping specific modalities to LoRA model paths; this field
|
|
||||||
is only applicable to multimodal models and should be leveraged when a
|
|
||||||
model always expects a LoRA to be active when a given modality is present.
|
|
||||||
Note that currently, if a request provides multiple additional
|
|
||||||
modalities, each of which have their own LoRA, we do NOT apply
|
|
||||||
default_mm_loras because we currently only support one lora adapter
|
|
||||||
per prompt. When run in offline mode, the lora IDs for n modalities
|
|
||||||
will be automatically assigned to 1-n with the names of the modalities
|
|
||||||
in alphabetic order."""
|
|
||||||
bias_enabled: bool = False
|
|
||||||
"""[DEPRECATED] Enable bias for LoRA adapters. This option will be
|
|
||||||
removed in v0.12.0."""
|
|
||||||
|
|
||||||
def compute_hash(self) -> str:
|
|
||||||
"""
|
|
||||||
WARNING: Whenever a new field is added to this config,
|
|
||||||
ensure that it is included in the factors list if
|
|
||||||
it affects the computation graph.
|
|
||||||
|
|
||||||
Provide a hash that uniquely identifies all the configs
|
|
||||||
that affect the structure of the computation
|
|
||||||
graph from input ids/embeddings to the final hidden states,
|
|
||||||
excluding anything before input ids/embeddings and after
|
|
||||||
the final hidden states.
|
|
||||||
"""
|
|
||||||
factors: list[Any] = []
|
|
||||||
factors.append(self.max_lora_rank)
|
|
||||||
factors.append(self.max_loras)
|
|
||||||
factors.append(self.fully_sharded_loras)
|
|
||||||
factors.append(self.lora_dtype)
|
|
||||||
factors.append(self.lora_extra_vocab_size)
|
|
||||||
factors.append(self.lora_vocab_padding_size)
|
|
||||||
factors.append(self.bias_enabled)
|
|
||||||
hash_str = hashlib.md5(str(factors).encode(),
|
|
||||||
usedforsecurity=False).hexdigest()
|
|
||||||
return hash_str
|
|
||||||
|
|
||||||
def __post_init__(self):
|
|
||||||
# Deprecation warning for lora_extra_vocab_size
|
|
||||||
logger.warning(
|
|
||||||
"`lora_extra_vocab_size` is deprecated and will be removed "
|
|
||||||
"in v0.12.0. Additional vocabulary support for "
|
|
||||||
"LoRA adapters is being phased out.")
|
|
||||||
|
|
||||||
# Deprecation warning for enable_lora_bias
|
|
||||||
if self.bias_enabled:
|
|
||||||
logger.warning("`enable_lora_bias` is deprecated "
|
|
||||||
"and will be removed in v0.12.0.")
|
|
||||||
|
|
||||||
# Setting the maximum rank to 512 should be able to satisfy the vast
|
|
||||||
# majority of applications.
|
|
||||||
possible_max_ranks = (8, 16, 32, 64, 128, 256, 320, 512)
|
|
||||||
possible_lora_extra_vocab_size = (256, 512)
|
|
||||||
if self.max_lora_rank not in possible_max_ranks:
|
|
||||||
raise ValueError(
|
|
||||||
f"max_lora_rank ({self.max_lora_rank}) must be one of "
|
|
||||||
f"{possible_max_ranks}.")
|
|
||||||
if self.lora_extra_vocab_size not in possible_lora_extra_vocab_size:
|
|
||||||
raise ValueError(
|
|
||||||
f"lora_extra_vocab_size ({self.lora_extra_vocab_size}) "
|
|
||||||
f"must be one of {possible_lora_extra_vocab_size}.")
|
|
||||||
if self.max_loras < 1:
|
|
||||||
raise ValueError(f"max_loras ({self.max_loras}) must be >= 1.")
|
|
||||||
if self.max_cpu_loras is None:
|
|
||||||
self.max_cpu_loras = self.max_loras
|
|
||||||
elif self.max_cpu_loras < self.max_loras:
|
|
||||||
raise ValueError(
|
|
||||||
f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
|
|
||||||
f"max_loras ({self.max_loras})")
|
|
||||||
|
|
||||||
def verify_with_cache_config(self, cache_config: CacheConfig):
|
|
||||||
if cache_config.cpu_offload_gb > 0 and not envs.VLLM_USE_V1:
|
|
||||||
raise ValueError(
|
|
||||||
"V0 LoRA does not support CPU offload, please use V1.")
|
|
||||||
|
|
||||||
def verify_with_model_config(self, model_config: ModelConfig):
|
|
||||||
if self.lora_dtype in (None, "auto"):
|
|
||||||
self.lora_dtype = model_config.dtype
|
|
||||||
elif isinstance(self.lora_dtype, str):
|
|
||||||
self.lora_dtype = getattr(torch, self.lora_dtype)
|
|
||||||
|
|
||||||
|
|
||||||
@config
|
@config
|
||||||
@dataclass
|
@dataclass
|
||||||
class MultiModalConfig:
|
class MultiModalConfig:
|
||||||
|
|||||||
132
vllm/config/lora.py
Normal file
132
vllm/config/lora.py
Normal file
@ -0,0 +1,132 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
from typing import TYPE_CHECKING, Any, ClassVar, Literal, Optional, Union
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from pydantic import ConfigDict
|
||||||
|
from pydantic.dataclasses import dataclass
|
||||||
|
|
||||||
|
import vllm.envs as envs
|
||||||
|
from vllm.config.utils import config
|
||||||
|
from vllm.logger import init_logger
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from vllm.config import ModelConfig
|
||||||
|
from vllm.config.cache import CacheConfig
|
||||||
|
else:
|
||||||
|
ModelConfig = Any
|
||||||
|
CacheConfig = Any
|
||||||
|
|
||||||
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
LoRADType = Literal["auto", "float16", "bfloat16"]
|
||||||
|
|
||||||
|
|
||||||
|
@config
|
||||||
|
@dataclass(config=ConfigDict(arbitrary_types_allowed=True))
|
||||||
|
class LoRAConfig:
|
||||||
|
"""Configuration for LoRA."""
|
||||||
|
|
||||||
|
max_lora_rank: int = 16
|
||||||
|
"""Max LoRA rank."""
|
||||||
|
max_loras: int = 1
|
||||||
|
"""Max number of LoRAs in a single batch."""
|
||||||
|
fully_sharded_loras: bool = False
|
||||||
|
"""By default, only half of the LoRA computation is sharded with tensor
|
||||||
|
parallelism. Enabling this will use the fully sharded layers. At high
|
||||||
|
sequence length, max rank or tensor parallel size, this is likely faster.
|
||||||
|
"""
|
||||||
|
max_cpu_loras: Optional[int] = None
|
||||||
|
"""Maximum number of LoRAs to store in CPU memory. Must be >= than
|
||||||
|
`max_loras`."""
|
||||||
|
lora_dtype: Union[torch.dtype, LoRADType] = "auto"
|
||||||
|
"""Data type for LoRA. If auto, will default to base model dtype."""
|
||||||
|
lora_extra_vocab_size: int = 256
|
||||||
|
"""(Deprecated) Maximum size of extra vocabulary that can be present in a
|
||||||
|
LoRA adapter. Will be removed in v0.12.0."""
|
||||||
|
lora_vocab_padding_size: ClassVar[int] = current_platform\
|
||||||
|
.get_lora_vocab_padding_size()
|
||||||
|
default_mm_loras: Optional[dict[str, str]] = None
|
||||||
|
"""Dictionary mapping specific modalities to LoRA model paths; this field
|
||||||
|
is only applicable to multimodal models and should be leveraged when a
|
||||||
|
model always expects a LoRA to be active when a given modality is present.
|
||||||
|
Note that currently, if a request provides multiple additional
|
||||||
|
modalities, each of which have their own LoRA, we do NOT apply
|
||||||
|
default_mm_loras because we currently only support one lora adapter
|
||||||
|
per prompt. When run in offline mode, the lora IDs for n modalities
|
||||||
|
will be automatically assigned to 1-n with the names of the modalities
|
||||||
|
in alphabetic order."""
|
||||||
|
bias_enabled: bool = False
|
||||||
|
"""[DEPRECATED] Enable bias for LoRA adapters. This option will be
|
||||||
|
removed in v0.12.0."""
|
||||||
|
|
||||||
|
def compute_hash(self) -> str:
|
||||||
|
"""
|
||||||
|
WARNING: Whenever a new field is added to this config,
|
||||||
|
ensure that it is included in the factors list if
|
||||||
|
it affects the computation graph.
|
||||||
|
|
||||||
|
Provide a hash that uniquely identifies all the configs
|
||||||
|
that affect the structure of the computation
|
||||||
|
graph from input ids/embeddings to the final hidden states,
|
||||||
|
excluding anything before input ids/embeddings and after
|
||||||
|
the final hidden states.
|
||||||
|
"""
|
||||||
|
factors: list[Any] = []
|
||||||
|
factors.append(self.max_lora_rank)
|
||||||
|
factors.append(self.max_loras)
|
||||||
|
factors.append(self.fully_sharded_loras)
|
||||||
|
factors.append(self.lora_dtype)
|
||||||
|
factors.append(self.lora_extra_vocab_size)
|
||||||
|
factors.append(self.lora_vocab_padding_size)
|
||||||
|
factors.append(self.bias_enabled)
|
||||||
|
hash_str = hashlib.md5(str(factors).encode(),
|
||||||
|
usedforsecurity=False).hexdigest()
|
||||||
|
return hash_str
|
||||||
|
|
||||||
|
def __post_init__(self):
|
||||||
|
# Deprecation warning for lora_extra_vocab_size
|
||||||
|
logger.warning(
|
||||||
|
"`lora_extra_vocab_size` is deprecated and will be removed "
|
||||||
|
"in v0.12.0. Additional vocabulary support for "
|
||||||
|
"LoRA adapters is being phased out.")
|
||||||
|
|
||||||
|
# Deprecation warning for enable_lora_bias
|
||||||
|
if self.bias_enabled:
|
||||||
|
logger.warning("`enable_lora_bias` is deprecated "
|
||||||
|
"and will be removed in v0.12.0.")
|
||||||
|
|
||||||
|
# Setting the maximum rank to 512 should be able to satisfy the vast
|
||||||
|
# majority of applications.
|
||||||
|
possible_max_ranks = (8, 16, 32, 64, 128, 256, 320, 512)
|
||||||
|
possible_lora_extra_vocab_size = (256, 512)
|
||||||
|
if self.max_lora_rank not in possible_max_ranks:
|
||||||
|
raise ValueError(
|
||||||
|
f"max_lora_rank ({self.max_lora_rank}) must be one of "
|
||||||
|
f"{possible_max_ranks}.")
|
||||||
|
if self.lora_extra_vocab_size not in possible_lora_extra_vocab_size:
|
||||||
|
raise ValueError(
|
||||||
|
f"lora_extra_vocab_size ({self.lora_extra_vocab_size}) "
|
||||||
|
f"must be one of {possible_lora_extra_vocab_size}.")
|
||||||
|
if self.max_loras < 1:
|
||||||
|
raise ValueError(f"max_loras ({self.max_loras}) must be >= 1.")
|
||||||
|
if self.max_cpu_loras is None:
|
||||||
|
self.max_cpu_loras = self.max_loras
|
||||||
|
elif self.max_cpu_loras < self.max_loras:
|
||||||
|
raise ValueError(
|
||||||
|
f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
|
||||||
|
f"max_loras ({self.max_loras})")
|
||||||
|
|
||||||
|
def verify_with_cache_config(self, cache_config: CacheConfig):
|
||||||
|
if cache_config.cpu_offload_gb > 0 and not envs.VLLM_USE_V1:
|
||||||
|
raise ValueError(
|
||||||
|
"V0 LoRA does not support CPU offload, please use V1.")
|
||||||
|
|
||||||
|
def verify_with_model_config(self, model_config: ModelConfig):
|
||||||
|
if self.lora_dtype in (None, "auto"):
|
||||||
|
self.lora_dtype = model_config.dtype
|
||||||
|
elif isinstance(self.lora_dtype, str):
|
||||||
|
self.lora_dtype = getattr(torch, self.lora_dtype)
|
||||||
@ -11,7 +11,8 @@ from typing import Callable, Deque, Dict, Iterable, List, Optional
|
|||||||
from typing import Sequence as GenericSequence
|
from typing import Sequence as GenericSequence
|
||||||
from typing import Set, Tuple, Union
|
from typing import Set, Tuple, Union
|
||||||
|
|
||||||
from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
|
from vllm.config import CacheConfig, SchedulerConfig
|
||||||
|
from vllm.config.lora import LoRAConfig
|
||||||
from vllm.core.interfaces import AllocStatus, BlockSpaceManager
|
from vllm.core.interfaces import AllocStatus, BlockSpaceManager
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
|
|||||||
@ -10,8 +10,9 @@ from typing import (Any, AsyncGenerator, Callable, Dict, Iterable, List,
|
|||||||
from weakref import ReferenceType
|
from weakref import ReferenceType
|
||||||
|
|
||||||
import vllm.envs as envs
|
import vllm.envs as envs
|
||||||
from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
|
from vllm.config import (DecodingConfig, ModelConfig, ParallelConfig,
|
||||||
ParallelConfig, SchedulerConfig, VllmConfig)
|
SchedulerConfig, VllmConfig)
|
||||||
|
from vllm.config.lora import LoRAConfig
|
||||||
from vllm.core.scheduler import SchedulerOutputs
|
from vllm.core.scheduler import SchedulerOutputs
|
||||||
from vllm.engine.arg_utils import AsyncEngineArgs
|
from vllm.engine.arg_utils import AsyncEngineArgs
|
||||||
from vllm.engine.async_timeout import asyncio_timeout
|
from vllm.engine.async_timeout import asyncio_timeout
|
||||||
|
|||||||
@ -16,9 +16,9 @@ import torch
|
|||||||
from typing_extensions import TypeVar
|
from typing_extensions import TypeVar
|
||||||
|
|
||||||
import vllm.envs as envs
|
import vllm.envs as envs
|
||||||
from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig,
|
from vllm.config import (DecodingConfig, ModelConfig, ObservabilityConfig,
|
||||||
ObservabilityConfig, ParallelConfig, SchedulerConfig,
|
ParallelConfig, SchedulerConfig, VllmConfig)
|
||||||
VllmConfig)
|
from vllm.config.lora import LoRAConfig
|
||||||
from vllm.core.scheduler import ScheduledSequenceGroup, SchedulerOutputs
|
from vllm.core.scheduler import ScheduledSequenceGroup, SchedulerOutputs
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.engine.metrics_types import StatLoggerBase, Stats
|
from vllm.engine.metrics_types import StatLoggerBase, Stats
|
||||||
|
|||||||
@ -7,7 +7,7 @@ import torch
|
|||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from transformers import PretrainedConfig
|
from transformers import PretrainedConfig
|
||||||
|
|
||||||
from vllm.config import LoRAConfig
|
from vllm.config.lora import LoRAConfig
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from vllm.lora.punica_wrapper import PunicaWrapperBase
|
from vllm.lora.punica_wrapper import PunicaWrapperBase
|
||||||
|
|||||||
@ -6,7 +6,7 @@ from typing import Optional, cast
|
|||||||
import torch
|
import torch
|
||||||
from transformers import PretrainedConfig
|
from transformers import PretrainedConfig
|
||||||
|
|
||||||
from vllm.config import LoRAConfig
|
from vllm.config.lora import LoRAConfig
|
||||||
from vllm.distributed.utils import divide
|
from vllm.distributed.utils import divide
|
||||||
# yapf: disable
|
# yapf: disable
|
||||||
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||||
|
|||||||
@ -7,7 +7,7 @@ import torch
|
|||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from transformers import PretrainedConfig
|
from transformers import PretrainedConfig
|
||||||
|
|
||||||
from vllm.config import LoRAConfig
|
from vllm.config.lora import LoRAConfig
|
||||||
from vllm.distributed import (get_tensor_model_parallel_rank,
|
from vllm.distributed import (get_tensor_model_parallel_rank,
|
||||||
get_tensor_model_parallel_world_size,
|
get_tensor_model_parallel_world_size,
|
||||||
tensor_model_parallel_all_gather)
|
tensor_model_parallel_all_gather)
|
||||||
|
|||||||
@ -8,7 +8,7 @@ import torch
|
|||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from transformers import PretrainedConfig
|
from transformers import PretrainedConfig
|
||||||
|
|
||||||
from vllm.config import LoRAConfig
|
from vllm.config.lora import LoRAConfig
|
||||||
from vllm.distributed import (get_tensor_model_parallel_rank,
|
from vllm.distributed import (get_tensor_model_parallel_rank,
|
||||||
get_tensor_model_parallel_world_size)
|
get_tensor_model_parallel_world_size)
|
||||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||||
|
|||||||
@ -7,7 +7,7 @@ import torch
|
|||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from transformers import PretrainedConfig
|
from transformers import PretrainedConfig
|
||||||
|
|
||||||
from vllm.config import LoRAConfig
|
from vllm.config.lora import LoRAConfig
|
||||||
from vllm.model_executor.layers.linear import ReplicatedLinear
|
from vllm.model_executor.layers.linear import ReplicatedLinear
|
||||||
|
|
||||||
from .base_linear import BaseLinearLayerWithLoRA
|
from .base_linear import BaseLinearLayerWithLoRA
|
||||||
|
|||||||
@ -7,7 +7,7 @@ import torch
|
|||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from transformers import PretrainedConfig
|
from transformers import PretrainedConfig
|
||||||
|
|
||||||
from vllm.config import LoRAConfig
|
from vllm.config.lora import LoRAConfig
|
||||||
from vllm.distributed import (get_tensor_model_parallel_rank,
|
from vllm.distributed import (get_tensor_model_parallel_rank,
|
||||||
get_tensor_model_parallel_world_size,
|
get_tensor_model_parallel_world_size,
|
||||||
split_tensor_along_last_dim,
|
split_tensor_along_last_dim,
|
||||||
|
|||||||
@ -8,7 +8,7 @@ import torch.nn as nn
|
|||||||
import torch.nn.functional as F
|
import torch.nn.functional as F
|
||||||
from transformers import PretrainedConfig
|
from transformers import PretrainedConfig
|
||||||
|
|
||||||
from vllm.config import LoRAConfig
|
from vllm.config.lora import LoRAConfig
|
||||||
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
||||||
VocabParallelEmbedding)
|
VocabParallelEmbedding)
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|||||||
@ -16,7 +16,7 @@ from vllm.adapter_commons.models import (AdapterLRUCache, AdapterModel,
|
|||||||
from vllm.adapter_commons.utils import (add_adapter, deactivate_adapter,
|
from vllm.adapter_commons.utils import (add_adapter, deactivate_adapter,
|
||||||
get_adapter, list_adapters,
|
get_adapter, list_adapters,
|
||||||
remove_adapter, set_adapter_mapping)
|
remove_adapter, set_adapter_mapping)
|
||||||
from vllm.config import LoRAConfig
|
from vllm.config.lora import LoRAConfig
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.lora.layers import BaseLayerWithLoRA, LoRAMapping
|
from vllm.lora.layers import BaseLayerWithLoRA, LoRAMapping
|
||||||
from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
|
from vllm.lora.lora import LoRALayerWeights, PackedLoRALayerWeights
|
||||||
|
|||||||
@ -9,7 +9,7 @@ import os
|
|||||||
from dataclasses import MISSING, dataclass, field, fields
|
from dataclasses import MISSING, dataclass, field, fields
|
||||||
from typing import Literal, Optional, Union
|
from typing import Literal, Optional, Union
|
||||||
|
|
||||||
from vllm.config import LoRAConfig
|
from vllm.config.lora import LoRAConfig
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
|
from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
|
||||||
|
|
||||||
|
|||||||
@ -11,7 +11,7 @@ from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError,
|
|||||||
from torch import nn
|
from torch import nn
|
||||||
from transformers import PretrainedConfig
|
from transformers import PretrainedConfig
|
||||||
|
|
||||||
from vllm.config import LoRAConfig
|
from vllm.config.lora import LoRAConfig
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
# being imported for _all_lora_classes below
|
# being imported for _all_lora_classes below
|
||||||
# yapf conflicts with isort for this block
|
# yapf conflicts with isort for this block
|
||||||
|
|||||||
@ -11,7 +11,7 @@ from vllm.adapter_commons.utils import (add_adapter_worker,
|
|||||||
list_adapters_worker,
|
list_adapters_worker,
|
||||||
set_active_adapters_worker)
|
set_active_adapters_worker)
|
||||||
from vllm.adapter_commons.worker_manager import AbstractWorkerManager
|
from vllm.adapter_commons.worker_manager import AbstractWorkerManager
|
||||||
from vllm.config import LoRAConfig
|
from vllm.config.lora import LoRAConfig
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.lora.models import (LoRAModel, LoRAModelManager,
|
from vllm.lora.models import (LoRAModel, LoRAModelManager,
|
||||||
LRUCacheLoRAModelManager, create_lora_manager)
|
LRUCacheLoRAModelManager, create_lora_manager)
|
||||||
|
|||||||
@ -29,7 +29,8 @@ from transformers import BartConfig
|
|||||||
from transformers.utils import logging
|
from transformers.utils import logging
|
||||||
|
|
||||||
from vllm.attention import Attention, AttentionType
|
from vllm.attention import Attention, AttentionType
|
||||||
from vllm.config import CacheConfig, LoRAConfig, VllmConfig
|
from vllm.config import CacheConfig, VllmConfig
|
||||||
|
from vllm.config.lora import LoRAConfig
|
||||||
from vllm.distributed import get_tensor_model_parallel_world_size
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
||||||
from vllm.model_executor.layers.activation import get_act_fn
|
from vllm.model_executor.layers.activation import get_act_fn
|
||||||
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
|
||||||
|
|||||||
@ -5,7 +5,8 @@ from typing import Optional
|
|||||||
|
|
||||||
from typing_extensions import assert_never
|
from typing_extensions import assert_never
|
||||||
|
|
||||||
from vllm.config import LoRAConfig, ModelConfig, SchedulerConfig
|
from vllm.config import ModelConfig, SchedulerConfig
|
||||||
|
from vllm.config.lora import LoRAConfig
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.transformers_utils.tokenizer import (AnyTokenizer, encode_tokens,
|
from vllm.transformers_utils.tokenizer import (AnyTokenizer, encode_tokens,
|
||||||
get_lora_tokenizer,
|
get_lora_tokenizer,
|
||||||
|
|||||||
@ -11,7 +11,8 @@ import numpy as np
|
|||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
|
||||||
from vllm.config import LoRAConfig, ModelConfig, SchedulerConfig
|
from vllm.config import ModelConfig, SchedulerConfig
|
||||||
|
from vllm.config.lora import LoRAConfig
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.lora.layers import LoRAMapping
|
from vllm.lora.layers import LoRAMapping
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user