mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 06:45:01 +08:00
[VLM] Refactor MultiModalConfig initialization and profiling (#7530)
This commit is contained in:
parent
1ef13cf92f
commit
bbf55c4805
@ -86,8 +86,12 @@ def server_function(port):
|
||||
|
||||
ModelRegistry.register_model("OPTForCausalLM", FakeAudioModel)
|
||||
|
||||
with patch("vllm.entrypoints.chat_utils._mm_token_str",
|
||||
lambda *_, **__: "_"):
|
||||
with patch(
|
||||
"vllm.entrypoints.chat_utils._mm_token_str",
|
||||
lambda *_, **__: "_"), patch(
|
||||
"vllm.model_executor.models.ModelRegistry.is_multimodal_model"
|
||||
) as mock:
|
||||
mock.return_value = True
|
||||
sys.argv = ["placeholder.py"] + \
|
||||
(f"--model {MODEL_NAME} --gpu-memory-utilization 0.10 "
|
||||
"--dtype bfloat16 --enforce-eager --api-key token-abc123 "
|
||||
|
||||
@ -4,7 +4,7 @@ import numpy as np
|
||||
import pytest
|
||||
from transformers import CLIPImageProcessor, LlavaNextImageProcessor
|
||||
|
||||
from vllm.config import ModelConfig, MultiModalConfig
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.multimodal import MultiModalRegistry
|
||||
from vllm.multimodal.utils import rescale_image_size
|
||||
|
||||
@ -30,10 +30,10 @@ def test_clip_image_processor(image_assets, mm_registry, dtype, size_factor):
|
||||
seed=0,
|
||||
dtype=dtype,
|
||||
revision=None,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
mm_config = MultiModalConfig(limit_per_prompt={"image": 1})
|
||||
|
||||
mm_registry.init_mm_limits_per_prompt(model_config, mm_config)
|
||||
mm_registry.init_mm_limits_per_prompt(model_config)
|
||||
|
||||
for asset in image_assets:
|
||||
image = rescale_image_size(asset.pil_image, size_factor)
|
||||
@ -73,10 +73,10 @@ def test_llava_next_image_processor(image_assets, mm_registry, dtype,
|
||||
seed=0,
|
||||
dtype=dtype,
|
||||
revision=None,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
mm_config = MultiModalConfig(limit_per_prompt={"image": 1})
|
||||
|
||||
mm_registry.init_mm_limits_per_prompt(model_config, mm_config)
|
||||
mm_registry.init_mm_limits_per_prompt(model_config)
|
||||
|
||||
for asset in image_assets:
|
||||
image = rescale_image_size(asset.pil_image, size_factor)
|
||||
@ -115,10 +115,10 @@ def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid):
|
||||
seed=0,
|
||||
dtype="half",
|
||||
revision=None,
|
||||
limit_mm_per_prompt={"image": limit},
|
||||
)
|
||||
mm_config = MultiModalConfig(limit_per_prompt={"image": limit})
|
||||
|
||||
mm_registry.init_mm_limits_per_prompt(model_config, mm_config)
|
||||
mm_registry.init_mm_limits_per_prompt(model_config)
|
||||
|
||||
image = image_assets[0].pil_image
|
||||
if num_images == 0:
|
||||
@ -145,10 +145,10 @@ def test_image_mapper_multi(image_assets, mm_registry, num_images):
|
||||
seed=0,
|
||||
dtype="half",
|
||||
revision=None,
|
||||
limit_mm_per_prompt={"image": num_images},
|
||||
)
|
||||
mm_config = MultiModalConfig(limit_per_prompt={"image": num_images})
|
||||
|
||||
mm_registry.init_mm_limits_per_prompt(model_config, mm_config)
|
||||
mm_registry.init_mm_limits_per_prompt(model_config)
|
||||
|
||||
image = image_assets[0].pil_image
|
||||
mm_inputs = {"image": [image] * num_images}
|
||||
|
||||
@ -109,6 +109,8 @@ class ModelConfig:
|
||||
matches the model name exposed via the APIs. If multiple model
|
||||
names provided, the first name will be used. If not specified,
|
||||
the model name will be the same as `model`.
|
||||
limit_mm_per_prompt: Maximum number of data instances per modality
|
||||
per prompt. Only applicable for multimodal models.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
@ -134,7 +136,7 @@ class ModelConfig:
|
||||
disable_sliding_window: bool = False,
|
||||
skip_tokenizer_init: bool = False,
|
||||
served_model_name: Optional[Union[str, List[str]]] = None,
|
||||
multimodal_config: Optional["MultiModalConfig"] = None,
|
||||
limit_mm_per_prompt: Optional[Mapping[str, int]] = None,
|
||||
) -> None:
|
||||
self.model = model
|
||||
self.tokenizer = tokenizer
|
||||
@ -211,14 +213,29 @@ class ModelConfig:
|
||||
sliding_window_len=self.get_hf_config_sliding_window())
|
||||
self.served_model_name = get_served_model_name(model,
|
||||
served_model_name)
|
||||
self.multimodal_config = multimodal_config
|
||||
|
||||
self.multimodal_config = self._init_multimodal_config(
|
||||
limit_mm_per_prompt)
|
||||
if not self.skip_tokenizer_init:
|
||||
self._verify_tokenizer_mode()
|
||||
self._verify_embedding_mode()
|
||||
self._verify_quantization()
|
||||
self._verify_cuda_graph()
|
||||
|
||||
def _init_multimodal_config(
|
||||
self, limit_mm_per_prompt: Optional[Mapping[str, int]]
|
||||
) -> Optional["MultiModalConfig"]:
|
||||
architectures = getattr(self.hf_config, "architectures", [])
|
||||
if any(
|
||||
ModelRegistry.is_multimodal_model(arch)
|
||||
for arch in architectures):
|
||||
return MultiModalConfig(limit_per_prompt=limit_mm_per_prompt or {})
|
||||
else:
|
||||
if limit_mm_per_prompt:
|
||||
raise ValueError(
|
||||
"limit_mm_per_prompt is only supported for multimodal "
|
||||
"models.")
|
||||
return None
|
||||
|
||||
def _verify_tokenizer_mode(self) -> None:
|
||||
tokenizer_mode = self.tokenizer_mode.lower()
|
||||
if tokenizer_mode not in ["auto", "slow"]:
|
||||
@ -467,6 +484,18 @@ class ModelConfig:
|
||||
if t != "attention"
|
||||
])
|
||||
|
||||
def get_multimodal_config(self) -> "MultiModalConfig":
|
||||
"""
|
||||
Get the multimodal configuration of the model.
|
||||
|
||||
Raises:
|
||||
ValueError: If the model is not multimodal.
|
||||
"""
|
||||
if self.multimodal_config is None:
|
||||
raise ValueError("The model is not multimodal.")
|
||||
|
||||
return self.multimodal_config
|
||||
|
||||
@property
|
||||
def is_encoder_decoder_model(self) -> bool:
|
||||
"""Extract the HF encoder/decoder model flag."""
|
||||
@ -1450,7 +1479,7 @@ class PromptAdapterConfig:
|
||||
class MultiModalConfig:
|
||||
"""Controls the behavior of multimodal models."""
|
||||
|
||||
limit_per_prompt: Mapping[str, int]
|
||||
limit_per_prompt: Mapping[str, int] = field(default_factory=dict)
|
||||
"""
|
||||
The maximum number of multi-modal input instances allowed per prompt
|
||||
for each :class:`~vllm.multimodal.MultiModalPlugin`.
|
||||
@ -1710,7 +1739,6 @@ class EngineConfig:
|
||||
device_config: DeviceConfig
|
||||
load_config: LoadConfig
|
||||
lora_config: Optional[LoRAConfig]
|
||||
multimodal_config: Optional[MultiModalConfig]
|
||||
speculative_config: Optional[SpeculativeConfig]
|
||||
decoding_config: Optional[DecodingConfig]
|
||||
observability_config: Optional[ObservabilityConfig]
|
||||
|
||||
@ -7,7 +7,7 @@ from typing import (TYPE_CHECKING, Dict, List, Mapping, Optional, Tuple, Type,
|
||||
|
||||
from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
|
||||
EngineConfig, LoadConfig, LoRAConfig, ModelConfig,
|
||||
MultiModalConfig, ObservabilityConfig, ParallelConfig,
|
||||
ObservabilityConfig, ParallelConfig,
|
||||
PromptAdapterConfig, SchedulerConfig,
|
||||
SpeculativeConfig, TokenizerPoolConfig)
|
||||
from vllm.executor.executor_base import ExecutorBase
|
||||
@ -765,9 +765,6 @@ class EngineArgs:
|
||||
"CPU offload space must be non-negative"
|
||||
f", but got {self.cpu_offload_gb}")
|
||||
|
||||
multimodal_config = MultiModalConfig(
|
||||
limit_per_prompt=self.limit_mm_per_prompt or {})
|
||||
|
||||
device_config = DeviceConfig(device=self.device)
|
||||
model_config = ModelConfig(
|
||||
model=self.model,
|
||||
@ -791,7 +788,8 @@ class EngineArgs:
|
||||
disable_sliding_window=self.disable_sliding_window,
|
||||
skip_tokenizer_init=self.skip_tokenizer_init,
|
||||
served_model_name=self.served_model_name,
|
||||
multimodal_config=multimodal_config)
|
||||
limit_mm_per_prompt=self.limit_mm_per_prompt,
|
||||
)
|
||||
cache_config = CacheConfig(
|
||||
block_size=self.block_size,
|
||||
gpu_memory_utilization=self.gpu_memory_utilization,
|
||||
@ -970,7 +968,6 @@ class EngineArgs:
|
||||
scheduler_config=scheduler_config,
|
||||
device_config=device_config,
|
||||
lora_config=lora_config,
|
||||
multimodal_config=multimodal_config,
|
||||
speculative_config=speculative_config,
|
||||
load_config=load_config,
|
||||
decoding_config=decoding_config,
|
||||
|
||||
@ -10,7 +10,7 @@ from typing_extensions import assert_never
|
||||
import vllm.envs as envs
|
||||
from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
|
||||
EngineConfig, LoadConfig, LoRAConfig, ModelConfig,
|
||||
MultiModalConfig, ObservabilityConfig, ParallelConfig,
|
||||
ObservabilityConfig, ParallelConfig,
|
||||
PromptAdapterConfig, SchedulerConfig,
|
||||
SpeculativeConfig)
|
||||
from vllm.core.scheduler import (ScheduledSequenceGroup, Scheduler,
|
||||
@ -100,8 +100,6 @@ class LLMEngine:
|
||||
scheduler_config: The configuration related to the request scheduler.
|
||||
device_config: The configuration related to the device.
|
||||
lora_config (Optional): The configuration related to serving multi-LoRA.
|
||||
multimodal_config (Optional): The configuration related to multimodal
|
||||
models.
|
||||
speculative_config (Optional): The configuration related to speculative
|
||||
decoding.
|
||||
executor_class: The model executor class for managing distributed
|
||||
@ -172,7 +170,6 @@ class LLMEngine:
|
||||
device_config: DeviceConfig,
|
||||
load_config: LoadConfig,
|
||||
lora_config: Optional[LoRAConfig],
|
||||
multimodal_config: Optional[MultiModalConfig],
|
||||
speculative_config: Optional[SpeculativeConfig],
|
||||
decoding_config: Optional[DecodingConfig],
|
||||
observability_config: Optional[ObservabilityConfig],
|
||||
@ -235,7 +232,6 @@ class LLMEngine:
|
||||
self.model_config = model_config
|
||||
self.cache_config = cache_config
|
||||
self.lora_config = lora_config
|
||||
self.multimodal_config = multimodal_config
|
||||
self.parallel_config = parallel_config
|
||||
self.scheduler_config = scheduler_config
|
||||
self.device_config = device_config
|
||||
@ -278,7 +274,6 @@ class LLMEngine:
|
||||
scheduler_config=scheduler_config,
|
||||
device_config=device_config,
|
||||
lora_config=lora_config,
|
||||
multimodal_config=multimodal_config,
|
||||
speculative_config=speculative_config,
|
||||
load_config=load_config,
|
||||
prompt_adapter_config=prompt_adapter_config,
|
||||
|
||||
@ -141,7 +141,6 @@ class CPUExecutor(ExecutorBase):
|
||||
rank=rank,
|
||||
distributed_init_method=self.distributed_init_method,
|
||||
lora_config=self.lora_config,
|
||||
multimodal_config=self.multimodal_config,
|
||||
kv_cache_dtype=self.cache_config.cache_dtype,
|
||||
prompt_adapter_config=self.prompt_adapter_config,
|
||||
is_driver_worker=rank == 0,
|
||||
|
||||
@ -2,8 +2,8 @@ from abc import ABC, abstractmethod
|
||||
from typing import List, Optional, Set, Tuple
|
||||
|
||||
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
|
||||
ModelConfig, MultiModalConfig, ObservabilityConfig,
|
||||
ParallelConfig, PromptAdapterConfig, SchedulerConfig,
|
||||
ModelConfig, ObservabilityConfig, ParallelConfig,
|
||||
PromptAdapterConfig, SchedulerConfig,
|
||||
SpeculativeConfig)
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.prompt_adapter.request import PromptAdapterRequest
|
||||
@ -29,7 +29,6 @@ class ExecutorBase(ABC):
|
||||
device_config: DeviceConfig,
|
||||
load_config: LoadConfig,
|
||||
lora_config: Optional[LoRAConfig],
|
||||
multimodal_config: Optional[MultiModalConfig],
|
||||
speculative_config: Optional[SpeculativeConfig],
|
||||
prompt_adapter_config: Optional[PromptAdapterConfig],
|
||||
observability_config: Optional[ObservabilityConfig],
|
||||
@ -41,7 +40,6 @@ class ExecutorBase(ABC):
|
||||
self.parallel_config = parallel_config
|
||||
self.scheduler_config = scheduler_config
|
||||
self.device_config = device_config
|
||||
self.multimodal_config = multimodal_config
|
||||
self.speculative_config = speculative_config
|
||||
self.prompt_adapter_config = prompt_adapter_config
|
||||
self.observability_config = observability_config
|
||||
|
||||
@ -55,7 +55,6 @@ class GPUExecutor(ExecutorBase):
|
||||
rank=rank,
|
||||
distributed_init_method=distributed_init_method,
|
||||
lora_config=self.lora_config,
|
||||
multimodal_config=self.multimodal_config,
|
||||
speculative_config=self.speculative_config,
|
||||
prompt_adapter_config=self.prompt_adapter_config,
|
||||
is_driver_worker=(not self.parallel_config)
|
||||
|
||||
@ -49,7 +49,6 @@ class OpenVINOExecutor(ExecutorBase):
|
||||
rank=0,
|
||||
distributed_init_method=distributed_init_method,
|
||||
lora_config=self.lora_config,
|
||||
multimodal_config=self.multimodal_config,
|
||||
kv_cache_dtype=self.cache_config.cache_dtype,
|
||||
is_driver_worker=True,
|
||||
)
|
||||
|
||||
@ -7,9 +7,8 @@ from typing import (TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, Set,
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
|
||||
ModelConfig, MultiModalConfig, ParallelConfig,
|
||||
PromptAdapterConfig, SchedulerConfig,
|
||||
SpeculativeConfig)
|
||||
ModelConfig, ParallelConfig, PromptAdapterConfig,
|
||||
SchedulerConfig, SpeculativeConfig)
|
||||
from vllm.executor.distributed_gpu_executor import ( # yapf: disable
|
||||
DistributedGPUExecutor, DistributedGPUExecutorAsync)
|
||||
from vllm.executor.ray_utils import RayWorkerWrapper, ray
|
||||
@ -46,7 +45,6 @@ class RayXPUExecutor(DistributedGPUExecutor):
|
||||
device_config: DeviceConfig,
|
||||
load_config: LoadConfig,
|
||||
lora_config: Optional[LoRAConfig],
|
||||
multimodal_config: Optional[MultiModalConfig],
|
||||
prompt_adapter_config: Optional[PromptAdapterConfig],
|
||||
speculative_config: Optional[SpeculativeConfig],
|
||||
) -> None:
|
||||
@ -61,7 +59,6 @@ class RayXPUExecutor(DistributedGPUExecutor):
|
||||
self.parallel_config = parallel_config
|
||||
self.scheduler_config = scheduler_config
|
||||
self.device_config = device_config
|
||||
self.multimodal_config = multimodal_config
|
||||
self.prompt_adapter_config = prompt_adapter_config
|
||||
|
||||
placement_group = self.parallel_config.placement_group
|
||||
@ -203,7 +200,6 @@ class RayXPUExecutor(DistributedGPUExecutor):
|
||||
rank=rank,
|
||||
distributed_init_method=distributed_init_method,
|
||||
lora_config=self.lora_config,
|
||||
multimodal_config=self.multimodal_config,
|
||||
is_driver_worker=rank == 0,
|
||||
))
|
||||
self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
|
||||
|
||||
@ -52,7 +52,6 @@ class TPUExecutor(ExecutorBase):
|
||||
local_rank=local_rank,
|
||||
rank=rank,
|
||||
distributed_init_method=distributed_init_method,
|
||||
multimodal_config=self.multimodal_config,
|
||||
is_driver_worker=rank == 0,
|
||||
)
|
||||
|
||||
|
||||
@ -3,9 +3,8 @@ from typing import List, Optional
|
||||
import torch
|
||||
|
||||
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
|
||||
ModelConfig, MultiModalConfig, ParallelConfig,
|
||||
PromptAdapterConfig, SchedulerConfig,
|
||||
SpeculativeConfig)
|
||||
ModelConfig, ParallelConfig, PromptAdapterConfig,
|
||||
SchedulerConfig, SpeculativeConfig)
|
||||
from vllm.executor.executor_base import ExecutorAsyncBase
|
||||
from vllm.executor.gpu_executor import GPUExecutor
|
||||
from vllm.logger import init_logger
|
||||
@ -29,7 +28,6 @@ class XPUExecutor(GPUExecutor):
|
||||
device_config: DeviceConfig,
|
||||
load_config: LoadConfig,
|
||||
lora_config: Optional[LoRAConfig],
|
||||
multimodal_config: Optional[MultiModalConfig],
|
||||
prompt_adapter_config: Optional[PromptAdapterConfig],
|
||||
speculative_config: Optional[SpeculativeConfig],
|
||||
) -> None:
|
||||
@ -46,7 +44,6 @@ class XPUExecutor(GPUExecutor):
|
||||
self.parallel_config = parallel_config
|
||||
self.scheduler_config = scheduler_config
|
||||
self.device_config = device_config
|
||||
self.multimodal_config = multimodal_config
|
||||
self.prompt_adapter_config = prompt_adapter_config
|
||||
self.speculative_config = None
|
||||
|
||||
|
||||
@ -13,7 +13,7 @@ from vllm.logger import init_logger
|
||||
from .data import LLMInputs
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import ModelConfig, MultiModalConfig
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.multimodal import MultiModalDataDict, MultiModalRegistry
|
||||
from vllm.sequence import SequenceData
|
||||
|
||||
@ -32,20 +32,6 @@ class InputContext:
|
||||
model_config: "ModelConfig"
|
||||
"""The configuration of the model."""
|
||||
|
||||
def get_multimodal_config(self) -> "MultiModalConfig":
|
||||
"""
|
||||
Get the multimodal configuration of the model.
|
||||
|
||||
Raises:
|
||||
ValueError: If the model is not multimodal.
|
||||
"""
|
||||
|
||||
multimodal_config = self.model_config.multimodal_config
|
||||
if multimodal_config is None:
|
||||
raise ValueError("No multimodal config found")
|
||||
|
||||
return multimodal_config
|
||||
|
||||
def get_hf_config(self, hf_config_type: Type[C] = PretrainedConfig) -> C:
|
||||
"""
|
||||
Get the HuggingFace configuration
|
||||
|
||||
@ -3,8 +3,7 @@ from typing import Optional
|
||||
from torch import nn
|
||||
|
||||
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
|
||||
ModelConfig, MultiModalConfig, ParallelConfig,
|
||||
SchedulerConfig)
|
||||
ModelConfig, ParallelConfig, SchedulerConfig)
|
||||
from vllm.model_executor.model_loader.loader import (BaseModelLoader,
|
||||
get_model_loader)
|
||||
from vllm.model_executor.model_loader.utils import (
|
||||
@ -15,13 +14,11 @@ def get_model(*, model_config: ModelConfig, load_config: LoadConfig,
|
||||
device_config: DeviceConfig, parallel_config: ParallelConfig,
|
||||
scheduler_config: SchedulerConfig,
|
||||
lora_config: Optional[LoRAConfig],
|
||||
multimodal_config: Optional[MultiModalConfig],
|
||||
cache_config: CacheConfig) -> nn.Module:
|
||||
loader = get_model_loader(load_config)
|
||||
return loader.load_model(model_config=model_config,
|
||||
device_config=device_config,
|
||||
lora_config=lora_config,
|
||||
multimodal_config=multimodal_config,
|
||||
parallel_config=parallel_config,
|
||||
scheduler_config=scheduler_config,
|
||||
cache_config=cache_config)
|
||||
|
||||
@ -132,9 +132,7 @@ def _get_model_initialization_kwargs(
|
||||
"please open an issue on github.")
|
||||
|
||||
if supports_multimodal(model_class):
|
||||
if multimodal_config is None:
|
||||
raise ValueError("Provide multi-modal related configurations "
|
||||
"through LLM entrypoint or engine arguments.")
|
||||
assert multimodal_config is not None
|
||||
|
||||
extra_kwargs["multimodal_config"] = multimodal_config
|
||||
|
||||
@ -164,7 +162,6 @@ def _initialize_model(
|
||||
model_config: ModelConfig,
|
||||
load_config: LoadConfig,
|
||||
lora_config: Optional[LoRAConfig],
|
||||
multimodal_config: Optional[MultiModalConfig],
|
||||
cache_config: CacheConfig,
|
||||
scheduler_config: Optional[SchedulerConfig] = None) -> nn.Module:
|
||||
"""Initialize a model with the given configurations."""
|
||||
@ -173,10 +170,10 @@ def _initialize_model(
|
||||
return build_model(
|
||||
model_class,
|
||||
model_config.hf_config,
|
||||
cache_config=cache_config,
|
||||
quant_config=_get_quantization_config(model_config, load_config),
|
||||
lora_config=lora_config,
|
||||
multimodal_config=multimodal_config,
|
||||
cache_config=cache_config,
|
||||
multimodal_config=model_config.multimodal_config,
|
||||
scheduler_config=scheduler_config,
|
||||
)
|
||||
|
||||
@ -191,7 +188,6 @@ class BaseModelLoader(ABC):
|
||||
def load_model(self, *, model_config: ModelConfig,
|
||||
device_config: DeviceConfig,
|
||||
lora_config: Optional[LoRAConfig],
|
||||
multimodal_config: Optional[MultiModalConfig],
|
||||
parallel_config: ParallelConfig,
|
||||
scheduler_config: SchedulerConfig,
|
||||
cache_config: CacheConfig) -> nn.Module:
|
||||
@ -336,7 +332,6 @@ class DefaultModelLoader(BaseModelLoader):
|
||||
def load_model(self, *, model_config: ModelConfig,
|
||||
device_config: DeviceConfig,
|
||||
lora_config: Optional[LoRAConfig],
|
||||
multimodal_config: Optional[MultiModalConfig],
|
||||
parallel_config: ParallelConfig,
|
||||
scheduler_config: SchedulerConfig,
|
||||
cache_config: CacheConfig) -> nn.Module:
|
||||
@ -344,8 +339,8 @@ class DefaultModelLoader(BaseModelLoader):
|
||||
with set_default_torch_dtype(model_config.dtype):
|
||||
with target_device:
|
||||
model = _initialize_model(model_config, self.load_config,
|
||||
lora_config, multimodal_config,
|
||||
cache_config, scheduler_config)
|
||||
lora_config, cache_config,
|
||||
scheduler_config)
|
||||
model.load_weights(
|
||||
self._get_weights_iterator(model_config.model,
|
||||
model_config.revision,
|
||||
@ -379,15 +374,14 @@ class DummyModelLoader(BaseModelLoader):
|
||||
def load_model(self, *, model_config: ModelConfig,
|
||||
device_config: DeviceConfig,
|
||||
lora_config: Optional[LoRAConfig],
|
||||
multimodal_config: Optional[MultiModalConfig],
|
||||
parallel_config: ParallelConfig,
|
||||
scheduler_config: SchedulerConfig,
|
||||
cache_config: CacheConfig) -> nn.Module:
|
||||
with set_default_torch_dtype(model_config.dtype):
|
||||
with torch.device(device_config.device):
|
||||
model = _initialize_model(model_config, self.load_config,
|
||||
lora_config, multimodal_config,
|
||||
cache_config, scheduler_config)
|
||||
lora_config, cache_config,
|
||||
scheduler_config)
|
||||
# NOTE(woosuk): For accurate performance evaluation, we assign
|
||||
# random values to the weights.
|
||||
initialize_dummy_weights(model)
|
||||
@ -420,7 +414,6 @@ class TensorizerLoader(BaseModelLoader):
|
||||
model_config: ModelConfig,
|
||||
device_config: DeviceConfig,
|
||||
lora_config: Optional[LoRAConfig],
|
||||
multimodal_config: Optional[MultiModalConfig],
|
||||
cache_config: CacheConfig,
|
||||
) -> nn.Module:
|
||||
"""Load a serialized model with tensorizer to the CPU.
|
||||
@ -433,8 +426,7 @@ class TensorizerLoader(BaseModelLoader):
|
||||
with set_default_torch_dtype(model_config.dtype):
|
||||
with torch.device(device_config.device):
|
||||
model = _initialize_model(model_config, self.load_config,
|
||||
lora_config, multimodal_config,
|
||||
cache_config)
|
||||
lora_config, cache_config)
|
||||
|
||||
model.load_weights(self._get_weights_iterator())
|
||||
return model.eval()
|
||||
@ -444,7 +436,6 @@ class TensorizerLoader(BaseModelLoader):
|
||||
model_config: ModelConfig,
|
||||
device_config: DeviceConfig,
|
||||
lora_config: Optional[LoRAConfig],
|
||||
multimodal_config: Optional[MultiModalConfig],
|
||||
cache_config: CacheConfig,
|
||||
) -> nn.Module:
|
||||
"""Load a serialized model with tensorizer.
|
||||
@ -458,7 +449,7 @@ class TensorizerLoader(BaseModelLoader):
|
||||
quant_config = _get_quantization_config(
|
||||
model_config, self.load_config)
|
||||
extra_kwargs = _get_model_initialization_kwargs(
|
||||
model_class, lora_config, multimodal_config)
|
||||
model_class, lora_config, model_config.multimodal_config)
|
||||
extra_kwargs["quant_config"] = quant_config
|
||||
extra_kwargs["cache_config"] = cache_config
|
||||
|
||||
@ -473,7 +464,6 @@ class TensorizerLoader(BaseModelLoader):
|
||||
def load_model(self, *, model_config: ModelConfig,
|
||||
device_config: DeviceConfig,
|
||||
lora_config: Optional[LoRAConfig],
|
||||
multimodal_config: Optional[MultiModalConfig],
|
||||
parallel_config: ParallelConfig,
|
||||
scheduler_config: SchedulerConfig,
|
||||
cache_config: CacheConfig) -> nn.Module:
|
||||
@ -487,11 +477,9 @@ class TensorizerLoader(BaseModelLoader):
|
||||
|
||||
if is_vllm_tensorized(self.tensorizer_config):
|
||||
return self._load_model_serialized(model_config, device_config,
|
||||
lora_config, multimodal_config,
|
||||
cache_config)
|
||||
lora_config, cache_config)
|
||||
return self._load_model_serialized_cpu(model_config, device_config,
|
||||
lora_config, multimodal_config,
|
||||
cache_config)
|
||||
lora_config, cache_config)
|
||||
|
||||
@staticmethod
|
||||
def save_model(
|
||||
@ -577,7 +565,6 @@ class ShardedStateLoader(BaseModelLoader):
|
||||
def load_model(self, *, model_config: ModelConfig,
|
||||
device_config: DeviceConfig,
|
||||
lora_config: Optional[LoRAConfig],
|
||||
multimodal_config: Optional[MultiModalConfig],
|
||||
parallel_config: ParallelConfig,
|
||||
scheduler_config: SchedulerConfig,
|
||||
cache_config: CacheConfig) -> nn.Module:
|
||||
@ -591,8 +578,7 @@ class ShardedStateLoader(BaseModelLoader):
|
||||
with set_default_torch_dtype(model_config.dtype):
|
||||
with torch.device(device_config.device):
|
||||
model = _initialize_model(model_config, self.load_config,
|
||||
lora_config, multimodal_config,
|
||||
cache_config)
|
||||
lora_config, cache_config)
|
||||
rank = get_tensor_model_parallel_rank()
|
||||
pattern = os.path.join(
|
||||
local_model_path,
|
||||
@ -955,15 +941,13 @@ class BitsAndBytesModelLoader(BaseModelLoader):
|
||||
def load_model(self, *, model_config: ModelConfig,
|
||||
device_config: DeviceConfig,
|
||||
lora_config: Optional[LoRAConfig],
|
||||
multimodal_config: Optional[MultiModalConfig],
|
||||
parallel_config: ParallelConfig,
|
||||
scheduler_config: SchedulerConfig,
|
||||
cache_config: CacheConfig) -> nn.Module:
|
||||
with set_default_torch_dtype(model_config.dtype):
|
||||
with torch.device(device_config.device):
|
||||
model = _initialize_model(model_config, self.load_config,
|
||||
lora_config, multimodal_config,
|
||||
cache_config)
|
||||
lora_config, cache_config)
|
||||
|
||||
self._load_weights(model_config, model)
|
||||
|
||||
@ -1032,7 +1016,6 @@ class GGUFModelLoader(BaseModelLoader):
|
||||
def load_model(self, *, model_config: ModelConfig,
|
||||
device_config: DeviceConfig,
|
||||
lora_config: Optional[LoRAConfig],
|
||||
multimodal_config: Optional[MultiModalConfig],
|
||||
parallel_config: ParallelConfig,
|
||||
scheduler_config: SchedulerConfig,
|
||||
cache_config: CacheConfig) -> nn.Module:
|
||||
@ -1047,8 +1030,7 @@ class GGUFModelLoader(BaseModelLoader):
|
||||
with set_default_torch_dtype(model_config.dtype):
|
||||
with torch.device(device_config.device):
|
||||
model = _initialize_model(model_config, self.load_config,
|
||||
lora_config, multimodal_config,
|
||||
cache_config)
|
||||
lora_config, cache_config)
|
||||
model.load_weights(
|
||||
self._get_weights_iterator(local_model_path, gguf_weights_map))
|
||||
return model
|
||||
|
||||
@ -9,17 +9,12 @@ from vllm.utils import is_hip
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
# Architecture -> (module, class).
|
||||
_GENERATION_MODELS = {
|
||||
"AquilaModel": ("llama", "LlamaForCausalLM"),
|
||||
"AquilaForCausalLM": ("llama", "LlamaForCausalLM"), # AquilaChat2
|
||||
"BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"), # baichuan-7b
|
||||
"BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"), # baichuan-13b
|
||||
"BloomForCausalLM": ("bloom", "BloomForCausalLM"),
|
||||
"Blip2ForConditionalGeneration":
|
||||
("blip2", "Blip2ForConditionalGeneration"),
|
||||
"ChameleonForConditionalGeneration":
|
||||
("chameleon", "ChameleonForConditionalGeneration"),
|
||||
"ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),
|
||||
"ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"),
|
||||
"CohereForCausalLM": ("commandr", "CohereForCausalLM"),
|
||||
@ -28,7 +23,6 @@ _GENERATION_MODELS = {
|
||||
"DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"),
|
||||
"DeepseekV2ForCausalLM": ("deepseek_v2", "DeepseekV2ForCausalLM"),
|
||||
"FalconForCausalLM": ("falcon", "FalconForCausalLM"),
|
||||
"FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
|
||||
"GemmaForCausalLM": ("gemma", "GemmaForCausalLM"),
|
||||
"Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"),
|
||||
"GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"),
|
||||
@ -37,13 +31,8 @@ _GENERATION_MODELS = {
|
||||
"GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
|
||||
"InternLMForCausalLM": ("llama", "LlamaForCausalLM"),
|
||||
"InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),
|
||||
"InternVLChatModel": ("internvl", "InternVLChatModel"),
|
||||
"JAISLMHeadModel": ("jais", "JAISLMHeadModel"),
|
||||
"LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
|
||||
"LlavaForConditionalGeneration":
|
||||
("llava", "LlavaForConditionalGeneration"),
|
||||
"LlavaNextForConditionalGeneration":
|
||||
("llava_next", "LlavaNextForConditionalGeneration"),
|
||||
# For decapoda-research/llama-*
|
||||
"LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
|
||||
"MistralForCausalLM": ("llama", "LlamaForCausalLM"),
|
||||
@ -53,17 +42,13 @@ _GENERATION_MODELS = {
|
||||
"MptForCausalLM": ("mpt", "MPTForCausalLM"),
|
||||
"MPTForCausalLM": ("mpt", "MPTForCausalLM"),
|
||||
"MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"),
|
||||
"MiniCPMV": ("minicpmv", "MiniCPMV"),
|
||||
"NemotronForCausalLM": ("nemotron", "NemotronForCausalLM"),
|
||||
"OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),
|
||||
"OPTForCausalLM": ("opt", "OPTForCausalLM"),
|
||||
"OrionForCausalLM": ("orion", "OrionForCausalLM"),
|
||||
"PersimmonForCausalLM": ("persimmon", "PersimmonForCausalLM"),
|
||||
"PaliGemmaForConditionalGeneration": ("paligemma",
|
||||
"PaliGemmaForConditionalGeneration"),
|
||||
"PhiForCausalLM": ("phi", "PhiForCausalLM"),
|
||||
"Phi3ForCausalLM": ("llama", "LlamaForCausalLM"),
|
||||
"Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
|
||||
"QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
|
||||
"Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
|
||||
"Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"),
|
||||
@ -83,6 +68,22 @@ _EMBEDDING_MODELS = {
|
||||
"MistralModel": ("llama_embedding", "LlamaEmbeddingModel"),
|
||||
}
|
||||
|
||||
_MULTIMODAL_MODELS = {
|
||||
"Blip2ForConditionalGeneration":
|
||||
("blip2", "Blip2ForConditionalGeneration"),
|
||||
"ChameleonForConditionalGeneration":
|
||||
("chameleon", "ChameleonForConditionalGeneration"),
|
||||
"FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"),
|
||||
"InternVLChatModel": ("internvl", "InternVLChatModel"),
|
||||
"LlavaForConditionalGeneration":
|
||||
("llava", "LlavaForConditionalGeneration"),
|
||||
"LlavaNextForConditionalGeneration":
|
||||
("llava_next", "LlavaNextForConditionalGeneration"),
|
||||
"MiniCPMV": ("minicpmv", "MiniCPMV"),
|
||||
"PaliGemmaForConditionalGeneration": ("paligemma",
|
||||
"PaliGemmaForConditionalGeneration"),
|
||||
"Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
|
||||
}
|
||||
_CONDITIONAL_GENERATION_MODELS = {
|
||||
"BartModel": ("bart", "BartForConditionalGeneration"),
|
||||
"BartForConditionalGeneration": ("bart", "BartForConditionalGeneration"),
|
||||
@ -91,7 +92,8 @@ _CONDITIONAL_GENERATION_MODELS = {
|
||||
_MODELS = {
|
||||
**_GENERATION_MODELS,
|
||||
**_EMBEDDING_MODELS,
|
||||
**_CONDITIONAL_GENERATION_MODELS
|
||||
**_MULTIMODAL_MODELS,
|
||||
**_CONDITIONAL_GENERATION_MODELS,
|
||||
}
|
||||
|
||||
# Architecture -> type.
|
||||
@ -182,6 +184,15 @@ class ModelRegistry:
|
||||
def is_embedding_model(model_arch: str) -> bool:
|
||||
return model_arch in _EMBEDDING_MODELS
|
||||
|
||||
@staticmethod
|
||||
def is_multimodal_model(model_arch: str) -> bool:
|
||||
|
||||
# TODO: find a way to avoid initializing CUDA prematurely to
|
||||
# use `supports_multimodal` to determine if a model is multimodal
|
||||
# model_cls = ModelRegistry._try_load_model_cls(model_arch)
|
||||
# from vllm.model_executor.models.interfaces import supports_multimodal
|
||||
return model_arch in _MULTIMODAL_MODELS
|
||||
|
||||
|
||||
__all__ = [
|
||||
"ModelRegistry",
|
||||
|
||||
@ -2,7 +2,7 @@ import functools
|
||||
from collections import UserDict
|
||||
from typing import Dict, Mapping, Optional, Sequence
|
||||
|
||||
from vllm.config import ModelConfig, MultiModalConfig
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from .audio import AudioPlugin
|
||||
@ -181,7 +181,6 @@ class MultiModalRegistry:
|
||||
def init_mm_limits_per_prompt(
|
||||
self,
|
||||
model_config: ModelConfig,
|
||||
multimodal_config: Optional[MultiModalConfig],
|
||||
) -> None:
|
||||
"""
|
||||
Initialize the maximum number of multi-modal input instances for each
|
||||
@ -192,6 +191,7 @@ class MultiModalRegistry:
|
||||
"`mm_limits` has already been set for model=%s, and will "
|
||||
"be overwritten by the new values.", model_config.model)
|
||||
|
||||
multimodal_config = model_config.multimodal_config
|
||||
if multimodal_config is None:
|
||||
limits_per_plugin = self._disabled_limits_per_plugin
|
||||
else:
|
||||
|
||||
@ -23,8 +23,8 @@ except ImportError:
|
||||
FLASHINFER_WORKSPACE_BUFFER_SIZE = 0
|
||||
|
||||
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
|
||||
ModelConfig, MultiModalConfig, ObservabilityConfig,
|
||||
ParallelConfig, PromptAdapterConfig, SchedulerConfig)
|
||||
ModelConfig, ObservabilityConfig, ParallelConfig,
|
||||
PromptAdapterConfig, SchedulerConfig)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.multimodal import MultiModalInputs
|
||||
from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
|
||||
@ -66,7 +66,6 @@ class TP1DraftModelRunner(ModelRunner):
|
||||
lora_config: Optional[LoRAConfig],
|
||||
kv_cache_dtype: Optional[str] = "auto",
|
||||
is_driver_worker: bool = False,
|
||||
multimodal_config: Optional[MultiModalConfig] = None,
|
||||
prompt_adapter_config: Optional[PromptAdapterConfig] = None,
|
||||
return_hidden_states: bool = False,
|
||||
observability_config: Optional[ObservabilityConfig] = None,
|
||||
@ -86,7 +85,6 @@ class TP1DraftModelRunner(ModelRunner):
|
||||
lora_config=lora_config,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
is_driver_worker=is_driver_worker,
|
||||
multimodal_config=multimodal_config,
|
||||
prompt_adapter_config=prompt_adapter_config,
|
||||
return_hidden_states=return_hidden_states,
|
||||
observability_config=observability_config,
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
from typing import List, Optional
|
||||
|
||||
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
|
||||
ModelConfig, MultiModalConfig, ObservabilityConfig,
|
||||
ParallelConfig, PromptAdapterConfig, SchedulerConfig)
|
||||
ModelConfig, ObservabilityConfig, ParallelConfig,
|
||||
PromptAdapterConfig, SchedulerConfig)
|
||||
from vllm.sequence import SequenceGroupMetadata
|
||||
from vllm.worker.model_runner import (ModelInputForGPUWithSamplingMetadata,
|
||||
ModelRunner)
|
||||
@ -31,7 +31,6 @@ class TargetModelRunner(ModelRunner):
|
||||
kv_cache_dtype: Optional[str] = "auto",
|
||||
is_driver_worker: bool = False,
|
||||
prompt_adapter_config: Optional[PromptAdapterConfig] = None,
|
||||
multimodal_config: Optional[MultiModalConfig] = None,
|
||||
return_hidden_states: bool = False,
|
||||
observability_config: Optional[ObservabilityConfig] = None):
|
||||
# An internal boolean member variable to indicate if token log
|
||||
@ -47,7 +46,6 @@ class TargetModelRunner(ModelRunner):
|
||||
lora_config=lora_config,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
is_driver_worker=is_driver_worker,
|
||||
multimodal_config=multimodal_config,
|
||||
prompt_adapter_config=prompt_adapter_config,
|
||||
return_hidden_states=return_hidden_states,
|
||||
observability_config=observability_config,
|
||||
|
||||
@ -6,8 +6,8 @@ from torch import nn
|
||||
|
||||
from vllm.attention import AttentionMetadata, get_attn_backend
|
||||
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
|
||||
ModelConfig, MultiModalConfig, ParallelConfig,
|
||||
PromptAdapterConfig, SchedulerConfig)
|
||||
ModelConfig, ParallelConfig, PromptAdapterConfig,
|
||||
SchedulerConfig)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor import SamplingMetadata
|
||||
from vllm.model_executor.model_loader import get_model
|
||||
@ -79,7 +79,6 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
|
||||
cache_config: CacheConfig,
|
||||
load_config: LoadConfig,
|
||||
lora_config: Optional[LoRAConfig],
|
||||
multimodal_config: Optional[MultiModalConfig],
|
||||
kv_cache_dtype: Optional[str] = "auto",
|
||||
prompt_adapter_config: Optional[PromptAdapterConfig] = None,
|
||||
is_driver_worker: bool = False,
|
||||
@ -94,7 +93,6 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
|
||||
self.device_config = device_config
|
||||
self.cache_config = cache_config
|
||||
self.lora_config = lora_config
|
||||
self.multimodal_config = multimodal_config
|
||||
self.prompt_adapter_config = prompt_adapter_config
|
||||
self.load_config = load_config
|
||||
self.is_driver_worker = is_driver_worker
|
||||
@ -125,7 +123,6 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]):
|
||||
self.model = get_model(model_config=self.model_config,
|
||||
load_config=self.load_config,
|
||||
device_config=self.device_config,
|
||||
multimodal_config=self.multimodal_config,
|
||||
lora_config=self.lora_config,
|
||||
parallel_config=self.parallel_config,
|
||||
scheduler_config=self.scheduler_config,
|
||||
|
||||
@ -7,8 +7,8 @@ import torch.distributed
|
||||
import vllm.envs as envs
|
||||
from vllm.attention import get_attn_backend
|
||||
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
|
||||
ModelConfig, MultiModalConfig, ParallelConfig,
|
||||
PromptAdapterConfig, SchedulerConfig)
|
||||
ModelConfig, ParallelConfig, PromptAdapterConfig,
|
||||
SchedulerConfig)
|
||||
from vllm.distributed import (ensure_model_parallel_initialized,
|
||||
init_distributed_environment)
|
||||
from vllm.logger import init_logger
|
||||
@ -132,7 +132,6 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
|
||||
rank: int,
|
||||
distributed_init_method: str,
|
||||
lora_config: Optional[LoRAConfig] = None,
|
||||
multimodal_config: Optional[MultiModalConfig] = None,
|
||||
kv_cache_dtype: Optional[str] = "auto",
|
||||
prompt_adapter_config: Optional[PromptAdapterConfig] = None,
|
||||
is_driver_worker: bool = False,
|
||||
@ -148,7 +147,6 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
|
||||
self.distributed_init_method = distributed_init_method
|
||||
self.lora_config = lora_config
|
||||
self.prompt_adapter_config = prompt_adapter_config
|
||||
self.multimodal_config = multimodal_config
|
||||
self.is_driver_worker = is_driver_worker
|
||||
if self.is_driver_worker:
|
||||
assert self.rank == 0, "The driver worker must have rank 0."
|
||||
@ -173,7 +171,6 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
|
||||
cache_config,
|
||||
load_config=self.load_config,
|
||||
lora_config=self.lora_config,
|
||||
multimodal_config=self.multimodal_config,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
prompt_adapter_config=self.prompt_adapter_config,
|
||||
is_driver_worker=is_driver_worker)
|
||||
|
||||
@ -4,8 +4,8 @@ from typing import Any, Dict, List, Optional, Tuple, Type
|
||||
import torch
|
||||
|
||||
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
|
||||
ModelConfig, MultiModalConfig, ObservabilityConfig,
|
||||
ParallelConfig, PromptAdapterConfig, SchedulerConfig)
|
||||
ModelConfig, ObservabilityConfig, ParallelConfig,
|
||||
PromptAdapterConfig, SchedulerConfig)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.pooling_metadata import PoolingMetadata
|
||||
from vllm.multimodal import MultiModalInputs
|
||||
@ -44,7 +44,6 @@ class EmbeddingModelRunner(
|
||||
kv_cache_dtype: Optional[str] = "auto",
|
||||
is_driver_worker: bool = False,
|
||||
prompt_adapter_config: Optional[PromptAdapterConfig] = None,
|
||||
multimodal_config: Optional[MultiModalConfig] = None,
|
||||
observability_config: Optional[ObservabilityConfig] = None,
|
||||
):
|
||||
super().__init__(model_config,
|
||||
@ -57,7 +56,6 @@ class EmbeddingModelRunner(
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
is_driver_worker=is_driver_worker,
|
||||
prompt_adapter_config=prompt_adapter_config,
|
||||
multimodal_config=multimodal_config,
|
||||
observability_config=observability_config)
|
||||
|
||||
@torch.inference_mode()
|
||||
|
||||
@ -10,8 +10,8 @@ from vllm.attention.selector import (_Backend, get_env_variable_attn_backend,
|
||||
get_global_forced_attn_backend,
|
||||
global_force_attn_backend)
|
||||
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
|
||||
ModelConfig, MultiModalConfig, ObservabilityConfig,
|
||||
ParallelConfig, PromptAdapterConfig, SchedulerConfig)
|
||||
ModelConfig, ObservabilityConfig, ParallelConfig,
|
||||
PromptAdapterConfig, SchedulerConfig)
|
||||
from vllm.inputs import INPUT_REGISTRY, InputRegistry
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor import SamplingMetadata
|
||||
@ -82,7 +82,6 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
|
||||
kv_cache_dtype: Optional[str] = "auto",
|
||||
is_driver_worker: bool = False,
|
||||
prompt_adapter_config: Optional[PromptAdapterConfig] = None,
|
||||
multimodal_config: Optional[MultiModalConfig] = None,
|
||||
observability_config: Optional[ObservabilityConfig] = None,
|
||||
input_registry: InputRegistry = INPUT_REGISTRY,
|
||||
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
|
||||
@ -90,7 +89,7 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
|
||||
'''
|
||||
EncoderDecoderModelRunner constructor.
|
||||
|
||||
`lora_config`, `multimodal_config`, and prompt_adapter_config are
|
||||
`lora_config` and `prompt_adapter_config` are
|
||||
unused (since these features are not yet supported for encoder/decoder
|
||||
models) but these arguments are present here for compatibility with
|
||||
the base-class constructor.
|
||||
@ -273,14 +272,8 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
|
||||
# number of tokens equal to max_num_batched_tokens.
|
||||
seqs: List[SequenceGroupMetadata] = []
|
||||
|
||||
model_config = self.model_config
|
||||
mm_config = self.multimodal_config
|
||||
|
||||
input_registry = self.input_registry
|
||||
mm_registry = self.mm_registry
|
||||
mm_registry.init_mm_limits_per_prompt(model_config, mm_config)
|
||||
|
||||
max_mm_tokens = mm_registry.get_max_multimodal_tokens(model_config)
|
||||
max_mm_tokens = self.mm_registry.get_max_multimodal_tokens(
|
||||
self.model_config)
|
||||
if max_mm_tokens > 0:
|
||||
raise NotImplementedError(
|
||||
"Multi-modal encoder-decoder models are not supported yet")
|
||||
@ -291,8 +284,10 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]):
|
||||
(group_id < max_num_batched_tokens % max_num_seqs))
|
||||
batch_size += seq_len
|
||||
|
||||
seq_data, _ = input_registry \
|
||||
.dummy_data_for_profiling(model_config, seq_len, mm_registry)
|
||||
seq_data, _ = self.input_registry \
|
||||
.dummy_data_for_profiling(self.model_config,
|
||||
seq_len,
|
||||
self.mm_registry)
|
||||
|
||||
# Having more tokens is over-conservative but otherwise fine
|
||||
assert len(seq_data.prompt_token_ids) >= seq_len, (
|
||||
|
||||
@ -27,8 +27,8 @@ except ImportError:
|
||||
import vllm.envs as envs
|
||||
from vllm.attention import AttentionMetadata, get_attn_backend
|
||||
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
|
||||
ModelConfig, MultiModalConfig, ObservabilityConfig,
|
||||
ParallelConfig, PromptAdapterConfig, SchedulerConfig)
|
||||
ModelConfig, ObservabilityConfig, ParallelConfig,
|
||||
PromptAdapterConfig, SchedulerConfig)
|
||||
from vllm.distributed import get_pp_group
|
||||
from vllm.distributed.parallel_state import graph_capture
|
||||
from vllm.inputs import INPUT_REGISTRY, InputRegistry
|
||||
@ -804,7 +804,6 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
|
||||
kv_cache_dtype: Optional[str] = "auto",
|
||||
is_driver_worker: bool = False,
|
||||
prompt_adapter_config: Optional[PromptAdapterConfig] = None,
|
||||
multimodal_config: Optional[MultiModalConfig] = None,
|
||||
return_hidden_states: bool = False,
|
||||
observability_config: Optional[ObservabilityConfig] = None,
|
||||
input_registry: InputRegistry = INPUT_REGISTRY,
|
||||
@ -819,7 +818,6 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
|
||||
self.load_config = load_config
|
||||
self.is_driver_worker = is_driver_worker
|
||||
self.prompt_adapter_config = prompt_adapter_config
|
||||
self.multimodal_config = multimodal_config
|
||||
self.return_hidden_states = return_hidden_states
|
||||
self.observability_config = observability_config
|
||||
|
||||
@ -866,6 +864,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
|
||||
self.mm_registry = mm_registry
|
||||
self.multi_modal_input_mapper = mm_registry \
|
||||
.create_input_mapper(model_config)
|
||||
self.mm_registry.init_mm_limits_per_prompt(self.model_config)
|
||||
|
||||
# Lazy initialization
|
||||
self.model: nn.Module # Set after load_model
|
||||
@ -893,7 +892,6 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
|
||||
device_config=self.device_config,
|
||||
load_config=self.load_config,
|
||||
lora_config=self.lora_config,
|
||||
multimodal_config=self.multimodal_config,
|
||||
parallel_config=self.parallel_config,
|
||||
scheduler_config=self.scheduler_config,
|
||||
cache_config=self.cache_config)
|
||||
@ -1056,14 +1054,9 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
|
||||
# To exercise the worst scenario for GPU memory consumption,
|
||||
# the number of seqs (batch_size) is chosen to maximize the number
|
||||
# of images processed.
|
||||
model_config = self.model_config
|
||||
mm_config = self.multimodal_config
|
||||
|
||||
input_registry = self.input_registry
|
||||
mm_registry = self.mm_registry
|
||||
mm_registry.init_mm_limits_per_prompt(model_config, mm_config)
|
||||
|
||||
max_mm_tokens = mm_registry.get_max_multimodal_tokens(model_config)
|
||||
max_mm_tokens = self.mm_registry.get_max_multimodal_tokens(
|
||||
self.model_config)
|
||||
if max_mm_tokens > 0:
|
||||
max_num_seqs_orig = max_num_seqs
|
||||
max_num_seqs = min(max_num_seqs,
|
||||
@ -1082,8 +1075,10 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]):
|
||||
(group_id < max_num_batched_tokens % max_num_seqs))
|
||||
batch_size += seq_len
|
||||
|
||||
seq_data, dummy_multi_modal_data = input_registry \
|
||||
.dummy_data_for_profiling(model_config, seq_len, mm_registry)
|
||||
seq_data, dummy_multi_modal_data = self.input_registry \
|
||||
.dummy_data_for_profiling(self.model_config,
|
||||
seq_len,
|
||||
self.mm_registry)
|
||||
|
||||
seq = SequenceGroupMetadata(
|
||||
request_id=str(group_id),
|
||||
|
||||
@ -11,7 +11,7 @@ import torch_xla.runtime as xr
|
||||
|
||||
from vllm.attention import AttentionMetadata, get_attn_backend
|
||||
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig,
|
||||
MultiModalConfig, ParallelConfig, SchedulerConfig)
|
||||
ParallelConfig, SchedulerConfig)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.model_loader import get_model
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
@ -89,7 +89,6 @@ class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]):
|
||||
device_config: DeviceConfig,
|
||||
cache_config: CacheConfig,
|
||||
load_config: LoadConfig,
|
||||
multimodal_config: Optional[MultiModalConfig] = None,
|
||||
is_driver_worker: bool = False,
|
||||
):
|
||||
self.model_config = model_config
|
||||
@ -98,7 +97,6 @@ class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]):
|
||||
self.device_config = device_config
|
||||
self.cache_config = cache_config
|
||||
self.load_config = load_config
|
||||
self.multimodal_config = multimodal_config
|
||||
self.is_driver_worker = is_driver_worker
|
||||
|
||||
self.block_size = self.cache_config.block_size
|
||||
@ -142,7 +140,6 @@ class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]):
|
||||
parallel_config=self.parallel_config,
|
||||
cache_config=self.cache_config,
|
||||
scheduler_config=self.scheduler_config,
|
||||
multimodal_config=self.multimodal_config,
|
||||
lora_config=None,
|
||||
)
|
||||
model = model.eval()
|
||||
|
||||
@ -7,7 +7,7 @@ import torch_xla.runtime as xr
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig,
|
||||
MultiModalConfig, ParallelConfig, SchedulerConfig)
|
||||
ParallelConfig, SchedulerConfig)
|
||||
from vllm.distributed import (ensure_model_parallel_initialized,
|
||||
init_distributed_environment)
|
||||
from vllm.logger import init_logger
|
||||
@ -31,7 +31,6 @@ class TPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
|
||||
device_config: DeviceConfig,
|
||||
cache_config: CacheConfig,
|
||||
load_config: LoadConfig,
|
||||
multimodal_config: Optional[MultiModalConfig],
|
||||
local_rank: int,
|
||||
rank: int,
|
||||
distributed_init_method: str,
|
||||
@ -44,7 +43,6 @@ class TPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
|
||||
self.device_config = device_config
|
||||
self.cache_config = cache_config
|
||||
self.load_config = load_config
|
||||
self.multimodal_config = multimodal_config
|
||||
self.local_rank = local_rank
|
||||
self.rank = rank
|
||||
self.distributed_init_method = distributed_init_method
|
||||
@ -64,7 +62,6 @@ class TPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase):
|
||||
device_config,
|
||||
cache_config,
|
||||
load_config,
|
||||
multimodal_config,
|
||||
is_driver_worker=is_driver_worker)
|
||||
|
||||
def init_device(self) -> None:
|
||||
|
||||
@ -39,7 +39,7 @@ def assert_enc_dec_mr_supported_scenario(
|
||||
raise NotImplementedError(
|
||||
STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_PP'])
|
||||
|
||||
if enc_dec_mr.multimodal_config is not None:
|
||||
if enc_dec_mr.model_config.multimodal_config is not None:
|
||||
raise NotImplementedError(
|
||||
STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_MM'])
|
||||
|
||||
|
||||
@ -7,8 +7,8 @@ import torch
|
||||
import torch.distributed
|
||||
|
||||
from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
|
||||
ModelConfig, MultiModalConfig, ObservabilityConfig,
|
||||
ParallelConfig, PromptAdapterConfig, SchedulerConfig,
|
||||
ModelConfig, ObservabilityConfig, ParallelConfig,
|
||||
PromptAdapterConfig, SchedulerConfig,
|
||||
SpeculativeConfig)
|
||||
from vllm.distributed import (ensure_model_parallel_initialized,
|
||||
init_distributed_environment,
|
||||
@ -46,7 +46,6 @@ class Worker(LocalOrDistributedWorkerBase):
|
||||
rank: int,
|
||||
distributed_init_method: str,
|
||||
lora_config: Optional[LoRAConfig] = None,
|
||||
multimodal_config: Optional[MultiModalConfig] = None,
|
||||
speculative_config: Optional[SpeculativeConfig] = None,
|
||||
prompt_adapter_config: Optional[PromptAdapterConfig] = None,
|
||||
is_driver_worker: bool = False,
|
||||
@ -73,7 +72,6 @@ class Worker(LocalOrDistributedWorkerBase):
|
||||
# note: lazy import to avoid importing torch before initializing
|
||||
from vllm.utils import init_cached_hf_modules
|
||||
init_cached_hf_modules()
|
||||
self.multimodal_config = multimodal_config
|
||||
self.observability_config = observability_config
|
||||
|
||||
# Return hidden states from target model if the draft model is an
|
||||
@ -103,7 +101,6 @@ class Worker(LocalOrDistributedWorkerBase):
|
||||
kv_cache_dtype=self.cache_config.cache_dtype,
|
||||
is_driver_worker=is_driver_worker,
|
||||
prompt_adapter_config=prompt_adapter_config,
|
||||
multimodal_config=multimodal_config,
|
||||
observability_config=observability_config,
|
||||
**speculative_args,
|
||||
)
|
||||
|
||||
@ -125,6 +125,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
|
||||
self.mm_registry = mm_registry
|
||||
self.multi_modal_input_mapper = mm_registry \
|
||||
.create_input_mapper(model_config)
|
||||
self.mm_registry.init_mm_limits_per_prompt(self.model_config)
|
||||
|
||||
# Lazy initialization.
|
||||
self.model: nn.Module # Set after init_Model
|
||||
@ -166,14 +167,8 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
|
||||
# To exercise the worst scenario for GPU memory consumption,
|
||||
# the number of seqs (batch_size) is chosen to maximize the number
|
||||
# of images processed.
|
||||
model_config = self.model_config
|
||||
mm_config = self.multimodal_config
|
||||
|
||||
input_registry = self.input_registry
|
||||
mm_registry = self.mm_registry
|
||||
mm_registry.init_mm_limits_per_prompt(model_config, mm_config)
|
||||
|
||||
max_mm_tokens = mm_registry.get_max_multimodal_tokens(model_config)
|
||||
max_mm_tokens = self.mm_registry.get_max_multimodal_tokens(
|
||||
self.model_config)
|
||||
if max_mm_tokens > 0:
|
||||
max_num_seqs_orig = max_num_seqs
|
||||
max_num_seqs = min(max_num_seqs,
|
||||
@ -190,8 +185,10 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]):
|
||||
seq_len = (max_num_batched_tokens // max_num_seqs +
|
||||
(group_id < max_num_batched_tokens % max_num_seqs))
|
||||
|
||||
seq_data, dummy_multi_modal_data = input_registry \
|
||||
.dummy_data_for_profiling(model_config, seq_len, mm_registry)
|
||||
seq_data, dummy_multi_modal_data = self.input_registry \
|
||||
.dummy_data_for_profiling(self.model_config,
|
||||
seq_len,
|
||||
self.mm_registry)
|
||||
|
||||
seq = SequenceGroupMetadata(
|
||||
request_id=str(group_id),
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user