From bbf55c4805efba5f1d7094f5e2888b3ef26c0fd7 Mon Sep 17 00:00:00 2001 From: Roger Wang <136131678+ywang96@users.noreply.github.com> Date: Sat, 17 Aug 2024 13:30:55 -0700 Subject: [PATCH] [VLM] Refactor `MultiModalConfig` initialization and profiling (#7530) --- tests/entrypoints/openai/test_audio.py | 8 +++- tests/multimodal/test_mapper.py | 18 ++++---- vllm/config.py | 38 +++++++++++++--- vllm/engine/arg_utils.py | 9 ++-- vllm/engine/llm_engine.py | 7 +-- vllm/executor/cpu_executor.py | 1 - vllm/executor/executor_base.py | 6 +-- vllm/executor/gpu_executor.py | 1 - vllm/executor/openvino_executor.py | 1 - vllm/executor/ray_xpu_executor.py | 8 +--- vllm/executor/tpu_executor.py | 1 - vllm/executor/xpu_executor.py | 7 +-- vllm/inputs/registry.py | 16 +------ vllm/model_executor/model_loader/__init__.py | 5 +-- vllm/model_executor/model_loader/loader.py | 46 ++++++-------------- vllm/model_executor/models/__init__.py | 43 +++++++++++------- vllm/multimodal/registry.py | 4 +- vllm/spec_decode/draft_model_runner.py | 6 +-- vllm/spec_decode/target_model_runner.py | 6 +-- vllm/worker/cpu_model_runner.py | 7 +-- vllm/worker/cpu_worker.py | 7 +-- vllm/worker/embedding_model_runner.py | 6 +-- vllm/worker/enc_dec_model_runner.py | 23 ++++------ vllm/worker/model_runner.py | 23 ++++------ vllm/worker/tpu_model_runner.py | 5 +-- vllm/worker/tpu_worker.py | 5 +-- vllm/worker/utils.py | 2 +- vllm/worker/worker.py | 7 +-- vllm/worker/xpu_model_runner.py | 17 +++----- 29 files changed, 143 insertions(+), 190 deletions(-) diff --git a/tests/entrypoints/openai/test_audio.py b/tests/entrypoints/openai/test_audio.py index 3c2c652fd317d..39b47f3033715 100644 --- a/tests/entrypoints/openai/test_audio.py +++ b/tests/entrypoints/openai/test_audio.py @@ -86,8 +86,12 @@ def server_function(port): ModelRegistry.register_model("OPTForCausalLM", FakeAudioModel) - with patch("vllm.entrypoints.chat_utils._mm_token_str", - lambda *_, **__: "_"): + with patch( + "vllm.entrypoints.chat_utils._mm_token_str", + lambda *_, **__: "_"), patch( + "vllm.model_executor.models.ModelRegistry.is_multimodal_model" + ) as mock: + mock.return_value = True sys.argv = ["placeholder.py"] + \ (f"--model {MODEL_NAME} --gpu-memory-utilization 0.10 " "--dtype bfloat16 --enforce-eager --api-key token-abc123 " diff --git a/tests/multimodal/test_mapper.py b/tests/multimodal/test_mapper.py index 6b0c02c799c4a..7d09b81060efd 100644 --- a/tests/multimodal/test_mapper.py +++ b/tests/multimodal/test_mapper.py @@ -4,7 +4,7 @@ import numpy as np import pytest from transformers import CLIPImageProcessor, LlavaNextImageProcessor -from vllm.config import ModelConfig, MultiModalConfig +from vllm.config import ModelConfig from vllm.multimodal import MultiModalRegistry from vllm.multimodal.utils import rescale_image_size @@ -30,10 +30,10 @@ def test_clip_image_processor(image_assets, mm_registry, dtype, size_factor): seed=0, dtype=dtype, revision=None, + limit_mm_per_prompt={"image": 1}, ) - mm_config = MultiModalConfig(limit_per_prompt={"image": 1}) - mm_registry.init_mm_limits_per_prompt(model_config, mm_config) + mm_registry.init_mm_limits_per_prompt(model_config) for asset in image_assets: image = rescale_image_size(asset.pil_image, size_factor) @@ -73,10 +73,10 @@ def test_llava_next_image_processor(image_assets, mm_registry, dtype, seed=0, dtype=dtype, revision=None, + limit_mm_per_prompt={"image": 1}, ) - mm_config = MultiModalConfig(limit_per_prompt={"image": 1}) - mm_registry.init_mm_limits_per_prompt(model_config, mm_config) + mm_registry.init_mm_limits_per_prompt(model_config) for asset in image_assets: image = rescale_image_size(asset.pil_image, size_factor) @@ -115,10 +115,10 @@ def test_mm_limits(image_assets, mm_registry, num_images, limit, is_valid): seed=0, dtype="half", revision=None, + limit_mm_per_prompt={"image": limit}, ) - mm_config = MultiModalConfig(limit_per_prompt={"image": limit}) - mm_registry.init_mm_limits_per_prompt(model_config, mm_config) + mm_registry.init_mm_limits_per_prompt(model_config) image = image_assets[0].pil_image if num_images == 0: @@ -145,10 +145,10 @@ def test_image_mapper_multi(image_assets, mm_registry, num_images): seed=0, dtype="half", revision=None, + limit_mm_per_prompt={"image": num_images}, ) - mm_config = MultiModalConfig(limit_per_prompt={"image": num_images}) - mm_registry.init_mm_limits_per_prompt(model_config, mm_config) + mm_registry.init_mm_limits_per_prompt(model_config) image = image_assets[0].pil_image mm_inputs = {"image": [image] * num_images} diff --git a/vllm/config.py b/vllm/config.py index e03adb5f5c963..beb77f2bd905f 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -109,6 +109,8 @@ class ModelConfig: matches the model name exposed via the APIs. If multiple model names provided, the first name will be used. If not specified, the model name will be the same as `model`. + limit_mm_per_prompt: Maximum number of data instances per modality + per prompt. Only applicable for multimodal models. """ def __init__( @@ -134,7 +136,7 @@ class ModelConfig: disable_sliding_window: bool = False, skip_tokenizer_init: bool = False, served_model_name: Optional[Union[str, List[str]]] = None, - multimodal_config: Optional["MultiModalConfig"] = None, + limit_mm_per_prompt: Optional[Mapping[str, int]] = None, ) -> None: self.model = model self.tokenizer = tokenizer @@ -211,14 +213,29 @@ class ModelConfig: sliding_window_len=self.get_hf_config_sliding_window()) self.served_model_name = get_served_model_name(model, served_model_name) - self.multimodal_config = multimodal_config - + self.multimodal_config = self._init_multimodal_config( + limit_mm_per_prompt) if not self.skip_tokenizer_init: self._verify_tokenizer_mode() self._verify_embedding_mode() self._verify_quantization() self._verify_cuda_graph() + def _init_multimodal_config( + self, limit_mm_per_prompt: Optional[Mapping[str, int]] + ) -> Optional["MultiModalConfig"]: + architectures = getattr(self.hf_config, "architectures", []) + if any( + ModelRegistry.is_multimodal_model(arch) + for arch in architectures): + return MultiModalConfig(limit_per_prompt=limit_mm_per_prompt or {}) + else: + if limit_mm_per_prompt: + raise ValueError( + "limit_mm_per_prompt is only supported for multimodal " + "models.") + return None + def _verify_tokenizer_mode(self) -> None: tokenizer_mode = self.tokenizer_mode.lower() if tokenizer_mode not in ["auto", "slow"]: @@ -467,6 +484,18 @@ class ModelConfig: if t != "attention" ]) + def get_multimodal_config(self) -> "MultiModalConfig": + """ + Get the multimodal configuration of the model. + + Raises: + ValueError: If the model is not multimodal. + """ + if self.multimodal_config is None: + raise ValueError("The model is not multimodal.") + + return self.multimodal_config + @property def is_encoder_decoder_model(self) -> bool: """Extract the HF encoder/decoder model flag.""" @@ -1450,7 +1479,7 @@ class PromptAdapterConfig: class MultiModalConfig: """Controls the behavior of multimodal models.""" - limit_per_prompt: Mapping[str, int] + limit_per_prompt: Mapping[str, int] = field(default_factory=dict) """ The maximum number of multi-modal input instances allowed per prompt for each :class:`~vllm.multimodal.MultiModalPlugin`. @@ -1710,7 +1739,6 @@ class EngineConfig: device_config: DeviceConfig load_config: LoadConfig lora_config: Optional[LoRAConfig] - multimodal_config: Optional[MultiModalConfig] speculative_config: Optional[SpeculativeConfig] decoding_config: Optional[DecodingConfig] observability_config: Optional[ObservabilityConfig] diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 6c7259129a109..cd1aeb904ff38 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -7,7 +7,7 @@ from typing import (TYPE_CHECKING, Dict, List, Mapping, Optional, Tuple, Type, from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, EngineConfig, LoadConfig, LoRAConfig, ModelConfig, - MultiModalConfig, ObservabilityConfig, ParallelConfig, + ObservabilityConfig, ParallelConfig, PromptAdapterConfig, SchedulerConfig, SpeculativeConfig, TokenizerPoolConfig) from vllm.executor.executor_base import ExecutorBase @@ -765,9 +765,6 @@ class EngineArgs: "CPU offload space must be non-negative" f", but got {self.cpu_offload_gb}") - multimodal_config = MultiModalConfig( - limit_per_prompt=self.limit_mm_per_prompt or {}) - device_config = DeviceConfig(device=self.device) model_config = ModelConfig( model=self.model, @@ -791,7 +788,8 @@ class EngineArgs: disable_sliding_window=self.disable_sliding_window, skip_tokenizer_init=self.skip_tokenizer_init, served_model_name=self.served_model_name, - multimodal_config=multimodal_config) + limit_mm_per_prompt=self.limit_mm_per_prompt, + ) cache_config = CacheConfig( block_size=self.block_size, gpu_memory_utilization=self.gpu_memory_utilization, @@ -970,7 +968,6 @@ class EngineArgs: scheduler_config=scheduler_config, device_config=device_config, lora_config=lora_config, - multimodal_config=multimodal_config, speculative_config=speculative_config, load_config=load_config, decoding_config=decoding_config, diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 979555eb6a05d..4ddb80ff7de1a 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -10,7 +10,7 @@ from typing_extensions import assert_never import vllm.envs as envs from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, EngineConfig, LoadConfig, LoRAConfig, ModelConfig, - MultiModalConfig, ObservabilityConfig, ParallelConfig, + ObservabilityConfig, ParallelConfig, PromptAdapterConfig, SchedulerConfig, SpeculativeConfig) from vllm.core.scheduler import (ScheduledSequenceGroup, Scheduler, @@ -100,8 +100,6 @@ class LLMEngine: scheduler_config: The configuration related to the request scheduler. device_config: The configuration related to the device. lora_config (Optional): The configuration related to serving multi-LoRA. - multimodal_config (Optional): The configuration related to multimodal - models. speculative_config (Optional): The configuration related to speculative decoding. executor_class: The model executor class for managing distributed @@ -172,7 +170,6 @@ class LLMEngine: device_config: DeviceConfig, load_config: LoadConfig, lora_config: Optional[LoRAConfig], - multimodal_config: Optional[MultiModalConfig], speculative_config: Optional[SpeculativeConfig], decoding_config: Optional[DecodingConfig], observability_config: Optional[ObservabilityConfig], @@ -235,7 +232,6 @@ class LLMEngine: self.model_config = model_config self.cache_config = cache_config self.lora_config = lora_config - self.multimodal_config = multimodal_config self.parallel_config = parallel_config self.scheduler_config = scheduler_config self.device_config = device_config @@ -278,7 +274,6 @@ class LLMEngine: scheduler_config=scheduler_config, device_config=device_config, lora_config=lora_config, - multimodal_config=multimodal_config, speculative_config=speculative_config, load_config=load_config, prompt_adapter_config=prompt_adapter_config, diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index f58aaf8a55b98..37d12725bd1e4 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -141,7 +141,6 @@ class CPUExecutor(ExecutorBase): rank=rank, distributed_init_method=self.distributed_init_method, lora_config=self.lora_config, - multimodal_config=self.multimodal_config, kv_cache_dtype=self.cache_config.cache_dtype, prompt_adapter_config=self.prompt_adapter_config, is_driver_worker=rank == 0, diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py index bc4f544554ae4..422bef107f352 100644 --- a/vllm/executor/executor_base.py +++ b/vllm/executor/executor_base.py @@ -2,8 +2,8 @@ from abc import ABC, abstractmethod from typing import List, Optional, Set, Tuple from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, MultiModalConfig, ObservabilityConfig, - ParallelConfig, PromptAdapterConfig, SchedulerConfig, + ModelConfig, ObservabilityConfig, ParallelConfig, + PromptAdapterConfig, SchedulerConfig, SpeculativeConfig) from vllm.lora.request import LoRARequest from vllm.prompt_adapter.request import PromptAdapterRequest @@ -29,7 +29,6 @@ class ExecutorBase(ABC): device_config: DeviceConfig, load_config: LoadConfig, lora_config: Optional[LoRAConfig], - multimodal_config: Optional[MultiModalConfig], speculative_config: Optional[SpeculativeConfig], prompt_adapter_config: Optional[PromptAdapterConfig], observability_config: Optional[ObservabilityConfig], @@ -41,7 +40,6 @@ class ExecutorBase(ABC): self.parallel_config = parallel_config self.scheduler_config = scheduler_config self.device_config = device_config - self.multimodal_config = multimodal_config self.speculative_config = speculative_config self.prompt_adapter_config = prompt_adapter_config self.observability_config = observability_config diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py index 57b9e2b33b982..55976f430254c 100644 --- a/vllm/executor/gpu_executor.py +++ b/vllm/executor/gpu_executor.py @@ -55,7 +55,6 @@ class GPUExecutor(ExecutorBase): rank=rank, distributed_init_method=distributed_init_method, lora_config=self.lora_config, - multimodal_config=self.multimodal_config, speculative_config=self.speculative_config, prompt_adapter_config=self.prompt_adapter_config, is_driver_worker=(not self.parallel_config) diff --git a/vllm/executor/openvino_executor.py b/vllm/executor/openvino_executor.py index 7df515a2a5ce7..867859d8d3d79 100644 --- a/vllm/executor/openvino_executor.py +++ b/vllm/executor/openvino_executor.py @@ -49,7 +49,6 @@ class OpenVINOExecutor(ExecutorBase): rank=0, distributed_init_method=distributed_init_method, lora_config=self.lora_config, - multimodal_config=self.multimodal_config, kv_cache_dtype=self.cache_config.cache_dtype, is_driver_worker=True, ) diff --git a/vllm/executor/ray_xpu_executor.py b/vllm/executor/ray_xpu_executor.py index bdd8ba9032766..938f83bc1338b 100644 --- a/vllm/executor/ray_xpu_executor.py +++ b/vllm/executor/ray_xpu_executor.py @@ -7,9 +7,8 @@ from typing import (TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, Set, import vllm.envs as envs from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, MultiModalConfig, ParallelConfig, - PromptAdapterConfig, SchedulerConfig, - SpeculativeConfig) + ModelConfig, ParallelConfig, PromptAdapterConfig, + SchedulerConfig, SpeculativeConfig) from vllm.executor.distributed_gpu_executor import ( # yapf: disable DistributedGPUExecutor, DistributedGPUExecutorAsync) from vllm.executor.ray_utils import RayWorkerWrapper, ray @@ -46,7 +45,6 @@ class RayXPUExecutor(DistributedGPUExecutor): device_config: DeviceConfig, load_config: LoadConfig, lora_config: Optional[LoRAConfig], - multimodal_config: Optional[MultiModalConfig], prompt_adapter_config: Optional[PromptAdapterConfig], speculative_config: Optional[SpeculativeConfig], ) -> None: @@ -61,7 +59,6 @@ class RayXPUExecutor(DistributedGPUExecutor): self.parallel_config = parallel_config self.scheduler_config = scheduler_config self.device_config = device_config - self.multimodal_config = multimodal_config self.prompt_adapter_config = prompt_adapter_config placement_group = self.parallel_config.placement_group @@ -203,7 +200,6 @@ class RayXPUExecutor(DistributedGPUExecutor): rank=rank, distributed_init_method=distributed_init_method, lora_config=self.lora_config, - multimodal_config=self.multimodal_config, is_driver_worker=rank == 0, )) self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs) diff --git a/vllm/executor/tpu_executor.py b/vllm/executor/tpu_executor.py index 1b5bb5c755ef2..253c8abdc1ada 100644 --- a/vllm/executor/tpu_executor.py +++ b/vllm/executor/tpu_executor.py @@ -52,7 +52,6 @@ class TPUExecutor(ExecutorBase): local_rank=local_rank, rank=rank, distributed_init_method=distributed_init_method, - multimodal_config=self.multimodal_config, is_driver_worker=rank == 0, ) diff --git a/vllm/executor/xpu_executor.py b/vllm/executor/xpu_executor.py index 9feae6a05ba9b..687f938cfb937 100644 --- a/vllm/executor/xpu_executor.py +++ b/vllm/executor/xpu_executor.py @@ -3,9 +3,8 @@ from typing import List, Optional import torch from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, MultiModalConfig, ParallelConfig, - PromptAdapterConfig, SchedulerConfig, - SpeculativeConfig) + ModelConfig, ParallelConfig, PromptAdapterConfig, + SchedulerConfig, SpeculativeConfig) from vllm.executor.executor_base import ExecutorAsyncBase from vllm.executor.gpu_executor import GPUExecutor from vllm.logger import init_logger @@ -29,7 +28,6 @@ class XPUExecutor(GPUExecutor): device_config: DeviceConfig, load_config: LoadConfig, lora_config: Optional[LoRAConfig], - multimodal_config: Optional[MultiModalConfig], prompt_adapter_config: Optional[PromptAdapterConfig], speculative_config: Optional[SpeculativeConfig], ) -> None: @@ -46,7 +44,6 @@ class XPUExecutor(GPUExecutor): self.parallel_config = parallel_config self.scheduler_config = scheduler_config self.device_config = device_config - self.multimodal_config = multimodal_config self.prompt_adapter_config = prompt_adapter_config self.speculative_config = None diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 2ca8b10f71593..28ce0ef86e798 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -13,7 +13,7 @@ from vllm.logger import init_logger from .data import LLMInputs if TYPE_CHECKING: - from vllm.config import ModelConfig, MultiModalConfig + from vllm.config import ModelConfig from vllm.multimodal import MultiModalDataDict, MultiModalRegistry from vllm.sequence import SequenceData @@ -32,20 +32,6 @@ class InputContext: model_config: "ModelConfig" """The configuration of the model.""" - def get_multimodal_config(self) -> "MultiModalConfig": - """ - Get the multimodal configuration of the model. - - Raises: - ValueError: If the model is not multimodal. - """ - - multimodal_config = self.model_config.multimodal_config - if multimodal_config is None: - raise ValueError("No multimodal config found") - - return multimodal_config - def get_hf_config(self, hf_config_type: Type[C] = PretrainedConfig) -> C: """ Get the HuggingFace configuration diff --git a/vllm/model_executor/model_loader/__init__.py b/vllm/model_executor/model_loader/__init__.py index d10107a7f024e..d1ec171c9ec2a 100644 --- a/vllm/model_executor/model_loader/__init__.py +++ b/vllm/model_executor/model_loader/__init__.py @@ -3,8 +3,7 @@ from typing import Optional from torch import nn from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, MultiModalConfig, ParallelConfig, - SchedulerConfig) + ModelConfig, ParallelConfig, SchedulerConfig) from vllm.model_executor.model_loader.loader import (BaseModelLoader, get_model_loader) from vllm.model_executor.model_loader.utils import ( @@ -15,13 +14,11 @@ def get_model(*, model_config: ModelConfig, load_config: LoadConfig, device_config: DeviceConfig, parallel_config: ParallelConfig, scheduler_config: SchedulerConfig, lora_config: Optional[LoRAConfig], - multimodal_config: Optional[MultiModalConfig], cache_config: CacheConfig) -> nn.Module: loader = get_model_loader(load_config) return loader.load_model(model_config=model_config, device_config=device_config, lora_config=lora_config, - multimodal_config=multimodal_config, parallel_config=parallel_config, scheduler_config=scheduler_config, cache_config=cache_config) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 979d0bb13ba51..d0427fb9b16af 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -132,9 +132,7 @@ def _get_model_initialization_kwargs( "please open an issue on github.") if supports_multimodal(model_class): - if multimodal_config is None: - raise ValueError("Provide multi-modal related configurations " - "through LLM entrypoint or engine arguments.") + assert multimodal_config is not None extra_kwargs["multimodal_config"] = multimodal_config @@ -164,7 +162,6 @@ def _initialize_model( model_config: ModelConfig, load_config: LoadConfig, lora_config: Optional[LoRAConfig], - multimodal_config: Optional[MultiModalConfig], cache_config: CacheConfig, scheduler_config: Optional[SchedulerConfig] = None) -> nn.Module: """Initialize a model with the given configurations.""" @@ -173,10 +170,10 @@ def _initialize_model( return build_model( model_class, model_config.hf_config, + cache_config=cache_config, quant_config=_get_quantization_config(model_config, load_config), lora_config=lora_config, - multimodal_config=multimodal_config, - cache_config=cache_config, + multimodal_config=model_config.multimodal_config, scheduler_config=scheduler_config, ) @@ -191,7 +188,6 @@ class BaseModelLoader(ABC): def load_model(self, *, model_config: ModelConfig, device_config: DeviceConfig, lora_config: Optional[LoRAConfig], - multimodal_config: Optional[MultiModalConfig], parallel_config: ParallelConfig, scheduler_config: SchedulerConfig, cache_config: CacheConfig) -> nn.Module: @@ -336,7 +332,6 @@ class DefaultModelLoader(BaseModelLoader): def load_model(self, *, model_config: ModelConfig, device_config: DeviceConfig, lora_config: Optional[LoRAConfig], - multimodal_config: Optional[MultiModalConfig], parallel_config: ParallelConfig, scheduler_config: SchedulerConfig, cache_config: CacheConfig) -> nn.Module: @@ -344,8 +339,8 @@ class DefaultModelLoader(BaseModelLoader): with set_default_torch_dtype(model_config.dtype): with target_device: model = _initialize_model(model_config, self.load_config, - lora_config, multimodal_config, - cache_config, scheduler_config) + lora_config, cache_config, + scheduler_config) model.load_weights( self._get_weights_iterator(model_config.model, model_config.revision, @@ -379,15 +374,14 @@ class DummyModelLoader(BaseModelLoader): def load_model(self, *, model_config: ModelConfig, device_config: DeviceConfig, lora_config: Optional[LoRAConfig], - multimodal_config: Optional[MultiModalConfig], parallel_config: ParallelConfig, scheduler_config: SchedulerConfig, cache_config: CacheConfig) -> nn.Module: with set_default_torch_dtype(model_config.dtype): with torch.device(device_config.device): model = _initialize_model(model_config, self.load_config, - lora_config, multimodal_config, - cache_config, scheduler_config) + lora_config, cache_config, + scheduler_config) # NOTE(woosuk): For accurate performance evaluation, we assign # random values to the weights. initialize_dummy_weights(model) @@ -420,7 +414,6 @@ class TensorizerLoader(BaseModelLoader): model_config: ModelConfig, device_config: DeviceConfig, lora_config: Optional[LoRAConfig], - multimodal_config: Optional[MultiModalConfig], cache_config: CacheConfig, ) -> nn.Module: """Load a serialized model with tensorizer to the CPU. @@ -433,8 +426,7 @@ class TensorizerLoader(BaseModelLoader): with set_default_torch_dtype(model_config.dtype): with torch.device(device_config.device): model = _initialize_model(model_config, self.load_config, - lora_config, multimodal_config, - cache_config) + lora_config, cache_config) model.load_weights(self._get_weights_iterator()) return model.eval() @@ -444,7 +436,6 @@ class TensorizerLoader(BaseModelLoader): model_config: ModelConfig, device_config: DeviceConfig, lora_config: Optional[LoRAConfig], - multimodal_config: Optional[MultiModalConfig], cache_config: CacheConfig, ) -> nn.Module: """Load a serialized model with tensorizer. @@ -458,7 +449,7 @@ class TensorizerLoader(BaseModelLoader): quant_config = _get_quantization_config( model_config, self.load_config) extra_kwargs = _get_model_initialization_kwargs( - model_class, lora_config, multimodal_config) + model_class, lora_config, model_config.multimodal_config) extra_kwargs["quant_config"] = quant_config extra_kwargs["cache_config"] = cache_config @@ -473,7 +464,6 @@ class TensorizerLoader(BaseModelLoader): def load_model(self, *, model_config: ModelConfig, device_config: DeviceConfig, lora_config: Optional[LoRAConfig], - multimodal_config: Optional[MultiModalConfig], parallel_config: ParallelConfig, scheduler_config: SchedulerConfig, cache_config: CacheConfig) -> nn.Module: @@ -487,11 +477,9 @@ class TensorizerLoader(BaseModelLoader): if is_vllm_tensorized(self.tensorizer_config): return self._load_model_serialized(model_config, device_config, - lora_config, multimodal_config, - cache_config) + lora_config, cache_config) return self._load_model_serialized_cpu(model_config, device_config, - lora_config, multimodal_config, - cache_config) + lora_config, cache_config) @staticmethod def save_model( @@ -577,7 +565,6 @@ class ShardedStateLoader(BaseModelLoader): def load_model(self, *, model_config: ModelConfig, device_config: DeviceConfig, lora_config: Optional[LoRAConfig], - multimodal_config: Optional[MultiModalConfig], parallel_config: ParallelConfig, scheduler_config: SchedulerConfig, cache_config: CacheConfig) -> nn.Module: @@ -591,8 +578,7 @@ class ShardedStateLoader(BaseModelLoader): with set_default_torch_dtype(model_config.dtype): with torch.device(device_config.device): model = _initialize_model(model_config, self.load_config, - lora_config, multimodal_config, - cache_config) + lora_config, cache_config) rank = get_tensor_model_parallel_rank() pattern = os.path.join( local_model_path, @@ -955,15 +941,13 @@ class BitsAndBytesModelLoader(BaseModelLoader): def load_model(self, *, model_config: ModelConfig, device_config: DeviceConfig, lora_config: Optional[LoRAConfig], - multimodal_config: Optional[MultiModalConfig], parallel_config: ParallelConfig, scheduler_config: SchedulerConfig, cache_config: CacheConfig) -> nn.Module: with set_default_torch_dtype(model_config.dtype): with torch.device(device_config.device): model = _initialize_model(model_config, self.load_config, - lora_config, multimodal_config, - cache_config) + lora_config, cache_config) self._load_weights(model_config, model) @@ -1032,7 +1016,6 @@ class GGUFModelLoader(BaseModelLoader): def load_model(self, *, model_config: ModelConfig, device_config: DeviceConfig, lora_config: Optional[LoRAConfig], - multimodal_config: Optional[MultiModalConfig], parallel_config: ParallelConfig, scheduler_config: SchedulerConfig, cache_config: CacheConfig) -> nn.Module: @@ -1047,8 +1030,7 @@ class GGUFModelLoader(BaseModelLoader): with set_default_torch_dtype(model_config.dtype): with torch.device(device_config.device): model = _initialize_model(model_config, self.load_config, - lora_config, multimodal_config, - cache_config) + lora_config, cache_config) model.load_weights( self._get_weights_iterator(local_model_path, gguf_weights_map)) return model diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 46aa62e24e8af..32cafa845a6e3 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -9,17 +9,12 @@ from vllm.utils import is_hip logger = init_logger(__name__) -# Architecture -> (module, class). _GENERATION_MODELS = { "AquilaModel": ("llama", "LlamaForCausalLM"), "AquilaForCausalLM": ("llama", "LlamaForCausalLM"), # AquilaChat2 "BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"), # baichuan-7b "BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"), # baichuan-13b "BloomForCausalLM": ("bloom", "BloomForCausalLM"), - "Blip2ForConditionalGeneration": - ("blip2", "Blip2ForConditionalGeneration"), - "ChameleonForConditionalGeneration": - ("chameleon", "ChameleonForConditionalGeneration"), "ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"), "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"), "CohereForCausalLM": ("commandr", "CohereForCausalLM"), @@ -28,7 +23,6 @@ _GENERATION_MODELS = { "DeepseekForCausalLM": ("deepseek", "DeepseekForCausalLM"), "DeepseekV2ForCausalLM": ("deepseek_v2", "DeepseekV2ForCausalLM"), "FalconForCausalLM": ("falcon", "FalconForCausalLM"), - "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"), "GemmaForCausalLM": ("gemma", "GemmaForCausalLM"), "Gemma2ForCausalLM": ("gemma2", "Gemma2ForCausalLM"), "GPT2LMHeadModel": ("gpt2", "GPT2LMHeadModel"), @@ -37,13 +31,8 @@ _GENERATION_MODELS = { "GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"), "InternLMForCausalLM": ("llama", "LlamaForCausalLM"), "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"), - "InternVLChatModel": ("internvl", "InternVLChatModel"), "JAISLMHeadModel": ("jais", "JAISLMHeadModel"), "LlamaForCausalLM": ("llama", "LlamaForCausalLM"), - "LlavaForConditionalGeneration": - ("llava", "LlavaForConditionalGeneration"), - "LlavaNextForConditionalGeneration": - ("llava_next", "LlavaNextForConditionalGeneration"), # For decapoda-research/llama-* "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"), "MistralForCausalLM": ("llama", "LlamaForCausalLM"), @@ -53,17 +42,13 @@ _GENERATION_MODELS = { "MptForCausalLM": ("mpt", "MPTForCausalLM"), "MPTForCausalLM": ("mpt", "MPTForCausalLM"), "MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"), - "MiniCPMV": ("minicpmv", "MiniCPMV"), "NemotronForCausalLM": ("nemotron", "NemotronForCausalLM"), "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"), "OPTForCausalLM": ("opt", "OPTForCausalLM"), "OrionForCausalLM": ("orion", "OrionForCausalLM"), "PersimmonForCausalLM": ("persimmon", "PersimmonForCausalLM"), - "PaliGemmaForConditionalGeneration": ("paligemma", - "PaliGemmaForConditionalGeneration"), "PhiForCausalLM": ("phi", "PhiForCausalLM"), "Phi3ForCausalLM": ("llama", "LlamaForCausalLM"), - "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"), "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"), @@ -83,6 +68,22 @@ _EMBEDDING_MODELS = { "MistralModel": ("llama_embedding", "LlamaEmbeddingModel"), } +_MULTIMODAL_MODELS = { + "Blip2ForConditionalGeneration": + ("blip2", "Blip2ForConditionalGeneration"), + "ChameleonForConditionalGeneration": + ("chameleon", "ChameleonForConditionalGeneration"), + "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"), + "InternVLChatModel": ("internvl", "InternVLChatModel"), + "LlavaForConditionalGeneration": + ("llava", "LlavaForConditionalGeneration"), + "LlavaNextForConditionalGeneration": + ("llava_next", "LlavaNextForConditionalGeneration"), + "MiniCPMV": ("minicpmv", "MiniCPMV"), + "PaliGemmaForConditionalGeneration": ("paligemma", + "PaliGemmaForConditionalGeneration"), + "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), +} _CONDITIONAL_GENERATION_MODELS = { "BartModel": ("bart", "BartForConditionalGeneration"), "BartForConditionalGeneration": ("bart", "BartForConditionalGeneration"), @@ -91,7 +92,8 @@ _CONDITIONAL_GENERATION_MODELS = { _MODELS = { **_GENERATION_MODELS, **_EMBEDDING_MODELS, - **_CONDITIONAL_GENERATION_MODELS + **_MULTIMODAL_MODELS, + **_CONDITIONAL_GENERATION_MODELS, } # Architecture -> type. @@ -182,6 +184,15 @@ class ModelRegistry: def is_embedding_model(model_arch: str) -> bool: return model_arch in _EMBEDDING_MODELS + @staticmethod + def is_multimodal_model(model_arch: str) -> bool: + + # TODO: find a way to avoid initializing CUDA prematurely to + # use `supports_multimodal` to determine if a model is multimodal + # model_cls = ModelRegistry._try_load_model_cls(model_arch) + # from vllm.model_executor.models.interfaces import supports_multimodal + return model_arch in _MULTIMODAL_MODELS + __all__ = [ "ModelRegistry", diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index d487d20011b45..cd16cdcbd890c 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -2,7 +2,7 @@ import functools from collections import UserDict from typing import Dict, Mapping, Optional, Sequence -from vllm.config import ModelConfig, MultiModalConfig +from vllm.config import ModelConfig from vllm.logger import init_logger from .audio import AudioPlugin @@ -181,7 +181,6 @@ class MultiModalRegistry: def init_mm_limits_per_prompt( self, model_config: ModelConfig, - multimodal_config: Optional[MultiModalConfig], ) -> None: """ Initialize the maximum number of multi-modal input instances for each @@ -192,6 +191,7 @@ class MultiModalRegistry: "`mm_limits` has already been set for model=%s, and will " "be overwritten by the new values.", model_config.model) + multimodal_config = model_config.multimodal_config if multimodal_config is None: limits_per_plugin = self._disabled_limits_per_plugin else: diff --git a/vllm/spec_decode/draft_model_runner.py b/vllm/spec_decode/draft_model_runner.py index 324044c96d994..1bb3b83744fec 100644 --- a/vllm/spec_decode/draft_model_runner.py +++ b/vllm/spec_decode/draft_model_runner.py @@ -23,8 +23,8 @@ except ImportError: FLASHINFER_WORKSPACE_BUFFER_SIZE = 0 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, MultiModalConfig, ObservabilityConfig, - ParallelConfig, PromptAdapterConfig, SchedulerConfig) + ModelConfig, ObservabilityConfig, ParallelConfig, + PromptAdapterConfig, SchedulerConfig) from vllm.logger import init_logger from vllm.multimodal import MultiModalInputs from vllm.sequence import (ExecuteModelRequest, IntermediateTensors, @@ -66,7 +66,6 @@ class TP1DraftModelRunner(ModelRunner): lora_config: Optional[LoRAConfig], kv_cache_dtype: Optional[str] = "auto", is_driver_worker: bool = False, - multimodal_config: Optional[MultiModalConfig] = None, prompt_adapter_config: Optional[PromptAdapterConfig] = None, return_hidden_states: bool = False, observability_config: Optional[ObservabilityConfig] = None, @@ -86,7 +85,6 @@ class TP1DraftModelRunner(ModelRunner): lora_config=lora_config, kv_cache_dtype=kv_cache_dtype, is_driver_worker=is_driver_worker, - multimodal_config=multimodal_config, prompt_adapter_config=prompt_adapter_config, return_hidden_states=return_hidden_states, observability_config=observability_config, diff --git a/vllm/spec_decode/target_model_runner.py b/vllm/spec_decode/target_model_runner.py index e5b6933a5ce1c..2bb7af7d7c600 100644 --- a/vllm/spec_decode/target_model_runner.py +++ b/vllm/spec_decode/target_model_runner.py @@ -1,8 +1,8 @@ from typing import List, Optional from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, MultiModalConfig, ObservabilityConfig, - ParallelConfig, PromptAdapterConfig, SchedulerConfig) + ModelConfig, ObservabilityConfig, ParallelConfig, + PromptAdapterConfig, SchedulerConfig) from vllm.sequence import SequenceGroupMetadata from vllm.worker.model_runner import (ModelInputForGPUWithSamplingMetadata, ModelRunner) @@ -31,7 +31,6 @@ class TargetModelRunner(ModelRunner): kv_cache_dtype: Optional[str] = "auto", is_driver_worker: bool = False, prompt_adapter_config: Optional[PromptAdapterConfig] = None, - multimodal_config: Optional[MultiModalConfig] = None, return_hidden_states: bool = False, observability_config: Optional[ObservabilityConfig] = None): # An internal boolean member variable to indicate if token log @@ -47,7 +46,6 @@ class TargetModelRunner(ModelRunner): lora_config=lora_config, kv_cache_dtype=kv_cache_dtype, is_driver_worker=is_driver_worker, - multimodal_config=multimodal_config, prompt_adapter_config=prompt_adapter_config, return_hidden_states=return_hidden_states, observability_config=observability_config, diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index e22e152a8a8ad..82e69b569d90d 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -6,8 +6,8 @@ from torch import nn from vllm.attention import AttentionMetadata, get_attn_backend from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, MultiModalConfig, ParallelConfig, - PromptAdapterConfig, SchedulerConfig) + ModelConfig, ParallelConfig, PromptAdapterConfig, + SchedulerConfig) from vllm.logger import init_logger from vllm.model_executor import SamplingMetadata from vllm.model_executor.model_loader import get_model @@ -79,7 +79,6 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]): cache_config: CacheConfig, load_config: LoadConfig, lora_config: Optional[LoRAConfig], - multimodal_config: Optional[MultiModalConfig], kv_cache_dtype: Optional[str] = "auto", prompt_adapter_config: Optional[PromptAdapterConfig] = None, is_driver_worker: bool = False, @@ -94,7 +93,6 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]): self.device_config = device_config self.cache_config = cache_config self.lora_config = lora_config - self.multimodal_config = multimodal_config self.prompt_adapter_config = prompt_adapter_config self.load_config = load_config self.is_driver_worker = is_driver_worker @@ -125,7 +123,6 @@ class CPUModelRunner(ModelRunnerBase[CPUModelInput]): self.model = get_model(model_config=self.model_config, load_config=self.load_config, device_config=self.device_config, - multimodal_config=self.multimodal_config, lora_config=self.lora_config, parallel_config=self.parallel_config, scheduler_config=self.scheduler_config, diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py index 735d48c908d61..d9b1d18da156c 100644 --- a/vllm/worker/cpu_worker.py +++ b/vllm/worker/cpu_worker.py @@ -7,8 +7,8 @@ import torch.distributed import vllm.envs as envs from vllm.attention import get_attn_backend from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, MultiModalConfig, ParallelConfig, - PromptAdapterConfig, SchedulerConfig) + ModelConfig, ParallelConfig, PromptAdapterConfig, + SchedulerConfig) from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment) from vllm.logger import init_logger @@ -132,7 +132,6 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): rank: int, distributed_init_method: str, lora_config: Optional[LoRAConfig] = None, - multimodal_config: Optional[MultiModalConfig] = None, kv_cache_dtype: Optional[str] = "auto", prompt_adapter_config: Optional[PromptAdapterConfig] = None, is_driver_worker: bool = False, @@ -148,7 +147,6 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): self.distributed_init_method = distributed_init_method self.lora_config = lora_config self.prompt_adapter_config = prompt_adapter_config - self.multimodal_config = multimodal_config self.is_driver_worker = is_driver_worker if self.is_driver_worker: assert self.rank == 0, "The driver worker must have rank 0." @@ -173,7 +171,6 @@ class CPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): cache_config, load_config=self.load_config, lora_config=self.lora_config, - multimodal_config=self.multimodal_config, kv_cache_dtype=kv_cache_dtype, prompt_adapter_config=self.prompt_adapter_config, is_driver_worker=is_driver_worker) diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py index 197c4c730e5a7..0121f5da79f1d 100644 --- a/vllm/worker/embedding_model_runner.py +++ b/vllm/worker/embedding_model_runner.py @@ -4,8 +4,8 @@ from typing import Any, Dict, List, Optional, Tuple, Type import torch from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, MultiModalConfig, ObservabilityConfig, - ParallelConfig, PromptAdapterConfig, SchedulerConfig) + ModelConfig, ObservabilityConfig, ParallelConfig, + PromptAdapterConfig, SchedulerConfig) from vllm.logger import init_logger from vllm.model_executor.pooling_metadata import PoolingMetadata from vllm.multimodal import MultiModalInputs @@ -44,7 +44,6 @@ class EmbeddingModelRunner( kv_cache_dtype: Optional[str] = "auto", is_driver_worker: bool = False, prompt_adapter_config: Optional[PromptAdapterConfig] = None, - multimodal_config: Optional[MultiModalConfig] = None, observability_config: Optional[ObservabilityConfig] = None, ): super().__init__(model_config, @@ -57,7 +56,6 @@ class EmbeddingModelRunner( kv_cache_dtype=kv_cache_dtype, is_driver_worker=is_driver_worker, prompt_adapter_config=prompt_adapter_config, - multimodal_config=multimodal_config, observability_config=observability_config) @torch.inference_mode() diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py index 4aec8d1d408d7..1afda0e45b702 100644 --- a/vllm/worker/enc_dec_model_runner.py +++ b/vllm/worker/enc_dec_model_runner.py @@ -10,8 +10,8 @@ from vllm.attention.selector import (_Backend, get_env_variable_attn_backend, get_global_forced_attn_backend, global_force_attn_backend) from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, MultiModalConfig, ObservabilityConfig, - ParallelConfig, PromptAdapterConfig, SchedulerConfig) + ModelConfig, ObservabilityConfig, ParallelConfig, + PromptAdapterConfig, SchedulerConfig) from vllm.inputs import INPUT_REGISTRY, InputRegistry from vllm.logger import init_logger from vllm.model_executor import SamplingMetadata @@ -82,7 +82,6 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]): kv_cache_dtype: Optional[str] = "auto", is_driver_worker: bool = False, prompt_adapter_config: Optional[PromptAdapterConfig] = None, - multimodal_config: Optional[MultiModalConfig] = None, observability_config: Optional[ObservabilityConfig] = None, input_registry: InputRegistry = INPUT_REGISTRY, mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, @@ -90,7 +89,7 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]): ''' EncoderDecoderModelRunner constructor. - `lora_config`, `multimodal_config`, and prompt_adapter_config are + `lora_config` and `prompt_adapter_config` are unused (since these features are not yet supported for encoder/decoder models) but these arguments are present here for compatibility with the base-class constructor. @@ -273,14 +272,8 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]): # number of tokens equal to max_num_batched_tokens. seqs: List[SequenceGroupMetadata] = [] - model_config = self.model_config - mm_config = self.multimodal_config - - input_registry = self.input_registry - mm_registry = self.mm_registry - mm_registry.init_mm_limits_per_prompt(model_config, mm_config) - - max_mm_tokens = mm_registry.get_max_multimodal_tokens(model_config) + max_mm_tokens = self.mm_registry.get_max_multimodal_tokens( + self.model_config) if max_mm_tokens > 0: raise NotImplementedError( "Multi-modal encoder-decoder models are not supported yet") @@ -291,8 +284,10 @@ class EncoderDecoderModelRunner(GPUModelRunnerBase[EncoderDecoderModelInput]): (group_id < max_num_batched_tokens % max_num_seqs)) batch_size += seq_len - seq_data, _ = input_registry \ - .dummy_data_for_profiling(model_config, seq_len, mm_registry) + seq_data, _ = self.input_registry \ + .dummy_data_for_profiling(self.model_config, + seq_len, + self.mm_registry) # Having more tokens is over-conservative but otherwise fine assert len(seq_data.prompt_token_ids) >= seq_len, ( diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index d01b4e781b608..9f27c734efd1e 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -27,8 +27,8 @@ except ImportError: import vllm.envs as envs from vllm.attention import AttentionMetadata, get_attn_backend from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, MultiModalConfig, ObservabilityConfig, - ParallelConfig, PromptAdapterConfig, SchedulerConfig) + ModelConfig, ObservabilityConfig, ParallelConfig, + PromptAdapterConfig, SchedulerConfig) from vllm.distributed import get_pp_group from vllm.distributed.parallel_state import graph_capture from vllm.inputs import INPUT_REGISTRY, InputRegistry @@ -804,7 +804,6 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): kv_cache_dtype: Optional[str] = "auto", is_driver_worker: bool = False, prompt_adapter_config: Optional[PromptAdapterConfig] = None, - multimodal_config: Optional[MultiModalConfig] = None, return_hidden_states: bool = False, observability_config: Optional[ObservabilityConfig] = None, input_registry: InputRegistry = INPUT_REGISTRY, @@ -819,7 +818,6 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): self.load_config = load_config self.is_driver_worker = is_driver_worker self.prompt_adapter_config = prompt_adapter_config - self.multimodal_config = multimodal_config self.return_hidden_states = return_hidden_states self.observability_config = observability_config @@ -866,6 +864,7 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): self.mm_registry = mm_registry self.multi_modal_input_mapper = mm_registry \ .create_input_mapper(model_config) + self.mm_registry.init_mm_limits_per_prompt(self.model_config) # Lazy initialization self.model: nn.Module # Set after load_model @@ -893,7 +892,6 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): device_config=self.device_config, load_config=self.load_config, lora_config=self.lora_config, - multimodal_config=self.multimodal_config, parallel_config=self.parallel_config, scheduler_config=self.scheduler_config, cache_config=self.cache_config) @@ -1056,14 +1054,9 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): # To exercise the worst scenario for GPU memory consumption, # the number of seqs (batch_size) is chosen to maximize the number # of images processed. - model_config = self.model_config - mm_config = self.multimodal_config - input_registry = self.input_registry - mm_registry = self.mm_registry - mm_registry.init_mm_limits_per_prompt(model_config, mm_config) - - max_mm_tokens = mm_registry.get_max_multimodal_tokens(model_config) + max_mm_tokens = self.mm_registry.get_max_multimodal_tokens( + self.model_config) if max_mm_tokens > 0: max_num_seqs_orig = max_num_seqs max_num_seqs = min(max_num_seqs, @@ -1082,8 +1075,10 @@ class GPUModelRunnerBase(ModelRunnerBase[TModelInputForGPU]): (group_id < max_num_batched_tokens % max_num_seqs)) batch_size += seq_len - seq_data, dummy_multi_modal_data = input_registry \ - .dummy_data_for_profiling(model_config, seq_len, mm_registry) + seq_data, dummy_multi_modal_data = self.input_registry \ + .dummy_data_for_profiling(self.model_config, + seq_len, + self.mm_registry) seq = SequenceGroupMetadata( request_id=str(group_id), diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py index 685ae0fd7cc85..86fc1d08c0812 100644 --- a/vllm/worker/tpu_model_runner.py +++ b/vllm/worker/tpu_model_runner.py @@ -11,7 +11,7 @@ import torch_xla.runtime as xr from vllm.attention import AttentionMetadata, get_attn_backend from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig, - MultiModalConfig, ParallelConfig, SchedulerConfig) + ParallelConfig, SchedulerConfig) from vllm.logger import init_logger from vllm.model_executor.model_loader import get_model from vllm.model_executor.sampling_metadata import SamplingMetadata @@ -89,7 +89,6 @@ class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]): device_config: DeviceConfig, cache_config: CacheConfig, load_config: LoadConfig, - multimodal_config: Optional[MultiModalConfig] = None, is_driver_worker: bool = False, ): self.model_config = model_config @@ -98,7 +97,6 @@ class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]): self.device_config = device_config self.cache_config = cache_config self.load_config = load_config - self.multimodal_config = multimodal_config self.is_driver_worker = is_driver_worker self.block_size = self.cache_config.block_size @@ -142,7 +140,6 @@ class TPUModelRunner(ModelRunnerBase[ModelInputForTPU]): parallel_config=self.parallel_config, cache_config=self.cache_config, scheduler_config=self.scheduler_config, - multimodal_config=self.multimodal_config, lora_config=None, ) model = model.eval() diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py index 35f8ecdb81268..90f59b5d038ad 100644 --- a/vllm/worker/tpu_worker.py +++ b/vllm/worker/tpu_worker.py @@ -7,7 +7,7 @@ import torch_xla.runtime as xr import vllm.envs as envs from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig, - MultiModalConfig, ParallelConfig, SchedulerConfig) + ParallelConfig, SchedulerConfig) from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment) from vllm.logger import init_logger @@ -31,7 +31,6 @@ class TPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): device_config: DeviceConfig, cache_config: CacheConfig, load_config: LoadConfig, - multimodal_config: Optional[MultiModalConfig], local_rank: int, rank: int, distributed_init_method: str, @@ -44,7 +43,6 @@ class TPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): self.device_config = device_config self.cache_config = cache_config self.load_config = load_config - self.multimodal_config = multimodal_config self.local_rank = local_rank self.rank = rank self.distributed_init_method = distributed_init_method @@ -64,7 +62,6 @@ class TPUWorker(LoraNotSupportedWorkerBase, LocalOrDistributedWorkerBase): device_config, cache_config, load_config, - multimodal_config, is_driver_worker=is_driver_worker) def init_device(self) -> None: diff --git a/vllm/worker/utils.py b/vllm/worker/utils.py index 8df3c8bc5408b..79c48896469e8 100644 --- a/vllm/worker/utils.py +++ b/vllm/worker/utils.py @@ -39,7 +39,7 @@ def assert_enc_dec_mr_supported_scenario( raise NotImplementedError( STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_PP']) - if enc_dec_mr.multimodal_config is not None: + if enc_dec_mr.model_config.multimodal_config is not None: raise NotImplementedError( STR_NOT_IMPL_ENC_DEC_ERR_STRS['STR_NOT_IMPL_ENC_DEC_MM']) diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 8f4372e20d2e7..ffe6216d3ed62 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -7,8 +7,8 @@ import torch import torch.distributed from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig, - ModelConfig, MultiModalConfig, ObservabilityConfig, - ParallelConfig, PromptAdapterConfig, SchedulerConfig, + ModelConfig, ObservabilityConfig, ParallelConfig, + PromptAdapterConfig, SchedulerConfig, SpeculativeConfig) from vllm.distributed import (ensure_model_parallel_initialized, init_distributed_environment, @@ -46,7 +46,6 @@ class Worker(LocalOrDistributedWorkerBase): rank: int, distributed_init_method: str, lora_config: Optional[LoRAConfig] = None, - multimodal_config: Optional[MultiModalConfig] = None, speculative_config: Optional[SpeculativeConfig] = None, prompt_adapter_config: Optional[PromptAdapterConfig] = None, is_driver_worker: bool = False, @@ -73,7 +72,6 @@ class Worker(LocalOrDistributedWorkerBase): # note: lazy import to avoid importing torch before initializing from vllm.utils import init_cached_hf_modules init_cached_hf_modules() - self.multimodal_config = multimodal_config self.observability_config = observability_config # Return hidden states from target model if the draft model is an @@ -103,7 +101,6 @@ class Worker(LocalOrDistributedWorkerBase): kv_cache_dtype=self.cache_config.cache_dtype, is_driver_worker=is_driver_worker, prompt_adapter_config=prompt_adapter_config, - multimodal_config=multimodal_config, observability_config=observability_config, **speculative_args, ) diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index d4b450199bb5d..8a2f93c15ed5e 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -125,6 +125,7 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]): self.mm_registry = mm_registry self.multi_modal_input_mapper = mm_registry \ .create_input_mapper(model_config) + self.mm_registry.init_mm_limits_per_prompt(self.model_config) # Lazy initialization. self.model: nn.Module # Set after init_Model @@ -166,14 +167,8 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]): # To exercise the worst scenario for GPU memory consumption, # the number of seqs (batch_size) is chosen to maximize the number # of images processed. - model_config = self.model_config - mm_config = self.multimodal_config - - input_registry = self.input_registry - mm_registry = self.mm_registry - mm_registry.init_mm_limits_per_prompt(model_config, mm_config) - - max_mm_tokens = mm_registry.get_max_multimodal_tokens(model_config) + max_mm_tokens = self.mm_registry.get_max_multimodal_tokens( + self.model_config) if max_mm_tokens > 0: max_num_seqs_orig = max_num_seqs max_num_seqs = min(max_num_seqs, @@ -190,8 +185,10 @@ class XPUModelRunner(ModelRunnerBase[ModelInputForXPU]): seq_len = (max_num_batched_tokens // max_num_seqs + (group_id < max_num_batched_tokens % max_num_seqs)) - seq_data, dummy_multi_modal_data = input_registry \ - .dummy_data_for_profiling(model_config, seq_len, mm_registry) + seq_data, dummy_multi_modal_data = self.input_registry \ + .dummy_data_for_profiling(self.model_config, + seq_len, + self.mm_registry) seq = SequenceGroupMetadata( request_id=str(group_id),