From 27f4c2fd46b99778d7ea19dfe7751fbaab615177 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sun, 7 Dec 2025 15:15:42 +0800 Subject: [PATCH] [Renderer] Separate out `RendererConfig` from `ModelConfig` (#30145) Signed-off-by: DarkLight1337 --- docs/contributing/model/transcription.md | 12 +- .../distributed/test_sequence_parallelism.py | 2 + tests/compile/test_functionalization.py | 6 +- tests/compile/test_fusion.py | 6 +- tests/compile/test_fusion_attn.py | 2 + tests/compile/test_pass_manager.py | 8 +- tests/compile/test_qk_norm_rope_fusion.py | 5 +- tests/distributed/test_kvlayout.py | 3 + .../entrypoints/openai/test_chat_template.py | 22 +- .../entrypoints/openai/test_lora_resolvers.py | 21 +- tests/entrypoints/openai/test_serving_chat.py | 28 ++- .../entrypoints/openai/test_serving_engine.py | 8 +- .../entrypoints/openai/test_serving_models.py | 8 +- tests/entrypoints/test_chat_utils.py | 190 +++++++----------- tests/lora/test_lora_manager.py | 14 +- tests/lora/test_worker.py | 2 + .../test_model_load_with_params.py | 22 +- tests/models/language/pooling/test_gritlm.py | 5 +- .../multimodal/processing/test_common.py | 22 +- .../multimodal/processing/test_glm4_1v.py | 4 +- .../multimodal/processing/test_h2ovl.py | 2 +- .../multimodal/processing/test_idefics3.py | 2 +- .../multimodal/processing/test_internvl.py | 2 +- .../multimodal/processing/test_llama4.py | 2 +- .../multimodal/processing/test_llava_next.py | 6 +- .../processing/test_llava_onevision.py | 6 +- .../processing/test_minimax_vl_01.py | 4 +- .../multimodal/processing/test_mllama4.py | 2 +- .../multimodal/processing/test_nemotron_vl.py | 2 +- .../multimodal/processing/test_phi3v.py | 2 +- .../multimodal/processing/test_phi4mm.py | 2 +- .../multimodal/processing/test_qwen2_vl.py | 2 +- .../multimodal/processing/test_smolvlm.py | 2 +- .../processing/test_tensor_schema.py | 24 +-- .../processing/test_transformers.py | 5 +- tests/models/multimodal/test_mapping.py | 33 +-- tests/models/registry.py | 33 ++- tests/models/utils.py | 17 +- tests/multimodal/test_cache.py | 27 ++- tests/multimodal/test_processing.py | 24 ++- tests/multimodal/test_registry.py | 4 +- tests/test_config.py | 131 +++++++----- tests/test_inputs.py | 7 +- tests/v1/attention/utils.py | 2 + tests/v1/core/test_kv_cache_utils.py | 20 +- tests/v1/core/test_scheduler.py | 2 + tests/v1/core/utils.py | 2 + tests/v1/engine/test_engine_core.py | 2 + .../engine/test_process_multi_modal_uuids.py | 24 ++- tests/v1/kv_connector/unit/utils.py | 2 + tests/v1/spec_decode/test_eagle.py | 2 + tests/v1/spec_decode/test_mtp.py | 2 + tests/v1/spec_decode/test_ngram.py | 2 + .../test_backend_guidance.py | 12 +- .../test_reasoning_structured_output.py | 35 ++-- tests/v1/tpu/worker/test_tpu_model_runner.py | 2 + tests/v1/worker/test_gpu_model_runner.py | 3 + vllm/config/__init__.py | 3 + vllm/config/model.py | 141 +++---------- vllm/config/multimodal.py | 4 - vllm/config/renderer.py | 109 ++++++++++ vllm/config/speculative.py | 5 - vllm/config/vllm.py | 25 ++- vllm/engine/arg_utils.py | 99 +++++---- vllm/engine/protocol.py | 3 +- vllm/entrypoints/chat_utils.py | 79 ++++---- vllm/entrypoints/llm.py | 14 +- vllm/entrypoints/openai/api_server.py | 2 +- vllm/entrypoints/openai/serving_completion.py | 2 +- vllm/entrypoints/openai/serving_engine.py | 11 +- vllm/entrypoints/openai/serving_models.py | 1 + vllm/entrypoints/openai/speech_to_text.py | 10 +- vllm/entrypoints/pooling/pooling/serving.py | 2 +- vllm/entrypoints/pooling/score/serving.py | 4 +- vllm/entrypoints/score_utils.py | 13 +- vllm/entrypoints/utils.py | 8 +- vllm/inputs/preprocess.py | 9 +- vllm/model_executor/models/adapters.py | 20 +- vllm/model_executor/models/deepseek_ocr.py | 4 +- vllm/model_executor/models/deepseek_vl2.py | 4 +- vllm/model_executor/models/gemma3n_mm.py | 8 +- vllm/model_executor/models/granite_speech.py | 14 +- vllm/model_executor/models/gritlm.py | 14 +- vllm/model_executor/models/interfaces.py | 10 +- vllm/model_executor/models/interns1.py | 2 +- .../model_executor/models/nano_nemotron_vl.py | 13 +- vllm/model_executor/models/nemotron_vl.py | 2 +- vllm/model_executor/models/pixtral.py | 2 +- vllm/model_executor/models/voxtral.py | 22 +- vllm/model_executor/models/whisper.py | 14 +- vllm/multimodal/cache.py | 22 +- vllm/multimodal/processing.py | 28 ++- vllm/multimodal/registry.py | 64 +++--- vllm/tokenizers/registry.py | 24 +-- vllm/transformers_utils/processor.py | 28 ++- vllm/v1/core/encoder_cache_manager.py | 8 +- vllm/v1/core/sched/scheduler.py | 2 +- vllm/v1/engine/async_llm.py | 7 +- vllm/v1/engine/input_processor.py | 7 +- vllm/v1/engine/llm_engine.py | 7 +- vllm/v1/spec_decode/eagle.py | 2 +- vllm/v1/structured_output/__init__.py | 18 +- vllm/v1/worker/gpu_model_runner.py | 7 +- vllm/v1/worker/tpu_model_runner.py | 7 +- vllm/v1/worker/utils.py | 19 +- 105 files changed, 969 insertions(+), 797 deletions(-) create mode 100644 vllm/config/renderer.py diff --git a/docs/contributing/model/transcription.md b/docs/contributing/model/transcription.md index fca941acd5076..c5605789022d4 100644 --- a/docs/contributing/model/transcription.md +++ b/docs/contributing/model/transcription.md @@ -22,7 +22,7 @@ Declare supported languages and capabilities: import torch from torch import nn - from vllm.config import ModelConfig, SpeechToTextConfig + from vllm.config import RendererConfig, SpeechToTextConfig from vllm.inputs.data import PromptType from vllm.model_executor.models.interfaces import SupportsTranscription @@ -52,7 +52,7 @@ This is for controlling general behavior of the API when serving your model: @classmethod def get_speech_to_text_config( cls, - model_config: ModelConfig, + renderer_config: RendererConfig, task_type: Literal["transcribe", "translate"], ) -> SpeechToTextConfig: return SpeechToTextConfig( @@ -83,7 +83,7 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt cls, audio: np.ndarray, stt_config: SpeechToTextConfig, - model_config: ModelConfig, + renderer_config: RendererConfig, language: str | None, task_type: Literal["transcribe", "translate"], request_prompt: str, @@ -120,7 +120,7 @@ Return a dict with separate `encoder_prompt` and `decoder_prompt` entries: cls, audio: np.ndarray, stt_config: SpeechToTextConfig, - model_config: ModelConfig, + renderer_config: RendererConfig, language: str | None, task_type: Literal["transcribe", "translate"], request_prompt: str, @@ -183,7 +183,7 @@ Provide a fast duration→token estimate to improve streaming usage statistics: cls, audio_duration_s: float, stt_config: SpeechToTextConfig, - model_config: ModelConfig, + renderer_config: RendererConfig, ) -> int | None: # Return None if unknown; otherwise return an estimate. return int(audio_duration_s * stt_config.sample_rate // 320) # example @@ -216,7 +216,7 @@ Relevant server logic: prompt = self.model_cls.get_generation_prompt( audio=chunk, stt_config=self.asr_config, - model_config=self.model_config, + renderer_config=self.renderer_config, language=language, task_type=self.task_type, request_prompt=request.prompt, diff --git a/tests/compile/distributed/test_sequence_parallelism.py b/tests/compile/distributed/test_sequence_parallelism.py index d9fdc3acc3d6f..77d3a24d42923 100644 --- a/tests/compile/distributed/test_sequence_parallelism.py +++ b/tests/compile/distributed/test_sequence_parallelism.py @@ -17,6 +17,7 @@ from vllm.config import ( DeviceConfig, ModelConfig, PassConfig, + RendererConfig, VllmConfig, get_current_vllm_config, set_current_vllm_config, @@ -276,6 +277,7 @@ def sequence_parallelism_pass_on_test_model( vllm_config = VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), device_config=device_config, compilation_config=compilation_config, ) diff --git a/tests/compile/test_functionalization.py b/tests/compile/test_functionalization.py index 7585915892700..52d6fd1e5d75e 100644 --- a/tests/compile/test_functionalization.py +++ b/tests/compile/test_functionalization.py @@ -15,6 +15,7 @@ from vllm.config import ( CompilationConfig, ModelConfig, PassConfig, + RendererConfig, VllmConfig, set_current_vllm_config, ) @@ -219,8 +220,11 @@ def test_fix_functionalization( torch.set_default_device("cuda") torch.set_default_dtype(dtype) + model_config = ModelConfig(dtype=dtype) + vllm_config = VllmConfig( - model_config=ModelConfig(dtype=dtype), + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), compilation_config=CompilationConfig( custom_ops=["all"], pass_config=PassConfig( diff --git a/tests/compile/test_fusion.py b/tests/compile/test_fusion.py index d0ba8385f4a01..bb4ee6b8e3eca 100644 --- a/tests/compile/test_fusion.py +++ b/tests/compile/test_fusion.py @@ -15,6 +15,7 @@ from vllm.config import ( CompilationMode, ModelConfig, PassConfig, + RendererConfig, VllmConfig, ) from vllm.model_executor.layers.layernorm import RMSNorm @@ -154,8 +155,11 @@ def test_fusion_rmsnorm_quant( custom_ops.append("+rms_norm") if enable_quant_fp8_custom_op: custom_ops.append("+quant_fp8") + + model_config = ModelConfig(dtype=dtype) vllm_config = VllmConfig( - model_config=ModelConfig(dtype=dtype), + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), compilation_config=CompilationConfig( mode=CompilationMode.VLLM_COMPILE, custom_ops=custom_ops, diff --git a/tests/compile/test_fusion_attn.py b/tests/compile/test_fusion_attn.py index db95dff5e0fc7..f87825db29817 100644 --- a/tests/compile/test_fusion_attn.py +++ b/tests/compile/test_fusion_attn.py @@ -24,6 +24,7 @@ from vllm.config import ( CompilationMode, ModelConfig, PassConfig, + RendererConfig, SchedulerConfig, VllmConfig, set_current_vllm_config, @@ -325,6 +326,7 @@ def test_attention_quant_pattern( ) vllm_config = VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), scheduler_config=SchedulerConfig( max_num_seqs=1024, max_model_len=model_config.max_model_len, diff --git a/tests/compile/test_pass_manager.py b/tests/compile/test_pass_manager.py index 6d0ba6b655031..c95e9e3ff8ae8 100644 --- a/tests/compile/test_pass_manager.py +++ b/tests/compile/test_pass_manager.py @@ -7,7 +7,7 @@ import torch from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass from vllm.compilation.pass_manager import PostGradPassManager -from vllm.config import ModelConfig, VllmConfig +from vllm.config import ModelConfig, RendererConfig, VllmConfig # dummy custom pass that doesn't inherit @@ -43,7 +43,11 @@ class ProperPass(InductorPass): ) def test_pass_manager_uuid(callable): # Some passes need dtype to be set - config = VllmConfig(model_config=ModelConfig(dtype=torch.bfloat16)) + model_config = ModelConfig(dtype=torch.bfloat16) + config = VllmConfig( + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), + ) pass_manager = PostGradPassManager() pass_manager.configure(config) diff --git a/tests/compile/test_qk_norm_rope_fusion.py b/tests/compile/test_qk_norm_rope_fusion.py index e0968ac799256..4d109015be48a 100644 --- a/tests/compile/test_qk_norm_rope_fusion.py +++ b/tests/compile/test_qk_norm_rope_fusion.py @@ -19,6 +19,7 @@ from vllm.config import ( CompilationMode, ModelConfig, PassConfig, + RendererConfig, VllmConfig, set_current_vllm_config, ) @@ -133,8 +134,10 @@ def test_qk_norm_rope_fusion( if enable_rope_custom_op: custom_ops.append("+rotary_embedding") + model_config = ModelConfig(dtype=dtype) vllm_config = VllmConfig( - model_config=ModelConfig(dtype=dtype), + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), compilation_config=CompilationConfig( mode=CompilationMode.VLLM_COMPILE, custom_ops=custom_ops, diff --git a/tests/distributed/test_kvlayout.py b/tests/distributed/test_kvlayout.py index b190b2820451b..0d51a51a50804 100644 --- a/tests/distributed/test_kvlayout.py +++ b/tests/distributed/test_kvlayout.py @@ -5,6 +5,7 @@ from vllm.config import ( DeviceConfig, KVTransferConfig, ModelConfig, + RendererConfig, VllmConfig, set_current_vllm_config, ) @@ -47,6 +48,7 @@ def test_get_kv_connector_cache_layout_with_nixl_connector(): vllm_config = VllmConfig( device_config=DeviceConfig("cpu"), model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), kv_transfer_config=kv_transfer_config, ) with set_current_vllm_config(vllm_config): @@ -70,6 +72,7 @@ def test_get_kv_connector_cache_layout_with_multi_connector(): vllm_config = VllmConfig( device_config=DeviceConfig("cpu"), model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), kv_transfer_config=kv_transfer_config, ) with set_current_vllm_config(vllm_config): diff --git a/tests/entrypoints/openai/test_chat_template.py b/tests/entrypoints/openai/test_chat_template.py index 77087ac21ea8b..b050cfdb561cf 100644 --- a/tests/entrypoints/openai/test_chat_template.py +++ b/tests/entrypoints/openai/test_chat_template.py @@ -3,7 +3,6 @@ import pytest -from vllm.config import ModelConfig from vllm.entrypoints.chat_utils import apply_hf_chat_template, load_chat_template from vllm.entrypoints.openai.protocol import ChatCompletionRequest from vllm.tokenizers import get_tokenizer @@ -107,24 +106,11 @@ def test_get_gen_prompt( model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_available_online(on_fail="skip") - model_config = ModelConfig( - model, - tokenizer=model_info.tokenizer or model, - tokenizer_mode=model_info.tokenizer_mode, - trust_remote_code=model_info.trust_remote_code, - revision=model_info.revision, - hf_overrides=model_info.hf_overrides, - skip_tokenizer_init=model_info.require_embed_inputs, - enable_prompt_embeds=model_info.require_embed_inputs, - enable_mm_embeds=model_info.require_embed_inputs, - enforce_eager=model_info.enforce_eager, - dtype=model_info.dtype, - ) + renderer_config = model_info.build_renderer_config(model) - # Initialize the tokenizer tokenizer = get_tokenizer( - tokenizer_name=model_config.tokenizer, - trust_remote_code=model_config.trust_remote_code, + renderer_config.tokenizer, + trust_remote_code=renderer_config.trust_remote_code, ) template_content = load_chat_template(chat_template=template) @@ -143,7 +129,7 @@ def test_get_gen_prompt( tokenizer=tokenizer, conversation=mock_request.messages, chat_template=mock_request.chat_template or template_content, - model_config=model_config, + renderer_config=renderer_config, tools=None, add_generation_prompt=mock_request.add_generation_prompt, continue_final_message=mock_request.continue_final_message, diff --git a/tests/entrypoints/openai/test_lora_resolvers.py b/tests/entrypoints/openai/test_lora_resolvers.py index ea6b3d812d8fe..7310c2610ce3b 100644 --- a/tests/entrypoints/openai/test_lora_resolvers.py +++ b/tests/entrypoints/openai/test_lora_resolvers.py @@ -33,26 +33,34 @@ class MockModelConfig: """Minimal mock ModelConfig for testing.""" model: str = MODEL_NAME - tokenizer: str = MODEL_NAME trust_remote_code: bool = False - tokenizer_mode: str = "auto" max_model_len: int = 100 - tokenizer_revision: str | None = None multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig) hf_config: MockHFConfig = field(default_factory=MockHFConfig) logits_processors: list[str] | None = None logits_processor_pattern: str | None = None diff_sampling_param: dict | None = None - allowed_local_media_path: str = "" - allowed_media_domains: list[str] | None = None encoder_config = None generation_config: str = "auto" - skip_tokenizer_init: bool = False def get_diff_sampling_param(self): return self.diff_sampling_param or {} +@dataclass +class MockRendererConfig: + """Minimal mock RendererConfig for testing.""" + + model_config: MockModelConfig + + tokenizer: str = MODEL_NAME + tokenizer_mode: str = "auto" + tokenizer_revision: str | None = None + skip_tokenizer_init: bool = False + allowed_local_media_path: str = "" + allowed_media_domains: list[str] | None = None + + class MockLoRAResolver(LoRAResolver): async def resolve_lora( self, base_model_name: str, lora_name: str @@ -114,6 +122,7 @@ def mock_serving_setup(): mock_engine.add_lora.reset_mock() mock_engine.model_config = MockModelConfig() + mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config) mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() diff --git a/tests/entrypoints/openai/test_serving_chat.py b/tests/entrypoints/openai/test_serving_chat.py index 9ea65f9fa6e7a..9df8f886edd96 100644 --- a/tests/entrypoints/openai/test_serving_chat.py +++ b/tests/entrypoints/openai/test_serving_chat.py @@ -346,27 +346,33 @@ class MockHFConfig: class MockModelConfig: task = "generate" runner_type = "generate" - tokenizer = MODEL_NAME trust_remote_code = False - tokenizer_mode = "auto" max_model_len = 100 - tokenizer_revision = None multimodal_config = MultiModalConfig() hf_config = MockHFConfig() logits_processors: list[str] | None = None logits_processor_pattern = None diff_sampling_param: dict | None = None - allowed_local_media_path: str = "" - allowed_media_domains: list[str] | None = None encoder_config = None generation_config: str = "auto" - media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) - skip_tokenizer_init = False def get_diff_sampling_param(self): return self.diff_sampling_param or {} +@dataclass +class MockRendererConfig: + model_config: MockModelConfig = field(default_factory=MockModelConfig) + + tokenizer = MODEL_NAME + tokenizer_mode = "auto" + tokenizer_revision = None + skip_tokenizer_init = False + media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict) + allowed_local_media_path: str = "" + allowed_media_domains: list[str] | None = None + + def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat: models = OpenAIServingModels( engine_client=engine, @@ -399,6 +405,7 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat: @dataclass class MockEngine: model_config: MockModelConfig = field(default_factory=MockModelConfig) + renderer_config: MockRendererConfig = field(default_factory=MockRendererConfig) input_processor: MagicMock = field(default_factory=MagicMock) io_processor: MagicMock = field(default_factory=MagicMock) @@ -429,6 +436,7 @@ async def test_serving_chat_returns_correct_model_name(): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = MockModelConfig() + mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config) mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() @@ -459,6 +467,7 @@ async def test_serving_chat_should_set_correct_max_tokens(): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = MockModelConfig() + mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config) mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() @@ -492,6 +501,7 @@ async def test_serving_chat_should_set_correct_max_tokens(): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = mock_model_config + mock_engine.renderer_config = MockRendererConfig(mock_model_config) mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() @@ -537,6 +547,7 @@ async def test_serving_chat_should_set_correct_max_tokens(): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = mock_model_config + mock_engine.renderer_config = MockRendererConfig(mock_model_config) mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() @@ -583,6 +594,7 @@ async def test_serving_chat_could_load_correct_generation_config(): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = mock_model_config + mock_engine.renderer_config = MockRendererConfig(mock_model_config) mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() @@ -629,6 +641,7 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = mock_model_config + mock_engine.renderer_config = MockRendererConfig(mock_model_config) mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() @@ -662,6 +675,7 @@ async def test_serving_chat_data_parallel_rank_extraction(): mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME) mock_engine.errored = False mock_engine.model_config = MockModelConfig() + mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config) mock_engine.input_processor = MagicMock() mock_engine.io_processor = MagicMock() diff --git a/tests/entrypoints/openai/test_serving_engine.py b/tests/entrypoints/openai/test_serving_engine.py index 956a06dc5487c..6ab0942b58da8 100644 --- a/tests/entrypoints/openai/test_serving_engine.py +++ b/tests/entrypoints/openai/test_serving_engine.py @@ -7,7 +7,7 @@ from unittest.mock import Mock import pytest -from vllm.config import ModelConfig +from vllm.config import ModelConfig, RendererConfig from vllm.entrypoints.openai.serving_engine import OpenAIServing from vllm.entrypoints.openai.serving_models import OpenAIServingModels from vllm.tokenizers import MistralTokenizer @@ -19,10 +19,16 @@ def serving() -> OpenAIServing: # Create minimal mocks engine_client = Mock() + model_config = Mock(spec=ModelConfig) model_config.max_model_len = 32768 + + renderer_config = Mock(spec=RendererConfig) + renderer_config.model_config = model_config + models = Mock(spec=OpenAIServingModels) models.model_config = model_config + models.renderer_config = renderer_config models.input_processor = Mock() models.io_processor = Mock() diff --git a/tests/entrypoints/openai/test_serving_models.py b/tests/entrypoints/openai/test_serving_models.py index b585835a0667a..376df6cfecb9f 100644 --- a/tests/entrypoints/openai/test_serving_models.py +++ b/tests/entrypoints/openai/test_serving_models.py @@ -6,7 +6,7 @@ from unittest.mock import MagicMock import pytest -from vllm.config import ModelConfig +from vllm.config import ModelConfig, RendererConfig from vllm.engine.protocol import EngineClient from vllm.entrypoints.openai.protocol import ( ErrorResponse, @@ -27,9 +27,15 @@ LORA_UNLOADING_SUCCESS_MESSAGE = ( async def _async_serving_models_init() -> OpenAIServingModels: mock_engine_client = MagicMock(spec=EngineClient) # Set the max_model_len attribute to avoid missing attribute + mock_model_config = MagicMock(spec=ModelConfig) mock_model_config.max_model_len = 2048 + + mock_renderer_config = MagicMock(spec=RendererConfig) + mock_renderer_config.model_config = mock_model_config + mock_engine_client.model_config = mock_model_config + mock_engine_client.renderer_config = mock_renderer_config mock_engine_client.input_processor = MagicMock() mock_engine_client.io_processor = MagicMock() diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 527322c71ae4b..7b296eae7c5a2 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -12,7 +12,7 @@ from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy from vllm.assets.audio import AudioAsset from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset -from vllm.config import ModelConfig +from vllm.config import ModelConfig, RendererConfig from vllm.entrypoints.chat_utils import ( _try_extract_ast, apply_mistral_chat_template, @@ -233,7 +233,7 @@ def test_parse_chat_messages_single_image( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -265,7 +265,7 @@ def test_parse_chat_messages_single_image_with_uuid( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -295,7 +295,7 @@ def test_parse_chat_messages_single_empty_image_with_uuid( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -328,7 +328,7 @@ def test_parse_chat_messages_single_image_with_bad_uuid_format( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -369,7 +369,7 @@ def test_parse_chat_messages_multiple_images_with_uuids( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -409,7 +409,7 @@ def test_parse_chat_messages_multiple_empty_images_with_uuids( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -451,7 +451,7 @@ def test_parse_chat_messages_mixed_empty_images_with_uuids( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -485,7 +485,7 @@ async def test_parse_chat_messages_single_image_with_uuid_async( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -516,7 +516,7 @@ async def test_parse_chat_messages_empty_image_with_uuid_async( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -554,7 +554,7 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -595,7 +595,7 @@ async def test_parse_chat_messages_multiple_empty_images_with_uuids_async( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -634,7 +634,7 @@ async def test_parse_chat_messages_multiple_images_with_partial_uuids_async( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -660,7 +660,7 @@ def test_parse_chat_messages_empty_system( "content": [{"type": "text", "text": "Who are you?"}], }, ], - mistral_model_config, + RendererConfig(model_config=mistral_model_config), content_format="string", ) assert conversation == [ @@ -677,7 +677,7 @@ def test_parse_chat_messages_empty_system( "content": [{"type": "text", "text": "Who are you?"}], }, ], - mistral_model_config, + RendererConfig(model_config=mistral_model_config), content_format="openai", ) assert conversation == [ @@ -701,7 +701,7 @@ async def test_parse_chat_messages_single_image_async( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -730,7 +730,7 @@ def test_parse_chat_messages_multiple_images( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -758,7 +758,7 @@ def test_parse_chat_messages_empty_pil_image_with_uuid( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -786,7 +786,7 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid( ], } ], - phi3v_model_config_image_embeds, + RendererConfig(model_config=phi3v_model_config_image_embeds), content_format="string", ) @@ -818,7 +818,7 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid( ], } ], - audio_embeds_model_config, + RendererConfig(model_config=audio_embeds_model_config), content_format="string", ) @@ -858,7 +858,7 @@ def test_parse_chat_messages_audio_embeds_with_string( ], } ], - audio_embeds_model_config, + RendererConfig(model_config=audio_embeds_model_config), content_format="string", ) @@ -900,7 +900,7 @@ async def test_parse_chat_messages_audio_embeds_async( ], } ], - audio_embeds_model_config, + RendererConfig(model_config=audio_embeds_model_config), content_format="string", ) @@ -1108,7 +1108,7 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async( ], } ], - phi3v_model_config_image_embeds, + RendererConfig(model_config=phi3v_model_config_image_embeds), content_format="string", ) @@ -1144,7 +1144,7 @@ async def test_parse_chat_messages_multiple_images_async( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -1176,7 +1176,7 @@ def test_parse_chat_messages_placeholder_already_in_prompt( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) assert conversation == [ @@ -1208,7 +1208,7 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -1245,7 +1245,7 @@ def test_parse_chat_messages_multiple_images_across_messages( ], }, ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -1289,7 +1289,7 @@ def test_parse_chat_messages_multiple_images_with_uuids_across_messages( ], }, ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -1314,7 +1314,7 @@ def test_parse_chat_messages_context_text_format( {"role": "assistant", "content": "Some stuff."}, {"role": "user", "content": "What about this one?"}, ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="openai", ) @@ -1367,7 +1367,7 @@ def test_parse_chat_messages_rejects_too_many_images_in_one_message( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -1410,7 +1410,7 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages( ], }, ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -1430,7 +1430,7 @@ def test_parse_chat_messages_multiple_images_uncommon_input( ], } ], - phi3v_model_config, + RendererConfig(model_config=phi3v_model_config), content_format="string", ) @@ -1464,7 +1464,7 @@ def test_parse_chat_messages_multiple_images_interleave( ], } ], - phi3v_model_config_mm_interleaved, + RendererConfig(model_config=phi3v_model_config_mm_interleaved), content_format="string", ) @@ -1500,7 +1500,7 @@ async def test_parse_chat_messages_multiple_images_interleave_async( ], } ], - phi3v_model_config_mm_interleaved, + RendererConfig(model_config=phi3v_model_config_mm_interleaved), content_format="string", ) @@ -1545,7 +1545,7 @@ async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async( ], } ], - phi3v_model_config_mm_interleaved, + RendererConfig(model_config=phi3v_model_config_mm_interleaved), content_format="string", ) @@ -1583,7 +1583,7 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave( ], }, ], - phi3v_model_config_mm_interleaved, + RendererConfig(model_config=phi3v_model_config_mm_interleaved), content_format="string", ) @@ -1631,7 +1631,7 @@ def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interl ], }, ], - phi3v_model_config_mm_interleaved, + RendererConfig(model_config=phi3v_model_config_mm_interleaved), content_format="string", ) @@ -1675,7 +1675,7 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave( ], }, ], - qwen25omni_model_config_mm_interleaved, + RendererConfig(model_config=qwen25omni_model_config_mm_interleaved), content_format="string", ) @@ -1743,7 +1743,7 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl ], }, ], - qwen25omni_model_config_mm_interleaved, + RendererConfig(model_config=qwen25omni_model_config_mm_interleaved), content_format="string", ) @@ -1813,7 +1813,7 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes ], }, ], - qwen25omni_model_config_mm_interleaved, + RendererConfig(model_config=qwen25omni_model_config_mm_interleaved), content_format="string", ) @@ -1879,7 +1879,7 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message ], }, ], - qwen25omni_model_config_mm_interleaved, + RendererConfig(model_config=qwen25omni_model_config_mm_interleaved), content_format="string", ) @@ -1927,7 +1927,7 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders( ], } ], - phi3v_model_config_mm_interleaved, + RendererConfig(model_config=phi3v_model_config_mm_interleaved), content_format="string", ) @@ -1945,24 +1945,11 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools): model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_available_online(on_fail="skip") - model_config = ModelConfig( - model, - tokenizer=model_info.tokenizer or model, - tokenizer_mode=model_info.tokenizer_mode, - revision=model_info.revision, - trust_remote_code=model_info.trust_remote_code, - hf_overrides=model_info.hf_overrides, - skip_tokenizer_init=model_info.require_embed_inputs, - enable_prompt_embeds=model_info.require_embed_inputs, - enable_mm_embeds=model_info.require_embed_inputs, - enforce_eager=model_info.enforce_eager, - dtype=model_info.dtype, - ) + renderer_config = model_info.build_renderer_config(model) - # Build the tokenizer tokenizer = get_tokenizer( - model, - trust_remote_code=model_config.trust_remote_code, + renderer_config.tokenizer, + trust_remote_code=renderer_config.trust_remote_code, ) tools = ( @@ -1985,7 +1972,7 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools): tokenizer, chat_template=None, tools=tools, - model_config=model_config, + model_config=renderer_config.model_config, ) assert isinstance(chat_template, str) @@ -2047,24 +2034,11 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwa "enable_thinking": True, } - model_config = ModelConfig( - model, - tokenizer=model_info.tokenizer or model, - tokenizer_mode=model_info.tokenizer_mode, - revision=model_info.revision, - trust_remote_code=model_info.trust_remote_code, - hf_overrides=model_info.hf_overrides, - skip_tokenizer_init=model_info.require_embed_inputs, - enable_prompt_embeds=model_info.require_embed_inputs, - enable_mm_embeds=model_info.require_embed_inputs, - enforce_eager=model_info.enforce_eager, - dtype=model_info.dtype, - ) + renderer_config = model_info.build_renderer_config(model) - # Build the tokenizer tokenizer = get_tokenizer( - model, - trust_remote_code=model_config.trust_remote_code, + renderer_config.tokenizer, + trust_remote_code=renderer_config.trust_remote_code, ) # Test detecting the tokenizer's chat_template @@ -2072,7 +2046,7 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwa tokenizer, chat_template=None, tools=tools, - model_config=model_config, + model_config=renderer_config.model_config, ) with pytest.raises( ValueError, match="Found unexpected chat template kwargs from request" @@ -2143,23 +2117,11 @@ def test_resolve_content_format_hf_defined(model, expected_format): model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_available_online(on_fail="skip") - model_config = ModelConfig( - model, - tokenizer=model_info.tokenizer or model, - tokenizer_mode=model_info.tokenizer_mode, - revision=model_info.revision, - trust_remote_code=model_info.trust_remote_code, - hf_overrides=model_info.hf_overrides, - skip_tokenizer_init=model_info.require_embed_inputs, - enable_prompt_embeds=model_info.require_embed_inputs, - enable_mm_embeds=model_info.require_embed_inputs, - enforce_eager=model_info.enforce_eager, - dtype=model_info.dtype, - ) + renderer_config = model_info.build_renderer_config(model) tokenizer = get_tokenizer( - model, - trust_remote_code=model_config.trust_remote_code, + renderer_config.tokenizer, + trust_remote_code=renderer_config.trust_remote_code, ) # Test detecting the tokenizer's chat_template @@ -2167,7 +2129,7 @@ def test_resolve_content_format_hf_defined(model, expected_format): tokenizer, chat_template=None, tools=None, - model_config=model_config, + model_config=renderer_config.model_config, ) assert isinstance(chat_template, str) @@ -2181,7 +2143,7 @@ def test_resolve_content_format_hf_defined(model, expected_format): None, "auto", tokenizer, - model_config=model_config, + renderer_config=renderer_config, ) assert resolved_format == expected_format @@ -2203,23 +2165,11 @@ def test_resolve_content_format_fallbacks(model, expected_format): model_info = HF_EXAMPLE_MODELS.find_hf_info(model) model_info.check_available_online(on_fail="skip") - model_config = ModelConfig( - model, - tokenizer=model_info.tokenizer or model, - tokenizer_mode=model_info.tokenizer_mode, - revision=model_info.revision, - trust_remote_code=model_info.trust_remote_code, - hf_overrides=model_info.hf_overrides, - skip_tokenizer_init=model_info.require_embed_inputs, - enable_prompt_embeds=model_info.require_embed_inputs, - enable_mm_embeds=model_info.require_embed_inputs, - enforce_eager=model_info.enforce_eager, - dtype=model_info.dtype, - ) + renderer_config = model_info.build_renderer_config(model) tokenizer = get_tokenizer( - model_config.tokenizer, - trust_remote_code=model_config.trust_remote_code, + renderer_config.tokenizer, + trust_remote_code=renderer_config.trust_remote_code, ) # Test detecting the tokenizer's chat_template @@ -2227,7 +2177,7 @@ def test_resolve_content_format_fallbacks(model, expected_format): tokenizer, chat_template=None, tools=None, - model_config=model_config, + model_config=renderer_config.model_config, ) assert isinstance(chat_template, str) @@ -2241,7 +2191,7 @@ def test_resolve_content_format_fallbacks(model, expected_format): None, "auto", tokenizer, - model_config=model_config, + renderer_config=renderer_config, ) assert resolved_format == expected_format @@ -2272,15 +2222,13 @@ def test_resolve_content_format_fallbacks(model, expected_format): ], ) def test_resolve_content_format_examples(template_path, expected_format): - model_config = ModelConfig( - PHI3V_MODEL_ID, # Dummy - tokenizer=PHI3V_MODEL_ID, # Dummy - trust_remote_code=True, - ) + model = PHI3V_MODEL_ID # Dummy + model_config = ModelConfig(model, trust_remote_code=True) + renderer_config = RendererConfig(model_config=model_config, tokenizer=model) dummy_tokenizer = get_tokenizer( - PHI3V_MODEL_ID, # Dummy - trust_remote_code=model_config.trust_remote_code, + renderer_config.tokenizer, + trust_remote_code=renderer_config.trust_remote_code, ) dummy_tokenizer.chat_template = None @@ -2297,7 +2245,7 @@ def test_resolve_content_format_examples(template_path, expected_format): None, "auto", dummy_tokenizer, - model_config=model_config, + renderer_config=renderer_config, ) assert resolved_format == expected_format @@ -2332,7 +2280,7 @@ def test_parse_chat_messages_include_thinking_chunk(mistral_model_config): conversation_with_thinking, _, _ = parse_chat_messages( messages, - mistral_model_config, + RendererConfig(model_config=mistral_model_config), content_format="openai", ) @@ -2432,7 +2380,7 @@ def test_parse_chat_messages_single_empty_audio_with_uuid( ], } ], - qwen2_audio_model_config, + RendererConfig(model_config=qwen2_audio_model_config), content_format="string", ) @@ -2466,7 +2414,7 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async( ], } ], - qwen2_audio_model_config, + RendererConfig(model_config=qwen2_audio_model_config), content_format="string", ) diff --git a/tests/lora/test_lora_manager.py b/tests/lora/test_lora_manager.py index 081f14d6fabfb..7158120fc0217 100644 --- a/tests/lora/test_lora_manager.py +++ b/tests/lora/test_lora_manager.py @@ -8,7 +8,7 @@ import torch from safetensors.torch import load_file from torch import nn -from vllm.config import ModelConfig, VllmConfig +from vllm.config import ModelConfig, RendererConfig, VllmConfig from vllm.config.lora import LoRAConfig from vllm.lora.layers import ( ColumnParallelLinearWithLoRA, @@ -422,7 +422,11 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_pa ) model_config = ModelConfig(max_model_len=16) - vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config) + vllm_config = VllmConfig( + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), + lora_config=lora_config, + ) vllm_config.scheduler_config.max_num_seqs = 4 vllm_config.scheduler_config.max_num_batched_tokens = 2 @@ -525,7 +529,11 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path ) model_config = ModelConfig(max_model_len=16) - vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config) + vllm_config = VllmConfig( + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), + lora_config=lora_config, + ) vllm_config.scheduler_config.max_num_seqs = 4 vllm_config.scheduler_config.max_num_batched_tokens = 2 diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py index 54059ec561907..42d8c6202e79e 100644 --- a/tests/lora/test_worker.py +++ b/tests/lora/test_worker.py @@ -11,6 +11,7 @@ from vllm.config import ( DeviceConfig, ModelConfig, ParallelConfig, + RendererConfig, SchedulerConfig, VllmConfig, ) @@ -43,6 +44,7 @@ def test_worker_apply_lora(qwen3_lora_files): vllm_config = VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), load_config=LoadConfig( download_dir=None, load_format="dummy", diff --git a/tests/model_executor/test_model_load_with_params.py b/tests/model_executor/test_model_load_with_params.py index 489ac1e6475b9..e368671078fdd 100644 --- a/tests/model_executor/test_model_load_with_params.py +++ b/tests/model_executor/test_model_load_with_params.py @@ -42,8 +42,10 @@ def test_model_loading_with_params(vllm_runner, monkeypatch): "Write a short story about a robot that dreams for the first time.\n" ) - model_config = vllm_model.llm.llm_engine.model_config - model_tokenizer = vllm_model.llm.llm_engine.tokenizer + llm_engine = vllm_model.llm.llm_engine + model_config = llm_engine.model_config + renderer_config = llm_engine.renderer_config + tokenizer = llm_engine.tokenizer # asserts on the bert model config file assert model_config.encoder_config["max_seq_length"] == 512 @@ -54,8 +56,8 @@ def test_model_loading_with_params(vllm_runner, monkeypatch): assert model_config.pooler_config.normalize # asserts on the tokenizer loaded - assert model_config.tokenizer == "BAAI/bge-base-en-v1.5" - assert model_tokenizer.model_max_length == 512 + assert renderer_config.tokenizer == "BAAI/bge-base-en-v1.5" + assert tokenizer.model_max_length == 512 def check_model(model): assert isinstance(model, BertEmbeddingModel) @@ -86,8 +88,10 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch): "Write a short story about a robot that dreams for the first time.\n" ) - model_config = vllm_model.llm.llm_engine.model_config - model_tokenizer = vllm_model.llm.llm_engine.tokenizer + llm_engine = vllm_model.llm.llm_engine + model_config = llm_engine.model_config + renderer_config = llm_engine.renderer_config + tokenizer = llm_engine.tokenizer # asserts on the bert model config file assert model_config.encoder_config["max_seq_length"] == 512 @@ -98,8 +102,8 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch): assert model_config.pooler_config.normalize # asserts on the tokenizer loaded - assert model_config.tokenizer == "intfloat/multilingual-e5-base" - assert model_tokenizer.model_max_length == 512 + assert renderer_config.tokenizer == "intfloat/multilingual-e5-base" + assert tokenizer.model_max_length == 512 def check_model(model): assert isinstance(model, RobertaEmbeddingModel) @@ -128,7 +132,7 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner, monkeypatch): "Write a short story about a robot that dreams for the first time.\n" ) - assert vllm_model.llm.llm_engine.model_config.tokenizer == model_name + assert vllm_model.llm.llm_engine.renderer_config.tokenizer == model_name def check_model(model): assert isinstance(model, RobertaEmbeddingModel) diff --git a/tests/models/language/pooling/test_gritlm.py b/tests/models/language/pooling/test_gritlm.py index 0adc9b5cf25f6..11ee003585487 100644 --- a/tests/models/language/pooling/test_gritlm.py +++ b/tests/models/language/pooling/test_gritlm.py @@ -6,7 +6,7 @@ import pytest from scipy.spatial.distance import cosine from vllm import LLM, SamplingParams -from vllm.config import ModelConfig +from vllm.config import ModelConfig, RendererConfig from ....utils import RemoteOpenAIServer @@ -31,7 +31,8 @@ def test_find_array(): dtype="bfloat16", seed=0, ) - pooling = GritLMMeanPool(model_config=model_config) + renderer_config = RendererConfig(model_config=model_config) + pooling = GritLMMeanPool(renderer_config=renderer_config) arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 2e032ac4ca526..9b2b29b758765 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -25,7 +25,6 @@ from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingC from vllm.tokenizers import ( MistralTokenizer, TokenizerLike, - cached_tokenizer_from_config, ) from ....multimodal.utils import random_audio, random_image, random_video @@ -212,31 +211,20 @@ def _test_processing_correctness( else: model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id_or_arch) model_id = model_id_or_arch + model_info.check_available_online(on_fail="skip") model_info.check_transformers_version(on_fail="skip") - model_config = ModelConfig( - model_id, - tokenizer=model_info.tokenizer or model_id, - tokenizer_mode=model_info.tokenizer_mode, - revision=model_info.revision, - trust_remote_code=model_info.trust_remote_code, - hf_overrides=model_info.hf_overrides, + renderer_config = model_info.build_renderer_config( + model=model_id, # Ensure that the cache can fit all of the data mm_processor_cache_gb=2048, - skip_tokenizer_init=model_info.require_embed_inputs, - enable_prompt_embeds=model_info.require_embed_inputs, - enable_mm_embeds=model_info.require_embed_inputs, - enforce_eager=model_info.enforce_eager, - dtype=model_info.dtype, ) + model_config = renderer_config.model_config model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) factories = model_cls._processor_factory - ctx = InputProcessingContext( - model_config, - tokenizer=cached_tokenizer_from_config(model_config), - ) + ctx = InputProcessingContext.from_config(renderer_config) cache = MultiModalProcessorOnlyCache(model_config) processing_info = factories.info(ctx) diff --git a/tests/models/multimodal/processing/test_glm4_1v.py b/tests/models/multimodal/processing/test_glm4_1v.py index 51071c93531de..fdc6352e2ec83 100644 --- a/tests/models/multimodal/processing/test_glm4_1v.py +++ b/tests/models/multimodal/processing/test_glm4_1v.py @@ -40,7 +40,7 @@ def test_processor_override( mm_processor_kwargs=None, limit_mm_per_prompt={"video": 1}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) tokenizer = processor.info.get_tokenizer() hf_processor_mm_kwargs = {"fps": fps} @@ -79,7 +79,7 @@ def test_video_loader_consistency( mm_processor_kwargs=None, limit_mm_per_prompt={"video": 1}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) hf_processor_mm_kwargs = {"fps": fps} # Build the image str / prompt based on the number of images we pass diff --git a/tests/models/multimodal/processing/test_h2ovl.py b/tests/models/multimodal/processing/test_h2ovl.py index 1701d9dd8f011..1263d663e6af6 100644 --- a/tests/models/multimodal/processing/test_h2ovl.py +++ b/tests/models/multimodal/processing/test_h2ovl.py @@ -162,7 +162,7 @@ def test_processor_override( mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, limit_mm_per_prompt={"image": len(size_factors)}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs min_num = min_dynamic_patch if dynamic_image_size else 1 diff --git a/tests/models/multimodal/processing/test_idefics3.py b/tests/models/multimodal/processing/test_idefics3.py index 351b9d018eec2..bf12e79a718b7 100644 --- a/tests/models/multimodal/processing/test_idefics3.py +++ b/tests/models/multimodal/processing/test_idefics3.py @@ -38,7 +38,7 @@ def test_processor_override( mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs # Build the image str / prompt based on the number of images we pass diff --git a/tests/models/multimodal/processing/test_internvl.py b/tests/models/multimodal/processing/test_internvl.py index b4994295d3a80..51f0d2e891b3f 100644 --- a/tests/models/multimodal/processing/test_internvl.py +++ b/tests/models/multimodal/processing/test_internvl.py @@ -116,7 +116,7 @@ def test_processor_override( mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, limit_mm_per_prompt={"image": len(size_factors)}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs min_num = min_dynamic_patch if dynamic_image_size else 1 diff --git a/tests/models/multimodal/processing/test_llama4.py b/tests/models/multimodal/processing/test_llama4.py index b73246b68b36a..04bc8d3f53818 100644 --- a/tests/models/multimodal/processing/test_llama4.py +++ b/tests/models/multimodal/processing/test_llama4.py @@ -30,7 +30,7 @@ def test_processor_override( limit_mm_per_prompt={"image": num_imgs}, mm_processor_cache_gb=mm_processor_cache_gb, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) config = processor.info.get_hf_config() tokenizer = processor.info.get_tokenizer() hf_processor = processor.info.get_hf_processor() diff --git a/tests/models/multimodal/processing/test_llava_next.py b/tests/models/multimodal/processing/test_llava_next.py index ffe7ca17b5d61..cd01002a32af2 100644 --- a/tests/models/multimodal/processing/test_llava_next.py +++ b/tests/models/multimodal/processing/test_llava_next.py @@ -42,7 +42,7 @@ def test_processor_max_tokens(model_id): mm_processor_kwargs=None, limit_mm_per_prompt={"image": 1}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) info = processor.info seen_aspect_ratios = set[float]() @@ -140,7 +140,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs): mm_processor_kwargs=None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) image_ratios = [ (171, 152), @@ -173,7 +173,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs): mm_processor_kwargs=None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) seen_aspect_ratios = set[float]() image_sizes = list[ImageSize]() diff --git a/tests/models/multimodal/processing/test_llava_onevision.py b/tests/models/multimodal/processing/test_llava_onevision.py index f5c552fe6476a..be505d95a500f 100644 --- a/tests/models/multimodal/processing/test_llava_onevision.py +++ b/tests/models/multimodal/processing/test_llava_onevision.py @@ -42,7 +42,7 @@ def test_processor_max_tokens(model_id): mm_processor_kwargs=None, limit_mm_per_prompt={"image": 1}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) info = processor.info seen_aspect_ratios = set[float]() @@ -138,7 +138,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs): mm_processor_kwargs=None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) image_ratios = [ (171, 152), @@ -171,7 +171,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs): mm_processor_kwargs=None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) seen_aspect_ratios = set[float]() image_sizes = list[ImageSize]() diff --git a/tests/models/multimodal/processing/test_minimax_vl_01.py b/tests/models/multimodal/processing/test_minimax_vl_01.py index 11e0001235110..17ac54fdd0a49 100644 --- a/tests/models/multimodal/processing/test_minimax_vl_01.py +++ b/tests/models/multimodal/processing/test_minimax_vl_01.py @@ -24,7 +24,7 @@ def test_processor_override( mm_processor_kwargs=None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) prompt = "" * num_imgs image = Image.new("RGB", size=(364, 364)) mm_data = {"image": [image] * num_imgs} @@ -83,7 +83,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs): mm_processor_kwargs=None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) image_ratios = [ (171, 152), diff --git a/tests/models/multimodal/processing/test_mllama4.py b/tests/models/multimodal/processing/test_mllama4.py index e5ff2d1391b62..9a65e2ddc85c6 100644 --- a/tests/models/multimodal/processing/test_mllama4.py +++ b/tests/models/multimodal/processing/test_mllama4.py @@ -25,7 +25,7 @@ def test_profiling(model_id: str, max_model_len: int): limit_mm_per_prompt=mm_counts, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) profiler = MultiModalProfiler(processor) decoder_dummy_data = profiler.get_decoder_dummy_data( diff --git a/tests/models/multimodal/processing/test_nemotron_vl.py b/tests/models/multimodal/processing/test_nemotron_vl.py index 5311ab1b78c69..f3609743b7c85 100644 --- a/tests/models/multimodal/processing/test_nemotron_vl.py +++ b/tests/models/multimodal/processing/test_nemotron_vl.py @@ -118,7 +118,7 @@ def test_processor_override( mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, limit_mm_per_prompt={"image": len(size_factors)}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs min_num = min_dynamic_patch if dynamic_image_size else 1 diff --git a/tests/models/multimodal/processing/test_phi3v.py b/tests/models/multimodal/processing/test_phi3v.py index 8faff2611e6fe..f51bd97861783 100644 --- a/tests/models/multimodal/processing/test_phi3v.py +++ b/tests/models/multimodal/processing/test_phi3v.py @@ -39,7 +39,7 @@ def test_processor_override( mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs # Build the image str / prompt based on the number of images we pass diff --git a/tests/models/multimodal/processing/test_phi4mm.py b/tests/models/multimodal/processing/test_phi4mm.py index 5391555c26675..271357b0d1507 100644 --- a/tests/models/multimodal/processing/test_phi4mm.py +++ b/tests/models/multimodal/processing/test_phi4mm.py @@ -39,7 +39,7 @@ def test_processor_override( mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs # Build the image str / prompt based on the number of images we pass diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py index 9f4cdb6789b2c..d65a270a7da3b 100644 --- a/tests/models/multimodal/processing/test_qwen2_vl.py +++ b/tests/models/multimodal/processing/test_qwen2_vl.py @@ -34,7 +34,7 @@ def test_processor_override( mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) tokenizer = processor.info.get_tokenizer() hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs diff --git a/tests/models/multimodal/processing/test_smolvlm.py b/tests/models/multimodal/processing/test_smolvlm.py index 6f77d5516d147..e0e6264de4e3a 100644 --- a/tests/models/multimodal/processing/test_smolvlm.py +++ b/tests/models/multimodal/processing/test_smolvlm.py @@ -38,7 +38,7 @@ def test_processor_override( mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, limit_mm_per_prompt={"image": num_imgs}, ) - processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config) hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs # Build the image str / prompt based on the number of images we pass diff --git a/tests/models/multimodal/processing/test_tensor_schema.py b/tests/models/multimodal/processing/test_tensor_schema.py index 5d489549c5b46..24959fa48ad6d 100644 --- a/tests/models/multimodal/processing/test_tensor_schema.py +++ b/tests/models/multimodal/processing/test_tensor_schema.py @@ -11,7 +11,7 @@ import pytest import torch.nn as nn from PIL import Image -from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config +from vllm.config import ModelConfig, RendererConfig, VllmConfig, set_current_vllm_config from vllm.config.multimodal import ( AudioDummyOptions, BaseDummyOptions, @@ -31,7 +31,6 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext from vllm.multimodal.utils import group_mm_kwargs_by_modality from vllm.platforms import current_platform -from vllm.tokenizers import cached_tokenizer_from_config from vllm.utils.collection_utils import is_list_of from vllm.utils.torch_utils import set_default_torch_dtype @@ -150,7 +149,10 @@ def initialize_dummy_model( backend="nccl", ) initialize_model_parallel(tensor_model_parallel_size=1) - vllm_config = VllmConfig(model_config=model_config) + vllm_config = VllmConfig( + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), + ) with set_current_vllm_config(vllm_config=vllm_config): with set_default_torch_dtype(model_config.dtype): model = model_cls(vllm_config=vllm_config) @@ -182,19 +184,12 @@ def test_model_tensor_schema(model_id: str): else: dtype = model_info.dtype - model_config = ModelConfig( + renderer_config = model_info.build_renderer_config( model_id, - tokenizer=model_info.tokenizer or model_id, - tokenizer_mode=model_info.tokenizer_mode, - revision=model_info.revision, - trust_remote_code=model_info.trust_remote_code, hf_overrides=hf_overrides_fn, - skip_tokenizer_init=model_info.require_embed_inputs, - enable_prompt_embeds=model_info.require_embed_inputs, - enable_mm_embeds=model_info.require_embed_inputs, - enforce_eager=model_info.enforce_eager, dtype=dtype, ) + model_config = renderer_config.model_config model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) assert supports_multimodal(model_cls) @@ -212,10 +207,7 @@ def test_model_tensor_schema(model_id: str): if not any(inputs_parse_methods): pytest.skip(f"{model_arch} does not support tensor schema validation.") - ctx = InputProcessingContext( - model_config, - tokenizer=cached_tokenizer_from_config(model_config), - ) + ctx = InputProcessingContext.from_config(renderer_config) processing_info = factories.info(ctx) supported_mm_limits = processing_info.get_supported_mm_limits() limit_mm_per_prompt = { diff --git a/tests/models/multimodal/processing/test_transformers.py b/tests/models/multimodal/processing/test_transformers.py index e2a2186f470b4..c9a90eb882da4 100644 --- a/tests/models/multimodal/processing/test_transformers.py +++ b/tests/models/multimodal/processing/test_transformers.py @@ -3,7 +3,7 @@ import pytest from vllm.assets.image import ImageAsset -from vllm.config import ModelConfig +from vllm.config import ModelConfig, RendererConfig from vllm.multimodal import MULTIMODAL_REGISTRY @@ -13,8 +13,9 @@ def test_multimodal_processor(model_id): model=model_id, model_impl="transformers", ) + renderer_config = RendererConfig(model_config=model_config) - mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config) + mm_processor = MULTIMODAL_REGISTRY.create_processor(renderer_config) image_pil = ImageAsset("cherry_blossom").pil_image mm_data = {"image": image_pil} diff --git a/tests/models/multimodal/test_mapping.py b/tests/models/multimodal/test_mapping.py index 0d2eaca95504e..73de6b5f7d2f9 100644 --- a/tests/models/multimodal/test_mapping.py +++ b/tests/models/multimodal/test_mapping.py @@ -7,7 +7,6 @@ import torch import transformers from transformers import AutoConfig, PreTrainedModel -from vllm.config import ModelConfig from vllm.model_executor.models.utils import WeightsMapper from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.transformers_utils.config import try_get_safetensors_metadata @@ -50,37 +49,11 @@ def test_hf_model_weights_mapper(model_arch: str): model_info.check_available_online(on_fail="skip") model_info.check_transformers_version(on_fail="skip") - is_mistral_model = model_arch in [ - "Mistral3ForConditionalGeneration", - "PixtralForConditionalGeneration", - "VoxtralForConditionalGeneration", - ] - - if not is_mistral_model or model_info.tokenizer_mode == "mistral": - tokenizer_mode = model_info.tokenizer_mode - else: - tokenizer_mode = "hf" - - model_id = model_info.default - - model_config = ModelConfig( - model_id, - tokenizer=model_info.tokenizer or model_id, - tokenizer_mode=tokenizer_mode, - config_format="hf", - revision=model_info.revision, - trust_remote_code=model_info.trust_remote_code, - hf_overrides=model_info.hf_overrides, - skip_tokenizer_init=model_info.require_embed_inputs, - enable_prompt_embeds=model_info.require_embed_inputs, - enable_mm_embeds=model_info.require_embed_inputs, - enforce_eager=model_info.enforce_eager, - dtype=model_info.dtype, - ) + model_config = model_info.build_model_config(config_format="hf") model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config) - original_weights = create_repo_dummy_weights(model_id) - hf_dummy_model = create_dummy_model(model_id, model_arch) + original_weights = create_repo_dummy_weights(model_config.model) + hf_dummy_model = create_dummy_model(model_config.model, model_arch) hf_converted_weights = hf_dummy_model.named_parameters() hf_converted_buffers = hf_dummy_model.named_buffers() mapper: WeightsMapper = model_cls.hf_to_vllm_mapper diff --git a/tests/models/registry.py b/tests/models/registry.py index 020cb749341a6..e2cb5bcbc6c91 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -9,7 +9,8 @@ import pytest from packaging.version import Version from transformers import __version__ as TRANSFORMERS_VERSION -from vllm.config.model import ModelDType, TokenizerMode +from vllm.config.model import ModelConfig, ModelDType +from vllm.config.renderer import RendererConfig, TokenizerMode @dataclass(frozen=True) @@ -170,6 +171,36 @@ class _HfExamplesInfo: else: pytest.skip(msg) + def build_model_config(self, model: str | None = None, **kwargs) -> ModelConfig: + if model is None: + model = self.default + + return ModelConfig( + **{ + "model": model, + "revision": self.revision, + "trust_remote_code": self.trust_remote_code, + "hf_overrides": self.hf_overrides, + "enable_prompt_embeds": self.require_embed_inputs, + "enable_mm_embeds": self.require_embed_inputs, + "enforce_eager": self.enforce_eager, + "dtype": self.dtype, + **kwargs, + } + ) + + def build_renderer_config( + self, model: str | None = None, **kwargs + ) -> RendererConfig: + model_config = self.build_model_config(model, **kwargs) + + return RendererConfig( + model_config=model_config, + tokenizer=self.tokenizer or model_config.model, + tokenizer_mode=self.tokenizer_mode, + skip_tokenizer_init=self.require_embed_inputs, + ) + _TEXT_GENERATION_EXAMPLE_MODELS = { # [Decoder-only] diff --git a/tests/models/utils.py b/tests/models/utils.py index d84b4b820533e..87292cc4538d9 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -13,7 +13,6 @@ from transformers import PretrainedConfig from vllm.config.model import ModelConfig, ModelDType, RunnerOption from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs from vllm.multimodal.processing import InputProcessingContext -from vllm.tokenizers import cached_tokenizer_from_config from .. import ci_envs from .registry import HF_EXAMPLE_MODELS @@ -296,30 +295,18 @@ def build_model_context( model_config_kwargs = model_config_kwargs or {} limit_mm_per_prompt = limit_mm_per_prompt or {} - model_config = ModelConfig( + renderer_config = model_info.build_renderer_config( model_id, runner=runner, - tokenizer=model_info.tokenizer or model_id, - tokenizer_mode=model_info.tokenizer_mode, - revision=model_info.revision, - trust_remote_code=model_info.trust_remote_code, dtype=dtype, seed=0, mm_processor_kwargs=mm_processor_kwargs, limit_mm_per_prompt=limit_mm_per_prompt, mm_processor_cache_gb=mm_processor_cache_gb, - hf_overrides=model_info.hf_overrides, - skip_tokenizer_init=model_info.require_embed_inputs, - enable_prompt_embeds=model_info.require_embed_inputs, - enable_mm_embeds=model_info.require_embed_inputs, - enforce_eager=model_info.enforce_eager, **model_config_kwargs, ) - return InputProcessingContext( - model_config, - tokenizer=cached_tokenizer_from_config(model_config), - ) + return InputProcessingContext.from_config(renderer_config) def check_embeddings_close( diff --git a/tests/multimodal/test_cache.py b/tests/multimodal/test_cache.py index e641b1111abaf..ce16d90130aa4 100644 --- a/tests/multimodal/test_cache.py +++ b/tests/multimodal/test_cache.py @@ -6,7 +6,7 @@ import numpy as np import pytest import torch -from vllm.config import ModelConfig, ParallelConfig, VllmConfig +from vllm.config import ModelConfig, ParallelConfig, RendererConfig, VllmConfig from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.cache import ( BaseMultiModalProcessorCache, @@ -110,11 +110,14 @@ def _create_vllm_config( mm_processor_cache_gb: float, enable_ipc: bool, ): + model_config = ModelConfig( + model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", + mm_processor_cache_gb=mm_processor_cache_gb, + ) + return VllmConfig( - model_config=ModelConfig( - model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", - mm_processor_cache_gb=mm_processor_cache_gb, - ), + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), parallel_config=ParallelConfig(data_parallel_size=1 if enable_ipc else 2), ) @@ -506,13 +509,15 @@ def _run_test_cache_eviction_shm( def test_cache_eviction_shm_cache(): + model_config = ModelConfig( + model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", + mm_processor_cache_type="shm", + mm_shm_cache_max_object_size_mb=6, + mm_processor_cache_gb=15.2 * MiB_bytes / GiB_bytes, + ) vllm_config = VllmConfig( - model_config=ModelConfig( - model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", - mm_processor_cache_type="shm", - mm_shm_cache_max_object_size_mb=6, - mm_processor_cache_gb=15.2 * MiB_bytes / GiB_bytes, - ), + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), ) sender_cache = ShmObjectStoreSenderCache(vllm_config) receiver_cache = ShmObjectStoreReceiverCache(vllm_config, mp.Lock()) diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index 262ea42e4d0fa..adff572524a9b 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -7,7 +7,7 @@ from contextlib import nullcontext import numpy as np import pytest -from vllm.config import ModelConfig +from vllm.config import ModelConfig, RendererConfig from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.processing import ( InputProcessingContext, @@ -920,8 +920,9 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid): model=model_id, limit_mm_per_prompt=limit_mm_per_prompt, ) + renderer_config = RendererConfig(model_config=model_config) - processor = MULTIMODAL_REGISTRY.create_processor(model_config) + processor = MULTIMODAL_REGISTRY.create_processor(renderer_config) processor._supported_mm_limits = {"image": num_supported} profiler = MultiModalProfiler(processor) @@ -955,8 +956,9 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid): model=model_id, limit_mm_per_prompt=limit_mm_per_prompt, ) + renderer_config = RendererConfig(model_config=model_config) - processor = MULTIMODAL_REGISTRY.create_processor(model_config) + processor = MULTIMODAL_REGISTRY.create_processor(renderer_config) rng = np.random.RandomState(0) image = random_image(rng, min_wh=128, max_wh=256) @@ -1012,11 +1014,13 @@ def test_hf_processor_init_kwargs( inference_kwargs, expected_kwargs, ): - ctx = InputProcessingContext( - model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs), - tokenizer=None, + model_config = ModelConfig(model_id, mm_processor_kwargs=config_kwargs) + renderer_config = RendererConfig( + model_config=model_config, + tokenizer=model_id, ) + ctx = InputProcessingContext.from_config(renderer_config) processor = ctx.get_hf_processor( DummyProcessor, # type: ignore[arg-type] **inference_kwargs, @@ -1045,11 +1049,13 @@ def test_hf_processor_call_kwargs( inference_kwargs, expected_kwargs, ): - ctx = InputProcessingContext( - model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs), - tokenizer=None, + model_config = ModelConfig(model_id, mm_processor_kwargs=config_kwargs) + renderer_config = RendererConfig( + model_config=model_config, + tokenizer=model_id, ) + ctx = InputProcessingContext.from_config(renderer_config) processor = ctx.get_hf_processor(DummyProcessor) # type: ignore[arg-type] result = ctx.call_hf_processor(processor, {}, inference_kwargs) diff --git a/tests/multimodal/test_registry.py b/tests/multimodal/test_registry.py index 3b01bda7f54c8..8127fac09968b 100644 --- a/tests/multimodal/test_registry.py +++ b/tests/multimodal/test_registry.py @@ -31,4 +31,6 @@ def test_supports_multimodal_inputs(model_id, limit_mm_per_prompt, expected): model_id, limit_mm_per_prompt=limit_mm_per_prompt, ) - assert MULTIMODAL_REGISTRY.supports_multimodal_inputs(ctx.model_config) is expected + assert ( + MULTIMODAL_REGISTRY.supports_multimodal_inputs(ctx.renderer_config) is expected + ) diff --git a/tests/test_config.py b/tests/test_config.py index 203447cd531fb..7464fcd1e9fe5 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -13,6 +13,7 @@ from vllm.config import ( CompilationConfig, ModelConfig, PoolerConfig, + RendererConfig, SchedulerConfig, VllmConfig, update_config, @@ -476,27 +477,41 @@ def test_load_config_pt_load_map_location(pt_load_map_location): ("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", 131073, 131072, True), ], ) -def test_get_and_verify_max_len( +def test_recalculate_max_model_len( model_id, max_model_len, expected_max_len, should_raise ): - """Test get_and_verify_max_len with different configurations.""" + """Test recalculate_max_model_len with different configurations.""" model_config = ModelConfig(model_id) if should_raise: with pytest.raises(ValueError): - model_config.get_and_verify_max_len(max_model_len) + model_config.recalculate_max_model_len( + max_model_len, + tokenizer=model_id, + tokenizer_revision=None, + ) else: - actual_max_len = model_config.get_and_verify_max_len(max_model_len) - assert actual_max_len == expected_max_len + model_config.recalculate_max_model_len( + max_model_len, + tokenizer=model_id, + tokenizer_revision=None, + ) + assert model_config.max_model_len == expected_max_len -class MockConfig: - """Simple mock object for testing maybe_pull_model_tokenizer_for_runai""" +class MockModelConfig: + """Simple mock object for testing maybe_pull_model_for_runai""" - def __init__(self, model: str, tokenizer: str): + def __init__(self, model: str): self.model = model - self.tokenizer = tokenizer - self.model_weights = None + + +class MockRendererConfig: + """Simple mock object for testing maybe_pull_tokenizer_for_runai""" + + def __init__(self, model_config: MockModelConfig): + self.model_config = model_config + self.tokenizer = model_config.model @pytest.mark.parametrize( @@ -514,59 +529,65 @@ def test_s3_url_model_tokenizer_paths(mock_pull_files, s3_url): mock_pull_files.return_value = None # Create first mock and run the method - config1 = MockConfig(model=s3_url, tokenizer=s3_url) - ModelConfig.maybe_pull_model_tokenizer_for_runai(config1, s3_url, s3_url) + model_config1 = MockModelConfig(model=s3_url) + renderer_config1 = MockRendererConfig(model_config=model_config1) + ModelConfig.maybe_pull_model_for_runai(model_config1, s3_url) + RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config1, s3_url) # Check that model and tokenizer point to existing directories - assert os.path.exists(config1.model), ( - f"Model directory does not exist: {config1.model}" + assert os.path.exists(model_config1.model), ( + f"Model directory does not exist: {model_config1.model}" ) - assert os.path.isdir(config1.model), ( - f"Model path is not a directory: {config1.model}" + assert os.path.isdir(model_config1.model), ( + f"Model path is not a directory: {model_config1.model}" ) - assert os.path.exists(config1.tokenizer), ( - f"Tokenizer directory does not exist: {config1.tokenizer}" + assert os.path.exists(renderer_config1.tokenizer), ( + f"Tokenizer directory does not exist: {renderer_config1.tokenizer}" ) - assert os.path.isdir(config1.tokenizer), ( - f"Tokenizer path is not a directory: {config1.tokenizer}" + assert os.path.isdir(renderer_config1.tokenizer), ( + f"Tokenizer path is not a directory: {renderer_config1.tokenizer}" ) # Verify that the paths are different from the original S3 URL - assert config1.model != s3_url, "Model path should be converted to local directory" - assert config1.tokenizer != s3_url, ( + assert model_config1.model != s3_url, ( + "Model path should be converted to local directory" + ) + assert renderer_config1.tokenizer != s3_url, ( "Tokenizer path should be converted to local directory" ) # Store the original paths - created_model_dir = config1.model - create_tokenizer_dir = config1.tokenizer + created_model_dir = model_config1.model + create_tokenizer_dir = renderer_config1.tokenizer # Create a new mock and run the method with the same S3 URL - config2 = MockConfig(model=s3_url, tokenizer=s3_url) - ModelConfig.maybe_pull_model_tokenizer_for_runai(config2, s3_url, s3_url) + model_config2 = MockModelConfig(model=s3_url) + renderer_config2 = MockRendererConfig(model_config=model_config2) + ModelConfig.maybe_pull_model_for_runai(model_config2, s3_url) + RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config2, s3_url) # Check that the new directories exist - assert os.path.exists(config2.model), ( - f"Model directory does not exist: {config2.model}" + assert os.path.exists(model_config2.model), ( + f"Model directory does not exist: {model_config2.model}" ) - assert os.path.isdir(config2.model), ( - f"Model path is not a directory: {config2.model}" + assert os.path.isdir(model_config2.model), ( + f"Model path is not a directory: {model_config2.model}" ) - assert os.path.exists(config2.tokenizer), ( - f"Tokenizer directory does not exist: {config2.tokenizer}" + assert os.path.exists(renderer_config2.tokenizer), ( + f"Tokenizer directory does not exist: {renderer_config2.tokenizer}" ) - assert os.path.isdir(config2.tokenizer), ( - f"Tokenizer path is not a directory: {config2.tokenizer}" + assert os.path.isdir(renderer_config2.tokenizer), ( + f"Tokenizer path is not a directory: {renderer_config2.tokenizer}" ) # Verify that the paths are deterministic (same as before) - assert config2.model == created_model_dir, ( + assert model_config2.model == created_model_dir, ( f"Model paths are not deterministic. " - f"Original: {created_model_dir}, New: {config2.model}" + f"Original: {created_model_dir}, New: {model_config2.model}" ) - assert config2.tokenizer == create_tokenizer_dir, ( + assert renderer_config2.tokenizer == create_tokenizer_dir, ( f"Tokenizer paths are not deterministic. " - f"Original: {create_tokenizer_dir}, New: {config2.tokenizer}" + f"Original: {create_tokenizer_dir}, New: {renderer_config2.tokenizer}" ) @@ -580,28 +601,36 @@ def test_s3_url_different_models_create_different_directories(mock_pull_files): s3_url2 = "s3://example-bucket-2/model/" # Create mocks with different S3 URLs and run the method - config1 = MockConfig(model=s3_url1, tokenizer=s3_url1) - ModelConfig.maybe_pull_model_tokenizer_for_runai(config1, s3_url1, s3_url1) + model_config1 = MockModelConfig(model=s3_url1) + renderer_config1 = MockRendererConfig(model_config=model_config1) + ModelConfig.maybe_pull_model_for_runai(model_config1, s3_url1) + RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config1, s3_url1) - config2 = MockConfig(model=s3_url2, tokenizer=s3_url2) - ModelConfig.maybe_pull_model_tokenizer_for_runai(config2, s3_url2, s3_url2) + model_config2 = MockModelConfig(model=s3_url2) + renderer_config2 = MockRendererConfig(model_config=model_config2) + ModelConfig.maybe_pull_model_for_runai(model_config2, s3_url2) + RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config2, s3_url2) # Verify that different URLs produce different directories - assert config1.model != config2.model, ( + assert model_config1.model != model_config2.model, ( f"Different S3 URLs should create different model directories. " - f"URL1 model: {config1.model}, URL2 model: {config2.model}" + f"URL1 model: {model_config1.model}, URL2 model: {model_config2.model}" ) - assert config1.tokenizer != config2.tokenizer, ( + assert renderer_config1.tokenizer != renderer_config2.tokenizer, ( f"Different S3 URLs should create different tokenizer directories. " - f"URL1 tokenizer: {config1.tokenizer}, " - f"URL2 tokenizer: {config2.tokenizer}" + f"URL1 tokenizer: {renderer_config1.tokenizer}, " + f"URL2 tokenizer: {renderer_config2.tokenizer}" ) # Verify that both sets of directories exist - assert os.path.exists(config1.model) and os.path.isdir(config1.model) - assert os.path.exists(config1.tokenizer) and os.path.isdir(config1.tokenizer) - assert os.path.exists(config2.model) and os.path.isdir(config2.model) - assert os.path.exists(config2.tokenizer) and os.path.isdir(config2.tokenizer) + assert os.path.exists(model_config1.model) and os.path.isdir(model_config1.model) + assert os.path.exists(renderer_config1.tokenizer) and os.path.isdir( + renderer_config1.tokenizer + ) + assert os.path.exists(model_config2.model) and os.path.isdir(model_config2.model) + assert os.path.exists(renderer_config2.tokenizer) and os.path.isdir( + renderer_config2.tokenizer + ) @pytest.mark.parametrize( diff --git a/tests/test_inputs.py b/tests/test_inputs.py index c4339827de8b6..48fd076ab3c67 100644 --- a/tests/test_inputs.py +++ b/tests/test_inputs.py @@ -3,7 +3,7 @@ import pytest -from vllm.config import ModelConfig +from vllm.config import ModelConfig, RendererConfig from vllm.inputs import zip_enc_dec_prompts from vllm.inputs.parse import parse_raw_prompts from vllm.inputs.preprocess import InputPreprocessor @@ -108,8 +108,9 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs): ) def test_preprocessor_always_mm_code_path(model_id, prompt): model_config = ModelConfig(model=model_id) - tokenizer = init_tokenizer_from_config(model_config) - input_preprocessor = InputPreprocessor(model_config, tokenizer) + renderer_config = RendererConfig(model_config=model_config) + tokenizer = init_tokenizer_from_config(renderer_config) + input_preprocessor = InputPreprocessor(renderer_config, tokenizer) # HF processor adds sep token sep_token_id = tokenizer.vocab[tokenizer.sep_token] diff --git a/tests/v1/attention/utils.py b/tests/v1/attention/utils.py index 6cab129c116c5..49307e3e5437d 100644 --- a/tests/v1/attention/utils.py +++ b/tests/v1/attention/utils.py @@ -16,6 +16,7 @@ from vllm.config import ( LoadConfig, ModelConfig, ParallelConfig, + RendererConfig, SchedulerConfig, VllmConfig, ) @@ -216,6 +217,7 @@ def create_vllm_config( return VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), cache_config=cache_config, parallel_config=parallel_config, scheduler_config=scheduler_config, diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py index fd5cf6d3e74aa..4a414bca591d1 100644 --- a/tests/v1/core/test_kv_cache_utils.py +++ b/tests/v1/core/test_kv_cache_utils.py @@ -8,7 +8,7 @@ import pytest import torch import vllm.v1.core.kv_cache_utils as kv_cache_utils -from vllm.config import ModelConfig, SchedulerConfig, VllmConfig +from vllm.config import ModelConfig, RendererConfig, SchedulerConfig, VllmConfig from vllm.lora.request import LoRARequest from vllm.multimodal.inputs import ( MultiModalFeatureSpec, @@ -667,7 +667,10 @@ def test_metrics_empty_stats(): def test_get_kv_cache_configs_multiple_workers(): model_config = ModelConfig(max_model_len=16) - vllm_config = VllmConfig(model_config=model_config) + vllm_config = VllmConfig( + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), + ) ref_kv_cache_spec = new_kv_cache_spec() same_kv_cache_specs = [ @@ -1136,6 +1139,7 @@ def test_estimate_max_model_len(model_id, max_model_len, want_estimated_max_len) vllm_config = VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), scheduler_config=scheduler_config, ) @@ -1175,6 +1179,7 @@ def test_get_max_concurrency_for_kv_cache_config(): vllm_config = VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), scheduler_config=scheduler_config, ) @@ -1293,7 +1298,10 @@ def test_allocate_with_lookahead(): def test_get_kv_cache_config_one_worker(): # pass max_model_len to pass check_enough_kv_cache_memory model_config = ModelConfig(max_model_len=16) - vllm_config = VllmConfig(model_config=model_config) + vllm_config = VllmConfig( + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), + ) mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2 # all layers are full attention -> single group @@ -1584,7 +1592,11 @@ def test_get_kv_cache_config_one_worker(): def test_get_kv_cache_configs_attention_free(): kv_cache_specs: dict[str, KVCacheSpec] = {} - vllm_config = VllmConfig(model_config=ModelConfig(max_model_len=16)) + model_config = ModelConfig(max_model_len=16) + vllm_config = VllmConfig( + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), + ) kv_cache_configs = get_kv_cache_configs(vllm_config, [kv_cache_specs], [0]) assert kv_cache_configs == [ KVCacheConfig( diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index c6c4a5085bff7..1505415a63619 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -11,6 +11,7 @@ from vllm.config import ( ECTransferConfig, KVTransferConfig, ModelConfig, + RendererConfig, SchedulerConfig, SpeculativeConfig, VllmConfig, @@ -1563,6 +1564,7 @@ def create_scheduler_with_priority( vllm_config = VllmConfig( scheduler_config=scheduler_config, model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), cache_config=cache_config, kv_transfer_config=kv_transfer_config, speculative_config=speculative_config, diff --git a/tests/v1/core/utils.py b/tests/v1/core/utils.py index f5ba613d38db1..086885c298145 100644 --- a/tests/v1/core/utils.py +++ b/tests/v1/core/utils.py @@ -9,6 +9,7 @@ from vllm.config import ( ECTransferConfig, KVTransferConfig, ModelConfig, + RendererConfig, SchedulerConfig, SpeculativeConfig, VllmConfig, @@ -132,6 +133,7 @@ def create_scheduler( vllm_config = VllmConfig( scheduler_config=scheduler_config, model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), cache_config=cache_config, kv_transfer_config=kv_transfer_config, speculative_config=speculative_config, diff --git a/tests/v1/engine/test_engine_core.py b/tests/v1/engine/test_engine_core.py index 48be8c15aba9e..c606100a12bf1 100644 --- a/tests/v1/engine/test_engine_core.py +++ b/tests/v1/engine/test_engine_core.py @@ -15,6 +15,7 @@ from vllm.config import ( ECTransferConfig, KVTransferConfig, ModelConfig, + RendererConfig, SchedulerConfig, VllmConfig, ) @@ -522,6 +523,7 @@ def test_encoder_instance_zero_kv_cache( vllm_config = VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), cache_config=cache_config, scheduler_config=scheduler_config, kv_transfer_config=kv_transfer_config, diff --git a/tests/v1/engine/test_process_multi_modal_uuids.py b/tests/v1/engine/test_process_multi_modal_uuids.py index 1b11b8af49d17..85fab3a855fd7 100644 --- a/tests/v1/engine/test_process_multi_modal_uuids.py +++ b/tests/v1/engine/test_process_multi_modal_uuids.py @@ -5,7 +5,14 @@ import pytest from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset -from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig +from vllm.config import ( + CacheConfig, + DeviceConfig, + ModelConfig, + MultiModalConfig, + RendererConfig, + VllmConfig, +) from vllm.sampling_params import SamplingParams from vllm.v1.engine import input_processor as input_processor_mod from vllm.v1.engine.input_processor import InputProcessor @@ -44,22 +51,21 @@ def _mock_input_processor( monkeypatch.setattr(VllmConfig, "__post_init__", lambda self: None, raising=True) model_config = ModelConfig( - skip_tokenizer_init=True, max_model_len=128, mm_processor_cache_gb=mm_cache_gb, generation_config="vllm", + ) + model_config.multimodal_config = MultiModalConfig(mm_processor_cache_gb=mm_cache_gb) + + renderer_config = RendererConfig( + model_config=model_config, tokenizer="dummy", + skip_tokenizer_init=True, ) - # Minimal multimodal_config to satisfy references in - # Processor.process_inputs. - class _MockMMConfig: - def __init__(self, gb: float): - self.mm_processor_cache_gb = gb - - model_config.multimodal_config = _MockMMConfig(mm_cache_gb) # type: ignore[attr-defined] vllm_config = VllmConfig( model_config=model_config, + renderer_config=renderer_config, cache_config=CacheConfig(enable_prefix_caching=enable_prefix_caching), device_config=DeviceConfig(device="cpu"), ) diff --git a/tests/v1/kv_connector/unit/utils.py b/tests/v1/kv_connector/unit/utils.py index 58f1a7282352b..768b338b5fe53 100644 --- a/tests/v1/kv_connector/unit/utils.py +++ b/tests/v1/kv_connector/unit/utils.py @@ -15,6 +15,7 @@ from vllm.config import ( DeviceConfig, KVTransferConfig, ModelConfig, + RendererConfig, SchedulerConfig, VllmConfig, ) @@ -127,6 +128,7 @@ def create_vllm_config( return VllmConfig( scheduler_config=scheduler_config, model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), cache_config=cache_config, kv_transfer_config=kv_transfer_config, device_config=DeviceConfig("cpu"), diff --git a/tests/v1/spec_decode/test_eagle.py b/tests/v1/spec_decode/test_eagle.py index 616e57de339e2..888ea0169b759 100644 --- a/tests/v1/spec_decode/test_eagle.py +++ b/tests/v1/spec_decode/test_eagle.py @@ -19,6 +19,7 @@ from vllm.config import ( DeviceConfig, ModelConfig, ParallelConfig, + RendererConfig, SchedulerConfig, SpeculativeConfig, VllmConfig, @@ -61,6 +62,7 @@ def _create_proposer( vllm_config = VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), cache_config=CacheConfig(), speculative_config=speculative_config, device_config=DeviceConfig(device=current_platform.device_type), diff --git a/tests/v1/spec_decode/test_mtp.py b/tests/v1/spec_decode/test_mtp.py index 3b8813ceb818a..4483c82438530 100644 --- a/tests/v1/spec_decode/test_mtp.py +++ b/tests/v1/spec_decode/test_mtp.py @@ -18,6 +18,7 @@ from vllm.config import ( DeviceConfig, ModelConfig, ParallelConfig, + RendererConfig, SchedulerConfig, SpeculativeConfig, VllmConfig, @@ -46,6 +47,7 @@ def _create_mtp_proposer(num_speculative_tokens: int) -> EagleProposer: vllm_config = VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), cache_config=CacheConfig(), speculative_config=speculative_config, device_config=DeviceConfig(device=current_platform.device_type), diff --git a/tests/v1/spec_decode/test_ngram.py b/tests/v1/spec_decode/test_ngram.py index 6bc412abe8695..2e365e08a4e72 100644 --- a/tests/v1/spec_decode/test_ngram.py +++ b/tests/v1/spec_decode/test_ngram.py @@ -4,6 +4,7 @@ import numpy as np from vllm.config import ( ModelConfig, + RendererConfig, SpeculativeConfig, VllmConfig, ) @@ -69,6 +70,7 @@ def test_ngram_proposer(): return NgramProposer( vllm_config=VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), speculative_config=SpeculativeConfig( prompt_lookup_min=min_n, prompt_lookup_max=max_n, diff --git a/tests/v1/structured_output/test_backend_guidance.py b/tests/v1/structured_output/test_backend_guidance.py index 4c01560fc88c3..baef2459f8df0 100644 --- a/tests/v1/structured_output/test_backend_guidance.py +++ b/tests/v1/structured_output/test_backend_guidance.py @@ -6,7 +6,7 @@ from concurrent.futures import Future import pytest from transformers import AutoTokenizer -from vllm.config import StructuredOutputsConfig, VllmConfig +from vllm.config import RendererConfig, StructuredOutputsConfig, VllmConfig from vllm.config.model import ModelConfig from vllm.config.parallel import ParallelConfig from vllm.config.speculative import SpeculativeConfig @@ -72,8 +72,11 @@ def test_backend_guidance_rollback_terminated(): def test_grammar_bitmask_with_specdec(): tokenizer = AutoTokenizer.from_pretrained(TOKENIZER) prompt = tokenizer.encode('{"a": "b"}') + + model_config = ModelConfig(tokenizer=TOKENIZER) vllm_config = VllmConfig( - model_config=ModelConfig(tokenizer=TOKENIZER), + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config, tokenizer=TOKENIZER), structured_outputs_config=StructuredOutputsConfig(backend="guidance"), speculative_config=SpeculativeConfig(model="[ngram]", num_speculative_tokens=3), ) @@ -137,8 +140,11 @@ def test_grammar_init_async_and_sync(async_grammar): # Use "external_launcher" for sync mode, None for async mode executor_backend = None if async_grammar else "external_launcher" + + model_config = ModelConfig(tokenizer=TOKENIZER) vllm_config = VllmConfig( - model_config=ModelConfig(tokenizer=TOKENIZER), + model_config=model_config, + renderer_config=RendererConfig(model_config=model_config, tokenizer=TOKENIZER), structured_outputs_config=StructuredOutputsConfig(backend="guidance"), parallel_config=ParallelConfig(distributed_executor_backend=executor_backend), ) diff --git a/tests/v1/structured_output/test_reasoning_structured_output.py b/tests/v1/structured_output/test_reasoning_structured_output.py index 70047a993c3f9..5901d38d1b78b 100644 --- a/tests/v1/structured_output/test_reasoning_structured_output.py +++ b/tests/v1/structured_output/test_reasoning_structured_output.py @@ -7,7 +7,7 @@ from unittest.mock import Mock import pytest -from vllm.config import ModelConfig, SchedulerConfig, VllmConfig +from vllm.config import ModelConfig, RendererConfig, SchedulerConfig, VllmConfig from vllm.reasoning import ReasoningParser from vllm.v1.request import Request from vllm.v1.structured_output import StructuredOutputManager @@ -17,19 +17,26 @@ class TestReasoningStructuredOutput: """Test reasoning-aware structured output functionality.""" @pytest.fixture - def mock_model_config(self): - """Create a mock ModelConfig.""" - config = Mock(spec=ModelConfig) - config.skip_tokenizer_init = True # Skip tokenizer init to avoid network calls - config.get_vocab_size = Mock(return_value=50000) + def mock_renderer_config(self): + """Create a mock RendererConfig.""" + renderer_config = Mock(spec=RendererConfig) + renderer_config.skip_tokenizer_init = ( + True # Skip tokenizer init to avoid network calls + ) + + model_config = Mock(spec=ModelConfig) + model_config.get_vocab_size = Mock(return_value=50000) + model_config.trust_remote_code = False # Add missing runner_type attribute that tokenizer initialization expects - config.runner_type = "generate" + model_config.runner_type = "generate" + renderer_config.model_config = model_config + # Add other attributes that tokenizer initialization might need - config.tokenizer = "test-tokenizer" - config.tokenizer_mode = "auto" - config.trust_remote_code = False - config.tokenizer_revision = None - return config + renderer_config.tokenizer = "test-tokenizer" + renderer_config.tokenizer_mode = "auto" + renderer_config.tokenizer_revision = None + + return renderer_config @pytest.fixture def mock_scheduler_config(self): @@ -39,10 +46,10 @@ class TestReasoningStructuredOutput: return config @pytest.fixture - def mock_vllm_config(self, mock_model_config, mock_scheduler_config): + def mock_vllm_config(self, mock_renderer_config, mock_scheduler_config): """Create a mock VllmConfig.""" config = Mock(spec=VllmConfig) - config.model_config = mock_model_config + config.renderer_config = mock_renderer_config config.scheduler_config = mock_scheduler_config config.structured_outputs_config = Mock() config.structured_outputs_config.reasoning_parser = None diff --git a/tests/v1/tpu/worker/test_tpu_model_runner.py b/tests/v1/tpu/worker/test_tpu_model_runner.py index cfc06666e7984..080d23863652d 100644 --- a/tests/v1/tpu/worker/test_tpu_model_runner.py +++ b/tests/v1/tpu/worker/test_tpu_model_runner.py @@ -7,6 +7,7 @@ from vllm.attention.layer import Attention from vllm.config import ( CacheConfig, ModelConfig, + RendererConfig, SchedulerConfig, VllmConfig, set_current_vllm_config, @@ -45,6 +46,7 @@ def get_vllm_config(): ) vllm_config = VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), cache_config=cache_config, scheduler_config=scheduler_config, ) diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 7b8c4268a5237..464e3ab99c76d 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -13,6 +13,7 @@ from vllm.config import ( CacheConfig, ModelConfig, ParallelConfig, + RendererConfig, SchedulerConfig, VllmConfig, set_current_vllm_config, @@ -101,6 +102,7 @@ def get_vllm_config(): parallel_config = ParallelConfig() vllm_config = VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), cache_config=cache_config, scheduler_config=scheduler_config, parallel_config=parallel_config, @@ -811,6 +813,7 @@ def test_hybrid_attention_mamba_tensor_shapes(): attention_config = AttentionConfig(backend=AttentionBackendEnum.FLASHINFER) vllm_config = VllmConfig( model_config=model_config, + renderer_config=RendererConfig(model_config=model_config), cache_config=cache_config, scheduler_config=scheduler_config, parallel_config=parallel_config, diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py index 0f84f3ca9d3e3..a4f9fd8d28292 100644 --- a/vllm/config/__init__.py +++ b/vllm/config/__init__.py @@ -24,6 +24,7 @@ from vllm.config.multimodal import MultiModalConfig from vllm.config.observability import ObservabilityConfig from vllm.config.parallel import EPLBConfig, ParallelConfig from vllm.config.pooler import PoolerConfig +from vllm.config.renderer import RendererConfig from vllm.config.scheduler import SchedulerConfig from vllm.config.speculative import SpeculativeConfig from vllm.config.speech_to_text import SpeechToTextConfig @@ -81,6 +82,8 @@ __all__ = [ "ParallelConfig", # From vllm.config.pooler "PoolerConfig", + # From vllm.config.renderer + "RendererConfig", # From vllm.config.scheduler "SchedulerConfig", # From vllm.config.speculative diff --git a/vllm/config/model.py b/vllm/config/model.py index 509a9c5e162f7..b0d4fb8e01e64 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -36,7 +36,6 @@ from vllm.transformers_utils.config import ( uses_xdrope_dim, ) from vllm.transformers_utils.gguf_utils import ( - is_gguf, is_remote_gguf, maybe_patch_hf_config_from_gguf, split_remote_gguf, @@ -83,7 +82,6 @@ TaskOption = Literal[ "transcription", "draft", ] -TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"] ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"] LogprobsMode = Literal[ "raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs" @@ -131,18 +129,6 @@ class ModelConfig: Note that the model may support other tasks using the same model runner. """ - tokenizer: SkipValidation[str] = None # type: ignore - """Name or path of the Hugging Face tokenizer to use. If unspecified, model - name or path will be used.""" - tokenizer_mode: TokenizerMode | str = "auto" - """Tokenizer mode:\n - - "auto" will use the tokenizer from `mistral_common` for Mistral models - if available, otherwise it will use the "hf" tokenizer.\n - - "hf" will use the fast tokenizer if available.\n - - "slow" will always use the slow tokenizer.\n - - "mistral" will always use the tokenizer from `mistral_common`.\n - - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n - - Other custom values can be supported via plugins.""" trust_remote_code: bool = False """Trust remote code (e.g., from HuggingFace) when downloading the model and tokenizer.""" @@ -168,13 +154,6 @@ class ModelConfig: hf_config_path: str | None = None """Name or path of the Hugging Face config to use. If unspecified, model name or path will be used.""" - allowed_local_media_path: str = "" - """Allowing API requests to read local images or videos from directories - specified by the server file system. This is a security risk. Should only - be enabled in trusted environments.""" - allowed_media_domains: list[str] | None = None - """If set, only media URLs that belong to this domain can be used for - multi-modal inputs. """ revision: str | None = None """The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.""" @@ -182,10 +161,6 @@ class ModelConfig: """The specific revision to use for the model code on the Hugging Face Hub. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.""" - tokenizer_revision: str | None = None - """The specific revision to use for the tokenizer on the Hugging Face Hub. - It can be a branch name, a tag name, or a commit id. If unspecified, will - use the default version.""" max_model_len: SkipValidation[int] = None # type: ignore """Model context length (prompt and output). If unspecified, will be automatically derived from the model config. @@ -230,10 +205,6 @@ class ModelConfig: preventing potential numerical issues. Note that even if this is set to False, cascade attention will be only used when the heuristic tells that it's beneficial.""" - skip_tokenizer_init: bool = False - """Skip initialization of tokenizer and detokenizer. Expects valid - `prompt_token_ids` and `None` for prompt from the input. The generated - output will contain token ids.""" enable_prompt_embeds: bool = False """If `True`, enables passing text embeddings as inputs via the `prompt_embeds` key. @@ -294,8 +265,6 @@ class ModelConfig: logits_processors: list[str | type[LogitsProcessor]] | None = None """One or more logits processors' fully-qualified class names or class definitions""" - io_processor_plugin: str | None = None - """IOProcessor plugin name to load at model startup""" # Pooler config pooler_config: PoolerConfig | None = None @@ -308,7 +277,6 @@ class ModelConfig: from the architecture of `self.model`.""" limit_mm_per_prompt: InitVar[dict[str, int | dict[str, int]] | None] = None enable_mm_embeds: InitVar[bool | None] = None - media_io_kwargs: InitVar[dict[str, dict[str, Any]] | None] = None mm_processor_kwargs: InitVar[dict[str, Any] | None] = None mm_processor_cache_gb: InitVar[float | None] = None mm_processor_cache_type: InitVar[MMCacheType | None] = None @@ -335,18 +303,12 @@ class ModelConfig: "runner", "convert", "task", - "tokenizer", - "tokenizer_mode", "seed", "hf_config_path", - "allowed_local_media_path", - "allowed_media_domains", - "tokenizer_revision", "spec_target_max_model_len", "enforce_eager", "logprobs_mode", "disable_cascade_attn", - "skip_tokenizer_init", "served_model_name", "config_format", "hf_token", @@ -354,11 +316,9 @@ class ModelConfig: "logits_processor_pattern", "override_attention_dtype", "logits_processors", - "io_processor_plugin", "pooler_config", "multimodal_config", "limit_mm_per_prompt", - "media_io_kwargs", "mm_processor_kwargs", "mm_processor_cache_gb", "mm_processor_cache_type", @@ -423,7 +383,6 @@ class ModelConfig: # Multimodal config init vars limit_mm_per_prompt: dict[str, int | dict[str, int]] | None, enable_mm_embeds: bool | None, - media_io_kwargs: dict[str, dict[str, Any]] | None, mm_processor_kwargs: dict[str, Any] | None, mm_processor_cache_gb: float | None, mm_processor_cache_type: MMCacheType | None, @@ -438,13 +397,8 @@ class ModelConfig: self.served_model_name = get_served_model_name( self.model, self.served_model_name ) - self.model = maybe_model_redirect(self.model) - # The tokenizer is consistent with the model by default. - if self.tokenizer is None: - self.tokenizer = self.model - if self.tokenizer_revision is None: - self.tokenizer_revision = self.revision - self.tokenizer = maybe_model_redirect(self.tokenizer) + self.original_model = self.model + self.model = maybe_model_redirect(self.original_model) if isinstance(self.hf_config_path, str): self.hf_config_path = maybe_model_redirect(self.hf_config_path) @@ -465,7 +419,7 @@ class ModelConfig: hf_overrides_kw[key] = value hf_overrides_fn = None - self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer) + self.maybe_pull_model_for_runai(self.model) from vllm.platforms import current_platform @@ -648,7 +602,8 @@ class ModelConfig: ) self.original_max_model_len = self.max_model_len - self.max_model_len = self.get_and_verify_max_len(self.max_model_len) + self.recalculate_max_model_len(self.original_max_model_len) + # Init multimodal config if needed if self._model_info.supports_multimodal: if ( @@ -664,7 +619,6 @@ class ModelConfig: mm_config_kwargs = dict( limit_per_prompt=limit_mm_per_prompt, enable_mm_embeds=enable_mm_embeds, - media_io_kwargs=media_io_kwargs, mm_processor_kwargs=mm_processor_kwargs, mm_processor_cache_gb=mm_processor_cache_gb, mm_processor_cache_type=mm_processor_cache_type, @@ -682,16 +636,8 @@ class ModelConfig: self.multimodal_config = MultiModalConfig(**mm_config_kwargs) - # Multimodal GGUF models must use original repo for mm processing - if is_gguf(self.tokenizer) and self.is_multimodal_model: - raise ValueError( - "Loading a multimodal GGUF model needs to use original " - "tokenizer. Please specify the unquantized hf model's " - "repo name or path using the --tokenizer argument." - ) - if self.disable_sliding_window: - # Set after get_and_verify_max_len to ensure that max_model_len + # Set after recalculate_max_model_len to ensure that max_model_len # can be correctly capped to sliding window size self.hf_text_config.sliding_window = None @@ -715,10 +661,9 @@ class ModelConfig: @model_validator(mode="after") def validate_model_config_after(self: "ModelConfig") -> "ModelConfig": - if not isinstance(self.tokenizer, str): - raise ValueError("tokenizer must be a string after __post_init__.") if not isinstance(self.max_model_len, int): raise ValueError("max_model_len must be an integer after __post_init__.") + return self def _get_transformers_backend_cls(self) -> str: @@ -767,49 +712,17 @@ class ModelConfig: """The architecture vllm actually used.""" return self._architecture - def maybe_pull_model_tokenizer_for_runai(self, model: str, tokenizer: str) -> None: - """Pull model/tokenizer from Object Storage to temporary - directory when needed. - - Args: - model: Model name or path - tokenizer: Tokenizer name or path - """ - - if not (is_runai_obj_uri(model) or is_runai_obj_uri(tokenizer)): + def maybe_pull_model_for_runai(self, model: str) -> None: + """Pull model from Object Storage to temporary directory when needed.""" + if not is_runai_obj_uri(model): return - if is_runai_obj_uri(model): - object_storage_model = ObjectStorageModel(url=model) - object_storage_model.pull_files( - model, allow_pattern=["*.model", "*.py", "*.json"] - ) - self.model_weights = model - self.model = object_storage_model.dir - - # If tokenizer is same as model, download to same directory - if model == tokenizer: - object_storage_model.pull_files( - model, - ignore_pattern=[ - "*.pt", - "*.safetensors", - "*.bin", - "*.tensors", - "*.pth", - ], - ) - self.tokenizer = object_storage_model.dir - return - - # Only download tokenizer if needed and not already handled - if is_runai_obj_uri(tokenizer): - object_storage_tokenizer = ObjectStorageModel(url=tokenizer) - object_storage_tokenizer.pull_files( - model, - ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors", "*.pth"], - ) - self.tokenizer = object_storage_tokenizer.dir + object_storage_model = ObjectStorageModel(url=model) + object_storage_model.pull_files( + model, allow_pattern=["*.model", "*.py", "*.json"] + ) + self.model_weights = model + self.model = object_storage_model.dir def _get_encoder_config(self): model = self.model @@ -1712,30 +1625,38 @@ class ModelConfig: return dense_modules[-1]["out_features"] return self.get_hidden_size() - def get_and_verify_max_len(self, max_model_len: int): + def recalculate_max_model_len( + self, + original_max_model_len: int | None, + *, + tokenizer: str | None = None, + tokenizer_revision: str | None = None, + ) -> None: # Consider max_model_len in tokenizer_config only when # pooling models use absolute position_embedding. + # NOTE: For simplicity we assume `args.model == args.tokenizer` + # since this is tokenizer_config = None if ( self.runner_type == "pooling" and getattr(self.hf_config, "position_embedding_type", "") == "absolute" ): tokenizer_config = try_get_tokenizer_config( - self.tokenizer, + tokenizer or self.model, trust_remote_code=self.trust_remote_code, - revision=self.tokenizer_revision, + revision=tokenizer_revision or self.revision, ) - max_model_len = _get_and_verify_max_len( + + self.max_model_len = _get_and_verify_max_len( hf_config=self.hf_text_config, tokenizer_config=tokenizer_config, - max_model_len=max_model_len, + max_model_len=original_max_model_len, disable_sliding_window=self.disable_sliding_window, sliding_window=self.get_sliding_window(), spec_target_max_model_len=self.spec_target_max_model_len, encoder_config=self.encoder_config, ) - logger.info("Using max model len %s", max_model_len) - return max_model_len + logger.info("Using max model len %s", self.max_model_len) @property def attn_type(self) -> AttnTypeStr: diff --git a/vllm/config/multimodal.py b/vllm/config/multimodal.py index 8a2936de96d6f..37e2f6b4d419a 100644 --- a/vllm/config/multimodal.py +++ b/vllm/config/multimodal.py @@ -79,10 +79,6 @@ class MultiModalConfig: WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed. Only enable this flag for trusted users!""" - media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict) - """Additional args passed to process media inputs, keyed by modalities. - For example, to set num_frames for video, set - `--media-io-kwargs '{"video": {"num_frames": 40} }'`""" mm_processor_kwargs: dict[str, object] | None = None """Arguments to be forwarded to the model's processor for multi-modal data, e.g., image processor. Overrides for the multi-modal processor obtained diff --git a/vllm/config/renderer.py b/vllm/config/renderer.py new file mode 100644 index 0000000000000..36a922b93ca05 --- /dev/null +++ b/vllm/config/renderer.py @@ -0,0 +1,109 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any, Literal + +from pydantic import Field, SkipValidation +from pydantic.dataclasses import dataclass + +from vllm.config.model import ModelConfig +from vllm.config.utils import config +from vllm.transformers_utils.gguf_utils import is_gguf +from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri +from vllm.transformers_utils.utils import maybe_model_redirect + +TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"] + + +@config +@dataclass +class RendererConfig: + """Configuration for the renderer.""" + + # NOTE: In reality, this is a required argument. + # We provide a dummy default value here to generate the CLI args. + model_config: SkipValidation[ModelConfig] = None # type: ignore + """Provides model context to the renderer.""" + + tokenizer: str = "" + """Name or path of the Hugging Face tokenizer to use. If unspecified, model + name or path will be used.""" + tokenizer_mode: TokenizerMode | str = "auto" + """Tokenizer mode:\n + - "auto" will use the tokenizer from `mistral_common` for Mistral models + if available, otherwise it will use the "hf" tokenizer.\n + - "hf" will use the fast tokenizer if available.\n + - "slow" will always use the slow tokenizer.\n + - "mistral" will always use the tokenizer from `mistral_common`.\n + - "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n + - Other custom values can be supported via plugins.""" + tokenizer_revision: str | None = None + """The specific revision to use for the tokenizer on the Hugging Face Hub. + It can be a branch name, a tag name, or a commit id. If unspecified, will + use the default version.""" + skip_tokenizer_init: bool = False + """Skip initialization of tokenizer and detokenizer. Expects valid + `prompt_token_ids` and `None` for prompt from the input. The generated + output will contain token ids.""" + + io_processor_plugin: str | None = None + """IOProcessor plugin name to load at model startup.""" + + media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict) + """Additional args passed to process media inputs, keyed by modalities. + For example, to set num_frames for video, set + `--media-io-kwargs '{"video": {"num_frames": 40} }'`""" + allowed_local_media_path: str = "" + """Allowing API requests to read local images or videos from directories + specified by the server file system. This is a security risk. Should only + be enabled in trusted environments.""" + allowed_media_domains: list[str] | None = None + """If set, only media URLs that belong to this domain can be used for + multi-modal inputs. """ + + @property + def trust_remote_code(self) -> bool: + return self.model_config.trust_remote_code + + def __post_init__(self) -> None: + model_config = self.model_config + + # The tokenizer is consistent with the model by default. + if not self.tokenizer: + self.tokenizer = ( + ModelConfig.model + if model_config is None + else model_config.original_model + ) + if not self.tokenizer_revision: + self.tokenizer_revision = ( + ModelConfig.revision if model_config is None else model_config.revision + ) + + self.original_tokenizer = self.tokenizer + self.tokenizer = maybe_model_redirect(self.original_tokenizer) + self.maybe_pull_tokenizer_for_runai(self.tokenizer) + + # Multimodal GGUF models must use original repo for mm processing + is_multimodal_model = ( + ModelConfig.is_multimodal_model + if model_config is None + else model_config.is_multimodal_model + ) + if is_gguf(self.tokenizer) and is_multimodal_model: + raise ValueError( + "Loading a multimodal GGUF model needs to use original " + "tokenizer. Please specify the unquantized hf model's " + "repo name or path using the --tokenizer argument." + ) + + def maybe_pull_tokenizer_for_runai(self, tokenizer: str) -> None: + """Pull tokenizer from Object Storage to temporary directory when needed.""" + if not is_runai_obj_uri(tokenizer): + return + + object_storage_tokenizer = ObjectStorageModel(url=tokenizer) + object_storage_tokenizer.pull_files( + tokenizer, + ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors", "*.pth"], + ) + self.tokenizer = object_storage_tokenizer.dir diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index bf533bf14e55c..63b63eac907d2 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -322,16 +322,11 @@ class SpeculativeConfig: self.draft_model_config = ModelConfig( model=self.model, runner="draft", - tokenizer=self.target_model_config.tokenizer, - tokenizer_mode=self.target_model_config.tokenizer_mode, trust_remote_code=self.target_model_config.trust_remote_code, - allowed_local_media_path=self.target_model_config.allowed_local_media_path, - allowed_media_domains=self.target_model_config.allowed_media_domains, dtype=self.target_model_config.dtype, seed=self.target_model_config.seed, revision=self.revision, code_revision=self.code_revision, - tokenizer_revision=self.target_model_config.tokenizer_revision, spec_target_max_model_len=self.target_model_config.max_model_len, quantization=self.quantization, enforce_eager=self.target_model_config.enforce_eager, diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 36e4bd159dc72..417797c445b95 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -39,6 +39,7 @@ from .lora import LoRAConfig from .model import ModelConfig from .observability import ObservabilityConfig from .parallel import ParallelConfig +from .renderer import RendererConfig from .scheduler import SchedulerConfig from .speculative import SpeculativeConfig from .structured_outputs import StructuredOutputsConfig @@ -181,6 +182,8 @@ class VllmConfig: # try to download a model model_config: ModelConfig = Field(default=None) """Model configuration.""" + renderer_config: RendererConfig = Field(default_factory=RendererConfig) + """Renderer configuration.""" cache_config: CacheConfig = Field(default_factory=CacheConfig) """Cache configuration.""" parallel_config: ParallelConfig = Field(default_factory=ParallelConfig) @@ -741,7 +744,7 @@ class VllmConfig: from vllm.multimodal import MULTIMODAL_REGISTRY self.scheduler_config.max_num_encoder_input_tokens = ( - MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config) + MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.renderer_config) ) logger.debug( "Encoder-decoder model detected: setting " @@ -1186,11 +1189,13 @@ class VllmConfig: computed_compile_ranges_split_points ) - def recalculate_max_model_len(self, max_model_len: int): - # Can only be called in try_verify_and_update_config - model_config = self.model_config - max_model_len = model_config.get_and_verify_max_len(max_model_len) - self.model_config.max_model_len = max_model_len + def recalculate_max_model_len(self, original_max_model_len: int | None) -> None: + # Can only be called during try_verify_and_update_config + self.model_config.recalculate_max_model_len( + original_max_model_len, + tokenizer=self.renderer_config.tokenizer, + tokenizer_revision=self.renderer_config.tokenizer_revision, + ) def try_verify_and_update_config(self): if self.model_config is None: @@ -1264,11 +1269,11 @@ class VllmConfig: return ( f"model={self.model_config.model!r}, " f"speculative_config={self.speculative_config!r}, " - f"tokenizer={self.model_config.tokenizer!r}, " - f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, " - f"tokenizer_mode={self.model_config.tokenizer_mode}, " + f"tokenizer={self.renderer_config.tokenizer!r}, " + f"skip_tokenizer_init={self.renderer_config.skip_tokenizer_init}, " + f"tokenizer_mode={self.renderer_config.tokenizer_mode}, " f"revision={self.model_config.revision}, " - f"tokenizer_revision={self.model_config.tokenizer_revision}, " + f"tokenizer_revision={self.renderer_config.tokenizer_revision}, " f"trust_remote_code={self.model_config.trust_remote_code}, " f"dtype={self.model_config.dtype}, " f"max_seq_len={self.model_config.max_model_len}, " diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index ceac5407af6e2..bd398abb0bf81 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -71,11 +71,11 @@ from vllm.config.model import ( ModelDType, RunnerOption, TaskOption, - TokenizerMode, ) from vllm.config.multimodal import MMCacheType, MMEncoderTPMode from vllm.config.observability import DetailedTraceModules from vllm.config.parallel import DistributedExecutorBackend, ExpertPlacementStrategy +from vllm.config.renderer import RendererConfig, TokenizerMode from vllm.config.scheduler import SchedulerPolicy from vllm.config.utils import get_field from vllm.config.vllm import OptimizationLevel @@ -355,17 +355,12 @@ class EngineArgs: model: str = ModelConfig.model served_model_name: str | list[str] | None = ModelConfig.served_model_name - tokenizer: str | None = ModelConfig.tokenizer hf_config_path: str | None = ModelConfig.hf_config_path runner: RunnerOption = ModelConfig.runner convert: ConvertOption = ModelConfig.convert task: TaskOption | None = ModelConfig.task - skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds - tokenizer_mode: TokenizerMode | str = ModelConfig.tokenizer_mode trust_remote_code: bool = ModelConfig.trust_remote_code - allowed_local_media_path: str = ModelConfig.allowed_local_media_path - allowed_media_domains: list[str] | None = ModelConfig.allowed_media_domains download_dir: str | None = LoadConfig.download_dir safetensors_load_strategy: str = LoadConfig.safetensors_load_strategy load_format: str | LoadFormats = LoadConfig.load_format @@ -449,7 +444,6 @@ class EngineArgs: code_revision: str | None = ModelConfig.code_revision hf_token: bool | str | None = ModelConfig.hf_token hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides") - tokenizer_revision: str | None = ModelConfig.tokenizer_revision quantization: QuantizationMethods | None = ModelConfig.quantization enforce_eager: bool = ModelConfig.enforce_eager disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce @@ -458,9 +452,6 @@ class EngineArgs: ) enable_mm_embeds: bool = MultiModalConfig.enable_mm_embeds interleave_mm_strings: bool = MultiModalConfig.interleave_mm_strings - media_io_kwargs: dict[str, dict[str, Any]] = get_field( - MultiModalConfig, "media_io_kwargs" - ) mm_processor_kwargs: dict[str, Any] | None = MultiModalConfig.mm_processor_kwargs disable_mm_preprocessor_cache: bool = False # DEPRECATED mm_processor_cache_gb: float = MultiModalConfig.mm_processor_cache_gb @@ -474,9 +465,19 @@ class EngineArgs: mm_encoder_attn_backend: AttentionBackendEnum | str | None = ( MultiModalConfig.mm_encoder_attn_backend ) - io_processor_plugin: str | None = None skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling video_pruning_rate: float = MultiModalConfig.video_pruning_rate + # Renderer fields + tokenizer: str | None = None + tokenizer_mode: TokenizerMode | str = RendererConfig.tokenizer_mode + tokenizer_revision: str | None = RendererConfig.tokenizer_revision + skip_tokenizer_init: bool = RendererConfig.skip_tokenizer_init + io_processor_plugin: str | None = None + media_io_kwargs: dict[str, dict[str, Any]] = get_field( + RendererConfig, "media_io_kwargs" + ) + allowed_local_media_path: str = RendererConfig.allowed_local_media_path + allowed_media_domains: list[str] | None = RendererConfig.allowed_media_domains # LoRA fields enable_lora: bool = False max_loras: int = LoRAConfig.max_loras @@ -627,25 +628,14 @@ class EngineArgs: model_group.add_argument("--runner", **model_kwargs["runner"]) model_group.add_argument("--convert", **model_kwargs["convert"]) model_group.add_argument("--task", **model_kwargs["task"], deprecated=True) - model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"]) - model_group.add_argument("--tokenizer-mode", **model_kwargs["tokenizer_mode"]) model_group.add_argument( "--trust-remote-code", **model_kwargs["trust_remote_code"] ) model_group.add_argument("--dtype", **model_kwargs["dtype"]) model_group.add_argument("--seed", **model_kwargs["seed"]) model_group.add_argument("--hf-config-path", **model_kwargs["hf_config_path"]) - model_group.add_argument( - "--allowed-local-media-path", **model_kwargs["allowed_local_media_path"] - ) - model_group.add_argument( - "--allowed-media-domains", **model_kwargs["allowed_media_domains"] - ) model_group.add_argument("--revision", **model_kwargs["revision"]) model_group.add_argument("--code-revision", **model_kwargs["code_revision"]) - model_group.add_argument( - "--tokenizer-revision", **model_kwargs["tokenizer_revision"] - ) model_group.add_argument("--max-model-len", **model_kwargs["max_model_len"]) model_group.add_argument("--quantization", "-q", **model_kwargs["quantization"]) model_group.add_argument("--enforce-eager", **model_kwargs["enforce_eager"]) @@ -657,9 +647,6 @@ class EngineArgs: model_group.add_argument( "--disable-cascade-attn", **model_kwargs["disable_cascade_attn"] ) - model_group.add_argument( - "--skip-tokenizer-init", **model_kwargs["skip_tokenizer_init"] - ) model_group.add_argument( "--enable-prompt-embeds", **model_kwargs["enable_prompt_embeds"] ) @@ -698,8 +685,34 @@ class EngineArgs: model_group.add_argument( "--logits-processors", **model_kwargs["logits_processors"] ) - model_group.add_argument( - "--io-processor-plugin", **model_kwargs["io_processor_plugin"] + + # Renderer arguments + renderer_kwargs = get_kwargs(RendererConfig) + renderer_group = parser.add_argument_group( + title="RendererConfig", + description=RendererConfig.__doc__, + ) + renderer_group.add_argument("--tokenizer", **renderer_kwargs["tokenizer"]) + renderer_group.add_argument( + "--tokenizer-mode", **renderer_kwargs["tokenizer_mode"] + ) + renderer_group.add_argument( + "--tokenizer-revision", **renderer_kwargs["tokenizer_revision"] + ) + renderer_group.add_argument( + "--skip-tokenizer-init", **renderer_kwargs["skip_tokenizer_init"] + ) + renderer_group.add_argument( + "--media-io-kwargs", **renderer_kwargs["media_io_kwargs"] + ) + renderer_group.add_argument( + "--allowed-local-media-path", **renderer_kwargs["allowed_local_media_path"] + ) + renderer_group.add_argument( + "--allowed-media-domains", **renderer_kwargs["allowed_media_domains"] + ) + renderer_group.add_argument( + "--io-processor-plugin", **renderer_kwargs["io_processor_plugin"] ) # Model loading arguments @@ -949,9 +962,6 @@ class EngineArgs: multimodal_group.add_argument( "--enable-mm-embeds", **multimodal_kwargs["enable_mm_embeds"] ) - multimodal_group.add_argument( - "--media-io-kwargs", **multimodal_kwargs["media_io_kwargs"] - ) multimodal_group.add_argument( "--mm-processor-kwargs", **multimodal_kwargs["mm_processor_kwargs"] ) @@ -1255,18 +1265,13 @@ class EngineArgs: runner=self.runner, convert=self.convert, task=self.task, - tokenizer=self.tokenizer, - tokenizer_mode=self.tokenizer_mode, trust_remote_code=self.trust_remote_code, - allowed_local_media_path=self.allowed_local_media_path, - allowed_media_domains=self.allowed_media_domains, dtype=self.dtype, seed=self.seed, revision=self.revision, code_revision=self.code_revision, hf_token=self.hf_token, hf_overrides=self.hf_overrides, - tokenizer_revision=self.tokenizer_revision, max_model_len=self.max_model_len, quantization=self.quantization, enforce_eager=self.enforce_eager, @@ -1274,13 +1279,11 @@ class EngineArgs: logprobs_mode=self.logprobs_mode, disable_sliding_window=self.disable_sliding_window, disable_cascade_attn=self.disable_cascade_attn, - skip_tokenizer_init=self.skip_tokenizer_init, enable_prompt_embeds=self.enable_prompt_embeds, served_model_name=self.served_model_name, limit_mm_per_prompt=self.limit_mm_per_prompt, enable_mm_embeds=self.enable_mm_embeds, interleave_mm_strings=self.interleave_mm_strings, - media_io_kwargs=self.media_io_kwargs, skip_mm_profiling=self.skip_mm_profiling, config_format=self.config_format, mm_processor_kwargs=self.mm_processor_kwargs, @@ -1298,7 +1301,6 @@ class EngineArgs: override_attention_dtype=self.override_attention_dtype, logits_processors=self.logits_processors, video_pruning_rate=self.video_pruning_rate, - io_processor_plugin=self.io_processor_plugin, ) def validate_tensorizer_args(self): @@ -1394,9 +1396,25 @@ class EngineArgs: ) model_config = self.create_model_config() - self.model = model_config.model - self.tokenizer = model_config.tokenizer + renderer_config = RendererConfig( + model_config=model_config, + tokenizer=self.tokenizer or "", + tokenizer_mode=self.tokenizer_mode, + tokenizer_revision=self.tokenizer_revision, + skip_tokenizer_init=self.skip_tokenizer_init, + io_processor_plugin=self.io_processor_plugin, + media_io_kwargs=self.media_io_kwargs, + allowed_local_media_path=self.allowed_local_media_path, + allowed_media_domains=self.allowed_media_domains, + ) + model_config.recalculate_max_model_len( + model_config.original_max_model_len, + tokenizer=renderer_config.tokenizer, + tokenizer_revision=renderer_config.tokenizer_revision, + ) + + self.model = model_config.model self._check_feature_supported(model_config) self._set_default_chunked_prefill_and_prefix_caching_args(model_config) self._set_default_max_num_seqs_and_batched_tokens_args( @@ -1768,6 +1786,7 @@ class EngineArgs: ) config = VllmConfig( model_config=model_config, + renderer_config=renderer_config, cache_config=cache_config, parallel_config=parallel_config, scheduler_config=scheduler_config, diff --git a/vllm/engine/protocol.py b/vllm/engine/protocol.py index d94951a0cffc8..7b60e7f89861b 100644 --- a/vllm/engine/protocol.py +++ b/vllm/engine/protocol.py @@ -5,7 +5,7 @@ from abc import ABC, abstractmethod from collections.abc import AsyncGenerator, Iterable, Mapping from typing import Any -from vllm.config import ModelConfig, VllmConfig +from vllm.config import ModelConfig, RendererConfig, VllmConfig from vllm.inputs.data import PromptType from vllm.lora.request import LoRARequest from vllm.outputs import PoolingRequestOutput, RequestOutput @@ -22,6 +22,7 @@ class EngineClient(ABC): """Protocol class for Clients to Engine""" vllm_config: VllmConfig + renderer_config: RendererConfig model_config: ModelConfig input_processor: InputProcessor io_processor: IOProcessor | None diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index aceaa8bd45b81..5ad256c2f3eb3 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -44,7 +44,7 @@ from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast, Processor from typing_extensions import Required, TypedDict from vllm import envs -from vllm.config import ModelConfig +from vllm.config import ModelConfig, RendererConfig from vllm.logger import init_logger from vllm.model_executor.models import SupportsMultiModal from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict @@ -452,9 +452,10 @@ This is needed because `lru_cache` does not cache when an exception happens. def _try_get_processor_chat_template( tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast, - model_config: ModelConfig, + *, + trust_remote_code: bool, ) -> str | None: - cache_key = (tokenizer.name_or_path, model_config.trust_remote_code) + cache_key = (tokenizer.name_or_path, trust_remote_code) if cache_key in _PROCESSOR_CHAT_TEMPLATES: return _PROCESSOR_CHAT_TEMPLATES[cache_key] @@ -466,7 +467,7 @@ def _try_get_processor_chat_template( PreTrainedTokenizerFast, ProcessorMixin, ), - trust_remote_code=model_config.trust_remote_code, + trust_remote_code=trust_remote_code, ) if ( isinstance(processor, ProcessorMixin) @@ -499,7 +500,10 @@ def resolve_hf_chat_template( # 2nd priority: AutoProcessor chat template, unless tool calling is enabled if tools is None: - chat_template = _try_get_processor_chat_template(tokenizer, model_config) + chat_template = _try_get_processor_chat_template( + tokenizer, + trust_remote_code=model_config.trust_remote_code, + ) if chat_template is not None: return chat_template @@ -513,10 +517,10 @@ def resolve_hf_chat_template( exc_info=True, ) - # 4th priority: Predefined fallbacks + # 4th priority: Predefined fallbacks] path = get_chat_template_fallback_path( model_type=model_config.hf_config.model_type, - tokenizer_name_or_path=model_config.tokenizer, + tokenizer_name_or_path=tokenizer.name_or_path, ) if path is not None: logger.info_once( @@ -538,14 +542,14 @@ def _resolve_chat_template_content_format( tools: list[dict[str, Any]] | None, tokenizer: TokenizerLike | None, *, - model_config: ModelConfig, + renderer_config: RendererConfig, ) -> _ChatTemplateContentFormat: if isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)): hf_chat_template = resolve_hf_chat_template( tokenizer, chat_template=chat_template, tools=tools, - model_config=model_config, + model_config=renderer_config.model_config, ) else: hf_chat_template = None @@ -595,7 +599,7 @@ def resolve_chat_template_content_format( given_format: ChatTemplateContentFormatOption, tokenizer: TokenizerLike | None, *, - model_config: ModelConfig, + renderer_config: RendererConfig, ) -> _ChatTemplateContentFormat: if given_format != "auto": return given_format @@ -604,7 +608,7 @@ def resolve_chat_template_content_format( chat_template, tools, tokenizer, - model_config=model_config, + renderer_config=renderer_config, ) _log_chat_template_content_format( @@ -627,32 +631,32 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): maximum per prompt. """ - def __init__(self, model_config: ModelConfig): + def __init__(self, renderer_config: RendererConfig): super().__init__() - self._model_config = model_config + self._renderer_config = renderer_config self._items_by_modality = defaultdict[str, list[_T | None]](list) self._uuids_by_modality = defaultdict[str, list[str | None]](list) @property - def model_config(self) -> ModelConfig: - return self._model_config + def renderer_config(self) -> RendererConfig: + return self._renderer_config @cached_property def model_cls(self) -> type[SupportsMultiModal]: from vllm.model_executor.model_loader import get_model_cls - model_cls = get_model_cls(self.model_config) + model_cls = get_model_cls(self.renderer_config.model_config) return cast(type[SupportsMultiModal], model_cls) @property def allowed_local_media_path(self): - return self._model_config.allowed_local_media_path + return self._renderer_config.allowed_local_media_path @property def allowed_media_domains(self): - return self._model_config.allowed_media_domains + return self._renderer_config.allowed_media_domains @property def mm_registry(self): @@ -660,7 +664,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]): @cached_property def mm_processor(self): - return self.mm_registry.create_processor(self.model_config) + return self.mm_registry.create_processor(self.renderer_config) def add( self, @@ -851,19 +855,20 @@ class MultiModalContentParser(BaseMultiModalContentParser): super().__init__() self._tracker = tracker - multimodal_config = self._tracker.model_config.multimodal_config - media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None) - self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load( envs.VLLM_MEDIA_CONNECTOR, - media_io_kwargs=media_io_kwargs, + media_io_kwargs=self.renderer_config.media_io_kwargs, allowed_local_media_path=tracker.allowed_local_media_path, allowed_media_domains=tracker.allowed_media_domains, ) + @property + def renderer_config(self) -> RendererConfig: + return self._tracker.renderer_config + @property def model_config(self) -> ModelConfig: - return self._tracker.model_config + return self.renderer_config.model_config def parse_image(self, image_url: str | None, uuid: str | None = None) -> None: image = self._connector.fetch_image(image_url) if image_url else None @@ -963,18 +968,20 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser): super().__init__() self._tracker = tracker - multimodal_config = self._tracker.model_config.multimodal_config - media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None) self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load( envs.VLLM_MEDIA_CONNECTOR, - media_io_kwargs=media_io_kwargs, + media_io_kwargs=self.renderer_config.media_io_kwargs, allowed_local_media_path=tracker.allowed_local_media_path, allowed_media_domains=tracker.allowed_media_domains, ) + @property + def renderer_config(self) -> RendererConfig: + return self._tracker.renderer_config + @property def model_config(self) -> ModelConfig: - return self._tracker.model_config + return self.renderer_config.model_config def parse_image(self, image_url: str | None, uuid: str | None = None) -> None: image_coro = self._connector.fetch_image_async(image_url) if image_url else None @@ -1604,15 +1611,17 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None: def parse_chat_messages( messages: list[ChatCompletionMessageParam], - model_config: ModelConfig, + renderer_config: RendererConfig, content_format: _ChatTemplateContentFormat, ) -> tuple[ list[ConversationMessage], MultiModalDataDict | None, MultiModalUUIDDict | None, ]: + model_config = renderer_config.model_config + conversation: list[ConversationMessage] = [] - mm_tracker = MultiModalItemTracker(model_config) + mm_tracker = MultiModalItemTracker(renderer_config) for msg in messages: sub_messages = _parse_chat_message_content( @@ -1635,15 +1644,17 @@ def parse_chat_messages( def parse_chat_messages_futures( messages: list[ChatCompletionMessageParam], - model_config: ModelConfig, + renderer_config: RendererConfig, content_format: _ChatTemplateContentFormat, ) -> tuple[ list[ConversationMessage], Awaitable[MultiModalDataDict | None], MultiModalUUIDDict | None, ]: + model_config = renderer_config.model_config + conversation: list[ConversationMessage] = [] - mm_tracker = AsyncMultiModalItemTracker(model_config) + mm_tracker = AsyncMultiModalItemTracker(renderer_config) for msg in messages: sub_messages = _parse_chat_message_content( @@ -1748,14 +1759,14 @@ def apply_hf_chat_template( chat_template: str | None, tools: list[dict[str, Any]] | None, *, - model_config: ModelConfig, + renderer_config: RendererConfig, **kwargs: Any, ) -> str: hf_chat_template = resolve_hf_chat_template( tokenizer, chat_template=chat_template, tools=tools, - model_config=model_config, + model_config=renderer_config.model_config, ) if hf_chat_template is None: diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 913324fd5f9c3..6b3cb26afb626 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -29,8 +29,8 @@ from vllm.config.model import ( HfOverrides, ModelDType, RunnerOption, - TokenizerMode, ) +from vllm.config.renderer import TokenizerMode from vllm.engine.arg_utils import EngineArgs from vllm.entrypoints.chat_utils import ( ChatCompletionMessageParam, @@ -343,6 +343,7 @@ class LLM: logger.info("Supported tasks: %s", supported_tasks) self.supported_tasks = supported_tasks + self.renderer_config = self.llm_engine.renderer_config self.model_config = self.llm_engine.model_config self.input_processor = self.llm_engine.input_processor self.io_processor = self.llm_engine.io_processor @@ -808,13 +809,13 @@ class LLM: list_of_messages = [cast(list[ChatCompletionMessageParam], messages)] tokenizer = self.get_tokenizer() - model_config = self.model_config + renderer_config = self.renderer_config resolved_content_format = resolve_chat_template_content_format( chat_template, tools, chat_template_content_format, tokenizer, - model_config=model_config, + renderer_config=renderer_config, ) _chat_template_kwargs: dict[str, Any] = dict( @@ -833,7 +834,7 @@ class LLM: # the chat message parsing for it. conversation, mm_data, mm_uuids = parse_chat_messages( msgs, - model_config, + renderer_config, content_format=resolved_content_format, ) @@ -847,7 +848,7 @@ class LLM: prompt_str = apply_hf_chat_template( tokenizer=tokenizer, conversation=conversation, - model_config=model_config, + renderer_config=renderer_config, **_chat_template_kwargs, ) # Special tokens are already included in chat templates so @@ -1290,6 +1291,7 @@ class LLM: lora_request: list[LoRARequest] | LoRARequest | None = None, tokenization_kwargs: dict[str, Any] | None = None, ) -> list[ScoringRequestOutput]: + renderer_config = self.renderer_config model_config = self.model_config if isinstance(tokenizer, MistralTokenizer): @@ -1317,7 +1319,7 @@ class LLM: for q, d in input_pairs: _, engine_prompt = get_score_prompt( - model_config=model_config, + renderer_config=renderer_config, data_1=q, data_2=d, tokenizer=tokenizer, diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 7be601d824f34..d77d611a2654d 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1099,7 +1099,7 @@ async def init_app_state( logger.info("Supported tasks: %s", supported_tasks) resolved_chat_template = await process_chat_template( - args.chat_template, engine_client, vllm_config.model_config + args.chat_template, engine_client, vllm_config.renderer_config ) if args.tool_server == "demo": diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py index 3e421e21e3e80..a9e72fb00c5bd 100644 --- a/vllm/entrypoints/openai/serving_completion.py +++ b/vllm/entrypoints/openai/serving_completion.py @@ -122,7 +122,7 @@ class OpenAIServingCompletion(OpenAIServing): try: lora_request = self._maybe_get_adapters(request) - if self.model_config.skip_tokenizer_init: + if self.renderer_config.skip_tokenizer_init: tokenizer = None else: tokenizer = await self.engine_client.get_tokenizer() diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index 99936f588f28b..d887cf48d89f9 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -291,6 +291,7 @@ class OpenAIServing: self.input_processor = self.models.input_processor self.io_processor = self.models.io_processor + self.renderer_config = self.models.renderer_config self.model_config = self.models.model_config self.max_model_len = self.model_config.max_model_len @@ -1100,18 +1101,18 @@ class OpenAIServing: Sequence[RequestPrompt], list[EngineTokensPrompt], ]: - model_config = self.model_config + renderer_config = self.renderer_config resolved_content_format = resolve_chat_template_content_format( chat_template, tool_dicts, chat_template_content_format, tokenizer, - model_config=model_config, + renderer_config=renderer_config, ) conversation, mm_data_future, mm_uuids = parse_chat_messages_futures( messages, - model_config, + renderer_config, content_format=resolved_content_format, ) @@ -1138,14 +1139,14 @@ class OpenAIServing: request_prompt = tokenizer.apply_chat_template( conversation=conversation, messages=messages, - model_config=model_config, + model_config=renderer_config.model_config, **_chat_template_kwargs, ) else: request_prompt = apply_hf_chat_template( tokenizer=tokenizer, conversation=conversation, - model_config=model_config, + renderer_config=renderer_config, **_chat_template_kwargs, ) diff --git a/vllm/entrypoints/openai/serving_models.py b/vllm/entrypoints/openai/serving_models.py index 953398a9a72ae..ec65e659383d8 100644 --- a/vllm/entrypoints/openai/serving_models.py +++ b/vllm/entrypoints/openai/serving_models.py @@ -71,6 +71,7 @@ class OpenAIServingModels: self.input_processor = self.engine_client.input_processor self.io_processor = self.engine_client.io_processor + self.renderer_config = self.engine_client.renderer_config self.model_config = self.engine_client.model_config self.max_model_len = self.model_config.max_model_len diff --git a/vllm/entrypoints/openai/speech_to_text.py b/vllm/entrypoints/openai/speech_to_text.py index cea9924ebbaca..5fd79eed1909b 100644 --- a/vllm/entrypoints/openai/speech_to_text.py +++ b/vllm/entrypoints/openai/speech_to_text.py @@ -91,7 +91,7 @@ class OpenAISpeechToText(OpenAIServing): self.task_type = task_type self.asr_config = self.model_cls.get_speech_to_text_config( - self.model_config, task_type + self.renderer_config, task_type ) self.enable_force_include_usage = enable_force_include_usage @@ -101,8 +101,8 @@ class OpenAISpeechToText(OpenAIServing): self.tokenizer = cast( PreTrainedTokenizerBase, get_tokenizer( - tokenizer_name=self.model_config.tokenizer, - tokenizer_mode=self.model_config.tokenizer_mode, + tokenizer_name=self.renderer_config.tokenizer, + tokenizer_mode=self.renderer_config.tokenizer_mode, ), ) @@ -154,7 +154,7 @@ class OpenAISpeechToText(OpenAIServing): prompt = self.model_cls.get_generation_prompt( audio=chunk, stt_config=self.asr_config, - model_config=self.model_config, + renderer_config=self.renderer_config, language=language, task_type=self.task_type, request_prompt=request.prompt, @@ -428,7 +428,7 @@ class OpenAISpeechToText(OpenAIServing): if res.prompt_token_ids is not None: num_prompt_tokens = len(res.prompt_token_ids) if audio_tokens := self.model_cls.get_num_audio_tokens( - audio_duration_s, self.asr_config, self.model_config + audio_duration_s, self.asr_config, self.renderer_config ): num_prompt_tokens += audio_tokens diff --git a/vllm/entrypoints/pooling/pooling/serving.py b/vllm/entrypoints/pooling/pooling/serving.py index 7fb767e26d019..cd28ccba9ef95 100644 --- a/vllm/entrypoints/pooling/pooling/serving.py +++ b/vllm/entrypoints/pooling/pooling/serving.py @@ -94,7 +94,7 @@ class OpenAIServingPooling(OpenAIServing): try: lora_request = self._maybe_get_adapters(request) - if self.model_config.skip_tokenizer_init: + if self.renderer_config.skip_tokenizer_init: tokenizer = None else: tokenizer = await self.engine_client.get_tokenizer() diff --git a/vllm/entrypoints/pooling/score/serving.py b/vllm/entrypoints/pooling/score/serving.py index e5a66783005a6..f657fcefd3a86 100644 --- a/vllm/entrypoints/pooling/score/serving.py +++ b/vllm/entrypoints/pooling/score/serving.py @@ -160,10 +160,8 @@ class ServingScores(OpenAIServing): data_1: str | ScoreContentPartParam, data_2: str | ScoreContentPartParam, ) -> tuple[str, TokensPrompt]: - model_config = self.model_config - full_prompt, engine_prompt = get_score_prompt( - model_config=model_config, + renderer_config=self.renderer_config, data_1=data_1, data_2=data_2, tokenizer=tokenizer, diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py index 072ddd4c90b16..561adbe454f3c 100644 --- a/vllm/entrypoints/score_utils.py +++ b/vllm/entrypoints/score_utils.py @@ -5,7 +5,7 @@ from typing import Any, TypeAlias, cast from torch.nn import CosineSimilarity from typing_extensions import Required, TypedDict -from vllm.config import ModelConfig +from vllm.config import ModelConfig, RendererConfig from vllm.entrypoints.chat_utils import ( BaseMultiModalItemTracker, ChatCompletionContentPartImageEmbedsParam, @@ -88,9 +88,9 @@ def _validate_score_input_lens( def parse_score_data( data_1: str | ScoreContentPartParam, data_2: str | ScoreContentPartParam, - model_config: ModelConfig, + renderer_config: RendererConfig, ) -> tuple[str, str, MultiModalDataDict | None]: - mm_tracker = MultiModalItemTracker(model_config) + mm_tracker = MultiModalItemTracker(renderer_config) content_1 = _parse_score_content(data_1, mm_tracker) content_2 = _parse_score_content(data_2, mm_tracker) @@ -176,7 +176,7 @@ def post_process_tokens( def get_score_prompt( - model_config: ModelConfig, + renderer_config: RendererConfig, tokenizer: TokenizerLike, tokenization_kwargs: dict[str, Any], data_1: str | ScoreContentPartParam, @@ -185,11 +185,14 @@ def get_score_prompt( prompt_1, prompt_2, mm_data = parse_score_data( data_1, data_2, - model_config, + renderer_config, ) + from vllm.model_executor.model_loader import get_model_cls + model_config = renderer_config.model_config model = get_model_cls(model_config) + if supports_score_template(model): full_prompt = apply_score_template(model_config, prompt_1, prompt_2) prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs) diff --git a/vllm/entrypoints/utils.py b/vllm/entrypoints/utils.py index daeeb995bc749..a81f73ac9e618 100644 --- a/vllm/entrypoints/utils.py +++ b/vllm/entrypoints/utils.py @@ -13,7 +13,7 @@ from fastapi import Request from fastapi.responses import JSONResponse, StreamingResponse from starlette.background import BackgroundTask, BackgroundTasks -from vllm.config import ModelConfig +from vllm.config import RendererConfig from vllm.engine.arg_utils import EngineArgs from vllm.engine.protocol import EngineClient from vllm.entrypoints.chat_utils import ( @@ -288,7 +288,7 @@ def process_lora_modules( async def process_chat_template( args_chat_template: Path | str | None, engine_client: EngineClient, - model_config: ModelConfig, + renderer_config: RendererConfig, ) -> str | None: resolved_chat_template = load_chat_template(args_chat_template) if resolved_chat_template is not None: @@ -305,7 +305,7 @@ async def process_chat_template( tokenizer=tokenizer, chat_template=None, tools=None, - model_config=model_config, + model_config=renderer_config.model_config, ) if hf_chat_template != resolved_chat_template: @@ -314,6 +314,6 @@ async def process_chat_template( "It is different from official chat template '%s'. " "This discrepancy may lead to performance degradation.", resolved_chat_template, - model_config.model, + renderer_config.model_config.model, ) return resolved_chat_template diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 0372b06d0017f..f534d102fc3b7 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -6,7 +6,7 @@ from typing import Any, cast from typing_extensions import assert_never -from vllm.config import ModelConfig +from vllm.config import RendererConfig from vllm.logger import init_logger from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.multimodal.cache import BaseMultiModalProcessorCache @@ -45,14 +45,15 @@ logger = init_logger(__name__) class InputPreprocessor: def __init__( self, - model_config: ModelConfig, + renderer_config: RendererConfig, tokenizer: TokenizerLike | None, mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, mm_processor_cache: BaseMultiModalProcessorCache | None = None, ) -> None: super().__init__() - self.model_config = model_config + self.renderer_config = renderer_config + self.model_config = renderer_config.model_config self.tokenizer = tokenizer self.mm_registry = mm_registry self.mm_processor_cache = mm_processor_cache @@ -231,7 +232,7 @@ class InputPreprocessor: def _get_mm_processor(self) -> BaseMultiModalProcessor: if not hasattr(self, "_mm_processor"): self._mm_processor = self.mm_registry.create_processor( - self.model_config, + self.renderer_config, tokenizer=self.tokenizer, cache=self.mm_processor_cache, ) diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index 007d847ac3b7b..a2700bd5a5016 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -415,7 +415,7 @@ def load_weights_using_from_2_way_softmax( from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader - model_config = model.vllm_config.model_config + renderer_config = model.vllm_config.renderer_config quant_config = model.vllm_config.quant_config text_config = model.config.get_text_config() @@ -447,10 +447,10 @@ def load_weights_using_from_2_way_softmax( from vllm.tokenizers import get_tokenizer tokenizer = get_tokenizer( - model_config.tokenizer, - revision=model_config.tokenizer_revision, - tokenizer_mode=model_config.tokenizer_mode, - trust_remote_code=model_config.trust_remote_code, + renderer_config.tokenizer, + revision=renderer_config.tokenizer_revision, + tokenizer_mode=renderer_config.tokenizer_mode, + trust_remote_code=renderer_config.trust_remote_code, ) false_id = tokenizer.convert_tokens_to_ids(tokens[0]) @@ -473,7 +473,7 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.weight_utils import default_weight_loader - model_config = model.vllm_config.model_config + renderer_config = model.vllm_config.renderer_config quant_config = model.vllm_config.quant_config text_config = model.config.get_text_config() @@ -501,10 +501,10 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te from vllm.tokenizers import get_tokenizer tokenizer = get_tokenizer( - model_config.tokenizer, - revision=model_config.tokenizer_revision, - tokenizer_mode=model_config.tokenizer_mode, - trust_remote_code=model_config.trust_remote_code, + renderer_config.tokenizer, + revision=renderer_config.tokenizer_revision, + tokenizer_mode=renderer_config.tokenizer_mode, + trust_remote_code=renderer_config.trust_remote_code, ) token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens] diff --git a/vllm/model_executor/models/deepseek_ocr.py b/vllm/model_executor/models/deepseek_ocr.py index 1f07381c0cbd0..bd472474982ff 100644 --- a/vllm/model_executor/models/deepseek_ocr.py +++ b/vllm/model_executor/models/deepseek_ocr.py @@ -377,8 +377,8 @@ class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): self.projector_config = config.projector_config self.text_config = config.text_config - model_config = vllm_config.model_config - tokenizer = cached_tokenizer_from_config(model_config) + renderer_config = vllm_config.renderer_config + tokenizer = cached_tokenizer_from_config(renderer_config) self.image_token_id = tokenizer.vocab[_IMAGE_TOKEN] self.sam_model = build_sam_vit_b() diff --git a/vllm/model_executor/models/deepseek_vl2.py b/vllm/model_executor/models/deepseek_vl2.py index 9f8faf9ed91ce..be03e1df81d22 100644 --- a/vllm/model_executor/models/deepseek_vl2.py +++ b/vllm/model_executor/models/deepseek_vl2.py @@ -370,8 +370,8 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): self.projector_config = config.projector_config self.text_config = config.text_config - model_config = vllm_config.model_config - tokenizer = cached_tokenizer_from_config(model_config) + renderer_config = vllm_config.renderer_config + tokenizer = cached_tokenizer_from_config(renderer_config) self.image_token_id: int = tokenizer.vocab[_IMAGE_TOKEN] self.vision = self._init_vision_module( diff --git a/vllm/model_executor/models/gemma3n_mm.py b/vllm/model_executor/models/gemma3n_mm.py index 7036118ada084..f82529d849a64 100644 --- a/vllm/model_executor/models/gemma3n_mm.py +++ b/vllm/model_executor/models/gemma3n_mm.py @@ -18,7 +18,7 @@ from transformers.models.gemma3n import ( ) from transformers.models.siglip import SiglipImageProcessorFast -from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig +from vllm.config import RendererConfig, SpeechToTextConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.inputs.data import PromptType from vllm.logger import init_logger @@ -760,7 +760,7 @@ class Gemma3nForConditionalGeneration( cls, audio: np.ndarray, stt_config: SpeechToTextConfig, - model_config: ModelConfig, + renderer_config: RendererConfig, language: Optional[str], task_type: Literal["transcribe", "translate"], request_prompt: str, @@ -798,7 +798,9 @@ class Gemma3nForConditionalGeneration( @classmethod def get_speech_to_text_config( - cls, model_config: ModelConfig, task_type: str + cls, + renderer_config: RendererConfig, + task_type: str, ) -> SpeechToTextConfig: return SpeechToTextConfig( # Let's set this to 30 as suggested in the docs for now, although diff --git a/vllm/model_executor/models/granite_speech.py b/vllm/model_executor/models/granite_speech.py index a4e50f4086281..96645f20b79df 100644 --- a/vllm/model_executor/models/granite_speech.py +++ b/vllm/model_executor/models/granite_speech.py @@ -34,7 +34,7 @@ import torch.nn.functional as F from torch import nn from transformers import BatchFeature, PretrainedConfig -from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig +from vllm.config import CacheConfig, RendererConfig, SpeechToTextConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.inputs.data import PromptType from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear @@ -840,7 +840,7 @@ class GraniteSpeechForConditionalGeneration( def get_generation_prompt( cls, audio: np.ndarray, - model_config: ModelConfig, + renderer_config: RendererConfig, stt_config: SpeechToTextConfig, language: str | None, task_type: Literal["transcribe", "translate"], @@ -861,7 +861,7 @@ class GraniteSpeechForConditionalGeneration( else: raise ValueError(f"Unsupported task type {task_type}") - tokenizer = cached_tokenizer_from_config(model_config) + tokenizer = cached_tokenizer_from_config(renderer_config) chat = [dict(role="user", content=user_prompt)] prompt = tokenizer.apply_chat_template( chat, @@ -882,10 +882,10 @@ class GraniteSpeechForConditionalGeneration( cls, audio_duration_s: float, stt_config: SpeechToTextConfig, - model_config: ModelConfig, + renderer_config: RendererConfig, ) -> int | None: """Get the number of audio tokens for an audio duration in sec.""" - processor = cached_processor_from_config(model_config) + processor = cached_processor_from_config(renderer_config) hop_length = processor.audio_processor.melspec_kwargs["hop_length"] proj_win_size = processor.audio_processor.projector_window_size ds_rate = processor.audio_processor.projector_downsample_rate @@ -903,7 +903,9 @@ class GraniteSpeechForConditionalGeneration( @classmethod def get_speech_to_text_config( - cls, model_config: ModelConfig, task_type: str + cls, + renderer_config: RendererConfig, + task_type: str, ) -> SpeechToTextConfig: """Get the stt config for this model.""" # Default settings are reasonable for this model and we don't currently diff --git a/vllm/model_executor/models/gritlm.py b/vllm/model_executor/models/gritlm.py index 2aba626a7c737..b9f3ac8aee5f7 100644 --- a/vllm/model_executor/models/gritlm.py +++ b/vllm/model_executor/models/gritlm.py @@ -6,7 +6,7 @@ import numpy as np import torch import torch.nn as nn -from vllm.config import ModelConfig, VllmConfig +from vllm.config import RendererConfig, VllmConfig from vllm.logger import init_logger from vllm.model_executor.layers.pooler import ( DispatchPooler, @@ -29,12 +29,12 @@ logger = init_logger(__name__) class GritLMMeanPool(nn.Module): """As `MeanPool`, but only includes non-instruction tokens.""" - def __init__(self, model_config: ModelConfig): + def __init__(self, renderer_config: RendererConfig): super().__init__() - self.model_config = model_config + self.renderer_config = renderer_config - tokenizer = cached_tokenizer_from_config(self.model_config) + tokenizer = cached_tokenizer_from_config(self.renderer_config) # Collect the tokens needed for pattern matching. # "▁<" is different from "_<". The former uses "▁" to indicate that @@ -174,10 +174,10 @@ class GritLMMeanPool(nn.Module): class GritLMPooler(Pooler): - def __init__(self, model_config: ModelConfig): + def __init__(self, renderer_config: RendererConfig): super().__init__() - self.pooling = GritLMMeanPool(model_config) + self.pooling = GritLMMeanPool(renderer_config) self.head = PoolerHead(PoolerNormalize()) def get_supported_tasks(self) -> Set[PoolingTask]: @@ -238,6 +238,6 @@ class GritLM(LlamaForCausalLM): self.pooler = DispatchPooler( { "token_embed": Pooler.for_token_embed(pooler_config), - "embed": GritLMPooler(vllm_config.model_config), + "embed": GritLMPooler(vllm_config.renderer_config), } ) diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py index 607ff55835f1d..4df91aaf8b7f2 100644 --- a/vllm/model_executor/models/interfaces.py +++ b/vllm/model_executor/models/interfaces.py @@ -19,7 +19,7 @@ from torch import Tensor from transformers.models.whisper.tokenization_whisper import LANGUAGES from typing_extensions import Self, TypeIs -from vllm.config import ModelConfig, SpeechToTextConfig +from vllm.config import RendererConfig, SpeechToTextConfig from vllm.inputs import TokensPrompt from vllm.inputs.data import PromptType from vllm.logger import init_logger @@ -887,7 +887,7 @@ class SupportsTranscription(Protocol): cls, audio: np.ndarray, stt_config: SpeechToTextConfig, - model_config: ModelConfig, + renderer_config: RendererConfig, language: str | None, task_type: Literal["transcribe", "translate"], request_prompt: str, @@ -930,7 +930,9 @@ class SupportsTranscription(Protocol): @classmethod def get_speech_to_text_config( - cls, model_config: ModelConfig, task_type: Literal["transcribe", "translate"] + cls, + renderer_config: RendererConfig, + task_type: Literal["transcribe", "translate"], ) -> SpeechToTextConfig: """Get the speech to text config for the ASR model.""" ... @@ -940,7 +942,7 @@ class SupportsTranscription(Protocol): cls, audio_duration_s: float, stt_config: SpeechToTextConfig, - model_config: ModelConfig, + renderer_config: RendererConfig, ) -> int | None: """ Map from audio duration to number of audio tokens produced by the ASR diff --git a/vllm/model_executor/models/interns1.py b/vllm/model_executor/models/interns1.py index 18985cefbf5ea..d75637da1d0e2 100644 --- a/vllm/model_executor/models/interns1.py +++ b/vllm/model_executor/models/interns1.py @@ -182,7 +182,7 @@ class InternS1ProcessingInfo(BaseProcessingInfo): def get_hf_processor(self, **kwargs: object) -> InternVLProcessor: hf_processor = self.ctx.get_hf_processor(InternVLProcessor, **kwargs) hf_processor.video_processor = cached_video_processor_from_config( - self.ctx.model_config, + self.ctx.renderer_config, processor_cls=InternVLVideoProcessor, size=hf_processor.image_processor.size, **kwargs, diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 6dfab595e5b92..4daaefd0c2709 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -1169,16 +1169,17 @@ class NemotronH_Nano_VL_V2( self.mlp1 = self.mlp1.to(self.language_model.config.dtype) self.config = config - self.model_config = vllm_config.model_config # Pre-tokenize special tokens for video processing # to avoid repeated tokenization - tokenizer = cached_tokenizer_from_config(vllm_config.model_config) - self._img_start_token_ids = tokenizer.encode( + self._tokenizer = cached_tokenizer_from_config(vllm_config.renderer_config) + self._img_start_token_ids = self._tokenizer.encode( IMG_START, add_special_tokens=False ) - self._img_end_token_ids = tokenizer.encode(IMG_END, add_special_tokens=False) - self._img_context_token_ids = tokenizer.encode( + self._img_end_token_ids = self._tokenizer.encode( + IMG_END, add_special_tokens=False + ) + self._img_context_token_ids = self._tokenizer.encode( IMG_CONTEXT, add_special_tokens=False ) @@ -1364,7 +1365,7 @@ class NemotronH_Nano_VL_V2( input_embeds for the LLM. """ device = video_embeddings.device - tokenizer = cached_tokenizer_from_config(self.model_config) + tokenizer = self._tokenizer # Generate video replacement token IDs using get_video_repl # This tokenizes each frame separator independently, then uses pre-tokenized diff --git a/vllm/model_executor/models/nemotron_vl.py b/vllm/model_executor/models/nemotron_vl.py index 391980fc61f9e..797793e658f7b 100644 --- a/vllm/model_executor/models/nemotron_vl.py +++ b/vllm/model_executor/models/nemotron_vl.py @@ -347,7 +347,7 @@ class NemotronVLProcessingInfo(BaseInternVLProcessingInfo): def get_image_processor(self, **kwargs: object): return cached_image_processor_from_config( - self.ctx.model_config, + self.ctx.renderer_config, **kwargs, ) diff --git a/vllm/model_executor/models/pixtral.py b/vllm/model_executor/models/pixtral.py index faf2d80d24bba..ebe743fa82a00 100644 --- a/vllm/model_executor/models/pixtral.py +++ b/vllm/model_executor/models/pixtral.py @@ -193,7 +193,7 @@ class PixtralProcessorAdapter: class PixtralProcessingInfo(BaseProcessingInfo): def get_tokenizer(self) -> MistralTokenizer: - tokenizer = cached_tokenizer_from_config(self.ctx.model_config) + tokenizer = cached_tokenizer_from_config(self.ctx.renderer_config) if not isinstance(tokenizer, MistralTokenizer): raise ValueError("This model requires `--tokenizer-mode mistral`") diff --git a/vllm/model_executor/models/voxtral.py b/vllm/model_executor/models/voxtral.py index 7b408248ec74c..0acd564e2e54f 100644 --- a/vllm/model_executor/models/voxtral.py +++ b/vllm/model_executor/models/voxtral.py @@ -20,7 +20,7 @@ from mistral_common.tokens.tokenizers.audio import Audio, AudioEncoder from transformers import BatchFeature, TensorType, WhisperConfig from transformers.tokenization_utils_base import TextInput -from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig +from vllm.config import RendererConfig, SpeechToTextConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.inputs.data import PromptType from vllm.logger import init_logger @@ -176,7 +176,7 @@ class VoxtralProcessorAdapter: class VoxtralProcessingInfo(BaseProcessingInfo): def get_tokenizer(self) -> MistralTokenizer: - tokenizer = cached_tokenizer_from_config(self.ctx.model_config) + tokenizer = cached_tokenizer_from_config(self.ctx.renderer_config) if not isinstance(tokenizer, MistralTokenizer): raise ValueError("This model requires `--tokenizer-mode mistral`") @@ -339,7 +339,7 @@ class VoxtralForConditionalGeneration( def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): super().__init__() - self.tokenizer = cached_tokenizer_from_config(vllm_config.model_config) + self.tokenizer = cached_tokenizer_from_config(vllm_config.renderer_config) # update quant config to so that ignored module and target module names # match the vLLM model names @@ -450,9 +450,11 @@ class VoxtralForConditionalGeneration( @classmethod def get_speech_to_text_config( - cls, model_config: ModelConfig, task_type: str + cls, + renderer_config: RendererConfig, + task_type: str, ) -> SpeechToTextConfig: - tokenizer = cached_tokenizer_from_config(model_config) + tokenizer = cached_tokenizer_from_config(renderer_config) audio_config = tokenizer.instruct.audio_encoder.audio_config max_audio_clip_s = audio_config.chunk_length_s sample_rate = audio_config.sampling_rate @@ -468,17 +470,17 @@ class VoxtralForConditionalGeneration( def get_generation_prompt( cls, audio: np.ndarray, - model_config: ModelConfig, + renderer_config: RendererConfig, # not needed here stt_config: SpeechToTextConfig, language: str | None, task_type: Literal["transcribe", "translate"], request_prompt: str, to_language: str | None, ) -> PromptType: - tokenizer = cached_tokenizer_from_config(model_config) + tokenizer = cached_tokenizer_from_config(renderer_config) audio = Audio(audio, int(stt_config.sample_rate), format="wav") # lossless req = TranscriptionRequest( - model=model_config.model, + model=renderer_config.model_config.model, audio=RawAudio.from_audio(audio), language=language, ) @@ -494,14 +496,14 @@ class VoxtralForConditionalGeneration( cls, audio_duration_s: float, stt_config: SpeechToTextConfig, - model_config: ModelConfig, + renderer_config: RendererConfig, ) -> int | None: """ Map from audio duration to number of audio tokens produced by the ASR model, without running a forward pass. This is used for estimating the amount of processing for this audio. """ - tokenizer = cached_tokenizer_from_config(model_config) + tokenizer = cached_tokenizer_from_config(renderer_config) adapter = VoxtralProcessorAdapter(tokenizer) return adapter.get_num_audio_tokens( int(audio_duration_s * stt_config.sample_rate) diff --git a/vllm/model_executor/models/whisper.py b/vllm/model_executor/models/whisper.py index b2feff1335151..6f526e3956fff 100644 --- a/vllm/model_executor/models/whisper.py +++ b/vllm/model_executor/models/whisper.py @@ -19,7 +19,7 @@ from transformers.models.whisper.modeling_whisper import sinusoids from vllm.attention.backends.abstract import AttentionType from vllm.attention.layer import Attention, MultiHeadAttention from vllm.attention.layers.cross_attention import CrossAttention -from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig +from vllm.config import CacheConfig, RendererConfig, SpeechToTextConfig, VllmConfig from vllm.config.multimodal import BaseDummyOptions from vllm.distributed import get_tensor_model_parallel_world_size from vllm.inputs.data import PromptType @@ -811,7 +811,7 @@ class WhisperForConditionalGeneration( def get_generation_prompt( cls, audio: np.ndarray, - model_config: ModelConfig, # not needed here + renderer_config: RendererConfig, # not needed here stt_config: SpeechToTextConfig, language: str | None, task_type: Literal["transcribe", "translate"], @@ -847,9 +847,11 @@ class WhisperForConditionalGeneration( @classmethod def get_speech_to_text_config( - cls, model_config: ModelConfig, task_type: str + cls, + renderer_config: RendererConfig, + task_type: str, ) -> SpeechToTextConfig: - processor = cached_processor_from_config(model_config) + processor = cached_processor_from_config(renderer_config) return SpeechToTextConfig( max_audio_clip_s=processor.feature_extractor.chunk_length, @@ -861,9 +863,9 @@ class WhisperForConditionalGeneration( cls, audio_duration_s: float, stt_config: SpeechToTextConfig, - model_config: ModelConfig, + renderer_config: RendererConfig, ) -> int | None: - processor = cached_processor_from_config(model_config) + processor = cached_processor_from_config(renderer_config) hop_length = processor.feature_extractor.hop_length assert hop_length is not None # NOTE(NickLucche) user can't pass encoder diff --git a/vllm/multimodal/cache.py b/vllm/multimodal/cache.py index 67bdf5e1557f9..9c838fe679582 100644 --- a/vllm/multimodal/cache.py +++ b/vllm/multimodal/cache.py @@ -31,7 +31,7 @@ from .inputs import ( ) if TYPE_CHECKING: - from vllm.config import ModelConfig, VllmConfig + from vllm.config import ModelConfig, RendererConfig, VllmConfig from .processing import ResolvedPromptUpdate from .registry import MultiModalRegistry @@ -561,13 +561,13 @@ class ShmObjectStoreSenderCache(BaseMultiModalProcessorCache): def _enable_processor_cache( - model_config: "ModelConfig", + renderer_config: "RendererConfig", mm_registry: "MultiModalRegistry", ) -> bool: - if not mm_registry.supports_multimodal_inputs(model_config): + if not mm_registry.supports_multimodal_inputs(renderer_config): return False - mm_config = model_config.get_multimodal_config() + mm_config = renderer_config.model_config.get_multimodal_config() return mm_config.mm_processor_cache_gb > 0 @@ -599,7 +599,7 @@ def processor_cache_from_config( """Return a `BaseMultiModalProcessorCache`, if enabled.""" model_config = vllm_config.model_config - if not _enable_processor_cache(model_config, mm_registry): + if not _enable_processor_cache(vllm_config.renderer_config, mm_registry): return None if not _enable_ipc_cache(vllm_config): @@ -611,14 +611,14 @@ def processor_cache_from_config( def processor_only_cache_from_config( - model_config: "ModelConfig", + renderer_config: "RendererConfig", mm_registry: "MultiModalRegistry", ): """Return a `MultiModalProcessorOnlyCache`, if enabled.""" - if not _enable_processor_cache(model_config, mm_registry): + if not _enable_processor_cache(renderer_config, mm_registry): return None - return MultiModalProcessorOnlyCache(model_config) + return MultiModalProcessorOnlyCache(renderer_config.model_config) class BaseMultiModalReceiverCache( @@ -787,7 +787,7 @@ def engine_receiver_cache_from_config( """ model_config = vllm_config.model_config - if not _enable_processor_cache(model_config, mm_registry): + if not _enable_processor_cache(vllm_config.renderer_config, mm_registry): return None if not _enable_ipc_cache(vllm_config): @@ -809,9 +809,7 @@ def worker_receiver_cache_from_config( Return a `BaseMultiModalReceiverCache` only when IPC caching is enabled and mm_processor_cache_type=="shm". """ - model_config = vllm_config.model_config - - if not _enable_processor_cache(model_config, mm_registry): + if not _enable_processor_cache(vllm_config.renderer_config, mm_registry): return None if not _enable_ipc_cache(vllm_config): diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 0390773783961..81ceb76a4b961 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -23,7 +23,7 @@ import torch from typing_extensions import TypeVar, assert_never from vllm.logger import init_logger -from vllm.tokenizers import TokenizerLike +from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config from vllm.transformers_utils.processor import cached_processor_from_config from vllm.utils.collection_utils import flatten_2d_lists, full_groupby from vllm.utils.func_utils import get_allowed_kwarg_only_overrides @@ -53,7 +53,7 @@ if TYPE_CHECKING: from transformers.feature_extraction_utils import BatchFeature from transformers.processing_utils import ProcessorMixin - from vllm.config import ModelConfig + from vllm.config import ModelConfig, RendererConfig from .cache import BaseMultiModalProcessorCache from .profiling import BaseDummyInputsBuilder @@ -63,6 +63,7 @@ else: ProcessorMixin = object ModelConfig = object + RendererConfig = object BaseMultiModalProcessorCache = object @@ -945,12 +946,29 @@ class InputProcessingContext: modify the inputs. """ - model_config: ModelConfig - """The configuration of the model.""" + renderer_config: RendererConfig + """The configuration of the renderer.""" tokenizer: TokenizerLike | None """The tokenizer used to tokenize the inputs.""" + @classmethod + def from_config( + cls, + renderer_config: RendererConfig, + *, + tokenizer: TokenizerLike | None = None, + ): + if tokenizer is None and not renderer_config.skip_tokenizer_init: + tokenizer = cached_tokenizer_from_config(renderer_config) + + return cls(renderer_config, tokenizer) + + @property + def model_config(self) -> ModelConfig: + """The configuration of the model.""" + return self.renderer_config.model_config + def get_tokenizer(self) -> TokenizerLike: if self.tokenizer is None: raise ValueError( @@ -1047,7 +1065,7 @@ class InputProcessingContext: typ = ProcessorMixin return cached_processor_from_config( - self.model_config, + self.renderer_config, processor_cls=typ, tokenizer=self.tokenizer, **kwargs, diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 00a84f9dec4f7..e49aaa5045c62 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Generic, Protocol, TypeVar, cast from vllm.config.multimodal import BaseDummyOptions from vllm.logger import init_logger -from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config +from vllm.tokenizers import TokenizerLike from .cache import BaseMultiModalProcessorCache from .processing import ( @@ -22,7 +22,7 @@ from .profiling import ( ) if TYPE_CHECKING: - from vllm.config import ModelConfig + from vllm.config import ModelConfig, RendererConfig from vllm.model_executor.models.interfaces import SupportsMultiModal logger = init_logger(__name__) @@ -114,17 +114,18 @@ class MultiModalRegistry: return mm_options if len(mm_options) > 0 else None - def supports_multimodal_inputs(self, model_config: "ModelConfig") -> bool: + def supports_multimodal_inputs(self, renderer_config: "RendererConfig") -> bool: """ Checks if the model supports multimodal inputs. Returns True if the model is multimodal with any non-zero supported modalities, otherwise returns False, effectively running in text-only mode. """ + model_config = renderer_config.model_config if not model_config.is_multimodal_model: return False - info = self._create_processing_info(model_config, tokenizer=None) + info = self._create_processing_info(renderer_config, tokenizer=None) supported_modalities = info.get_supported_mm_limits() mm_config = model_config.get_multimodal_config() @@ -144,7 +145,7 @@ class MultiModalRegistry: def get_max_tokens_per_item_by_modality( self, - model_config: "ModelConfig", + renderer_config: "RendererConfig", *, cache: BaseMultiModalProcessorCache | None = None, profiler_limits: Mapping[str, int] | None = None, @@ -153,10 +154,11 @@ class MultiModalRegistry: Get the maximum number of tokens per data item from each modality based on underlying model configuration. """ + model_config = renderer_config.model_config if not model_config.is_multimodal_model: return {} - processor = self.create_processor(model_config, cache=cache) + processor = self.create_processor(renderer_config, cache=cache) profiler: MultiModalProfiler = MultiModalProfiler(processor) seq_len = model_config.max_model_len @@ -171,7 +173,7 @@ class MultiModalRegistry: def get_mm_limits_per_prompt( self, - model_config: "ModelConfig", + renderer_config: "RendererConfig", *, cache: BaseMultiModalProcessorCache | None = None, ) -> Mapping[str, int]: @@ -179,10 +181,11 @@ class MultiModalRegistry: Get the maximum number of multi-modal input instances for each modality that are allowed per prompt for a model class. """ + model_config = renderer_config.model_config if not model_config.is_multimodal_model: return {} - processor = self.create_processor(model_config, cache=cache) + processor = self.create_processor(renderer_config, cache=cache) profiler: MultiModalProfiler = MultiModalProfiler(processor) return profiler.get_mm_limits() @@ -228,30 +231,21 @@ class MultiModalRegistry: assert hasattr(model_cls, "_processor_factory") return cast("SupportsMultiModal", model_cls) - def _create_processing_ctx( - self, - model_config: "ModelConfig", - tokenizer: TokenizerLike | None = None, - ) -> InputProcessingContext: - if tokenizer is None and not model_config.skip_tokenizer_init: - tokenizer = cached_tokenizer_from_config(model_config) - - return InputProcessingContext(model_config, tokenizer) - def _create_processing_info( self, - model_config: "ModelConfig", + renderer_config: "RendererConfig", *, tokenizer: TokenizerLike | None = None, ) -> BaseProcessingInfo: - model_cls = self._get_model_cls(model_config) + model_cls = self._get_model_cls(renderer_config.model_config) factories = model_cls._processor_factory - ctx = self._create_processing_ctx(model_config, tokenizer) + + ctx = InputProcessingContext.from_config(renderer_config, tokenizer=tokenizer) return factories.info(ctx) def create_processor( self, - model_config: "ModelConfig", + renderer_config: "RendererConfig", *, tokenizer: TokenizerLike | None = None, cache: BaseMultiModalProcessorCache | None = None, @@ -259,19 +253,19 @@ class MultiModalRegistry: """ Create a multi-modal processor for a specific model and tokenizer. """ + model_config = renderer_config.model_config if not model_config.is_multimodal_model: raise ValueError(f"{model_config.model} is not a multimodal model") model_cls = self._get_model_cls(model_config) factories = model_cls._processor_factory - ctx = self._create_processing_ctx(model_config, tokenizer) - + ctx = InputProcessingContext.from_config(renderer_config, tokenizer=tokenizer) return factories.build_processor(ctx, cache=cache) def get_decoder_dummy_data( self, - model_config: "ModelConfig", + renderer_config: "RendererConfig", seq_len: int, mm_counts: Mapping[str, int] | None = None, *, @@ -280,15 +274,15 @@ class MultiModalRegistry: """ Create dummy data for profiling the memory usage of a model. - The model is identified by `model_config`. + The model is identified by `renderer_config`. """ - processor = self.create_processor(model_config, cache=cache) + processor = self.create_processor(renderer_config, cache=cache) profiler: MultiModalProfiler = MultiModalProfiler(processor) # Extract configurable options from multimodal config. # Only include modalities that use advanced option types so legacy # count-only behavior remains unchanged. - mm_options = self._extract_mm_options(model_config) + mm_options = self._extract_mm_options(renderer_config.model_config) dummy_data = profiler.get_decoder_dummy_data(seq_len, mm_counts, mm_options) @@ -304,7 +298,7 @@ class MultiModalRegistry: def get_encoder_dummy_data( self, - model_config: "ModelConfig", + renderer_config: "RendererConfig", seq_len: int, mm_counts: Mapping[str, int] | None = None, *, @@ -313,15 +307,15 @@ class MultiModalRegistry: """ Create dummy data for profiling the memory usage of a model. - The model is identified by `model_config`. + The model is identified by `renderer_config`. """ - processor = self.create_processor(model_config, cache=cache) + processor = self.create_processor(renderer_config, cache=cache) profiler: MultiModalProfiler = MultiModalProfiler(processor) # Extract configurable options from multimodal config. # Only include modalities that use advanced option types so legacy # count-only behavior remains unchanged. - mm_options = self._extract_mm_options(model_config) + mm_options = self._extract_mm_options(renderer_config.model_config) dummy_data = profiler.get_encoder_dummy_data(seq_len, mm_counts, mm_options) @@ -336,13 +330,15 @@ class MultiModalRegistry: return dummy_data - def get_encdec_max_encoder_len(self, model_config: "ModelConfig") -> int: + def get_encdec_max_encoder_len(self, renderer_config: "RendererConfig") -> int: """ Get the maximum length of the encoder input for encoder-decoder models. """ + model_config = renderer_config.model_config if not model_config.is_encoder_decoder: return 0 - max_tokens = self.get_max_tokens_per_item_by_modality(model_config) + + max_tokens = self.get_max_tokens_per_item_by_modality(renderer_config) if not max_tokens: # TODO - this function assumes encoder-decoder models are # multimodal. This will need to change when adding support for more diff --git a/vllm/tokenizers/registry.py b/vllm/tokenizers/registry.py index 1d44feeee500f..c9575511af8c9 100644 --- a/vllm/tokenizers/registry.py +++ b/vllm/tokenizers/registry.py @@ -24,7 +24,7 @@ from vllm.utils.import_utils import resolve_obj_by_qualname from .protocol import TokenizerLike if TYPE_CHECKING: - from vllm.config import ModelConfig + from vllm.config import RendererConfig logger = init_logger(__name__) @@ -205,18 +205,18 @@ def get_tokenizer( cached_get_tokenizer = lru_cache(get_tokenizer) -def cached_tokenizer_from_config(model_config: "ModelConfig", **kwargs): +def cached_tokenizer_from_config(renderer_config: "RendererConfig", **kwargs): return cached_get_tokenizer( - model_config.tokenizer, - tokenizer_mode=model_config.tokenizer_mode, - revision=model_config.tokenizer_revision, - trust_remote_code=model_config.trust_remote_code, + renderer_config.tokenizer, + tokenizer_mode=renderer_config.tokenizer_mode, + revision=renderer_config.tokenizer_revision, + trust_remote_code=renderer_config.trust_remote_code, **kwargs, ) -def init_tokenizer_from_config(model_config: "ModelConfig"): - runner_type = model_config.runner_type +def init_tokenizer_from_config(renderer_config: "RendererConfig"): + runner_type = renderer_config.model_config.runner_type if runner_type == "generate" or runner_type == "draft": truncation_side = "left" elif runner_type == "pooling": @@ -225,9 +225,9 @@ def init_tokenizer_from_config(model_config: "ModelConfig"): assert_never(runner_type) return get_tokenizer( - model_config.tokenizer, - tokenizer_mode=model_config.tokenizer_mode, - trust_remote_code=model_config.trust_remote_code, - revision=model_config.tokenizer_revision, + renderer_config.tokenizer, + tokenizer_mode=renderer_config.tokenizer_mode, + trust_remote_code=renderer_config.trust_remote_code, + revision=renderer_config.tokenizer_revision, truncation_side=truncation_side, ) diff --git a/vllm/transformers_utils/processor.py b/vllm/transformers_utils/processor.py index e9864b0c1531d..bdebd2686bae2 100644 --- a/vllm/transformers_utils/processor.py +++ b/vllm/transformers_utils/processor.py @@ -23,7 +23,7 @@ from vllm.transformers_utils.utils import convert_model_repo_to_path from vllm.utils.func_utils import get_allowed_kwarg_only_overrides if TYPE_CHECKING: - from vllm.config import ModelConfig + from vllm.config import ModelConfig, RendererConfig _P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin) _V = TypeVar("_V", bound=BaseVideoProcessor, default=BaseVideoProcessor) @@ -233,17 +233,18 @@ def cached_get_processor_without_dynamic_kwargs( def cached_processor_from_config( - model_config: "ModelConfig", + renderer_config: "RendererConfig", processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin, **kwargs: Any, ) -> _P: + model_config = renderer_config.model_config if is_gguf(model_config.model): - assert not is_gguf(model_config.tokenizer), ( + assert not is_gguf(renderer_config.tokenizer), ( "For multimodal GGUF models, the original tokenizer " "should be used to correctly load processor." ) - model = model_config.tokenizer - revision = model_config.tokenizer_revision + model = renderer_config.tokenizer + revision = renderer_config.tokenizer_revision else: model = model_config.model revision = model_config.revision @@ -297,9 +298,11 @@ cached_get_feature_extractor = lru_cache(get_feature_extractor) def cached_feature_extractor_from_config( - model_config: "ModelConfig", + renderer_config: "RendererConfig", **kwargs: Any, ): + model_config = renderer_config.model_config + return cached_get_feature_extractor( model_config.model, revision=model_config.revision, @@ -348,16 +351,17 @@ cached_get_image_processor = lru_cache(get_image_processor) def cached_image_processor_from_config( - model_config: "ModelConfig", + renderer_config: "RendererConfig", **kwargs: Any, ): + model_config = renderer_config.model_config if is_gguf(model_config.model): - assert not is_gguf(model_config.tokenizer), ( + assert not is_gguf(renderer_config.tokenizer), ( "For multimodal GGUF models, the original tokenizer " "should be used to correctly load image processor." ) - model = model_config.tokenizer - revision = model_config.tokenizer_revision + model = renderer_config.tokenizer + revision = renderer_config.tokenizer_revision else: model = model_config.model revision = model_config.revision @@ -411,10 +415,12 @@ cached_get_video_processor = lru_cache(get_video_processor) def cached_video_processor_from_config( - model_config: "ModelConfig", + renderer_config: "RendererConfig", processor_cls: type[_V] | None = None, **kwargs: Any, ): + model_config = renderer_config.model_config + return cached_get_video_processor( model_config.model, revision=model_config.revision, diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py index 3959e9a59a53b..21315b85f22aa 100644 --- a/vllm/v1/core/encoder_cache_manager.py +++ b/vllm/v1/core/encoder_cache_manager.py @@ -10,7 +10,7 @@ from vllm.multimodal import MultiModalRegistry from vllm.v1.request import Request if TYPE_CHECKING: - from vllm.config import ModelConfig, SchedulerConfig + from vllm.config import RendererConfig, SchedulerConfig logger = init_logger(__name__) @@ -250,7 +250,7 @@ class EncoderCacheManager: def compute_encoder_budget( - model_config: "ModelConfig", + renderer_config: "RendererConfig", scheduler_config: "SchedulerConfig", mm_registry: MultiModalRegistry, ) -> tuple[int, int]: @@ -263,9 +263,9 @@ def compute_encoder_budget( - Space budget for encoder cache size, measured in number of tokens from the input sequence. """ - if mm_registry.supports_multimodal_inputs(model_config): + if mm_registry.supports_multimodal_inputs(renderer_config): max_tokens_by_modality = mm_registry.get_max_tokens_per_item_by_modality( - model_config + renderer_config ) return compute_mm_encoder_budget( diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 0a8efa2fd512f..96073efc5f56a 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -164,7 +164,7 @@ class Scheduler(SchedulerInterface): # This can be changed when we make encoder cache for embedding caching # across requests. encoder_compute_budget, encoder_cache_size = compute_encoder_budget( - model_config=vllm_config.model_config, + renderer_config=vllm_config.renderer_config, scheduler_config=vllm_config.scheduler_config, mm_registry=mm_registry, ) diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index fd7e04dc02082..b76f9c0595d61 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -91,6 +91,7 @@ class AsyncLLM(EngineClient): # Ensure we can serialize custom transformer configs maybe_register_config_serialize_by_value() + self.renderer_config = vllm_config.renderer_config self.model_config = vllm_config.model_config self.vllm_config = vllm_config self.observability_config = vllm_config.observability_config @@ -108,15 +109,15 @@ class AsyncLLM(EngineClient): "enabling logging without default stat loggers." ) - if self.model_config.skip_tokenizer_init: + if self.renderer_config.skip_tokenizer_init: tokenizer = None else: - tokenizer = init_tokenizer_from_config(self.model_config) + tokenizer = init_tokenizer_from_config(self.renderer_config) self.input_processor = InputProcessor(self.vllm_config, tokenizer) self.io_processor = get_io_processor( self.vllm_config, - self.model_config.io_processor_plugin, + self.renderer_config.io_processor_plugin, ) # OutputProcessor (converts EngineCoreOutputs --> RequestOutput). diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index e6a94f4e3de5d..a2f6ba5be8c17 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -43,6 +43,7 @@ class InputProcessor: mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, ) -> None: self.vllm_config = vllm_config + self.renderer_config = vllm_config.renderer_config self.model_config = vllm_config.model_config self.cache_config = vllm_config.cache_config self.lora_config = vllm_config.lora_config @@ -54,7 +55,7 @@ class InputProcessor: self.mm_processor_cache = processor_cache_from_config(vllm_config, mm_registry) self.input_preprocessor = InputPreprocessor( - self.model_config, + self.renderer_config, tokenizer, mm_registry, mm_processor_cache=self.mm_processor_cache, @@ -252,7 +253,7 @@ class InputProcessor: if not params.structured_outputs or not self.structured_outputs_config: return - if self.model_config.skip_tokenizer_init and params.structured_outputs: + if self.renderer_config.skip_tokenizer_init and params.structured_outputs: raise ValueError( "Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'" # noqa: E501 ) @@ -582,7 +583,7 @@ class InputProcessor: if prompt_type == "encoder" and model_config.is_multimodal_model: mm_registry = self.input_preprocessor.mm_registry mm_processor = mm_registry.create_processor( - model_config, + self.renderer_config, tokenizer=tokenizer, ) assert isinstance(mm_processor, EncDecMultiModalProcessor) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 4c31291005477..ba0e1cf25cb08 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -60,6 +60,7 @@ class LLMEngine: ) -> None: self.vllm_config = vllm_config self.observability_config = vllm_config.observability_config + self.renderer_config = vllm_config.renderer_config self.model_config = vllm_config.model_config self.cache_config = vllm_config.cache_config @@ -83,15 +84,15 @@ class LLMEngine: self.dp_group = None self.should_execute_dummy_batch = False - if self.model_config.skip_tokenizer_init: + if self.renderer_config.skip_tokenizer_init: tokenizer = None else: - tokenizer = init_tokenizer_from_config(self.model_config) + tokenizer = init_tokenizer_from_config(self.renderer_config) self.input_processor = InputProcessor(self.vllm_config, tokenizer) self.io_processor = get_io_processor( self.vllm_config, - self.model_config.io_processor_plugin, + self.renderer_config.io_processor_plugin, ) # OutputProcessor (convert EngineCoreOutputs --> RequestOutput). diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 31428db2d3afc..7976418510aab 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -85,7 +85,7 @@ class EagleProposer: # Multi-modal data support self.mm_registry = MULTIMODAL_REGISTRY self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs( - vllm_config.model_config + vllm_config.renderer_config ) self.attn_metadata_builder: AttentionMetadataBuilder | None = None diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index 5ee88178cdf60..36aa3d9bb3f94 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -63,7 +63,7 @@ class StructuredOutputManager: max_workers = max(1, min(multiprocessing.cpu_count() // 2, 8)) self.executor_for_fillmask = ThreadPoolExecutor(max_workers=max_workers) - if not self.vllm_config.model_config.skip_tokenizer_init: + if not vllm_config.renderer_config.skip_tokenizer_init: # The default max_workers if not specified is the number of # CPUs * 5, which is way too high since these tasks are CPU-bound, # not I/O bound. We also know we would never dominate CPU usage @@ -71,21 +71,15 @@ class StructuredOutputManager: # of CPUs. max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2) self.executor = ThreadPoolExecutor(max_workers=max_workers) - self.tokenizer = init_tokenizer_from_config( - model_config=self.vllm_config.model_config - ) - reasoning_parser = ( - self.vllm_config.structured_outputs_config.reasoning_parser - ) + self.tokenizer = init_tokenizer_from_config(vllm_config.renderer_config) + reasoning_parser = vllm_config.structured_outputs_config.reasoning_parser reasoning_parser_plugin = ( - self.vllm_config.structured_outputs_config.reasoning_parser_plugin + vllm_config.structured_outputs_config.reasoning_parser_plugin ) if reasoning_parser_plugin and len(reasoning_parser_plugin) > 3: ReasoningParserManager.import_reasoning_parser(reasoning_parser_plugin) - reasoning_parser = ( - self.vllm_config.structured_outputs_config.reasoning_parser - ) + reasoning_parser = vllm_config.structured_outputs_config.reasoning_parser if reasoning_parser: reasoner_cls = ReasoningParserManager.get_reasoning_parser( reasoning_parser @@ -93,7 +87,7 @@ class StructuredOutputManager: self.reasoner = reasoner_cls(tokenizer=self.tokenizer) self.enable_in_reasoning = ( - self.vllm_config.structured_outputs_config.enable_in_reasoning + vllm_config.structured_outputs_config.enable_in_reasoning ) def grammar_init(self, request: Request) -> None: diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index a50360ab08694..b3c8d4da22b63 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -271,6 +271,7 @@ class GPUModelRunner( device: torch.device, ): self.vllm_config = vllm_config + self.renderer_config = vllm_config.renderer_config self.model_config = vllm_config.model_config self.cache_config = vllm_config.cache_config self.compilation_config = vllm_config.compilation_config @@ -335,7 +336,7 @@ class GPUModelRunner( self.uses_mrope = model_config.uses_mrope self.uses_xdrope_dim = model_config.uses_xdrope_dim self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs( - model_config + self.renderer_config ) if self.model_config.is_encoder_decoder: @@ -558,7 +559,7 @@ class GPUModelRunner( self.mm_budget = ( MultiModalBudget( - self.model_config, + self.renderer_config, self.scheduler_config, self.mm_registry, ) @@ -3873,7 +3874,7 @@ class GPUModelRunner( assert self.mm_budget is not None dummy_decoder_data = self.mm_registry.get_decoder_dummy_data( - model_config=self.model_config, + renderer_config=self.renderer_config, seq_len=self.max_model_len, mm_counts={modality: 1}, cache=self.mm_budget.cache, diff --git a/vllm/v1/worker/tpu_model_runner.py b/vllm/v1/worker/tpu_model_runner.py index 283f21b779e38..7e2a6af68e6af 100644 --- a/vllm/v1/worker/tpu_model_runner.py +++ b/vllm/v1/worker/tpu_model_runner.py @@ -143,6 +143,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): original_parallel_config: ParallelConfig | None = None, ): self.vllm_config = vllm_config + self.renderer_config = vllm_config.renderer_config self.model_config = vllm_config.model_config self.cache_config = vllm_config.cache_config self.lora_config = vllm_config.lora_config @@ -222,7 +223,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.mm_registry = MULTIMODAL_REGISTRY self.uses_mrope = model_config.uses_mrope self.supports_mm_inputs = self.mm_registry.supports_multimodal_inputs( - model_config + self.renderer_config ) # TODO: Support M-RoPE (e.g, Qwen2-VL) assert not self.uses_mrope, "TPU does not support M-RoPE yet." @@ -353,7 +354,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): self.mm_budget = ( MultiModalBudget( - self.model_config, + self.renderer_config, self.scheduler_config, self.mm_registry, ) @@ -2038,7 +2039,7 @@ class TPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): assert self.mm_budget is not None dummy_decoder_data = self.mm_registry.get_decoder_dummy_data( - model_config=self.model_config, + renderer_config=self.renderer_config, seq_len=self.max_model_len, mm_counts={modality: 1}, cache=self.mm_budget.cache, diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 0b0e2006d73d2..44418b9985e2e 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -7,7 +7,7 @@ import torch from vllm.attention.backends.abstract import AttentionBackend from vllm.attention.layer import Attention -from vllm.config import ModelConfig, SchedulerConfig, VllmConfig +from vllm.config import RendererConfig, SchedulerConfig, VllmConfig from vllm.model_executor.models.interfaces import MultiModalEmbeddings from vllm.model_executor.models.utils import extract_layer_index from vllm.multimodal.cache import processor_only_cache_from_config @@ -23,24 +23,29 @@ class MultiModalBudget: def __init__( self, - model_config: ModelConfig, + renderer_config: RendererConfig, scheduler_config: SchedulerConfig, mm_registry: MultiModalRegistry, ) -> None: super().__init__() - self.model_config = model_config + self.renderer_config = renderer_config + self.model_config = renderer_config.model_config self.scheduler_config = scheduler_config self.mm_registry = mm_registry - self.cache = cache = processor_only_cache_from_config(model_config, mm_registry) + self.cache = cache = processor_only_cache_from_config( + renderer_config, mm_registry + ) - self.max_model_len = model_config.max_model_len + self.max_model_len = self.model_config.max_model_len self.max_num_reqs = scheduler_config.max_num_seqs - self.mm_limits = mm_registry.get_mm_limits_per_prompt(model_config, cache=cache) + self.mm_limits = mm_registry.get_mm_limits_per_prompt( + renderer_config, cache=cache + ) max_tokens_by_modality = mm_registry.get_max_tokens_per_item_by_modality( - model_config, + renderer_config, cache=cache, profiler_limits=self.mm_limits, )