Revert "[Renderer] Separate out RendererConfig from ModelConfig (#30145)" (#30199)

This commit is contained in:
Cyrus Leung 2025-12-07 16:00:22 +08:00 committed by GitHub
parent 27f4c2fd46
commit e83b7e379c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
105 changed files with 797 additions and 969 deletions

View File

@ -22,7 +22,7 @@ Declare supported languages and capabilities:
import torch
from torch import nn
from vllm.config import RendererConfig, SpeechToTextConfig
from vllm.config import ModelConfig, SpeechToTextConfig
from vllm.inputs.data import PromptType
from vllm.model_executor.models.interfaces import SupportsTranscription
@ -52,7 +52,7 @@ This is for controlling general behavior of the API when serving your model:
@classmethod
def get_speech_to_text_config(
cls,
renderer_config: RendererConfig,
model_config: ModelConfig,
task_type: Literal["transcribe", "translate"],
) -> SpeechToTextConfig:
return SpeechToTextConfig(
@ -83,7 +83,7 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
cls,
audio: np.ndarray,
stt_config: SpeechToTextConfig,
renderer_config: RendererConfig,
model_config: ModelConfig,
language: str | None,
task_type: Literal["transcribe", "translate"],
request_prompt: str,
@ -120,7 +120,7 @@ Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
cls,
audio: np.ndarray,
stt_config: SpeechToTextConfig,
renderer_config: RendererConfig,
model_config: ModelConfig,
language: str | None,
task_type: Literal["transcribe", "translate"],
request_prompt: str,
@ -183,7 +183,7 @@ Provide a fast duration→token estimate to improve streaming usage statistics:
cls,
audio_duration_s: float,
stt_config: SpeechToTextConfig,
renderer_config: RendererConfig,
model_config: ModelConfig,
) -> int | None:
# Return None if unknown; otherwise return an estimate.
return int(audio_duration_s * stt_config.sample_rate // 320) # example
@ -216,7 +216,7 @@ Relevant server logic:
prompt = self.model_cls.get_generation_prompt(
audio=chunk,
stt_config=self.asr_config,
renderer_config=self.renderer_config,
model_config=self.model_config,
language=language,
task_type=self.task_type,
request_prompt=request.prompt,

View File

@ -17,7 +17,6 @@ from vllm.config import (
DeviceConfig,
ModelConfig,
PassConfig,
RendererConfig,
VllmConfig,
get_current_vllm_config,
set_current_vllm_config,
@ -277,7 +276,6 @@ def sequence_parallelism_pass_on_test_model(
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
device_config=device_config,
compilation_config=compilation_config,
)

View File

@ -15,7 +15,6 @@ from vllm.config import (
CompilationConfig,
ModelConfig,
PassConfig,
RendererConfig,
VllmConfig,
set_current_vllm_config,
)
@ -220,11 +219,8 @@ def test_fix_functionalization(
torch.set_default_device("cuda")
torch.set_default_dtype(dtype)
model_config = ModelConfig(dtype=dtype)
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
model_config=ModelConfig(dtype=dtype),
compilation_config=CompilationConfig(
custom_ops=["all"],
pass_config=PassConfig(

View File

@ -15,7 +15,6 @@ from vllm.config import (
CompilationMode,
ModelConfig,
PassConfig,
RendererConfig,
VllmConfig,
)
from vllm.model_executor.layers.layernorm import RMSNorm
@ -155,11 +154,8 @@ def test_fusion_rmsnorm_quant(
custom_ops.append("+rms_norm")
if enable_quant_fp8_custom_op:
custom_ops.append("+quant_fp8")
model_config = ModelConfig(dtype=dtype)
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
model_config=ModelConfig(dtype=dtype),
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
custom_ops=custom_ops,

View File

@ -24,7 +24,6 @@ from vllm.config import (
CompilationMode,
ModelConfig,
PassConfig,
RendererConfig,
SchedulerConfig,
VllmConfig,
set_current_vllm_config,
@ -326,7 +325,6 @@ def test_attention_quant_pattern(
)
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
scheduler_config=SchedulerConfig(
max_num_seqs=1024,
max_model_len=model_config.max_model_len,

View File

@ -7,7 +7,7 @@ import torch
from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
from vllm.compilation.pass_manager import PostGradPassManager
from vllm.config import ModelConfig, RendererConfig, VllmConfig
from vllm.config import ModelConfig, VllmConfig
# dummy custom pass that doesn't inherit
@ -43,11 +43,7 @@ class ProperPass(InductorPass):
)
def test_pass_manager_uuid(callable):
# Some passes need dtype to be set
model_config = ModelConfig(dtype=torch.bfloat16)
config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
)
config = VllmConfig(model_config=ModelConfig(dtype=torch.bfloat16))
pass_manager = PostGradPassManager()
pass_manager.configure(config)

View File

@ -19,7 +19,6 @@ from vllm.config import (
CompilationMode,
ModelConfig,
PassConfig,
RendererConfig,
VllmConfig,
set_current_vllm_config,
)
@ -134,10 +133,8 @@ def test_qk_norm_rope_fusion(
if enable_rope_custom_op:
custom_ops.append("+rotary_embedding")
model_config = ModelConfig(dtype=dtype)
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
model_config=ModelConfig(dtype=dtype),
compilation_config=CompilationConfig(
mode=CompilationMode.VLLM_COMPILE,
custom_ops=custom_ops,

View File

@ -5,7 +5,6 @@ from vllm.config import (
DeviceConfig,
KVTransferConfig,
ModelConfig,
RendererConfig,
VllmConfig,
set_current_vllm_config,
)
@ -48,7 +47,6 @@ def test_get_kv_connector_cache_layout_with_nixl_connector():
vllm_config = VllmConfig(
device_config=DeviceConfig("cpu"),
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
kv_transfer_config=kv_transfer_config,
)
with set_current_vllm_config(vllm_config):
@ -72,7 +70,6 @@ def test_get_kv_connector_cache_layout_with_multi_connector():
vllm_config = VllmConfig(
device_config=DeviceConfig("cpu"),
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
kv_transfer_config=kv_transfer_config,
)
with set_current_vllm_config(vllm_config):

View File

@ -3,6 +3,7 @@
import pytest
from vllm.config import ModelConfig
from vllm.entrypoints.chat_utils import apply_hf_chat_template, load_chat_template
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
from vllm.tokenizers import get_tokenizer
@ -106,11 +107,24 @@ def test_get_gen_prompt(
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
renderer_config = model_info.build_renderer_config(model)
model_config = ModelConfig(
model,
tokenizer=model_info.tokenizer or model,
tokenizer_mode=model_info.tokenizer_mode,
trust_remote_code=model_info.trust_remote_code,
revision=model_info.revision,
hf_overrides=model_info.hf_overrides,
skip_tokenizer_init=model_info.require_embed_inputs,
enable_prompt_embeds=model_info.require_embed_inputs,
enable_mm_embeds=model_info.require_embed_inputs,
enforce_eager=model_info.enforce_eager,
dtype=model_info.dtype,
)
# Initialize the tokenizer
tokenizer = get_tokenizer(
renderer_config.tokenizer,
trust_remote_code=renderer_config.trust_remote_code,
tokenizer_name=model_config.tokenizer,
trust_remote_code=model_config.trust_remote_code,
)
template_content = load_chat_template(chat_template=template)
@ -129,7 +143,7 @@ def test_get_gen_prompt(
tokenizer=tokenizer,
conversation=mock_request.messages,
chat_template=mock_request.chat_template or template_content,
renderer_config=renderer_config,
model_config=model_config,
tools=None,
add_generation_prompt=mock_request.add_generation_prompt,
continue_final_message=mock_request.continue_final_message,

View File

@ -33,34 +33,26 @@ class MockModelConfig:
"""Minimal mock ModelConfig for testing."""
model: str = MODEL_NAME
tokenizer: str = MODEL_NAME
trust_remote_code: bool = False
tokenizer_mode: str = "auto"
max_model_len: int = 100
tokenizer_revision: str | None = None
multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig)
hf_config: MockHFConfig = field(default_factory=MockHFConfig)
logits_processors: list[str] | None = None
logits_processor_pattern: str | None = None
diff_sampling_param: dict | None = None
allowed_local_media_path: str = ""
allowed_media_domains: list[str] | None = None
encoder_config = None
generation_config: str = "auto"
skip_tokenizer_init: bool = False
def get_diff_sampling_param(self):
return self.diff_sampling_param or {}
@dataclass
class MockRendererConfig:
"""Minimal mock RendererConfig for testing."""
model_config: MockModelConfig
tokenizer: str = MODEL_NAME
tokenizer_mode: str = "auto"
tokenizer_revision: str | None = None
skip_tokenizer_init: bool = False
allowed_local_media_path: str = ""
allowed_media_domains: list[str] | None = None
class MockLoRAResolver(LoRAResolver):
async def resolve_lora(
self, base_model_name: str, lora_name: str
@ -122,7 +114,6 @@ def mock_serving_setup():
mock_engine.add_lora.reset_mock()
mock_engine.model_config = MockModelConfig()
mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()

View File

@ -346,33 +346,27 @@ class MockHFConfig:
class MockModelConfig:
task = "generate"
runner_type = "generate"
tokenizer = MODEL_NAME
trust_remote_code = False
tokenizer_mode = "auto"
max_model_len = 100
tokenizer_revision = None
multimodal_config = MultiModalConfig()
hf_config = MockHFConfig()
logits_processors: list[str] | None = None
logits_processor_pattern = None
diff_sampling_param: dict | None = None
allowed_local_media_path: str = ""
allowed_media_domains: list[str] | None = None
encoder_config = None
generation_config: str = "auto"
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
skip_tokenizer_init = False
def get_diff_sampling_param(self):
return self.diff_sampling_param or {}
@dataclass
class MockRendererConfig:
model_config: MockModelConfig = field(default_factory=MockModelConfig)
tokenizer = MODEL_NAME
tokenizer_mode = "auto"
tokenizer_revision = None
skip_tokenizer_init = False
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
allowed_local_media_path: str = ""
allowed_media_domains: list[str] | None = None
def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
models = OpenAIServingModels(
engine_client=engine,
@ -405,7 +399,6 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
@dataclass
class MockEngine:
model_config: MockModelConfig = field(default_factory=MockModelConfig)
renderer_config: MockRendererConfig = field(default_factory=MockRendererConfig)
input_processor: MagicMock = field(default_factory=MagicMock)
io_processor: MagicMock = field(default_factory=MagicMock)
@ -436,7 +429,6 @@ async def test_serving_chat_returns_correct_model_name():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = MockModelConfig()
mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()
@ -467,7 +459,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = MockModelConfig()
mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()
@ -501,7 +492,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = mock_model_config
mock_engine.renderer_config = MockRendererConfig(mock_model_config)
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()
@ -547,7 +537,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = mock_model_config
mock_engine.renderer_config = MockRendererConfig(mock_model_config)
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()
@ -594,7 +583,6 @@ async def test_serving_chat_could_load_correct_generation_config():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = mock_model_config
mock_engine.renderer_config = MockRendererConfig(mock_model_config)
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()
@ -641,7 +629,6 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = mock_model_config
mock_engine.renderer_config = MockRendererConfig(mock_model_config)
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()
@ -675,7 +662,6 @@ async def test_serving_chat_data_parallel_rank_extraction():
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
mock_engine.errored = False
mock_engine.model_config = MockModelConfig()
mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
mock_engine.input_processor = MagicMock()
mock_engine.io_processor = MagicMock()

View File

@ -7,7 +7,7 @@ from unittest.mock import Mock
import pytest
from vllm.config import ModelConfig, RendererConfig
from vllm.config import ModelConfig
from vllm.entrypoints.openai.serving_engine import OpenAIServing
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
from vllm.tokenizers import MistralTokenizer
@ -19,16 +19,10 @@ def serving() -> OpenAIServing:
# Create minimal mocks
engine_client = Mock()
model_config = Mock(spec=ModelConfig)
model_config.max_model_len = 32768
renderer_config = Mock(spec=RendererConfig)
renderer_config.model_config = model_config
models = Mock(spec=OpenAIServingModels)
models.model_config = model_config
models.renderer_config = renderer_config
models.input_processor = Mock()
models.io_processor = Mock()

View File

@ -6,7 +6,7 @@ from unittest.mock import MagicMock
import pytest
from vllm.config import ModelConfig, RendererConfig
from vllm.config import ModelConfig
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.openai.protocol import (
ErrorResponse,
@ -27,15 +27,9 @@ LORA_UNLOADING_SUCCESS_MESSAGE = (
async def _async_serving_models_init() -> OpenAIServingModels:
mock_engine_client = MagicMock(spec=EngineClient)
# Set the max_model_len attribute to avoid missing attribute
mock_model_config = MagicMock(spec=ModelConfig)
mock_model_config.max_model_len = 2048
mock_renderer_config = MagicMock(spec=RendererConfig)
mock_renderer_config.model_config = mock_model_config
mock_engine_client.model_config = mock_model_config
mock_engine_client.renderer_config = mock_renderer_config
mock_engine_client.input_processor = MagicMock()
mock_engine_client.io_processor = MagicMock()

View File

@ -12,7 +12,7 @@ from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
from vllm.assets.audio import AudioAsset
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.config import ModelConfig, RendererConfig
from vllm.config import ModelConfig
from vllm.entrypoints.chat_utils import (
_try_extract_ast,
apply_mistral_chat_template,
@ -233,7 +233,7 @@ def test_parse_chat_messages_single_image(
],
}
],
RendererConfig(model_config=phi3v_model_config),
phi3v_model_config,
content_format="string",
)
@ -265,7 +265,7 @@ def test_parse_chat_messages_single_image_with_uuid(
],
}
],
RendererConfig(model_config=phi3v_model_config),
phi3v_model_config,
content_format="string",
)
@ -295,7 +295,7 @@ def test_parse_chat_messages_single_empty_image_with_uuid(
],
}
],
RendererConfig(model_config=phi3v_model_config),
phi3v_model_config,
content_format="string",
)
@ -328,7 +328,7 @@ def test_parse_chat_messages_single_image_with_bad_uuid_format(
],
}
],
RendererConfig(model_config=phi3v_model_config),
phi3v_model_config,
content_format="string",
)
@ -369,7 +369,7 @@ def test_parse_chat_messages_multiple_images_with_uuids(
],
}
],
RendererConfig(model_config=phi3v_model_config),
phi3v_model_config,
content_format="string",
)
@ -409,7 +409,7 @@ def test_parse_chat_messages_multiple_empty_images_with_uuids(
],
}
],
RendererConfig(model_config=phi3v_model_config),
phi3v_model_config,
content_format="string",
)
@ -451,7 +451,7 @@ def test_parse_chat_messages_mixed_empty_images_with_uuids(
],
}
],
RendererConfig(model_config=phi3v_model_config),
phi3v_model_config,
content_format="string",
)
@ -485,7 +485,7 @@ async def test_parse_chat_messages_single_image_with_uuid_async(
],
}
],
RendererConfig(model_config=phi3v_model_config),
phi3v_model_config,
content_format="string",
)
@ -516,7 +516,7 @@ async def test_parse_chat_messages_empty_image_with_uuid_async(
],
}
],
RendererConfig(model_config=phi3v_model_config),
phi3v_model_config,
content_format="string",
)
@ -554,7 +554,7 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async(
],
}
],
RendererConfig(model_config=phi3v_model_config),
phi3v_model_config,
content_format="string",
)
@ -595,7 +595,7 @@ async def test_parse_chat_messages_multiple_empty_images_with_uuids_async(
],
}
],
RendererConfig(model_config=phi3v_model_config),
phi3v_model_config,
content_format="string",
)
@ -634,7 +634,7 @@ async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
],
}
],
RendererConfig(model_config=phi3v_model_config),
phi3v_model_config,
content_format="string",
)
@ -660,7 +660,7 @@ def test_parse_chat_messages_empty_system(
"content": [{"type": "text", "text": "Who are you?"}],
},
],
RendererConfig(model_config=mistral_model_config),
mistral_model_config,
content_format="string",
)
assert conversation == [
@ -677,7 +677,7 @@ def test_parse_chat_messages_empty_system(
"content": [{"type": "text", "text": "Who are you?"}],
},
],
RendererConfig(model_config=mistral_model_config),
mistral_model_config,
content_format="openai",
)
assert conversation == [
@ -701,7 +701,7 @@ async def test_parse_chat_messages_single_image_async(
],
}
],
RendererConfig(model_config=phi3v_model_config),
phi3v_model_config,
content_format="string",
)
@ -730,7 +730,7 @@ def test_parse_chat_messages_multiple_images(
],
}
],
RendererConfig(model_config=phi3v_model_config),
phi3v_model_config,
content_format="string",
)
@ -758,7 +758,7 @@ def test_parse_chat_messages_empty_pil_image_with_uuid(
],
}
],
RendererConfig(model_config=phi3v_model_config),
phi3v_model_config,
content_format="string",
)
@ -786,7 +786,7 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
],
}
],
RendererConfig(model_config=phi3v_model_config_image_embeds),
phi3v_model_config_image_embeds,
content_format="string",
)
@ -818,7 +818,7 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid(
],
}
],
RendererConfig(model_config=audio_embeds_model_config),
audio_embeds_model_config,
content_format="string",
)
@ -858,7 +858,7 @@ def test_parse_chat_messages_audio_embeds_with_string(
],
}
],
RendererConfig(model_config=audio_embeds_model_config),
audio_embeds_model_config,
content_format="string",
)
@ -900,7 +900,7 @@ async def test_parse_chat_messages_audio_embeds_async(
],
}
],
RendererConfig(model_config=audio_embeds_model_config),
audio_embeds_model_config,
content_format="string",
)
@ -1108,7 +1108,7 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
],
}
],
RendererConfig(model_config=phi3v_model_config_image_embeds),
phi3v_model_config_image_embeds,
content_format="string",
)
@ -1144,7 +1144,7 @@ async def test_parse_chat_messages_multiple_images_async(
],
}
],
RendererConfig(model_config=phi3v_model_config),
phi3v_model_config,
content_format="string",
)
@ -1176,7 +1176,7 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
],
}
],
RendererConfig(model_config=phi3v_model_config),
phi3v_model_config,
content_format="string",
)
assert conversation == [
@ -1208,7 +1208,7 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
],
}
],
RendererConfig(model_config=phi3v_model_config),
phi3v_model_config,
content_format="string",
)
@ -1245,7 +1245,7 @@ def test_parse_chat_messages_multiple_images_across_messages(
],
},
],
RendererConfig(model_config=phi3v_model_config),
phi3v_model_config,
content_format="string",
)
@ -1289,7 +1289,7 @@ def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
],
},
],
RendererConfig(model_config=phi3v_model_config),
phi3v_model_config,
content_format="string",
)
@ -1314,7 +1314,7 @@ def test_parse_chat_messages_context_text_format(
{"role": "assistant", "content": "Some stuff."},
{"role": "user", "content": "What about this one?"},
],
RendererConfig(model_config=phi3v_model_config),
phi3v_model_config,
content_format="openai",
)
@ -1367,7 +1367,7 @@ def test_parse_chat_messages_rejects_too_many_images_in_one_message(
],
}
],
RendererConfig(model_config=phi3v_model_config),
phi3v_model_config,
content_format="string",
)
@ -1410,7 +1410,7 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages(
],
},
],
RendererConfig(model_config=phi3v_model_config),
phi3v_model_config,
content_format="string",
)
@ -1430,7 +1430,7 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
],
}
],
RendererConfig(model_config=phi3v_model_config),
phi3v_model_config,
content_format="string",
)
@ -1464,7 +1464,7 @@ def test_parse_chat_messages_multiple_images_interleave(
],
}
],
RendererConfig(model_config=phi3v_model_config_mm_interleaved),
phi3v_model_config_mm_interleaved,
content_format="string",
)
@ -1500,7 +1500,7 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
],
}
],
RendererConfig(model_config=phi3v_model_config_mm_interleaved),
phi3v_model_config_mm_interleaved,
content_format="string",
)
@ -1545,7 +1545,7 @@ async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(
],
}
],
RendererConfig(model_config=phi3v_model_config_mm_interleaved),
phi3v_model_config_mm_interleaved,
content_format="string",
)
@ -1583,7 +1583,7 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
],
},
],
RendererConfig(model_config=phi3v_model_config_mm_interleaved),
phi3v_model_config_mm_interleaved,
content_format="string",
)
@ -1631,7 +1631,7 @@ def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interl
],
},
],
RendererConfig(model_config=phi3v_model_config_mm_interleaved),
phi3v_model_config_mm_interleaved,
content_format="string",
)
@ -1675,7 +1675,7 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
],
},
],
RendererConfig(model_config=qwen25omni_model_config_mm_interleaved),
qwen25omni_model_config_mm_interleaved,
content_format="string",
)
@ -1743,7 +1743,7 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl
],
},
],
RendererConfig(model_config=qwen25omni_model_config_mm_interleaved),
qwen25omni_model_config_mm_interleaved,
content_format="string",
)
@ -1813,7 +1813,7 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes
],
},
],
RendererConfig(model_config=qwen25omni_model_config_mm_interleaved),
qwen25omni_model_config_mm_interleaved,
content_format="string",
)
@ -1879,7 +1879,7 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message
],
},
],
RendererConfig(model_config=qwen25omni_model_config_mm_interleaved),
qwen25omni_model_config_mm_interleaved,
content_format="string",
)
@ -1927,7 +1927,7 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
],
}
],
RendererConfig(model_config=phi3v_model_config_mm_interleaved),
phi3v_model_config_mm_interleaved,
content_format="string",
)
@ -1945,11 +1945,24 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
renderer_config = model_info.build_renderer_config(model)
model_config = ModelConfig(
model,
tokenizer=model_info.tokenizer or model,
tokenizer_mode=model_info.tokenizer_mode,
revision=model_info.revision,
trust_remote_code=model_info.trust_remote_code,
hf_overrides=model_info.hf_overrides,
skip_tokenizer_init=model_info.require_embed_inputs,
enable_prompt_embeds=model_info.require_embed_inputs,
enable_mm_embeds=model_info.require_embed_inputs,
enforce_eager=model_info.enforce_eager,
dtype=model_info.dtype,
)
# Build the tokenizer
tokenizer = get_tokenizer(
renderer_config.tokenizer,
trust_remote_code=renderer_config.trust_remote_code,
model,
trust_remote_code=model_config.trust_remote_code,
)
tools = (
@ -1972,7 +1985,7 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
tokenizer,
chat_template=None,
tools=tools,
model_config=renderer_config.model_config,
model_config=model_config,
)
assert isinstance(chat_template, str)
@ -2034,11 +2047,24 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwa
"enable_thinking": True,
}
renderer_config = model_info.build_renderer_config(model)
model_config = ModelConfig(
model,
tokenizer=model_info.tokenizer or model,
tokenizer_mode=model_info.tokenizer_mode,
revision=model_info.revision,
trust_remote_code=model_info.trust_remote_code,
hf_overrides=model_info.hf_overrides,
skip_tokenizer_init=model_info.require_embed_inputs,
enable_prompt_embeds=model_info.require_embed_inputs,
enable_mm_embeds=model_info.require_embed_inputs,
enforce_eager=model_info.enforce_eager,
dtype=model_info.dtype,
)
# Build the tokenizer
tokenizer = get_tokenizer(
renderer_config.tokenizer,
trust_remote_code=renderer_config.trust_remote_code,
model,
trust_remote_code=model_config.trust_remote_code,
)
# Test detecting the tokenizer's chat_template
@ -2046,7 +2072,7 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwa
tokenizer,
chat_template=None,
tools=tools,
model_config=renderer_config.model_config,
model_config=model_config,
)
with pytest.raises(
ValueError, match="Found unexpected chat template kwargs from request"
@ -2117,11 +2143,23 @@ def test_resolve_content_format_hf_defined(model, expected_format):
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
renderer_config = model_info.build_renderer_config(model)
model_config = ModelConfig(
model,
tokenizer=model_info.tokenizer or model,
tokenizer_mode=model_info.tokenizer_mode,
revision=model_info.revision,
trust_remote_code=model_info.trust_remote_code,
hf_overrides=model_info.hf_overrides,
skip_tokenizer_init=model_info.require_embed_inputs,
enable_prompt_embeds=model_info.require_embed_inputs,
enable_mm_embeds=model_info.require_embed_inputs,
enforce_eager=model_info.enforce_eager,
dtype=model_info.dtype,
)
tokenizer = get_tokenizer(
renderer_config.tokenizer,
trust_remote_code=renderer_config.trust_remote_code,
model,
trust_remote_code=model_config.trust_remote_code,
)
# Test detecting the tokenizer's chat_template
@ -2129,7 +2167,7 @@ def test_resolve_content_format_hf_defined(model, expected_format):
tokenizer,
chat_template=None,
tools=None,
model_config=renderer_config.model_config,
model_config=model_config,
)
assert isinstance(chat_template, str)
@ -2143,7 +2181,7 @@ def test_resolve_content_format_hf_defined(model, expected_format):
None,
"auto",
tokenizer,
renderer_config=renderer_config,
model_config=model_config,
)
assert resolved_format == expected_format
@ -2165,11 +2203,23 @@ def test_resolve_content_format_fallbacks(model, expected_format):
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
renderer_config = model_info.build_renderer_config(model)
model_config = ModelConfig(
model,
tokenizer=model_info.tokenizer or model,
tokenizer_mode=model_info.tokenizer_mode,
revision=model_info.revision,
trust_remote_code=model_info.trust_remote_code,
hf_overrides=model_info.hf_overrides,
skip_tokenizer_init=model_info.require_embed_inputs,
enable_prompt_embeds=model_info.require_embed_inputs,
enable_mm_embeds=model_info.require_embed_inputs,
enforce_eager=model_info.enforce_eager,
dtype=model_info.dtype,
)
tokenizer = get_tokenizer(
renderer_config.tokenizer,
trust_remote_code=renderer_config.trust_remote_code,
model_config.tokenizer,
trust_remote_code=model_config.trust_remote_code,
)
# Test detecting the tokenizer's chat_template
@ -2177,7 +2227,7 @@ def test_resolve_content_format_fallbacks(model, expected_format):
tokenizer,
chat_template=None,
tools=None,
model_config=renderer_config.model_config,
model_config=model_config,
)
assert isinstance(chat_template, str)
@ -2191,7 +2241,7 @@ def test_resolve_content_format_fallbacks(model, expected_format):
None,
"auto",
tokenizer,
renderer_config=renderer_config,
model_config=model_config,
)
assert resolved_format == expected_format
@ -2222,13 +2272,15 @@ def test_resolve_content_format_fallbacks(model, expected_format):
],
)
def test_resolve_content_format_examples(template_path, expected_format):
model = PHI3V_MODEL_ID # Dummy
model_config = ModelConfig(model, trust_remote_code=True)
renderer_config = RendererConfig(model_config=model_config, tokenizer=model)
model_config = ModelConfig(
PHI3V_MODEL_ID, # Dummy
tokenizer=PHI3V_MODEL_ID, # Dummy
trust_remote_code=True,
)
dummy_tokenizer = get_tokenizer(
renderer_config.tokenizer,
trust_remote_code=renderer_config.trust_remote_code,
PHI3V_MODEL_ID, # Dummy
trust_remote_code=model_config.trust_remote_code,
)
dummy_tokenizer.chat_template = None
@ -2245,7 +2297,7 @@ def test_resolve_content_format_examples(template_path, expected_format):
None,
"auto",
dummy_tokenizer,
renderer_config=renderer_config,
model_config=model_config,
)
assert resolved_format == expected_format
@ -2280,7 +2332,7 @@ def test_parse_chat_messages_include_thinking_chunk(mistral_model_config):
conversation_with_thinking, _, _ = parse_chat_messages(
messages,
RendererConfig(model_config=mistral_model_config),
mistral_model_config,
content_format="openai",
)
@ -2380,7 +2432,7 @@ def test_parse_chat_messages_single_empty_audio_with_uuid(
],
}
],
RendererConfig(model_config=qwen2_audio_model_config),
qwen2_audio_model_config,
content_format="string",
)
@ -2414,7 +2466,7 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
],
}
],
RendererConfig(model_config=qwen2_audio_model_config),
qwen2_audio_model_config,
content_format="string",
)

View File

@ -8,7 +8,7 @@ import torch
from safetensors.torch import load_file
from torch import nn
from vllm.config import ModelConfig, RendererConfig, VllmConfig
from vllm.config import ModelConfig, VllmConfig
from vllm.config.lora import LoRAConfig
from vllm.lora.layers import (
ColumnParallelLinearWithLoRA,
@ -422,11 +422,7 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_pa
)
model_config = ModelConfig(max_model_len=16)
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
lora_config=lora_config,
)
vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config)
vllm_config.scheduler_config.max_num_seqs = 4
vllm_config.scheduler_config.max_num_batched_tokens = 2
@ -529,11 +525,7 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path
)
model_config = ModelConfig(max_model_len=16)
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
lora_config=lora_config,
)
vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config)
vllm_config.scheduler_config.max_num_seqs = 4
vllm_config.scheduler_config.max_num_batched_tokens = 2

View File

@ -11,7 +11,6 @@ from vllm.config import (
DeviceConfig,
ModelConfig,
ParallelConfig,
RendererConfig,
SchedulerConfig,
VllmConfig,
)
@ -44,7 +43,6 @@ def test_worker_apply_lora(qwen3_lora_files):
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
load_config=LoadConfig(
download_dir=None,
load_format="dummy",

View File

@ -42,10 +42,8 @@ def test_model_loading_with_params(vllm_runner, monkeypatch):
"Write a short story about a robot that dreams for the first time.\n"
)
llm_engine = vllm_model.llm.llm_engine
model_config = llm_engine.model_config
renderer_config = llm_engine.renderer_config
tokenizer = llm_engine.tokenizer
model_config = vllm_model.llm.llm_engine.model_config
model_tokenizer = vllm_model.llm.llm_engine.tokenizer
# asserts on the bert model config file
assert model_config.encoder_config["max_seq_length"] == 512
@ -56,8 +54,8 @@ def test_model_loading_with_params(vllm_runner, monkeypatch):
assert model_config.pooler_config.normalize
# asserts on the tokenizer loaded
assert renderer_config.tokenizer == "BAAI/bge-base-en-v1.5"
assert tokenizer.model_max_length == 512
assert model_config.tokenizer == "BAAI/bge-base-en-v1.5"
assert model_tokenizer.model_max_length == 512
def check_model(model):
assert isinstance(model, BertEmbeddingModel)
@ -88,10 +86,8 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
"Write a short story about a robot that dreams for the first time.\n"
)
llm_engine = vllm_model.llm.llm_engine
model_config = llm_engine.model_config
renderer_config = llm_engine.renderer_config
tokenizer = llm_engine.tokenizer
model_config = vllm_model.llm.llm_engine.model_config
model_tokenizer = vllm_model.llm.llm_engine.tokenizer
# asserts on the bert model config file
assert model_config.encoder_config["max_seq_length"] == 512
@ -102,8 +98,8 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
assert model_config.pooler_config.normalize
# asserts on the tokenizer loaded
assert renderer_config.tokenizer == "intfloat/multilingual-e5-base"
assert tokenizer.model_max_length == 512
assert model_config.tokenizer == "intfloat/multilingual-e5-base"
assert model_tokenizer.model_max_length == 512
def check_model(model):
assert isinstance(model, RobertaEmbeddingModel)
@ -132,7 +128,7 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner, monkeypatch):
"Write a short story about a robot that dreams for the first time.\n"
)
assert vllm_model.llm.llm_engine.renderer_config.tokenizer == model_name
assert vllm_model.llm.llm_engine.model_config.tokenizer == model_name
def check_model(model):
assert isinstance(model, RobertaEmbeddingModel)

View File

@ -6,7 +6,7 @@ import pytest
from scipy.spatial.distance import cosine
from vllm import LLM, SamplingParams
from vllm.config import ModelConfig, RendererConfig
from vllm.config import ModelConfig
from ....utils import RemoteOpenAIServer
@ -31,8 +31,7 @@ def test_find_array():
dtype="bfloat16",
seed=0,
)
renderer_config = RendererConfig(model_config=model_config)
pooling = GritLMMeanPool(renderer_config=renderer_config)
pooling = GritLMMeanPool(model_config=model_config)
arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

View File

@ -25,6 +25,7 @@ from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingC
from vllm.tokenizers import (
MistralTokenizer,
TokenizerLike,
cached_tokenizer_from_config,
)
from ....multimodal.utils import random_audio, random_image, random_video
@ -211,20 +212,31 @@ def _test_processing_correctness(
else:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id_or_arch)
model_id = model_id_or_arch
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
renderer_config = model_info.build_renderer_config(
model=model_id,
model_config = ModelConfig(
model_id,
tokenizer=model_info.tokenizer or model_id,
tokenizer_mode=model_info.tokenizer_mode,
revision=model_info.revision,
trust_remote_code=model_info.trust_remote_code,
hf_overrides=model_info.hf_overrides,
# Ensure that the cache can fit all of the data
mm_processor_cache_gb=2048,
skip_tokenizer_init=model_info.require_embed_inputs,
enable_prompt_embeds=model_info.require_embed_inputs,
enable_mm_embeds=model_info.require_embed_inputs,
enforce_eager=model_info.enforce_eager,
dtype=model_info.dtype,
)
model_config = renderer_config.model_config
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
factories = model_cls._processor_factory
ctx = InputProcessingContext.from_config(renderer_config)
ctx = InputProcessingContext(
model_config,
tokenizer=cached_tokenizer_from_config(model_config),
)
cache = MultiModalProcessorOnlyCache(model_config)
processing_info = factories.info(ctx)

View File

@ -40,7 +40,7 @@ def test_processor_override(
mm_processor_kwargs=None,
limit_mm_per_prompt={"video": 1},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
tokenizer = processor.info.get_tokenizer()
hf_processor_mm_kwargs = {"fps": fps}
@ -79,7 +79,7 @@ def test_video_loader_consistency(
mm_processor_kwargs=None,
limit_mm_per_prompt={"video": 1},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
hf_processor_mm_kwargs = {"fps": fps}
# Build the image str / prompt based on the number of images we pass

View File

@ -162,7 +162,7 @@ def test_processor_override(
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": len(size_factors)},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
min_num = min_dynamic_patch if dynamic_image_size else 1

View File

@ -38,7 +38,7 @@ def test_processor_override(
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": num_imgs},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
# Build the image str / prompt based on the number of images we pass

View File

@ -116,7 +116,7 @@ def test_processor_override(
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": len(size_factors)},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
min_num = min_dynamic_patch if dynamic_image_size else 1

View File

@ -30,7 +30,7 @@ def test_processor_override(
limit_mm_per_prompt={"image": num_imgs},
mm_processor_cache_gb=mm_processor_cache_gb,
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
config = processor.info.get_hf_config()
tokenizer = processor.info.get_tokenizer()
hf_processor = processor.info.get_hf_processor()

View File

@ -42,7 +42,7 @@ def test_processor_max_tokens(model_id):
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": 1},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
info = processor.info
seen_aspect_ratios = set[float]()
@ -140,7 +140,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
image_ratios = [
(171, 152),
@ -173,7 +173,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
seen_aspect_ratios = set[float]()
image_sizes = list[ImageSize]()

View File

@ -42,7 +42,7 @@ def test_processor_max_tokens(model_id):
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": 1},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
info = processor.info
seen_aspect_ratios = set[float]()
@ -138,7 +138,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
image_ratios = [
(171, 152),
@ -171,7 +171,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
seen_aspect_ratios = set[float]()
image_sizes = list[ImageSize]()

View File

@ -24,7 +24,7 @@ def test_processor_override(
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
prompt = "<image>" * num_imgs
image = Image.new("RGB", size=(364, 364))
mm_data = {"image": [image] * num_imgs}
@ -83,7 +83,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
mm_processor_kwargs=None,
limit_mm_per_prompt={"image": num_imgs},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
image_ratios = [
(171, 152),

View File

@ -25,7 +25,7 @@ def test_profiling(model_id: str, max_model_len: int):
limit_mm_per_prompt=mm_counts,
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
profiler = MultiModalProfiler(processor)
decoder_dummy_data = profiler.get_decoder_dummy_data(

View File

@ -118,7 +118,7 @@ def test_processor_override(
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": len(size_factors)},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
min_num = min_dynamic_patch if dynamic_image_size else 1

View File

@ -39,7 +39,7 @@ def test_processor_override(
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": num_imgs},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
# Build the image str / prompt based on the number of images we pass

View File

@ -39,7 +39,7 @@ def test_processor_override(
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": num_imgs},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
# Build the image str / prompt based on the number of images we pass

View File

@ -34,7 +34,7 @@ def test_processor_override(
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": num_imgs},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
tokenizer = processor.info.get_tokenizer()
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs

View File

@ -38,7 +38,7 @@ def test_processor_override(
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
limit_mm_per_prompt={"image": num_imgs},
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
# Build the image str / prompt based on the number of images we pass

View File

@ -11,7 +11,7 @@ import pytest
import torch.nn as nn
from PIL import Image
from vllm.config import ModelConfig, RendererConfig, VllmConfig, set_current_vllm_config
from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
from vllm.config.multimodal import (
AudioDummyOptions,
BaseDummyOptions,
@ -31,6 +31,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
from vllm.multimodal.utils import group_mm_kwargs_by_modality
from vllm.platforms import current_platform
from vllm.tokenizers import cached_tokenizer_from_config
from vllm.utils.collection_utils import is_list_of
from vllm.utils.torch_utils import set_default_torch_dtype
@ -149,10 +150,7 @@ def initialize_dummy_model(
backend="nccl",
)
initialize_model_parallel(tensor_model_parallel_size=1)
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
)
vllm_config = VllmConfig(model_config=model_config)
with set_current_vllm_config(vllm_config=vllm_config):
with set_default_torch_dtype(model_config.dtype):
model = model_cls(vllm_config=vllm_config)
@ -184,12 +182,19 @@ def test_model_tensor_schema(model_id: str):
else:
dtype = model_info.dtype
renderer_config = model_info.build_renderer_config(
model_config = ModelConfig(
model_id,
tokenizer=model_info.tokenizer or model_id,
tokenizer_mode=model_info.tokenizer_mode,
revision=model_info.revision,
trust_remote_code=model_info.trust_remote_code,
hf_overrides=hf_overrides_fn,
skip_tokenizer_init=model_info.require_embed_inputs,
enable_prompt_embeds=model_info.require_embed_inputs,
enable_mm_embeds=model_info.require_embed_inputs,
enforce_eager=model_info.enforce_eager,
dtype=dtype,
)
model_config = renderer_config.model_config
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
assert supports_multimodal(model_cls)
@ -207,7 +212,10 @@ def test_model_tensor_schema(model_id: str):
if not any(inputs_parse_methods):
pytest.skip(f"{model_arch} does not support tensor schema validation.")
ctx = InputProcessingContext.from_config(renderer_config)
ctx = InputProcessingContext(
model_config,
tokenizer=cached_tokenizer_from_config(model_config),
)
processing_info = factories.info(ctx)
supported_mm_limits = processing_info.get_supported_mm_limits()
limit_mm_per_prompt = {

View File

@ -3,7 +3,7 @@
import pytest
from vllm.assets.image import ImageAsset
from vllm.config import ModelConfig, RendererConfig
from vllm.config import ModelConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
@ -13,9 +13,8 @@ def test_multimodal_processor(model_id):
model=model_id,
model_impl="transformers",
)
renderer_config = RendererConfig(model_config=model_config)
mm_processor = MULTIMODAL_REGISTRY.create_processor(renderer_config)
mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config)
image_pil = ImageAsset("cherry_blossom").pil_image
mm_data = {"image": image_pil}

View File

@ -7,6 +7,7 @@ import torch
import transformers
from transformers import AutoConfig, PreTrainedModel
from vllm.config import ModelConfig
from vllm.model_executor.models.utils import WeightsMapper
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.transformers_utils.config import try_get_safetensors_metadata
@ -49,11 +50,37 @@ def test_hf_model_weights_mapper(model_arch: str):
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
model_config = model_info.build_model_config(config_format="hf")
is_mistral_model = model_arch in [
"Mistral3ForConditionalGeneration",
"PixtralForConditionalGeneration",
"VoxtralForConditionalGeneration",
]
if not is_mistral_model or model_info.tokenizer_mode == "mistral":
tokenizer_mode = model_info.tokenizer_mode
else:
tokenizer_mode = "hf"
model_id = model_info.default
model_config = ModelConfig(
model_id,
tokenizer=model_info.tokenizer or model_id,
tokenizer_mode=tokenizer_mode,
config_format="hf",
revision=model_info.revision,
trust_remote_code=model_info.trust_remote_code,
hf_overrides=model_info.hf_overrides,
skip_tokenizer_init=model_info.require_embed_inputs,
enable_prompt_embeds=model_info.require_embed_inputs,
enable_mm_embeds=model_info.require_embed_inputs,
enforce_eager=model_info.enforce_eager,
dtype=model_info.dtype,
)
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
original_weights = create_repo_dummy_weights(model_config.model)
hf_dummy_model = create_dummy_model(model_config.model, model_arch)
original_weights = create_repo_dummy_weights(model_id)
hf_dummy_model = create_dummy_model(model_id, model_arch)
hf_converted_weights = hf_dummy_model.named_parameters()
hf_converted_buffers = hf_dummy_model.named_buffers()
mapper: WeightsMapper = model_cls.hf_to_vllm_mapper

View File

@ -9,8 +9,7 @@ import pytest
from packaging.version import Version
from transformers import __version__ as TRANSFORMERS_VERSION
from vllm.config.model import ModelConfig, ModelDType
from vllm.config.renderer import RendererConfig, TokenizerMode
from vllm.config.model import ModelDType, TokenizerMode
@dataclass(frozen=True)
@ -171,36 +170,6 @@ class _HfExamplesInfo:
else:
pytest.skip(msg)
def build_model_config(self, model: str | None = None, **kwargs) -> ModelConfig:
if model is None:
model = self.default
return ModelConfig(
**{
"model": model,
"revision": self.revision,
"trust_remote_code": self.trust_remote_code,
"hf_overrides": self.hf_overrides,
"enable_prompt_embeds": self.require_embed_inputs,
"enable_mm_embeds": self.require_embed_inputs,
"enforce_eager": self.enforce_eager,
"dtype": self.dtype,
**kwargs,
}
)
def build_renderer_config(
self, model: str | None = None, **kwargs
) -> RendererConfig:
model_config = self.build_model_config(model, **kwargs)
return RendererConfig(
model_config=model_config,
tokenizer=self.tokenizer or model_config.model,
tokenizer_mode=self.tokenizer_mode,
skip_tokenizer_init=self.require_embed_inputs,
)
_TEXT_GENERATION_EXAMPLE_MODELS = {
# [Decoder-only]

View File

@ -13,6 +13,7 @@ from transformers import PretrainedConfig
from vllm.config.model import ModelConfig, ModelDType, RunnerOption
from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
from vllm.multimodal.processing import InputProcessingContext
from vllm.tokenizers import cached_tokenizer_from_config
from .. import ci_envs
from .registry import HF_EXAMPLE_MODELS
@ -295,18 +296,30 @@ def build_model_context(
model_config_kwargs = model_config_kwargs or {}
limit_mm_per_prompt = limit_mm_per_prompt or {}
renderer_config = model_info.build_renderer_config(
model_config = ModelConfig(
model_id,
runner=runner,
tokenizer=model_info.tokenizer or model_id,
tokenizer_mode=model_info.tokenizer_mode,
revision=model_info.revision,
trust_remote_code=model_info.trust_remote_code,
dtype=dtype,
seed=0,
mm_processor_kwargs=mm_processor_kwargs,
limit_mm_per_prompt=limit_mm_per_prompt,
mm_processor_cache_gb=mm_processor_cache_gb,
hf_overrides=model_info.hf_overrides,
skip_tokenizer_init=model_info.require_embed_inputs,
enable_prompt_embeds=model_info.require_embed_inputs,
enable_mm_embeds=model_info.require_embed_inputs,
enforce_eager=model_info.enforce_eager,
**model_config_kwargs,
)
return InputProcessingContext.from_config(renderer_config)
return InputProcessingContext(
model_config,
tokenizer=cached_tokenizer_from_config(model_config),
)
def check_embeddings_close(

View File

@ -6,7 +6,7 @@ import numpy as np
import pytest
import torch
from vllm.config import ModelConfig, ParallelConfig, RendererConfig, VllmConfig
from vllm.config import ModelConfig, ParallelConfig, VllmConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.cache import (
BaseMultiModalProcessorCache,
@ -110,14 +110,11 @@ def _create_vllm_config(
mm_processor_cache_gb: float,
enable_ipc: bool,
):
model_config = ModelConfig(
model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
mm_processor_cache_gb=mm_processor_cache_gb,
)
return VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
model_config=ModelConfig(
model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
mm_processor_cache_gb=mm_processor_cache_gb,
),
parallel_config=ParallelConfig(data_parallel_size=1 if enable_ipc else 2),
)
@ -509,15 +506,13 @@ def _run_test_cache_eviction_shm(
def test_cache_eviction_shm_cache():
model_config = ModelConfig(
model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
mm_processor_cache_type="shm",
mm_shm_cache_max_object_size_mb=6,
mm_processor_cache_gb=15.2 * MiB_bytes / GiB_bytes,
)
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
model_config=ModelConfig(
model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
mm_processor_cache_type="shm",
mm_shm_cache_max_object_size_mb=6,
mm_processor_cache_gb=15.2 * MiB_bytes / GiB_bytes,
),
)
sender_cache = ShmObjectStoreSenderCache(vllm_config)
receiver_cache = ShmObjectStoreReceiverCache(vllm_config, mp.Lock())

View File

@ -7,7 +7,7 @@ from contextlib import nullcontext
import numpy as np
import pytest
from vllm.config import ModelConfig, RendererConfig
from vllm.config import ModelConfig
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.processing import (
InputProcessingContext,
@ -920,9 +920,8 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
model=model_id,
limit_mm_per_prompt=limit_mm_per_prompt,
)
renderer_config = RendererConfig(model_config=model_config)
processor = MULTIMODAL_REGISTRY.create_processor(renderer_config)
processor = MULTIMODAL_REGISTRY.create_processor(model_config)
processor._supported_mm_limits = {"image": num_supported}
profiler = MultiModalProfiler(processor)
@ -956,9 +955,8 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
model=model_id,
limit_mm_per_prompt=limit_mm_per_prompt,
)
renderer_config = RendererConfig(model_config=model_config)
processor = MULTIMODAL_REGISTRY.create_processor(renderer_config)
processor = MULTIMODAL_REGISTRY.create_processor(model_config)
rng = np.random.RandomState(0)
image = random_image(rng, min_wh=128, max_wh=256)
@ -1014,13 +1012,11 @@ def test_hf_processor_init_kwargs(
inference_kwargs,
expected_kwargs,
):
model_config = ModelConfig(model_id, mm_processor_kwargs=config_kwargs)
renderer_config = RendererConfig(
model_config=model_config,
tokenizer=model_id,
ctx = InputProcessingContext(
model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
tokenizer=None,
)
ctx = InputProcessingContext.from_config(renderer_config)
processor = ctx.get_hf_processor(
DummyProcessor, # type: ignore[arg-type]
**inference_kwargs,
@ -1049,13 +1045,11 @@ def test_hf_processor_call_kwargs(
inference_kwargs,
expected_kwargs,
):
model_config = ModelConfig(model_id, mm_processor_kwargs=config_kwargs)
renderer_config = RendererConfig(
model_config=model_config,
tokenizer=model_id,
ctx = InputProcessingContext(
model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
tokenizer=None,
)
ctx = InputProcessingContext.from_config(renderer_config)
processor = ctx.get_hf_processor(DummyProcessor) # type: ignore[arg-type]
result = ctx.call_hf_processor(processor, {}, inference_kwargs)

View File

@ -31,6 +31,4 @@ def test_supports_multimodal_inputs(model_id, limit_mm_per_prompt, expected):
model_id,
limit_mm_per_prompt=limit_mm_per_prompt,
)
assert (
MULTIMODAL_REGISTRY.supports_multimodal_inputs(ctx.renderer_config) is expected
)
assert MULTIMODAL_REGISTRY.supports_multimodal_inputs(ctx.model_config) is expected

View File

@ -13,7 +13,6 @@ from vllm.config import (
CompilationConfig,
ModelConfig,
PoolerConfig,
RendererConfig,
SchedulerConfig,
VllmConfig,
update_config,
@ -477,41 +476,27 @@ def test_load_config_pt_load_map_location(pt_load_map_location):
("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", 131073, 131072, True),
],
)
def test_recalculate_max_model_len(
def test_get_and_verify_max_len(
model_id, max_model_len, expected_max_len, should_raise
):
"""Test recalculate_max_model_len with different configurations."""
"""Test get_and_verify_max_len with different configurations."""
model_config = ModelConfig(model_id)
if should_raise:
with pytest.raises(ValueError):
model_config.recalculate_max_model_len(
max_model_len,
tokenizer=model_id,
tokenizer_revision=None,
)
model_config.get_and_verify_max_len(max_model_len)
else:
model_config.recalculate_max_model_len(
max_model_len,
tokenizer=model_id,
tokenizer_revision=None,
)
assert model_config.max_model_len == expected_max_len
actual_max_len = model_config.get_and_verify_max_len(max_model_len)
assert actual_max_len == expected_max_len
class MockModelConfig:
"""Simple mock object for testing maybe_pull_model_for_runai"""
class MockConfig:
"""Simple mock object for testing maybe_pull_model_tokenizer_for_runai"""
def __init__(self, model: str):
def __init__(self, model: str, tokenizer: str):
self.model = model
class MockRendererConfig:
"""Simple mock object for testing maybe_pull_tokenizer_for_runai"""
def __init__(self, model_config: MockModelConfig):
self.model_config = model_config
self.tokenizer = model_config.model
self.tokenizer = tokenizer
self.model_weights = None
@pytest.mark.parametrize(
@ -529,65 +514,59 @@ def test_s3_url_model_tokenizer_paths(mock_pull_files, s3_url):
mock_pull_files.return_value = None
# Create first mock and run the method
model_config1 = MockModelConfig(model=s3_url)
renderer_config1 = MockRendererConfig(model_config=model_config1)
ModelConfig.maybe_pull_model_for_runai(model_config1, s3_url)
RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config1, s3_url)
config1 = MockConfig(model=s3_url, tokenizer=s3_url)
ModelConfig.maybe_pull_model_tokenizer_for_runai(config1, s3_url, s3_url)
# Check that model and tokenizer point to existing directories
assert os.path.exists(model_config1.model), (
f"Model directory does not exist: {model_config1.model}"
assert os.path.exists(config1.model), (
f"Model directory does not exist: {config1.model}"
)
assert os.path.isdir(model_config1.model), (
f"Model path is not a directory: {model_config1.model}"
assert os.path.isdir(config1.model), (
f"Model path is not a directory: {config1.model}"
)
assert os.path.exists(renderer_config1.tokenizer), (
f"Tokenizer directory does not exist: {renderer_config1.tokenizer}"
assert os.path.exists(config1.tokenizer), (
f"Tokenizer directory does not exist: {config1.tokenizer}"
)
assert os.path.isdir(renderer_config1.tokenizer), (
f"Tokenizer path is not a directory: {renderer_config1.tokenizer}"
assert os.path.isdir(config1.tokenizer), (
f"Tokenizer path is not a directory: {config1.tokenizer}"
)
# Verify that the paths are different from the original S3 URL
assert model_config1.model != s3_url, (
"Model path should be converted to local directory"
)
assert renderer_config1.tokenizer != s3_url, (
assert config1.model != s3_url, "Model path should be converted to local directory"
assert config1.tokenizer != s3_url, (
"Tokenizer path should be converted to local directory"
)
# Store the original paths
created_model_dir = model_config1.model
create_tokenizer_dir = renderer_config1.tokenizer
created_model_dir = config1.model
create_tokenizer_dir = config1.tokenizer
# Create a new mock and run the method with the same S3 URL
model_config2 = MockModelConfig(model=s3_url)
renderer_config2 = MockRendererConfig(model_config=model_config2)
ModelConfig.maybe_pull_model_for_runai(model_config2, s3_url)
RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config2, s3_url)
config2 = MockConfig(model=s3_url, tokenizer=s3_url)
ModelConfig.maybe_pull_model_tokenizer_for_runai(config2, s3_url, s3_url)
# Check that the new directories exist
assert os.path.exists(model_config2.model), (
f"Model directory does not exist: {model_config2.model}"
assert os.path.exists(config2.model), (
f"Model directory does not exist: {config2.model}"
)
assert os.path.isdir(model_config2.model), (
f"Model path is not a directory: {model_config2.model}"
assert os.path.isdir(config2.model), (
f"Model path is not a directory: {config2.model}"
)
assert os.path.exists(renderer_config2.tokenizer), (
f"Tokenizer directory does not exist: {renderer_config2.tokenizer}"
assert os.path.exists(config2.tokenizer), (
f"Tokenizer directory does not exist: {config2.tokenizer}"
)
assert os.path.isdir(renderer_config2.tokenizer), (
f"Tokenizer path is not a directory: {renderer_config2.tokenizer}"
assert os.path.isdir(config2.tokenizer), (
f"Tokenizer path is not a directory: {config2.tokenizer}"
)
# Verify that the paths are deterministic (same as before)
assert model_config2.model == created_model_dir, (
assert config2.model == created_model_dir, (
f"Model paths are not deterministic. "
f"Original: {created_model_dir}, New: {model_config2.model}"
f"Original: {created_model_dir}, New: {config2.model}"
)
assert renderer_config2.tokenizer == create_tokenizer_dir, (
assert config2.tokenizer == create_tokenizer_dir, (
f"Tokenizer paths are not deterministic. "
f"Original: {create_tokenizer_dir}, New: {renderer_config2.tokenizer}"
f"Original: {create_tokenizer_dir}, New: {config2.tokenizer}"
)
@ -601,36 +580,28 @@ def test_s3_url_different_models_create_different_directories(mock_pull_files):
s3_url2 = "s3://example-bucket-2/model/"
# Create mocks with different S3 URLs and run the method
model_config1 = MockModelConfig(model=s3_url1)
renderer_config1 = MockRendererConfig(model_config=model_config1)
ModelConfig.maybe_pull_model_for_runai(model_config1, s3_url1)
RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config1, s3_url1)
config1 = MockConfig(model=s3_url1, tokenizer=s3_url1)
ModelConfig.maybe_pull_model_tokenizer_for_runai(config1, s3_url1, s3_url1)
model_config2 = MockModelConfig(model=s3_url2)
renderer_config2 = MockRendererConfig(model_config=model_config2)
ModelConfig.maybe_pull_model_for_runai(model_config2, s3_url2)
RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config2, s3_url2)
config2 = MockConfig(model=s3_url2, tokenizer=s3_url2)
ModelConfig.maybe_pull_model_tokenizer_for_runai(config2, s3_url2, s3_url2)
# Verify that different URLs produce different directories
assert model_config1.model != model_config2.model, (
assert config1.model != config2.model, (
f"Different S3 URLs should create different model directories. "
f"URL1 model: {model_config1.model}, URL2 model: {model_config2.model}"
f"URL1 model: {config1.model}, URL2 model: {config2.model}"
)
assert renderer_config1.tokenizer != renderer_config2.tokenizer, (
assert config1.tokenizer != config2.tokenizer, (
f"Different S3 URLs should create different tokenizer directories. "
f"URL1 tokenizer: {renderer_config1.tokenizer}, "
f"URL2 tokenizer: {renderer_config2.tokenizer}"
f"URL1 tokenizer: {config1.tokenizer}, "
f"URL2 tokenizer: {config2.tokenizer}"
)
# Verify that both sets of directories exist
assert os.path.exists(model_config1.model) and os.path.isdir(model_config1.model)
assert os.path.exists(renderer_config1.tokenizer) and os.path.isdir(
renderer_config1.tokenizer
)
assert os.path.exists(model_config2.model) and os.path.isdir(model_config2.model)
assert os.path.exists(renderer_config2.tokenizer) and os.path.isdir(
renderer_config2.tokenizer
)
assert os.path.exists(config1.model) and os.path.isdir(config1.model)
assert os.path.exists(config1.tokenizer) and os.path.isdir(config1.tokenizer)
assert os.path.exists(config2.model) and os.path.isdir(config2.model)
assert os.path.exists(config2.tokenizer) and os.path.isdir(config2.tokenizer)
@pytest.mark.parametrize(

View File

@ -3,7 +3,7 @@
import pytest
from vllm.config import ModelConfig, RendererConfig
from vllm.config import ModelConfig
from vllm.inputs import zip_enc_dec_prompts
from vllm.inputs.parse import parse_raw_prompts
from vllm.inputs.preprocess import InputPreprocessor
@ -108,9 +108,8 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
)
def test_preprocessor_always_mm_code_path(model_id, prompt):
model_config = ModelConfig(model=model_id)
renderer_config = RendererConfig(model_config=model_config)
tokenizer = init_tokenizer_from_config(renderer_config)
input_preprocessor = InputPreprocessor(renderer_config, tokenizer)
tokenizer = init_tokenizer_from_config(model_config)
input_preprocessor = InputPreprocessor(model_config, tokenizer)
# HF processor adds sep token
sep_token_id = tokenizer.vocab[tokenizer.sep_token]

View File

@ -16,7 +16,6 @@ from vllm.config import (
LoadConfig,
ModelConfig,
ParallelConfig,
RendererConfig,
SchedulerConfig,
VllmConfig,
)
@ -217,7 +216,6 @@ def create_vllm_config(
return VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
cache_config=cache_config,
parallel_config=parallel_config,
scheduler_config=scheduler_config,

View File

@ -8,7 +8,7 @@ import pytest
import torch
import vllm.v1.core.kv_cache_utils as kv_cache_utils
from vllm.config import ModelConfig, RendererConfig, SchedulerConfig, VllmConfig
from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
from vllm.lora.request import LoRARequest
from vllm.multimodal.inputs import (
MultiModalFeatureSpec,
@ -667,10 +667,7 @@ def test_metrics_empty_stats():
def test_get_kv_cache_configs_multiple_workers():
model_config = ModelConfig(max_model_len=16)
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
)
vllm_config = VllmConfig(model_config=model_config)
ref_kv_cache_spec = new_kv_cache_spec()
same_kv_cache_specs = [
@ -1139,7 +1136,6 @@ def test_estimate_max_model_len(model_id, max_model_len, want_estimated_max_len)
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
scheduler_config=scheduler_config,
)
@ -1179,7 +1175,6 @@ def test_get_max_concurrency_for_kv_cache_config():
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
scheduler_config=scheduler_config,
)
@ -1298,10 +1293,7 @@ def test_allocate_with_lookahead():
def test_get_kv_cache_config_one_worker():
# pass max_model_len to pass check_enough_kv_cache_memory
model_config = ModelConfig(max_model_len=16)
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
)
vllm_config = VllmConfig(model_config=model_config)
mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2
# all layers are full attention -> single group
@ -1592,11 +1584,7 @@ def test_get_kv_cache_config_one_worker():
def test_get_kv_cache_configs_attention_free():
kv_cache_specs: dict[str, KVCacheSpec] = {}
model_config = ModelConfig(max_model_len=16)
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
)
vllm_config = VllmConfig(model_config=ModelConfig(max_model_len=16))
kv_cache_configs = get_kv_cache_configs(vllm_config, [kv_cache_specs], [0])
assert kv_cache_configs == [
KVCacheConfig(

View File

@ -11,7 +11,6 @@ from vllm.config import (
ECTransferConfig,
KVTransferConfig,
ModelConfig,
RendererConfig,
SchedulerConfig,
SpeculativeConfig,
VllmConfig,
@ -1564,7 +1563,6 @@ def create_scheduler_with_priority(
vllm_config = VllmConfig(
scheduler_config=scheduler_config,
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
cache_config=cache_config,
kv_transfer_config=kv_transfer_config,
speculative_config=speculative_config,

View File

@ -9,7 +9,6 @@ from vllm.config import (
ECTransferConfig,
KVTransferConfig,
ModelConfig,
RendererConfig,
SchedulerConfig,
SpeculativeConfig,
VllmConfig,
@ -133,7 +132,6 @@ def create_scheduler(
vllm_config = VllmConfig(
scheduler_config=scheduler_config,
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
cache_config=cache_config,
kv_transfer_config=kv_transfer_config,
speculative_config=speculative_config,

View File

@ -15,7 +15,6 @@ from vllm.config import (
ECTransferConfig,
KVTransferConfig,
ModelConfig,
RendererConfig,
SchedulerConfig,
VllmConfig,
)
@ -523,7 +522,6 @@ def test_encoder_instance_zero_kv_cache(
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
cache_config=cache_config,
scheduler_config=scheduler_config,
kv_transfer_config=kv_transfer_config,

View File

@ -5,14 +5,7 @@ import pytest
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.config import (
CacheConfig,
DeviceConfig,
ModelConfig,
MultiModalConfig,
RendererConfig,
VllmConfig,
)
from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
from vllm.sampling_params import SamplingParams
from vllm.v1.engine import input_processor as input_processor_mod
from vllm.v1.engine.input_processor import InputProcessor
@ -51,21 +44,22 @@ def _mock_input_processor(
monkeypatch.setattr(VllmConfig, "__post_init__", lambda self: None, raising=True)
model_config = ModelConfig(
skip_tokenizer_init=True,
max_model_len=128,
mm_processor_cache_gb=mm_cache_gb,
generation_config="vllm",
)
model_config.multimodal_config = MultiModalConfig(mm_processor_cache_gb=mm_cache_gb)
renderer_config = RendererConfig(
model_config=model_config,
tokenizer="dummy",
skip_tokenizer_init=True,
)
# Minimal multimodal_config to satisfy references in
# Processor.process_inputs.
class _MockMMConfig:
def __init__(self, gb: float):
self.mm_processor_cache_gb = gb
model_config.multimodal_config = _MockMMConfig(mm_cache_gb) # type: ignore[attr-defined]
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=renderer_config,
cache_config=CacheConfig(enable_prefix_caching=enable_prefix_caching),
device_config=DeviceConfig(device="cpu"),
)

View File

@ -15,7 +15,6 @@ from vllm.config import (
DeviceConfig,
KVTransferConfig,
ModelConfig,
RendererConfig,
SchedulerConfig,
VllmConfig,
)
@ -128,7 +127,6 @@ def create_vllm_config(
return VllmConfig(
scheduler_config=scheduler_config,
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
cache_config=cache_config,
kv_transfer_config=kv_transfer_config,
device_config=DeviceConfig("cpu"),

View File

@ -19,7 +19,6 @@ from vllm.config import (
DeviceConfig,
ModelConfig,
ParallelConfig,
RendererConfig,
SchedulerConfig,
SpeculativeConfig,
VllmConfig,
@ -62,7 +61,6 @@ def _create_proposer(
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
cache_config=CacheConfig(),
speculative_config=speculative_config,
device_config=DeviceConfig(device=current_platform.device_type),

View File

@ -18,7 +18,6 @@ from vllm.config import (
DeviceConfig,
ModelConfig,
ParallelConfig,
RendererConfig,
SchedulerConfig,
SpeculativeConfig,
VllmConfig,
@ -47,7 +46,6 @@ def _create_mtp_proposer(num_speculative_tokens: int) -> EagleProposer:
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
cache_config=CacheConfig(),
speculative_config=speculative_config,
device_config=DeviceConfig(device=current_platform.device_type),

View File

@ -4,7 +4,6 @@ import numpy as np
from vllm.config import (
ModelConfig,
RendererConfig,
SpeculativeConfig,
VllmConfig,
)
@ -70,7 +69,6 @@ def test_ngram_proposer():
return NgramProposer(
vllm_config=VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
speculative_config=SpeculativeConfig(
prompt_lookup_min=min_n,
prompt_lookup_max=max_n,

View File

@ -6,7 +6,7 @@ from concurrent.futures import Future
import pytest
from transformers import AutoTokenizer
from vllm.config import RendererConfig, StructuredOutputsConfig, VllmConfig
from vllm.config import StructuredOutputsConfig, VllmConfig
from vllm.config.model import ModelConfig
from vllm.config.parallel import ParallelConfig
from vllm.config.speculative import SpeculativeConfig
@ -72,11 +72,8 @@ def test_backend_guidance_rollback_terminated():
def test_grammar_bitmask_with_specdec():
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
prompt = tokenizer.encode('{"a": "b"}')
model_config = ModelConfig(tokenizer=TOKENIZER)
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config, tokenizer=TOKENIZER),
model_config=ModelConfig(tokenizer=TOKENIZER),
structured_outputs_config=StructuredOutputsConfig(backend="guidance"),
speculative_config=SpeculativeConfig(model="[ngram]", num_speculative_tokens=3),
)
@ -140,11 +137,8 @@ def test_grammar_init_async_and_sync(async_grammar):
# Use "external_launcher" for sync mode, None for async mode
executor_backend = None if async_grammar else "external_launcher"
model_config = ModelConfig(tokenizer=TOKENIZER)
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config, tokenizer=TOKENIZER),
model_config=ModelConfig(tokenizer=TOKENIZER),
structured_outputs_config=StructuredOutputsConfig(backend="guidance"),
parallel_config=ParallelConfig(distributed_executor_backend=executor_backend),
)

View File

@ -7,7 +7,7 @@ from unittest.mock import Mock
import pytest
from vllm.config import ModelConfig, RendererConfig, SchedulerConfig, VllmConfig
from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
from vllm.reasoning import ReasoningParser
from vllm.v1.request import Request
from vllm.v1.structured_output import StructuredOutputManager
@ -17,26 +17,19 @@ class TestReasoningStructuredOutput:
"""Test reasoning-aware structured output functionality."""
@pytest.fixture
def mock_renderer_config(self):
"""Create a mock RendererConfig."""
renderer_config = Mock(spec=RendererConfig)
renderer_config.skip_tokenizer_init = (
True # Skip tokenizer init to avoid network calls
)
model_config = Mock(spec=ModelConfig)
model_config.get_vocab_size = Mock(return_value=50000)
model_config.trust_remote_code = False
def mock_model_config(self):
"""Create a mock ModelConfig."""
config = Mock(spec=ModelConfig)
config.skip_tokenizer_init = True # Skip tokenizer init to avoid network calls
config.get_vocab_size = Mock(return_value=50000)
# Add missing runner_type attribute that tokenizer initialization expects
model_config.runner_type = "generate"
renderer_config.model_config = model_config
config.runner_type = "generate"
# Add other attributes that tokenizer initialization might need
renderer_config.tokenizer = "test-tokenizer"
renderer_config.tokenizer_mode = "auto"
renderer_config.tokenizer_revision = None
return renderer_config
config.tokenizer = "test-tokenizer"
config.tokenizer_mode = "auto"
config.trust_remote_code = False
config.tokenizer_revision = None
return config
@pytest.fixture
def mock_scheduler_config(self):
@ -46,10 +39,10 @@ class TestReasoningStructuredOutput:
return config
@pytest.fixture
def mock_vllm_config(self, mock_renderer_config, mock_scheduler_config):
def mock_vllm_config(self, mock_model_config, mock_scheduler_config):
"""Create a mock VllmConfig."""
config = Mock(spec=VllmConfig)
config.renderer_config = mock_renderer_config
config.model_config = mock_model_config
config.scheduler_config = mock_scheduler_config
config.structured_outputs_config = Mock()
config.structured_outputs_config.reasoning_parser = None

View File

@ -7,7 +7,6 @@ from vllm.attention.layer import Attention
from vllm.config import (
CacheConfig,
ModelConfig,
RendererConfig,
SchedulerConfig,
VllmConfig,
set_current_vllm_config,
@ -46,7 +45,6 @@ def get_vllm_config():
)
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
cache_config=cache_config,
scheduler_config=scheduler_config,
)

View File

@ -13,7 +13,6 @@ from vllm.config import (
CacheConfig,
ModelConfig,
ParallelConfig,
RendererConfig,
SchedulerConfig,
VllmConfig,
set_current_vllm_config,
@ -102,7 +101,6 @@ def get_vllm_config():
parallel_config = ParallelConfig()
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
cache_config=cache_config,
scheduler_config=scheduler_config,
parallel_config=parallel_config,
@ -813,7 +811,6 @@ def test_hybrid_attention_mamba_tensor_shapes():
attention_config = AttentionConfig(backend=AttentionBackendEnum.FLASHINFER)
vllm_config = VllmConfig(
model_config=model_config,
renderer_config=RendererConfig(model_config=model_config),
cache_config=cache_config,
scheduler_config=scheduler_config,
parallel_config=parallel_config,

View File

@ -24,7 +24,6 @@ from vllm.config.multimodal import MultiModalConfig
from vllm.config.observability import ObservabilityConfig
from vllm.config.parallel import EPLBConfig, ParallelConfig
from vllm.config.pooler import PoolerConfig
from vllm.config.renderer import RendererConfig
from vllm.config.scheduler import SchedulerConfig
from vllm.config.speculative import SpeculativeConfig
from vllm.config.speech_to_text import SpeechToTextConfig
@ -82,8 +81,6 @@ __all__ = [
"ParallelConfig",
# From vllm.config.pooler
"PoolerConfig",
# From vllm.config.renderer
"RendererConfig",
# From vllm.config.scheduler
"SchedulerConfig",
# From vllm.config.speculative

View File

@ -36,6 +36,7 @@ from vllm.transformers_utils.config import (
uses_xdrope_dim,
)
from vllm.transformers_utils.gguf_utils import (
is_gguf,
is_remote_gguf,
maybe_patch_hf_config_from_gguf,
split_remote_gguf,
@ -82,6 +83,7 @@ TaskOption = Literal[
"transcription",
"draft",
]
TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"]
ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
LogprobsMode = Literal[
"raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"
@ -129,6 +131,18 @@ class ModelConfig:
Note that the model may support other tasks using the same model runner.
"""
tokenizer: SkipValidation[str] = None # type: ignore
"""Name or path of the Hugging Face tokenizer to use. If unspecified, model
name or path will be used."""
tokenizer_mode: TokenizerMode | str = "auto"
"""Tokenizer mode:\n
- "auto" will use the tokenizer from `mistral_common` for Mistral models
if available, otherwise it will use the "hf" tokenizer.\n
- "hf" will use the fast tokenizer if available.\n
- "slow" will always use the slow tokenizer.\n
- "mistral" will always use the tokenizer from `mistral_common`.\n
- "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
- Other custom values can be supported via plugins."""
trust_remote_code: bool = False
"""Trust remote code (e.g., from HuggingFace) when downloading the model
and tokenizer."""
@ -154,6 +168,13 @@ class ModelConfig:
hf_config_path: str | None = None
"""Name or path of the Hugging Face config to use. If unspecified, model
name or path will be used."""
allowed_local_media_path: str = ""
"""Allowing API requests to read local images or videos from directories
specified by the server file system. This is a security risk. Should only
be enabled in trusted environments."""
allowed_media_domains: list[str] | None = None
"""If set, only media URLs that belong to this domain can be used for
multi-modal inputs. """
revision: str | None = None
"""The specific model version to use. It can be a branch name, a tag name,
or a commit id. If unspecified, will use the default version."""
@ -161,6 +182,10 @@ class ModelConfig:
"""The specific revision to use for the model code on the Hugging Face Hub.
It can be a branch name, a tag name, or a commit id. If unspecified, will
use the default version."""
tokenizer_revision: str | None = None
"""The specific revision to use for the tokenizer on the Hugging Face Hub.
It can be a branch name, a tag name, or a commit id. If unspecified, will
use the default version."""
max_model_len: SkipValidation[int] = None # type: ignore
"""Model context length (prompt and output). If unspecified, will be
automatically derived from the model config.
@ -205,6 +230,10 @@ class ModelConfig:
preventing potential numerical issues. Note that even if this is set to
False, cascade attention will be only used when the heuristic tells that
it's beneficial."""
skip_tokenizer_init: bool = False
"""Skip initialization of tokenizer and detokenizer. Expects valid
`prompt_token_ids` and `None` for prompt from the input. The generated
output will contain token ids."""
enable_prompt_embeds: bool = False
"""If `True`, enables passing text embeddings as inputs via the
`prompt_embeds` key.
@ -265,6 +294,8 @@ class ModelConfig:
logits_processors: list[str | type[LogitsProcessor]] | None = None
"""One or more logits processors' fully-qualified class names or class
definitions"""
io_processor_plugin: str | None = None
"""IOProcessor plugin name to load at model startup"""
# Pooler config
pooler_config: PoolerConfig | None = None
@ -277,6 +308,7 @@ class ModelConfig:
from the architecture of `self.model`."""
limit_mm_per_prompt: InitVar[dict[str, int | dict[str, int]] | None] = None
enable_mm_embeds: InitVar[bool | None] = None
media_io_kwargs: InitVar[dict[str, dict[str, Any]] | None] = None
mm_processor_kwargs: InitVar[dict[str, Any] | None] = None
mm_processor_cache_gb: InitVar[float | None] = None
mm_processor_cache_type: InitVar[MMCacheType | None] = None
@ -303,12 +335,18 @@ class ModelConfig:
"runner",
"convert",
"task",
"tokenizer",
"tokenizer_mode",
"seed",
"hf_config_path",
"allowed_local_media_path",
"allowed_media_domains",
"tokenizer_revision",
"spec_target_max_model_len",
"enforce_eager",
"logprobs_mode",
"disable_cascade_attn",
"skip_tokenizer_init",
"served_model_name",
"config_format",
"hf_token",
@ -316,9 +354,11 @@ class ModelConfig:
"logits_processor_pattern",
"override_attention_dtype",
"logits_processors",
"io_processor_plugin",
"pooler_config",
"multimodal_config",
"limit_mm_per_prompt",
"media_io_kwargs",
"mm_processor_kwargs",
"mm_processor_cache_gb",
"mm_processor_cache_type",
@ -383,6 +423,7 @@ class ModelConfig:
# Multimodal config init vars
limit_mm_per_prompt: dict[str, int | dict[str, int]] | None,
enable_mm_embeds: bool | None,
media_io_kwargs: dict[str, dict[str, Any]] | None,
mm_processor_kwargs: dict[str, Any] | None,
mm_processor_cache_gb: float | None,
mm_processor_cache_type: MMCacheType | None,
@ -397,8 +438,13 @@ class ModelConfig:
self.served_model_name = get_served_model_name(
self.model, self.served_model_name
)
self.original_model = self.model
self.model = maybe_model_redirect(self.original_model)
self.model = maybe_model_redirect(self.model)
# The tokenizer is consistent with the model by default.
if self.tokenizer is None:
self.tokenizer = self.model
if self.tokenizer_revision is None:
self.tokenizer_revision = self.revision
self.tokenizer = maybe_model_redirect(self.tokenizer)
if isinstance(self.hf_config_path, str):
self.hf_config_path = maybe_model_redirect(self.hf_config_path)
@ -419,7 +465,7 @@ class ModelConfig:
hf_overrides_kw[key] = value
hf_overrides_fn = None
self.maybe_pull_model_for_runai(self.model)
self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer)
from vllm.platforms import current_platform
@ -602,8 +648,7 @@ class ModelConfig:
)
self.original_max_model_len = self.max_model_len
self.recalculate_max_model_len(self.original_max_model_len)
self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
# Init multimodal config if needed
if self._model_info.supports_multimodal:
if (
@ -619,6 +664,7 @@ class ModelConfig:
mm_config_kwargs = dict(
limit_per_prompt=limit_mm_per_prompt,
enable_mm_embeds=enable_mm_embeds,
media_io_kwargs=media_io_kwargs,
mm_processor_kwargs=mm_processor_kwargs,
mm_processor_cache_gb=mm_processor_cache_gb,
mm_processor_cache_type=mm_processor_cache_type,
@ -636,8 +682,16 @@ class ModelConfig:
self.multimodal_config = MultiModalConfig(**mm_config_kwargs)
# Multimodal GGUF models must use original repo for mm processing
if is_gguf(self.tokenizer) and self.is_multimodal_model:
raise ValueError(
"Loading a multimodal GGUF model needs to use original "
"tokenizer. Please specify the unquantized hf model's "
"repo name or path using the --tokenizer argument."
)
if self.disable_sliding_window:
# Set after recalculate_max_model_len to ensure that max_model_len
# Set after get_and_verify_max_len to ensure that max_model_len
# can be correctly capped to sliding window size
self.hf_text_config.sliding_window = None
@ -661,9 +715,10 @@ class ModelConfig:
@model_validator(mode="after")
def validate_model_config_after(self: "ModelConfig") -> "ModelConfig":
if not isinstance(self.tokenizer, str):
raise ValueError("tokenizer must be a string after __post_init__.")
if not isinstance(self.max_model_len, int):
raise ValueError("max_model_len must be an integer after __post_init__.")
return self
def _get_transformers_backend_cls(self) -> str:
@ -712,17 +767,49 @@ class ModelConfig:
"""The architecture vllm actually used."""
return self._architecture
def maybe_pull_model_for_runai(self, model: str) -> None:
"""Pull model from Object Storage to temporary directory when needed."""
if not is_runai_obj_uri(model):
def maybe_pull_model_tokenizer_for_runai(self, model: str, tokenizer: str) -> None:
"""Pull model/tokenizer from Object Storage to temporary
directory when needed.
Args:
model: Model name or path
tokenizer: Tokenizer name or path
"""
if not (is_runai_obj_uri(model) or is_runai_obj_uri(tokenizer)):
return
object_storage_model = ObjectStorageModel(url=model)
object_storage_model.pull_files(
model, allow_pattern=["*.model", "*.py", "*.json"]
)
self.model_weights = model
self.model = object_storage_model.dir
if is_runai_obj_uri(model):
object_storage_model = ObjectStorageModel(url=model)
object_storage_model.pull_files(
model, allow_pattern=["*.model", "*.py", "*.json"]
)
self.model_weights = model
self.model = object_storage_model.dir
# If tokenizer is same as model, download to same directory
if model == tokenizer:
object_storage_model.pull_files(
model,
ignore_pattern=[
"*.pt",
"*.safetensors",
"*.bin",
"*.tensors",
"*.pth",
],
)
self.tokenizer = object_storage_model.dir
return
# Only download tokenizer if needed and not already handled
if is_runai_obj_uri(tokenizer):
object_storage_tokenizer = ObjectStorageModel(url=tokenizer)
object_storage_tokenizer.pull_files(
model,
ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors", "*.pth"],
)
self.tokenizer = object_storage_tokenizer.dir
def _get_encoder_config(self):
model = self.model
@ -1625,38 +1712,30 @@ class ModelConfig:
return dense_modules[-1]["out_features"]
return self.get_hidden_size()
def recalculate_max_model_len(
self,
original_max_model_len: int | None,
*,
tokenizer: str | None = None,
tokenizer_revision: str | None = None,
) -> None:
def get_and_verify_max_len(self, max_model_len: int):
# Consider max_model_len in tokenizer_config only when
# pooling models use absolute position_embedding.
# NOTE: For simplicity we assume `args.model == args.tokenizer`
# since this is
tokenizer_config = None
if (
self.runner_type == "pooling"
and getattr(self.hf_config, "position_embedding_type", "") == "absolute"
):
tokenizer_config = try_get_tokenizer_config(
tokenizer or self.model,
self.tokenizer,
trust_remote_code=self.trust_remote_code,
revision=tokenizer_revision or self.revision,
revision=self.tokenizer_revision,
)
self.max_model_len = _get_and_verify_max_len(
max_model_len = _get_and_verify_max_len(
hf_config=self.hf_text_config,
tokenizer_config=tokenizer_config,
max_model_len=original_max_model_len,
max_model_len=max_model_len,
disable_sliding_window=self.disable_sliding_window,
sliding_window=self.get_sliding_window(),
spec_target_max_model_len=self.spec_target_max_model_len,
encoder_config=self.encoder_config,
)
logger.info("Using max model len %s", self.max_model_len)
logger.info("Using max model len %s", max_model_len)
return max_model_len
@property
def attn_type(self) -> AttnTypeStr:

View File

@ -79,6 +79,10 @@ class MultiModalConfig:
WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed.
Only enable this flag for trusted users!"""
media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict)
"""Additional args passed to process media inputs, keyed by modalities.
For example, to set num_frames for video, set
`--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
mm_processor_kwargs: dict[str, object] | None = None
"""Arguments to be forwarded to the model's processor for multi-modal data,
e.g., image processor. Overrides for the multi-modal processor obtained

View File

@ -1,109 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from typing import Any, Literal
from pydantic import Field, SkipValidation
from pydantic.dataclasses import dataclass
from vllm.config.model import ModelConfig
from vllm.config.utils import config
from vllm.transformers_utils.gguf_utils import is_gguf
from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri
from vllm.transformers_utils.utils import maybe_model_redirect
TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"]
@config
@dataclass
class RendererConfig:
"""Configuration for the renderer."""
# NOTE: In reality, this is a required argument.
# We provide a dummy default value here to generate the CLI args.
model_config: SkipValidation[ModelConfig] = None # type: ignore
"""Provides model context to the renderer."""
tokenizer: str = ""
"""Name or path of the Hugging Face tokenizer to use. If unspecified, model
name or path will be used."""
tokenizer_mode: TokenizerMode | str = "auto"
"""Tokenizer mode:\n
- "auto" will use the tokenizer from `mistral_common` for Mistral models
if available, otherwise it will use the "hf" tokenizer.\n
- "hf" will use the fast tokenizer if available.\n
- "slow" will always use the slow tokenizer.\n
- "mistral" will always use the tokenizer from `mistral_common`.\n
- "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
- Other custom values can be supported via plugins."""
tokenizer_revision: str | None = None
"""The specific revision to use for the tokenizer on the Hugging Face Hub.
It can be a branch name, a tag name, or a commit id. If unspecified, will
use the default version."""
skip_tokenizer_init: bool = False
"""Skip initialization of tokenizer and detokenizer. Expects valid
`prompt_token_ids` and `None` for prompt from the input. The generated
output will contain token ids."""
io_processor_plugin: str | None = None
"""IOProcessor plugin name to load at model startup."""
media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict)
"""Additional args passed to process media inputs, keyed by modalities.
For example, to set num_frames for video, set
`--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
allowed_local_media_path: str = ""
"""Allowing API requests to read local images or videos from directories
specified by the server file system. This is a security risk. Should only
be enabled in trusted environments."""
allowed_media_domains: list[str] | None = None
"""If set, only media URLs that belong to this domain can be used for
multi-modal inputs. """
@property
def trust_remote_code(self) -> bool:
return self.model_config.trust_remote_code
def __post_init__(self) -> None:
model_config = self.model_config
# The tokenizer is consistent with the model by default.
if not self.tokenizer:
self.tokenizer = (
ModelConfig.model
if model_config is None
else model_config.original_model
)
if not self.tokenizer_revision:
self.tokenizer_revision = (
ModelConfig.revision if model_config is None else model_config.revision
)
self.original_tokenizer = self.tokenizer
self.tokenizer = maybe_model_redirect(self.original_tokenizer)
self.maybe_pull_tokenizer_for_runai(self.tokenizer)
# Multimodal GGUF models must use original repo for mm processing
is_multimodal_model = (
ModelConfig.is_multimodal_model
if model_config is None
else model_config.is_multimodal_model
)
if is_gguf(self.tokenizer) and is_multimodal_model:
raise ValueError(
"Loading a multimodal GGUF model needs to use original "
"tokenizer. Please specify the unquantized hf model's "
"repo name or path using the --tokenizer argument."
)
def maybe_pull_tokenizer_for_runai(self, tokenizer: str) -> None:
"""Pull tokenizer from Object Storage to temporary directory when needed."""
if not is_runai_obj_uri(tokenizer):
return
object_storage_tokenizer = ObjectStorageModel(url=tokenizer)
object_storage_tokenizer.pull_files(
tokenizer,
ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors", "*.pth"],
)
self.tokenizer = object_storage_tokenizer.dir

View File

@ -322,11 +322,16 @@ class SpeculativeConfig:
self.draft_model_config = ModelConfig(
model=self.model,
runner="draft",
tokenizer=self.target_model_config.tokenizer,
tokenizer_mode=self.target_model_config.tokenizer_mode,
trust_remote_code=self.target_model_config.trust_remote_code,
allowed_local_media_path=self.target_model_config.allowed_local_media_path,
allowed_media_domains=self.target_model_config.allowed_media_domains,
dtype=self.target_model_config.dtype,
seed=self.target_model_config.seed,
revision=self.revision,
code_revision=self.code_revision,
tokenizer_revision=self.target_model_config.tokenizer_revision,
spec_target_max_model_len=self.target_model_config.max_model_len,
quantization=self.quantization,
enforce_eager=self.target_model_config.enforce_eager,

View File

@ -39,7 +39,6 @@ from .lora import LoRAConfig
from .model import ModelConfig
from .observability import ObservabilityConfig
from .parallel import ParallelConfig
from .renderer import RendererConfig
from .scheduler import SchedulerConfig
from .speculative import SpeculativeConfig
from .structured_outputs import StructuredOutputsConfig
@ -182,8 +181,6 @@ class VllmConfig:
# try to download a model
model_config: ModelConfig = Field(default=None)
"""Model configuration."""
renderer_config: RendererConfig = Field(default_factory=RendererConfig)
"""Renderer configuration."""
cache_config: CacheConfig = Field(default_factory=CacheConfig)
"""Cache configuration."""
parallel_config: ParallelConfig = Field(default_factory=ParallelConfig)
@ -744,7 +741,7 @@ class VllmConfig:
from vllm.multimodal import MULTIMODAL_REGISTRY
self.scheduler_config.max_num_encoder_input_tokens = (
MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.renderer_config)
MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config)
)
logger.debug(
"Encoder-decoder model detected: setting "
@ -1189,13 +1186,11 @@ class VllmConfig:
computed_compile_ranges_split_points
)
def recalculate_max_model_len(self, original_max_model_len: int | None) -> None:
# Can only be called during try_verify_and_update_config
self.model_config.recalculate_max_model_len(
original_max_model_len,
tokenizer=self.renderer_config.tokenizer,
tokenizer_revision=self.renderer_config.tokenizer_revision,
)
def recalculate_max_model_len(self, max_model_len: int):
# Can only be called in try_verify_and_update_config
model_config = self.model_config
max_model_len = model_config.get_and_verify_max_len(max_model_len)
self.model_config.max_model_len = max_model_len
def try_verify_and_update_config(self):
if self.model_config is None:
@ -1269,11 +1264,11 @@ class VllmConfig:
return (
f"model={self.model_config.model!r}, "
f"speculative_config={self.speculative_config!r}, "
f"tokenizer={self.renderer_config.tokenizer!r}, "
f"skip_tokenizer_init={self.renderer_config.skip_tokenizer_init}, "
f"tokenizer_mode={self.renderer_config.tokenizer_mode}, "
f"tokenizer={self.model_config.tokenizer!r}, "
f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, "
f"tokenizer_mode={self.model_config.tokenizer_mode}, "
f"revision={self.model_config.revision}, "
f"tokenizer_revision={self.renderer_config.tokenizer_revision}, "
f"tokenizer_revision={self.model_config.tokenizer_revision}, "
f"trust_remote_code={self.model_config.trust_remote_code}, "
f"dtype={self.model_config.dtype}, "
f"max_seq_len={self.model_config.max_model_len}, "

View File

@ -71,11 +71,11 @@ from vllm.config.model import (
ModelDType,
RunnerOption,
TaskOption,
TokenizerMode,
)
from vllm.config.multimodal import MMCacheType, MMEncoderTPMode
from vllm.config.observability import DetailedTraceModules
from vllm.config.parallel import DistributedExecutorBackend, ExpertPlacementStrategy
from vllm.config.renderer import RendererConfig, TokenizerMode
from vllm.config.scheduler import SchedulerPolicy
from vllm.config.utils import get_field
from vllm.config.vllm import OptimizationLevel
@ -355,12 +355,17 @@ class EngineArgs:
model: str = ModelConfig.model
served_model_name: str | list[str] | None = ModelConfig.served_model_name
tokenizer: str | None = ModelConfig.tokenizer
hf_config_path: str | None = ModelConfig.hf_config_path
runner: RunnerOption = ModelConfig.runner
convert: ConvertOption = ModelConfig.convert
task: TaskOption | None = ModelConfig.task
skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
tokenizer_mode: TokenizerMode | str = ModelConfig.tokenizer_mode
trust_remote_code: bool = ModelConfig.trust_remote_code
allowed_local_media_path: str = ModelConfig.allowed_local_media_path
allowed_media_domains: list[str] | None = ModelConfig.allowed_media_domains
download_dir: str | None = LoadConfig.download_dir
safetensors_load_strategy: str = LoadConfig.safetensors_load_strategy
load_format: str | LoadFormats = LoadConfig.load_format
@ -444,6 +449,7 @@ class EngineArgs:
code_revision: str | None = ModelConfig.code_revision
hf_token: bool | str | None = ModelConfig.hf_token
hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides")
tokenizer_revision: str | None = ModelConfig.tokenizer_revision
quantization: QuantizationMethods | None = ModelConfig.quantization
enforce_eager: bool = ModelConfig.enforce_eager
disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
@ -452,6 +458,9 @@ class EngineArgs:
)
enable_mm_embeds: bool = MultiModalConfig.enable_mm_embeds
interleave_mm_strings: bool = MultiModalConfig.interleave_mm_strings
media_io_kwargs: dict[str, dict[str, Any]] = get_field(
MultiModalConfig, "media_io_kwargs"
)
mm_processor_kwargs: dict[str, Any] | None = MultiModalConfig.mm_processor_kwargs
disable_mm_preprocessor_cache: bool = False # DEPRECATED
mm_processor_cache_gb: float = MultiModalConfig.mm_processor_cache_gb
@ -465,19 +474,9 @@ class EngineArgs:
mm_encoder_attn_backend: AttentionBackendEnum | str | None = (
MultiModalConfig.mm_encoder_attn_backend
)
io_processor_plugin: str | None = None
skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling
video_pruning_rate: float = MultiModalConfig.video_pruning_rate
# Renderer fields
tokenizer: str | None = None
tokenizer_mode: TokenizerMode | str = RendererConfig.tokenizer_mode
tokenizer_revision: str | None = RendererConfig.tokenizer_revision
skip_tokenizer_init: bool = RendererConfig.skip_tokenizer_init
io_processor_plugin: str | None = None
media_io_kwargs: dict[str, dict[str, Any]] = get_field(
RendererConfig, "media_io_kwargs"
)
allowed_local_media_path: str = RendererConfig.allowed_local_media_path
allowed_media_domains: list[str] | None = RendererConfig.allowed_media_domains
# LoRA fields
enable_lora: bool = False
max_loras: int = LoRAConfig.max_loras
@ -628,14 +627,25 @@ class EngineArgs:
model_group.add_argument("--runner", **model_kwargs["runner"])
model_group.add_argument("--convert", **model_kwargs["convert"])
model_group.add_argument("--task", **model_kwargs["task"], deprecated=True)
model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"])
model_group.add_argument("--tokenizer-mode", **model_kwargs["tokenizer_mode"])
model_group.add_argument(
"--trust-remote-code", **model_kwargs["trust_remote_code"]
)
model_group.add_argument("--dtype", **model_kwargs["dtype"])
model_group.add_argument("--seed", **model_kwargs["seed"])
model_group.add_argument("--hf-config-path", **model_kwargs["hf_config_path"])
model_group.add_argument(
"--allowed-local-media-path", **model_kwargs["allowed_local_media_path"]
)
model_group.add_argument(
"--allowed-media-domains", **model_kwargs["allowed_media_domains"]
)
model_group.add_argument("--revision", **model_kwargs["revision"])
model_group.add_argument("--code-revision", **model_kwargs["code_revision"])
model_group.add_argument(
"--tokenizer-revision", **model_kwargs["tokenizer_revision"]
)
model_group.add_argument("--max-model-len", **model_kwargs["max_model_len"])
model_group.add_argument("--quantization", "-q", **model_kwargs["quantization"])
model_group.add_argument("--enforce-eager", **model_kwargs["enforce_eager"])
@ -647,6 +657,9 @@ class EngineArgs:
model_group.add_argument(
"--disable-cascade-attn", **model_kwargs["disable_cascade_attn"]
)
model_group.add_argument(
"--skip-tokenizer-init", **model_kwargs["skip_tokenizer_init"]
)
model_group.add_argument(
"--enable-prompt-embeds", **model_kwargs["enable_prompt_embeds"]
)
@ -685,34 +698,8 @@ class EngineArgs:
model_group.add_argument(
"--logits-processors", **model_kwargs["logits_processors"]
)
# Renderer arguments
renderer_kwargs = get_kwargs(RendererConfig)
renderer_group = parser.add_argument_group(
title="RendererConfig",
description=RendererConfig.__doc__,
)
renderer_group.add_argument("--tokenizer", **renderer_kwargs["tokenizer"])
renderer_group.add_argument(
"--tokenizer-mode", **renderer_kwargs["tokenizer_mode"]
)
renderer_group.add_argument(
"--tokenizer-revision", **renderer_kwargs["tokenizer_revision"]
)
renderer_group.add_argument(
"--skip-tokenizer-init", **renderer_kwargs["skip_tokenizer_init"]
)
renderer_group.add_argument(
"--media-io-kwargs", **renderer_kwargs["media_io_kwargs"]
)
renderer_group.add_argument(
"--allowed-local-media-path", **renderer_kwargs["allowed_local_media_path"]
)
renderer_group.add_argument(
"--allowed-media-domains", **renderer_kwargs["allowed_media_domains"]
)
renderer_group.add_argument(
"--io-processor-plugin", **renderer_kwargs["io_processor_plugin"]
model_group.add_argument(
"--io-processor-plugin", **model_kwargs["io_processor_plugin"]
)
# Model loading arguments
@ -962,6 +949,9 @@ class EngineArgs:
multimodal_group.add_argument(
"--enable-mm-embeds", **multimodal_kwargs["enable_mm_embeds"]
)
multimodal_group.add_argument(
"--media-io-kwargs", **multimodal_kwargs["media_io_kwargs"]
)
multimodal_group.add_argument(
"--mm-processor-kwargs", **multimodal_kwargs["mm_processor_kwargs"]
)
@ -1265,13 +1255,18 @@ class EngineArgs:
runner=self.runner,
convert=self.convert,
task=self.task,
tokenizer=self.tokenizer,
tokenizer_mode=self.tokenizer_mode,
trust_remote_code=self.trust_remote_code,
allowed_local_media_path=self.allowed_local_media_path,
allowed_media_domains=self.allowed_media_domains,
dtype=self.dtype,
seed=self.seed,
revision=self.revision,
code_revision=self.code_revision,
hf_token=self.hf_token,
hf_overrides=self.hf_overrides,
tokenizer_revision=self.tokenizer_revision,
max_model_len=self.max_model_len,
quantization=self.quantization,
enforce_eager=self.enforce_eager,
@ -1279,11 +1274,13 @@ class EngineArgs:
logprobs_mode=self.logprobs_mode,
disable_sliding_window=self.disable_sliding_window,
disable_cascade_attn=self.disable_cascade_attn,
skip_tokenizer_init=self.skip_tokenizer_init,
enable_prompt_embeds=self.enable_prompt_embeds,
served_model_name=self.served_model_name,
limit_mm_per_prompt=self.limit_mm_per_prompt,
enable_mm_embeds=self.enable_mm_embeds,
interleave_mm_strings=self.interleave_mm_strings,
media_io_kwargs=self.media_io_kwargs,
skip_mm_profiling=self.skip_mm_profiling,
config_format=self.config_format,
mm_processor_kwargs=self.mm_processor_kwargs,
@ -1301,6 +1298,7 @@ class EngineArgs:
override_attention_dtype=self.override_attention_dtype,
logits_processors=self.logits_processors,
video_pruning_rate=self.video_pruning_rate,
io_processor_plugin=self.io_processor_plugin,
)
def validate_tensorizer_args(self):
@ -1396,25 +1394,9 @@ class EngineArgs:
)
model_config = self.create_model_config()
renderer_config = RendererConfig(
model_config=model_config,
tokenizer=self.tokenizer or "",
tokenizer_mode=self.tokenizer_mode,
tokenizer_revision=self.tokenizer_revision,
skip_tokenizer_init=self.skip_tokenizer_init,
io_processor_plugin=self.io_processor_plugin,
media_io_kwargs=self.media_io_kwargs,
allowed_local_media_path=self.allowed_local_media_path,
allowed_media_domains=self.allowed_media_domains,
)
model_config.recalculate_max_model_len(
model_config.original_max_model_len,
tokenizer=renderer_config.tokenizer,
tokenizer_revision=renderer_config.tokenizer_revision,
)
self.model = model_config.model
self.tokenizer = model_config.tokenizer
self._check_feature_supported(model_config)
self._set_default_chunked_prefill_and_prefix_caching_args(model_config)
self._set_default_max_num_seqs_and_batched_tokens_args(
@ -1786,7 +1768,6 @@ class EngineArgs:
)
config = VllmConfig(
model_config=model_config,
renderer_config=renderer_config,
cache_config=cache_config,
parallel_config=parallel_config,
scheduler_config=scheduler_config,

View File

@ -5,7 +5,7 @@ from abc import ABC, abstractmethod
from collections.abc import AsyncGenerator, Iterable, Mapping
from typing import Any
from vllm.config import ModelConfig, RendererConfig, VllmConfig
from vllm.config import ModelConfig, VllmConfig
from vllm.inputs.data import PromptType
from vllm.lora.request import LoRARequest
from vllm.outputs import PoolingRequestOutput, RequestOutput
@ -22,7 +22,6 @@ class EngineClient(ABC):
"""Protocol class for Clients to Engine"""
vllm_config: VllmConfig
renderer_config: RendererConfig
model_config: ModelConfig
input_processor: InputProcessor
io_processor: IOProcessor | None

View File

@ -44,7 +44,7 @@ from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast, Processor
from typing_extensions import Required, TypedDict
from vllm import envs
from vllm.config import ModelConfig, RendererConfig
from vllm.config import ModelConfig
from vllm.logger import init_logger
from vllm.model_executor.models import SupportsMultiModal
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict
@ -452,10 +452,9 @@ This is needed because `lru_cache` does not cache when an exception happens.
def _try_get_processor_chat_template(
tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
*,
trust_remote_code: bool,
model_config: ModelConfig,
) -> str | None:
cache_key = (tokenizer.name_or_path, trust_remote_code)
cache_key = (tokenizer.name_or_path, model_config.trust_remote_code)
if cache_key in _PROCESSOR_CHAT_TEMPLATES:
return _PROCESSOR_CHAT_TEMPLATES[cache_key]
@ -467,7 +466,7 @@ def _try_get_processor_chat_template(
PreTrainedTokenizerFast,
ProcessorMixin,
),
trust_remote_code=trust_remote_code,
trust_remote_code=model_config.trust_remote_code,
)
if (
isinstance(processor, ProcessorMixin)
@ -500,10 +499,7 @@ def resolve_hf_chat_template(
# 2nd priority: AutoProcessor chat template, unless tool calling is enabled
if tools is None:
chat_template = _try_get_processor_chat_template(
tokenizer,
trust_remote_code=model_config.trust_remote_code,
)
chat_template = _try_get_processor_chat_template(tokenizer, model_config)
if chat_template is not None:
return chat_template
@ -517,10 +513,10 @@ def resolve_hf_chat_template(
exc_info=True,
)
# 4th priority: Predefined fallbacks]
# 4th priority: Predefined fallbacks
path = get_chat_template_fallback_path(
model_type=model_config.hf_config.model_type,
tokenizer_name_or_path=tokenizer.name_or_path,
tokenizer_name_or_path=model_config.tokenizer,
)
if path is not None:
logger.info_once(
@ -542,14 +538,14 @@ def _resolve_chat_template_content_format(
tools: list[dict[str, Any]] | None,
tokenizer: TokenizerLike | None,
*,
renderer_config: RendererConfig,
model_config: ModelConfig,
) -> _ChatTemplateContentFormat:
if isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
hf_chat_template = resolve_hf_chat_template(
tokenizer,
chat_template=chat_template,
tools=tools,
model_config=renderer_config.model_config,
model_config=model_config,
)
else:
hf_chat_template = None
@ -599,7 +595,7 @@ def resolve_chat_template_content_format(
given_format: ChatTemplateContentFormatOption,
tokenizer: TokenizerLike | None,
*,
renderer_config: RendererConfig,
model_config: ModelConfig,
) -> _ChatTemplateContentFormat:
if given_format != "auto":
return given_format
@ -608,7 +604,7 @@ def resolve_chat_template_content_format(
chat_template,
tools,
tokenizer,
renderer_config=renderer_config,
model_config=model_config,
)
_log_chat_template_content_format(
@ -631,32 +627,32 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
maximum per prompt.
"""
def __init__(self, renderer_config: RendererConfig):
def __init__(self, model_config: ModelConfig):
super().__init__()
self._renderer_config = renderer_config
self._model_config = model_config
self._items_by_modality = defaultdict[str, list[_T | None]](list)
self._uuids_by_modality = defaultdict[str, list[str | None]](list)
@property
def renderer_config(self) -> RendererConfig:
return self._renderer_config
def model_config(self) -> ModelConfig:
return self._model_config
@cached_property
def model_cls(self) -> type[SupportsMultiModal]:
from vllm.model_executor.model_loader import get_model_cls
model_cls = get_model_cls(self.renderer_config.model_config)
model_cls = get_model_cls(self.model_config)
return cast(type[SupportsMultiModal], model_cls)
@property
def allowed_local_media_path(self):
return self._renderer_config.allowed_local_media_path
return self._model_config.allowed_local_media_path
@property
def allowed_media_domains(self):
return self._renderer_config.allowed_media_domains
return self._model_config.allowed_media_domains
@property
def mm_registry(self):
@ -664,7 +660,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
@cached_property
def mm_processor(self):
return self.mm_registry.create_processor(self.renderer_config)
return self.mm_registry.create_processor(self.model_config)
def add(
self,
@ -855,20 +851,19 @@ class MultiModalContentParser(BaseMultiModalContentParser):
super().__init__()
self._tracker = tracker
multimodal_config = self._tracker.model_config.multimodal_config
media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None)
self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load(
envs.VLLM_MEDIA_CONNECTOR,
media_io_kwargs=self.renderer_config.media_io_kwargs,
media_io_kwargs=media_io_kwargs,
allowed_local_media_path=tracker.allowed_local_media_path,
allowed_media_domains=tracker.allowed_media_domains,
)
@property
def renderer_config(self) -> RendererConfig:
return self._tracker.renderer_config
@property
def model_config(self) -> ModelConfig:
return self.renderer_config.model_config
return self._tracker.model_config
def parse_image(self, image_url: str | None, uuid: str | None = None) -> None:
image = self._connector.fetch_image(image_url) if image_url else None
@ -968,20 +963,18 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
super().__init__()
self._tracker = tracker
multimodal_config = self._tracker.model_config.multimodal_config
media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None)
self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load(
envs.VLLM_MEDIA_CONNECTOR,
media_io_kwargs=self.renderer_config.media_io_kwargs,
media_io_kwargs=media_io_kwargs,
allowed_local_media_path=tracker.allowed_local_media_path,
allowed_media_domains=tracker.allowed_media_domains,
)
@property
def renderer_config(self) -> RendererConfig:
return self._tracker.renderer_config
@property
def model_config(self) -> ModelConfig:
return self.renderer_config.model_config
return self._tracker.model_config
def parse_image(self, image_url: str | None, uuid: str | None = None) -> None:
image_coro = self._connector.fetch_image_async(image_url) if image_url else None
@ -1611,17 +1604,15 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None:
def parse_chat_messages(
messages: list[ChatCompletionMessageParam],
renderer_config: RendererConfig,
model_config: ModelConfig,
content_format: _ChatTemplateContentFormat,
) -> tuple[
list[ConversationMessage],
MultiModalDataDict | None,
MultiModalUUIDDict | None,
]:
model_config = renderer_config.model_config
conversation: list[ConversationMessage] = []
mm_tracker = MultiModalItemTracker(renderer_config)
mm_tracker = MultiModalItemTracker(model_config)
for msg in messages:
sub_messages = _parse_chat_message_content(
@ -1644,17 +1635,15 @@ def parse_chat_messages(
def parse_chat_messages_futures(
messages: list[ChatCompletionMessageParam],
renderer_config: RendererConfig,
model_config: ModelConfig,
content_format: _ChatTemplateContentFormat,
) -> tuple[
list[ConversationMessage],
Awaitable[MultiModalDataDict | None],
MultiModalUUIDDict | None,
]:
model_config = renderer_config.model_config
conversation: list[ConversationMessage] = []
mm_tracker = AsyncMultiModalItemTracker(renderer_config)
mm_tracker = AsyncMultiModalItemTracker(model_config)
for msg in messages:
sub_messages = _parse_chat_message_content(
@ -1759,14 +1748,14 @@ def apply_hf_chat_template(
chat_template: str | None,
tools: list[dict[str, Any]] | None,
*,
renderer_config: RendererConfig,
model_config: ModelConfig,
**kwargs: Any,
) -> str:
hf_chat_template = resolve_hf_chat_template(
tokenizer,
chat_template=chat_template,
tools=tools,
model_config=renderer_config.model_config,
model_config=model_config,
)
if hf_chat_template is None:

View File

@ -29,8 +29,8 @@ from vllm.config.model import (
HfOverrides,
ModelDType,
RunnerOption,
TokenizerMode,
)
from vllm.config.renderer import TokenizerMode
from vllm.engine.arg_utils import EngineArgs
from vllm.entrypoints.chat_utils import (
ChatCompletionMessageParam,
@ -343,7 +343,6 @@ class LLM:
logger.info("Supported tasks: %s", supported_tasks)
self.supported_tasks = supported_tasks
self.renderer_config = self.llm_engine.renderer_config
self.model_config = self.llm_engine.model_config
self.input_processor = self.llm_engine.input_processor
self.io_processor = self.llm_engine.io_processor
@ -809,13 +808,13 @@ class LLM:
list_of_messages = [cast(list[ChatCompletionMessageParam], messages)]
tokenizer = self.get_tokenizer()
renderer_config = self.renderer_config
model_config = self.model_config
resolved_content_format = resolve_chat_template_content_format(
chat_template,
tools,
chat_template_content_format,
tokenizer,
renderer_config=renderer_config,
model_config=model_config,
)
_chat_template_kwargs: dict[str, Any] = dict(
@ -834,7 +833,7 @@ class LLM:
# the chat message parsing for it.
conversation, mm_data, mm_uuids = parse_chat_messages(
msgs,
renderer_config,
model_config,
content_format=resolved_content_format,
)
@ -848,7 +847,7 @@ class LLM:
prompt_str = apply_hf_chat_template(
tokenizer=tokenizer,
conversation=conversation,
renderer_config=renderer_config,
model_config=model_config,
**_chat_template_kwargs,
)
# Special tokens are already included in chat templates so
@ -1291,7 +1290,6 @@ class LLM:
lora_request: list[LoRARequest] | LoRARequest | None = None,
tokenization_kwargs: dict[str, Any] | None = None,
) -> list[ScoringRequestOutput]:
renderer_config = self.renderer_config
model_config = self.model_config
if isinstance(tokenizer, MistralTokenizer):
@ -1319,7 +1317,7 @@ class LLM:
for q, d in input_pairs:
_, engine_prompt = get_score_prompt(
renderer_config=renderer_config,
model_config=model_config,
data_1=q,
data_2=d,
tokenizer=tokenizer,

View File

@ -1099,7 +1099,7 @@ async def init_app_state(
logger.info("Supported tasks: %s", supported_tasks)
resolved_chat_template = await process_chat_template(
args.chat_template, engine_client, vllm_config.renderer_config
args.chat_template, engine_client, vllm_config.model_config
)
if args.tool_server == "demo":

View File

@ -122,7 +122,7 @@ class OpenAIServingCompletion(OpenAIServing):
try:
lora_request = self._maybe_get_adapters(request)
if self.renderer_config.skip_tokenizer_init:
if self.model_config.skip_tokenizer_init:
tokenizer = None
else:
tokenizer = await self.engine_client.get_tokenizer()

View File

@ -291,7 +291,6 @@ class OpenAIServing:
self.input_processor = self.models.input_processor
self.io_processor = self.models.io_processor
self.renderer_config = self.models.renderer_config
self.model_config = self.models.model_config
self.max_model_len = self.model_config.max_model_len
@ -1101,18 +1100,18 @@ class OpenAIServing:
Sequence[RequestPrompt],
list[EngineTokensPrompt],
]:
renderer_config = self.renderer_config
model_config = self.model_config
resolved_content_format = resolve_chat_template_content_format(
chat_template,
tool_dicts,
chat_template_content_format,
tokenizer,
renderer_config=renderer_config,
model_config=model_config,
)
conversation, mm_data_future, mm_uuids = parse_chat_messages_futures(
messages,
renderer_config,
model_config,
content_format=resolved_content_format,
)
@ -1139,14 +1138,14 @@ class OpenAIServing:
request_prompt = tokenizer.apply_chat_template(
conversation=conversation,
messages=messages,
model_config=renderer_config.model_config,
model_config=model_config,
**_chat_template_kwargs,
)
else:
request_prompt = apply_hf_chat_template(
tokenizer=tokenizer,
conversation=conversation,
renderer_config=renderer_config,
model_config=model_config,
**_chat_template_kwargs,
)

View File

@ -71,7 +71,6 @@ class OpenAIServingModels:
self.input_processor = self.engine_client.input_processor
self.io_processor = self.engine_client.io_processor
self.renderer_config = self.engine_client.renderer_config
self.model_config = self.engine_client.model_config
self.max_model_len = self.model_config.max_model_len

View File

@ -91,7 +91,7 @@ class OpenAISpeechToText(OpenAIServing):
self.task_type = task_type
self.asr_config = self.model_cls.get_speech_to_text_config(
self.renderer_config, task_type
self.model_config, task_type
)
self.enable_force_include_usage = enable_force_include_usage
@ -101,8 +101,8 @@ class OpenAISpeechToText(OpenAIServing):
self.tokenizer = cast(
PreTrainedTokenizerBase,
get_tokenizer(
tokenizer_name=self.renderer_config.tokenizer,
tokenizer_mode=self.renderer_config.tokenizer_mode,
tokenizer_name=self.model_config.tokenizer,
tokenizer_mode=self.model_config.tokenizer_mode,
),
)
@ -154,7 +154,7 @@ class OpenAISpeechToText(OpenAIServing):
prompt = self.model_cls.get_generation_prompt(
audio=chunk,
stt_config=self.asr_config,
renderer_config=self.renderer_config,
model_config=self.model_config,
language=language,
task_type=self.task_type,
request_prompt=request.prompt,
@ -428,7 +428,7 @@ class OpenAISpeechToText(OpenAIServing):
if res.prompt_token_ids is not None:
num_prompt_tokens = len(res.prompt_token_ids)
if audio_tokens := self.model_cls.get_num_audio_tokens(
audio_duration_s, self.asr_config, self.renderer_config
audio_duration_s, self.asr_config, self.model_config
):
num_prompt_tokens += audio_tokens

View File

@ -94,7 +94,7 @@ class OpenAIServingPooling(OpenAIServing):
try:
lora_request = self._maybe_get_adapters(request)
if self.renderer_config.skip_tokenizer_init:
if self.model_config.skip_tokenizer_init:
tokenizer = None
else:
tokenizer = await self.engine_client.get_tokenizer()

View File

@ -160,8 +160,10 @@ class ServingScores(OpenAIServing):
data_1: str | ScoreContentPartParam,
data_2: str | ScoreContentPartParam,
) -> tuple[str, TokensPrompt]:
model_config = self.model_config
full_prompt, engine_prompt = get_score_prompt(
renderer_config=self.renderer_config,
model_config=model_config,
data_1=data_1,
data_2=data_2,
tokenizer=tokenizer,

View File

@ -5,7 +5,7 @@ from typing import Any, TypeAlias, cast
from torch.nn import CosineSimilarity
from typing_extensions import Required, TypedDict
from vllm.config import ModelConfig, RendererConfig
from vllm.config import ModelConfig
from vllm.entrypoints.chat_utils import (
BaseMultiModalItemTracker,
ChatCompletionContentPartImageEmbedsParam,
@ -88,9 +88,9 @@ def _validate_score_input_lens(
def parse_score_data(
data_1: str | ScoreContentPartParam,
data_2: str | ScoreContentPartParam,
renderer_config: RendererConfig,
model_config: ModelConfig,
) -> tuple[str, str, MultiModalDataDict | None]:
mm_tracker = MultiModalItemTracker(renderer_config)
mm_tracker = MultiModalItemTracker(model_config)
content_1 = _parse_score_content(data_1, mm_tracker)
content_2 = _parse_score_content(data_2, mm_tracker)
@ -176,7 +176,7 @@ def post_process_tokens(
def get_score_prompt(
renderer_config: RendererConfig,
model_config: ModelConfig,
tokenizer: TokenizerLike,
tokenization_kwargs: dict[str, Any],
data_1: str | ScoreContentPartParam,
@ -185,14 +185,11 @@ def get_score_prompt(
prompt_1, prompt_2, mm_data = parse_score_data(
data_1,
data_2,
renderer_config,
model_config,
)
from vllm.model_executor.model_loader import get_model_cls
model_config = renderer_config.model_config
model = get_model_cls(model_config)
if supports_score_template(model):
full_prompt = apply_score_template(model_config, prompt_1, prompt_2)
prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs)

View File

@ -13,7 +13,7 @@ from fastapi import Request
from fastapi.responses import JSONResponse, StreamingResponse
from starlette.background import BackgroundTask, BackgroundTasks
from vllm.config import RendererConfig
from vllm.config import ModelConfig
from vllm.engine.arg_utils import EngineArgs
from vllm.engine.protocol import EngineClient
from vllm.entrypoints.chat_utils import (
@ -288,7 +288,7 @@ def process_lora_modules(
async def process_chat_template(
args_chat_template: Path | str | None,
engine_client: EngineClient,
renderer_config: RendererConfig,
model_config: ModelConfig,
) -> str | None:
resolved_chat_template = load_chat_template(args_chat_template)
if resolved_chat_template is not None:
@ -305,7 +305,7 @@ async def process_chat_template(
tokenizer=tokenizer,
chat_template=None,
tools=None,
model_config=renderer_config.model_config,
model_config=model_config,
)
if hf_chat_template != resolved_chat_template:
@ -314,6 +314,6 @@ async def process_chat_template(
"It is different from official chat template '%s'. "
"This discrepancy may lead to performance degradation.",
resolved_chat_template,
renderer_config.model_config.model,
model_config.model,
)
return resolved_chat_template

View File

@ -6,7 +6,7 @@ from typing import Any, cast
from typing_extensions import assert_never
from vllm.config import RendererConfig
from vllm.config import ModelConfig
from vllm.logger import init_logger
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
from vllm.multimodal.cache import BaseMultiModalProcessorCache
@ -45,15 +45,14 @@ logger = init_logger(__name__)
class InputPreprocessor:
def __init__(
self,
renderer_config: RendererConfig,
model_config: ModelConfig,
tokenizer: TokenizerLike | None,
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
mm_processor_cache: BaseMultiModalProcessorCache | None = None,
) -> None:
super().__init__()
self.renderer_config = renderer_config
self.model_config = renderer_config.model_config
self.model_config = model_config
self.tokenizer = tokenizer
self.mm_registry = mm_registry
self.mm_processor_cache = mm_processor_cache
@ -232,7 +231,7 @@ class InputPreprocessor:
def _get_mm_processor(self) -> BaseMultiModalProcessor:
if not hasattr(self, "_mm_processor"):
self._mm_processor = self.mm_registry.create_processor(
self.renderer_config,
self.model_config,
tokenizer=self.tokenizer,
cache=self.mm_processor_cache,
)

View File

@ -415,7 +415,7 @@ def load_weights_using_from_2_way_softmax(
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
renderer_config = model.vllm_config.renderer_config
model_config = model.vllm_config.model_config
quant_config = model.vllm_config.quant_config
text_config = model.config.get_text_config()
@ -447,10 +447,10 @@ def load_weights_using_from_2_way_softmax(
from vllm.tokenizers import get_tokenizer
tokenizer = get_tokenizer(
renderer_config.tokenizer,
revision=renderer_config.tokenizer_revision,
tokenizer_mode=renderer_config.tokenizer_mode,
trust_remote_code=renderer_config.trust_remote_code,
model_config.tokenizer,
revision=model_config.tokenizer_revision,
tokenizer_mode=model_config.tokenizer_mode,
trust_remote_code=model_config.trust_remote_code,
)
false_id = tokenizer.convert_tokens_to_ids(tokens[0])
@ -473,7 +473,7 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
renderer_config = model.vllm_config.renderer_config
model_config = model.vllm_config.model_config
quant_config = model.vllm_config.quant_config
text_config = model.config.get_text_config()
@ -501,10 +501,10 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
from vllm.tokenizers import get_tokenizer
tokenizer = get_tokenizer(
renderer_config.tokenizer,
revision=renderer_config.tokenizer_revision,
tokenizer_mode=renderer_config.tokenizer_mode,
trust_remote_code=renderer_config.trust_remote_code,
model_config.tokenizer,
revision=model_config.tokenizer_revision,
tokenizer_mode=model_config.tokenizer_mode,
trust_remote_code=model_config.trust_remote_code,
)
token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens]

View File

@ -377,8 +377,8 @@ class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
self.projector_config = config.projector_config
self.text_config = config.text_config
renderer_config = vllm_config.renderer_config
tokenizer = cached_tokenizer_from_config(renderer_config)
model_config = vllm_config.model_config
tokenizer = cached_tokenizer_from_config(model_config)
self.image_token_id = tokenizer.vocab[_IMAGE_TOKEN]
self.sam_model = build_sam_vit_b()

View File

@ -370,8 +370,8 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
self.projector_config = config.projector_config
self.text_config = config.text_config
renderer_config = vllm_config.renderer_config
tokenizer = cached_tokenizer_from_config(renderer_config)
model_config = vllm_config.model_config
tokenizer = cached_tokenizer_from_config(model_config)
self.image_token_id: int = tokenizer.vocab[_IMAGE_TOKEN]
self.vision = self._init_vision_module(

View File

@ -18,7 +18,7 @@ from transformers.models.gemma3n import (
)
from transformers.models.siglip import SiglipImageProcessorFast
from vllm.config import RendererConfig, SpeechToTextConfig, VllmConfig
from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs.data import PromptType
from vllm.logger import init_logger
@ -760,7 +760,7 @@ class Gemma3nForConditionalGeneration(
cls,
audio: np.ndarray,
stt_config: SpeechToTextConfig,
renderer_config: RendererConfig,
model_config: ModelConfig,
language: Optional[str],
task_type: Literal["transcribe", "translate"],
request_prompt: str,
@ -798,9 +798,7 @@ class Gemma3nForConditionalGeneration(
@classmethod
def get_speech_to_text_config(
cls,
renderer_config: RendererConfig,
task_type: str,
cls, model_config: ModelConfig, task_type: str
) -> SpeechToTextConfig:
return SpeechToTextConfig(
# Let's set this to 30 as suggested in the docs for now, although

View File

@ -34,7 +34,7 @@ import torch.nn.functional as F
from torch import nn
from transformers import BatchFeature, PretrainedConfig
from vllm.config import CacheConfig, RendererConfig, SpeechToTextConfig, VllmConfig
from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs.data import PromptType
from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
@ -840,7 +840,7 @@ class GraniteSpeechForConditionalGeneration(
def get_generation_prompt(
cls,
audio: np.ndarray,
renderer_config: RendererConfig,
model_config: ModelConfig,
stt_config: SpeechToTextConfig,
language: str | None,
task_type: Literal["transcribe", "translate"],
@ -861,7 +861,7 @@ class GraniteSpeechForConditionalGeneration(
else:
raise ValueError(f"Unsupported task type {task_type}")
tokenizer = cached_tokenizer_from_config(renderer_config)
tokenizer = cached_tokenizer_from_config(model_config)
chat = [dict(role="user", content=user_prompt)]
prompt = tokenizer.apply_chat_template(
chat,
@ -882,10 +882,10 @@ class GraniteSpeechForConditionalGeneration(
cls,
audio_duration_s: float,
stt_config: SpeechToTextConfig,
renderer_config: RendererConfig,
model_config: ModelConfig,
) -> int | None:
"""Get the number of audio tokens for an audio duration in sec."""
processor = cached_processor_from_config(renderer_config)
processor = cached_processor_from_config(model_config)
hop_length = processor.audio_processor.melspec_kwargs["hop_length"]
proj_win_size = processor.audio_processor.projector_window_size
ds_rate = processor.audio_processor.projector_downsample_rate
@ -903,9 +903,7 @@ class GraniteSpeechForConditionalGeneration(
@classmethod
def get_speech_to_text_config(
cls,
renderer_config: RendererConfig,
task_type: str,
cls, model_config: ModelConfig, task_type: str
) -> SpeechToTextConfig:
"""Get the stt config for this model."""
# Default settings are reasonable for this model and we don't currently

View File

@ -6,7 +6,7 @@ import numpy as np
import torch
import torch.nn as nn
from vllm.config import RendererConfig, VllmConfig
from vllm.config import ModelConfig, VllmConfig
from vllm.logger import init_logger
from vllm.model_executor.layers.pooler import (
DispatchPooler,
@ -29,12 +29,12 @@ logger = init_logger(__name__)
class GritLMMeanPool(nn.Module):
"""As `MeanPool`, but only includes non-instruction tokens."""
def __init__(self, renderer_config: RendererConfig):
def __init__(self, model_config: ModelConfig):
super().__init__()
self.renderer_config = renderer_config
self.model_config = model_config
tokenizer = cached_tokenizer_from_config(self.renderer_config)
tokenizer = cached_tokenizer_from_config(self.model_config)
# Collect the tokens needed for pattern matching.
# "▁<" is different from "_<". The former uses "▁" to indicate that
@ -174,10 +174,10 @@ class GritLMMeanPool(nn.Module):
class GritLMPooler(Pooler):
def __init__(self, renderer_config: RendererConfig):
def __init__(self, model_config: ModelConfig):
super().__init__()
self.pooling = GritLMMeanPool(renderer_config)
self.pooling = GritLMMeanPool(model_config)
self.head = PoolerHead(PoolerNormalize())
def get_supported_tasks(self) -> Set[PoolingTask]:
@ -238,6 +238,6 @@ class GritLM(LlamaForCausalLM):
self.pooler = DispatchPooler(
{
"token_embed": Pooler.for_token_embed(pooler_config),
"embed": GritLMPooler(vllm_config.renderer_config),
"embed": GritLMPooler(vllm_config.model_config),
}
)

View File

@ -19,7 +19,7 @@ from torch import Tensor
from transformers.models.whisper.tokenization_whisper import LANGUAGES
from typing_extensions import Self, TypeIs
from vllm.config import RendererConfig, SpeechToTextConfig
from vllm.config import ModelConfig, SpeechToTextConfig
from vllm.inputs import TokensPrompt
from vllm.inputs.data import PromptType
from vllm.logger import init_logger
@ -887,7 +887,7 @@ class SupportsTranscription(Protocol):
cls,
audio: np.ndarray,
stt_config: SpeechToTextConfig,
renderer_config: RendererConfig,
model_config: ModelConfig,
language: str | None,
task_type: Literal["transcribe", "translate"],
request_prompt: str,
@ -930,9 +930,7 @@ class SupportsTranscription(Protocol):
@classmethod
def get_speech_to_text_config(
cls,
renderer_config: RendererConfig,
task_type: Literal["transcribe", "translate"],
cls, model_config: ModelConfig, task_type: Literal["transcribe", "translate"]
) -> SpeechToTextConfig:
"""Get the speech to text config for the ASR model."""
...
@ -942,7 +940,7 @@ class SupportsTranscription(Protocol):
cls,
audio_duration_s: float,
stt_config: SpeechToTextConfig,
renderer_config: RendererConfig,
model_config: ModelConfig,
) -> int | None:
"""
Map from audio duration to number of audio tokens produced by the ASR

View File

@ -182,7 +182,7 @@ class InternS1ProcessingInfo(BaseProcessingInfo):
def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
hf_processor = self.ctx.get_hf_processor(InternVLProcessor, **kwargs)
hf_processor.video_processor = cached_video_processor_from_config(
self.ctx.renderer_config,
self.ctx.model_config,
processor_cls=InternVLVideoProcessor,
size=hf_processor.image_processor.size,
**kwargs,

View File

@ -1169,17 +1169,16 @@ class NemotronH_Nano_VL_V2(
self.mlp1 = self.mlp1.to(self.language_model.config.dtype)
self.config = config
self.model_config = vllm_config.model_config
# Pre-tokenize special tokens for video processing
# to avoid repeated tokenization
self._tokenizer = cached_tokenizer_from_config(vllm_config.renderer_config)
self._img_start_token_ids = self._tokenizer.encode(
tokenizer = cached_tokenizer_from_config(vllm_config.model_config)
self._img_start_token_ids = tokenizer.encode(
IMG_START, add_special_tokens=False
)
self._img_end_token_ids = self._tokenizer.encode(
IMG_END, add_special_tokens=False
)
self._img_context_token_ids = self._tokenizer.encode(
self._img_end_token_ids = tokenizer.encode(IMG_END, add_special_tokens=False)
self._img_context_token_ids = tokenizer.encode(
IMG_CONTEXT, add_special_tokens=False
)
@ -1365,7 +1364,7 @@ class NemotronH_Nano_VL_V2(
input_embeds for the LLM.
"""
device = video_embeddings.device
tokenizer = self._tokenizer
tokenizer = cached_tokenizer_from_config(self.model_config)
# Generate video replacement token IDs using get_video_repl
# This tokenizes each frame separator independently, then uses pre-tokenized

View File

@ -347,7 +347,7 @@ class NemotronVLProcessingInfo(BaseInternVLProcessingInfo):
def get_image_processor(self, **kwargs: object):
return cached_image_processor_from_config(
self.ctx.renderer_config,
self.ctx.model_config,
**kwargs,
)

View File

@ -193,7 +193,7 @@ class PixtralProcessorAdapter:
class PixtralProcessingInfo(BaseProcessingInfo):
def get_tokenizer(self) -> MistralTokenizer:
tokenizer = cached_tokenizer_from_config(self.ctx.renderer_config)
tokenizer = cached_tokenizer_from_config(self.ctx.model_config)
if not isinstance(tokenizer, MistralTokenizer):
raise ValueError("This model requires `--tokenizer-mode mistral`")

View File

@ -20,7 +20,7 @@ from mistral_common.tokens.tokenizers.audio import Audio, AudioEncoder
from transformers import BatchFeature, TensorType, WhisperConfig
from transformers.tokenization_utils_base import TextInput
from vllm.config import RendererConfig, SpeechToTextConfig, VllmConfig
from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.inputs.data import PromptType
from vllm.logger import init_logger
@ -176,7 +176,7 @@ class VoxtralProcessorAdapter:
class VoxtralProcessingInfo(BaseProcessingInfo):
def get_tokenizer(self) -> MistralTokenizer:
tokenizer = cached_tokenizer_from_config(self.ctx.renderer_config)
tokenizer = cached_tokenizer_from_config(self.ctx.model_config)
if not isinstance(tokenizer, MistralTokenizer):
raise ValueError("This model requires `--tokenizer-mode mistral`")
@ -339,7 +339,7 @@ class VoxtralForConditionalGeneration(
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
self.tokenizer = cached_tokenizer_from_config(vllm_config.renderer_config)
self.tokenizer = cached_tokenizer_from_config(vllm_config.model_config)
# update quant config to so that ignored module and target module names
# match the vLLM model names
@ -450,11 +450,9 @@ class VoxtralForConditionalGeneration(
@classmethod
def get_speech_to_text_config(
cls,
renderer_config: RendererConfig,
task_type: str,
cls, model_config: ModelConfig, task_type: str
) -> SpeechToTextConfig:
tokenizer = cached_tokenizer_from_config(renderer_config)
tokenizer = cached_tokenizer_from_config(model_config)
audio_config = tokenizer.instruct.audio_encoder.audio_config
max_audio_clip_s = audio_config.chunk_length_s
sample_rate = audio_config.sampling_rate
@ -470,17 +468,17 @@ class VoxtralForConditionalGeneration(
def get_generation_prompt(
cls,
audio: np.ndarray,
renderer_config: RendererConfig, # not needed here
model_config: ModelConfig,
stt_config: SpeechToTextConfig,
language: str | None,
task_type: Literal["transcribe", "translate"],
request_prompt: str,
to_language: str | None,
) -> PromptType:
tokenizer = cached_tokenizer_from_config(renderer_config)
tokenizer = cached_tokenizer_from_config(model_config)
audio = Audio(audio, int(stt_config.sample_rate), format="wav") # lossless
req = TranscriptionRequest(
model=renderer_config.model_config.model,
model=model_config.model,
audio=RawAudio.from_audio(audio),
language=language,
)
@ -496,14 +494,14 @@ class VoxtralForConditionalGeneration(
cls,
audio_duration_s: float,
stt_config: SpeechToTextConfig,
renderer_config: RendererConfig,
model_config: ModelConfig,
) -> int | None:
"""
Map from audio duration to number of audio tokens produced by the ASR
model, without running a forward pass.
This is used for estimating the amount of processing for this audio.
"""
tokenizer = cached_tokenizer_from_config(renderer_config)
tokenizer = cached_tokenizer_from_config(model_config)
adapter = VoxtralProcessorAdapter(tokenizer)
return adapter.get_num_audio_tokens(
int(audio_duration_s * stt_config.sample_rate)

View File

@ -19,7 +19,7 @@ from transformers.models.whisper.modeling_whisper import sinusoids
from vllm.attention.backends.abstract import AttentionType
from vllm.attention.layer import Attention, MultiHeadAttention
from vllm.attention.layers.cross_attention import CrossAttention
from vllm.config import CacheConfig, RendererConfig, SpeechToTextConfig, VllmConfig
from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
from vllm.config.multimodal import BaseDummyOptions
from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.inputs.data import PromptType
@ -811,7 +811,7 @@ class WhisperForConditionalGeneration(
def get_generation_prompt(
cls,
audio: np.ndarray,
renderer_config: RendererConfig, # not needed here
model_config: ModelConfig, # not needed here
stt_config: SpeechToTextConfig,
language: str | None,
task_type: Literal["transcribe", "translate"],
@ -847,11 +847,9 @@ class WhisperForConditionalGeneration(
@classmethod
def get_speech_to_text_config(
cls,
renderer_config: RendererConfig,
task_type: str,
cls, model_config: ModelConfig, task_type: str
) -> SpeechToTextConfig:
processor = cached_processor_from_config(renderer_config)
processor = cached_processor_from_config(model_config)
return SpeechToTextConfig(
max_audio_clip_s=processor.feature_extractor.chunk_length,
@ -863,9 +861,9 @@ class WhisperForConditionalGeneration(
cls,
audio_duration_s: float,
stt_config: SpeechToTextConfig,
renderer_config: RendererConfig,
model_config: ModelConfig,
) -> int | None:
processor = cached_processor_from_config(renderer_config)
processor = cached_processor_from_config(model_config)
hop_length = processor.feature_extractor.hop_length
assert hop_length is not None
# NOTE(NickLucche) user can't pass encoder

View File

@ -31,7 +31,7 @@ from .inputs import (
)
if TYPE_CHECKING:
from vllm.config import ModelConfig, RendererConfig, VllmConfig
from vllm.config import ModelConfig, VllmConfig
from .processing import ResolvedPromptUpdate
from .registry import MultiModalRegistry
@ -561,13 +561,13 @@ class ShmObjectStoreSenderCache(BaseMultiModalProcessorCache):
def _enable_processor_cache(
renderer_config: "RendererConfig",
model_config: "ModelConfig",
mm_registry: "MultiModalRegistry",
) -> bool:
if not mm_registry.supports_multimodal_inputs(renderer_config):
if not mm_registry.supports_multimodal_inputs(model_config):
return False
mm_config = renderer_config.model_config.get_multimodal_config()
mm_config = model_config.get_multimodal_config()
return mm_config.mm_processor_cache_gb > 0
@ -599,7 +599,7 @@ def processor_cache_from_config(
"""Return a `BaseMultiModalProcessorCache`, if enabled."""
model_config = vllm_config.model_config
if not _enable_processor_cache(vllm_config.renderer_config, mm_registry):
if not _enable_processor_cache(model_config, mm_registry):
return None
if not _enable_ipc_cache(vllm_config):
@ -611,14 +611,14 @@ def processor_cache_from_config(
def processor_only_cache_from_config(
renderer_config: "RendererConfig",
model_config: "ModelConfig",
mm_registry: "MultiModalRegistry",
):
"""Return a `MultiModalProcessorOnlyCache`, if enabled."""
if not _enable_processor_cache(renderer_config, mm_registry):
if not _enable_processor_cache(model_config, mm_registry):
return None
return MultiModalProcessorOnlyCache(renderer_config.model_config)
return MultiModalProcessorOnlyCache(model_config)
class BaseMultiModalReceiverCache(
@ -787,7 +787,7 @@ def engine_receiver_cache_from_config(
"""
model_config = vllm_config.model_config
if not _enable_processor_cache(vllm_config.renderer_config, mm_registry):
if not _enable_processor_cache(model_config, mm_registry):
return None
if not _enable_ipc_cache(vllm_config):
@ -809,7 +809,9 @@ def worker_receiver_cache_from_config(
Return a `BaseMultiModalReceiverCache` only when IPC caching is enabled and
mm_processor_cache_type=="shm".
"""
if not _enable_processor_cache(vllm_config.renderer_config, mm_registry):
model_config = vllm_config.model_config
if not _enable_processor_cache(model_config, mm_registry):
return None
if not _enable_ipc_cache(vllm_config):

View File

@ -23,7 +23,7 @@ import torch
from typing_extensions import TypeVar, assert_never
from vllm.logger import init_logger
from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
from vllm.tokenizers import TokenizerLike
from vllm.transformers_utils.processor import cached_processor_from_config
from vllm.utils.collection_utils import flatten_2d_lists, full_groupby
from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
@ -53,7 +53,7 @@ if TYPE_CHECKING:
from transformers.feature_extraction_utils import BatchFeature
from transformers.processing_utils import ProcessorMixin
from vllm.config import ModelConfig, RendererConfig
from vllm.config import ModelConfig
from .cache import BaseMultiModalProcessorCache
from .profiling import BaseDummyInputsBuilder
@ -63,7 +63,6 @@ else:
ProcessorMixin = object
ModelConfig = object
RendererConfig = object
BaseMultiModalProcessorCache = object
@ -946,29 +945,12 @@ class InputProcessingContext:
modify the inputs.
"""
renderer_config: RendererConfig
"""The configuration of the renderer."""
model_config: ModelConfig
"""The configuration of the model."""
tokenizer: TokenizerLike | None
"""The tokenizer used to tokenize the inputs."""
@classmethod
def from_config(
cls,
renderer_config: RendererConfig,
*,
tokenizer: TokenizerLike | None = None,
):
if tokenizer is None and not renderer_config.skip_tokenizer_init:
tokenizer = cached_tokenizer_from_config(renderer_config)
return cls(renderer_config, tokenizer)
@property
def model_config(self) -> ModelConfig:
"""The configuration of the model."""
return self.renderer_config.model_config
def get_tokenizer(self) -> TokenizerLike:
if self.tokenizer is None:
raise ValueError(
@ -1065,7 +1047,7 @@ class InputProcessingContext:
typ = ProcessorMixin
return cached_processor_from_config(
self.renderer_config,
self.model_config,
processor_cls=typ,
tokenizer=self.tokenizer,
**kwargs,

View File

@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Generic, Protocol, TypeVar, cast
from vllm.config.multimodal import BaseDummyOptions
from vllm.logger import init_logger
from vllm.tokenizers import TokenizerLike
from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
from .cache import BaseMultiModalProcessorCache
from .processing import (
@ -22,7 +22,7 @@ from .profiling import (
)
if TYPE_CHECKING:
from vllm.config import ModelConfig, RendererConfig
from vllm.config import ModelConfig
from vllm.model_executor.models.interfaces import SupportsMultiModal
logger = init_logger(__name__)
@ -114,18 +114,17 @@ class MultiModalRegistry:
return mm_options if len(mm_options) > 0 else None
def supports_multimodal_inputs(self, renderer_config: "RendererConfig") -> bool:
def supports_multimodal_inputs(self, model_config: "ModelConfig") -> bool:
"""
Checks if the model supports multimodal inputs.
Returns True if the model is multimodal with any non-zero supported
modalities, otherwise returns False, effectively running in
text-only mode.
"""
model_config = renderer_config.model_config
if not model_config.is_multimodal_model:
return False
info = self._create_processing_info(renderer_config, tokenizer=None)
info = self._create_processing_info(model_config, tokenizer=None)
supported_modalities = info.get_supported_mm_limits()
mm_config = model_config.get_multimodal_config()
@ -145,7 +144,7 @@ class MultiModalRegistry:
def get_max_tokens_per_item_by_modality(
self,
renderer_config: "RendererConfig",
model_config: "ModelConfig",
*,
cache: BaseMultiModalProcessorCache | None = None,
profiler_limits: Mapping[str, int] | None = None,
@ -154,11 +153,10 @@ class MultiModalRegistry:
Get the maximum number of tokens per data item from each modality based
on underlying model configuration.
"""
model_config = renderer_config.model_config
if not model_config.is_multimodal_model:
return {}
processor = self.create_processor(renderer_config, cache=cache)
processor = self.create_processor(model_config, cache=cache)
profiler: MultiModalProfiler = MultiModalProfiler(processor)
seq_len = model_config.max_model_len
@ -173,7 +171,7 @@ class MultiModalRegistry:
def get_mm_limits_per_prompt(
self,
renderer_config: "RendererConfig",
model_config: "ModelConfig",
*,
cache: BaseMultiModalProcessorCache | None = None,
) -> Mapping[str, int]:
@ -181,11 +179,10 @@ class MultiModalRegistry:
Get the maximum number of multi-modal input instances for each modality
that are allowed per prompt for a model class.
"""
model_config = renderer_config.model_config
if not model_config.is_multimodal_model:
return {}
processor = self.create_processor(renderer_config, cache=cache)
processor = self.create_processor(model_config, cache=cache)
profiler: MultiModalProfiler = MultiModalProfiler(processor)
return profiler.get_mm_limits()
@ -231,21 +228,30 @@ class MultiModalRegistry:
assert hasattr(model_cls, "_processor_factory")
return cast("SupportsMultiModal", model_cls)
def _create_processing_ctx(
self,
model_config: "ModelConfig",
tokenizer: TokenizerLike | None = None,
) -> InputProcessingContext:
if tokenizer is None and not model_config.skip_tokenizer_init:
tokenizer = cached_tokenizer_from_config(model_config)
return InputProcessingContext(model_config, tokenizer)
def _create_processing_info(
self,
renderer_config: "RendererConfig",
model_config: "ModelConfig",
*,
tokenizer: TokenizerLike | None = None,
) -> BaseProcessingInfo:
model_cls = self._get_model_cls(renderer_config.model_config)
model_cls = self._get_model_cls(model_config)
factories = model_cls._processor_factory
ctx = InputProcessingContext.from_config(renderer_config, tokenizer=tokenizer)
ctx = self._create_processing_ctx(model_config, tokenizer)
return factories.info(ctx)
def create_processor(
self,
renderer_config: "RendererConfig",
model_config: "ModelConfig",
*,
tokenizer: TokenizerLike | None = None,
cache: BaseMultiModalProcessorCache | None = None,
@ -253,19 +259,19 @@ class MultiModalRegistry:
"""
Create a multi-modal processor for a specific model and tokenizer.
"""
model_config = renderer_config.model_config
if not model_config.is_multimodal_model:
raise ValueError(f"{model_config.model} is not a multimodal model")
model_cls = self._get_model_cls(model_config)
factories = model_cls._processor_factory
ctx = InputProcessingContext.from_config(renderer_config, tokenizer=tokenizer)
ctx = self._create_processing_ctx(model_config, tokenizer)
return factories.build_processor(ctx, cache=cache)
def get_decoder_dummy_data(
self,
renderer_config: "RendererConfig",
model_config: "ModelConfig",
seq_len: int,
mm_counts: Mapping[str, int] | None = None,
*,
@ -274,15 +280,15 @@ class MultiModalRegistry:
"""
Create dummy data for profiling the memory usage of a model.
The model is identified by `renderer_config`.
The model is identified by `model_config`.
"""
processor = self.create_processor(renderer_config, cache=cache)
processor = self.create_processor(model_config, cache=cache)
profiler: MultiModalProfiler = MultiModalProfiler(processor)
# Extract configurable options from multimodal config.
# Only include modalities that use advanced option types so legacy
# count-only behavior remains unchanged.
mm_options = self._extract_mm_options(renderer_config.model_config)
mm_options = self._extract_mm_options(model_config)
dummy_data = profiler.get_decoder_dummy_data(seq_len, mm_counts, mm_options)
@ -298,7 +304,7 @@ class MultiModalRegistry:
def get_encoder_dummy_data(
self,
renderer_config: "RendererConfig",
model_config: "ModelConfig",
seq_len: int,
mm_counts: Mapping[str, int] | None = None,
*,
@ -307,15 +313,15 @@ class MultiModalRegistry:
"""
Create dummy data for profiling the memory usage of a model.
The model is identified by `renderer_config`.
The model is identified by `model_config`.
"""
processor = self.create_processor(renderer_config, cache=cache)
processor = self.create_processor(model_config, cache=cache)
profiler: MultiModalProfiler = MultiModalProfiler(processor)
# Extract configurable options from multimodal config.
# Only include modalities that use advanced option types so legacy
# count-only behavior remains unchanged.
mm_options = self._extract_mm_options(renderer_config.model_config)
mm_options = self._extract_mm_options(model_config)
dummy_data = profiler.get_encoder_dummy_data(seq_len, mm_counts, mm_options)
@ -330,15 +336,13 @@ class MultiModalRegistry:
return dummy_data
def get_encdec_max_encoder_len(self, renderer_config: "RendererConfig") -> int:
def get_encdec_max_encoder_len(self, model_config: "ModelConfig") -> int:
"""
Get the maximum length of the encoder input for encoder-decoder models.
"""
model_config = renderer_config.model_config
if not model_config.is_encoder_decoder:
return 0
max_tokens = self.get_max_tokens_per_item_by_modality(renderer_config)
max_tokens = self.get_max_tokens_per_item_by_modality(model_config)
if not max_tokens:
# TODO - this function assumes encoder-decoder models are
# multimodal. This will need to change when adding support for more

View File

@ -24,7 +24,7 @@ from vllm.utils.import_utils import resolve_obj_by_qualname
from .protocol import TokenizerLike
if TYPE_CHECKING:
from vllm.config import RendererConfig
from vllm.config import ModelConfig
logger = init_logger(__name__)
@ -205,18 +205,18 @@ def get_tokenizer(
cached_get_tokenizer = lru_cache(get_tokenizer)
def cached_tokenizer_from_config(renderer_config: "RendererConfig", **kwargs):
def cached_tokenizer_from_config(model_config: "ModelConfig", **kwargs):
return cached_get_tokenizer(
renderer_config.tokenizer,
tokenizer_mode=renderer_config.tokenizer_mode,
revision=renderer_config.tokenizer_revision,
trust_remote_code=renderer_config.trust_remote_code,
model_config.tokenizer,
tokenizer_mode=model_config.tokenizer_mode,
revision=model_config.tokenizer_revision,
trust_remote_code=model_config.trust_remote_code,
**kwargs,
)
def init_tokenizer_from_config(renderer_config: "RendererConfig"):
runner_type = renderer_config.model_config.runner_type
def init_tokenizer_from_config(model_config: "ModelConfig"):
runner_type = model_config.runner_type
if runner_type == "generate" or runner_type == "draft":
truncation_side = "left"
elif runner_type == "pooling":
@ -225,9 +225,9 @@ def init_tokenizer_from_config(renderer_config: "RendererConfig"):
assert_never(runner_type)
return get_tokenizer(
renderer_config.tokenizer,
tokenizer_mode=renderer_config.tokenizer_mode,
trust_remote_code=renderer_config.trust_remote_code,
revision=renderer_config.tokenizer_revision,
model_config.tokenizer,
tokenizer_mode=model_config.tokenizer_mode,
trust_remote_code=model_config.trust_remote_code,
revision=model_config.tokenizer_revision,
truncation_side=truncation_side,
)

View File

@ -23,7 +23,7 @@ from vllm.transformers_utils.utils import convert_model_repo_to_path
from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
if TYPE_CHECKING:
from vllm.config import ModelConfig, RendererConfig
from vllm.config import ModelConfig
_P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
_V = TypeVar("_V", bound=BaseVideoProcessor, default=BaseVideoProcessor)
@ -233,18 +233,17 @@ def cached_get_processor_without_dynamic_kwargs(
def cached_processor_from_config(
renderer_config: "RendererConfig",
model_config: "ModelConfig",
processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin,
**kwargs: Any,
) -> _P:
model_config = renderer_config.model_config
if is_gguf(model_config.model):
assert not is_gguf(renderer_config.tokenizer), (
assert not is_gguf(model_config.tokenizer), (
"For multimodal GGUF models, the original tokenizer "
"should be used to correctly load processor."
)
model = renderer_config.tokenizer
revision = renderer_config.tokenizer_revision
model = model_config.tokenizer
revision = model_config.tokenizer_revision
else:
model = model_config.model
revision = model_config.revision
@ -298,11 +297,9 @@ cached_get_feature_extractor = lru_cache(get_feature_extractor)
def cached_feature_extractor_from_config(
renderer_config: "RendererConfig",
model_config: "ModelConfig",
**kwargs: Any,
):
model_config = renderer_config.model_config
return cached_get_feature_extractor(
model_config.model,
revision=model_config.revision,
@ -351,17 +348,16 @@ cached_get_image_processor = lru_cache(get_image_processor)
def cached_image_processor_from_config(
renderer_config: "RendererConfig",
model_config: "ModelConfig",
**kwargs: Any,
):
model_config = renderer_config.model_config
if is_gguf(model_config.model):
assert not is_gguf(renderer_config.tokenizer), (
assert not is_gguf(model_config.tokenizer), (
"For multimodal GGUF models, the original tokenizer "
"should be used to correctly load image processor."
)
model = renderer_config.tokenizer
revision = renderer_config.tokenizer_revision
model = model_config.tokenizer
revision = model_config.tokenizer_revision
else:
model = model_config.model
revision = model_config.revision
@ -415,12 +411,10 @@ cached_get_video_processor = lru_cache(get_video_processor)
def cached_video_processor_from_config(
renderer_config: "RendererConfig",
model_config: "ModelConfig",
processor_cls: type[_V] | None = None,
**kwargs: Any,
):
model_config = renderer_config.model_config
return cached_get_video_processor(
model_config.model,
revision=model_config.revision,

View File

@ -10,7 +10,7 @@ from vllm.multimodal import MultiModalRegistry
from vllm.v1.request import Request
if TYPE_CHECKING:
from vllm.config import RendererConfig, SchedulerConfig
from vllm.config import ModelConfig, SchedulerConfig
logger = init_logger(__name__)
@ -250,7 +250,7 @@ class EncoderCacheManager:
def compute_encoder_budget(
renderer_config: "RendererConfig",
model_config: "ModelConfig",
scheduler_config: "SchedulerConfig",
mm_registry: MultiModalRegistry,
) -> tuple[int, int]:
@ -263,9 +263,9 @@ def compute_encoder_budget(
- Space budget for encoder cache size, measured in number of tokens
from the input sequence.
"""
if mm_registry.supports_multimodal_inputs(renderer_config):
if mm_registry.supports_multimodal_inputs(model_config):
max_tokens_by_modality = mm_registry.get_max_tokens_per_item_by_modality(
renderer_config
model_config
)
return compute_mm_encoder_budget(

View File

@ -164,7 +164,7 @@ class Scheduler(SchedulerInterface):
# This can be changed when we make encoder cache for embedding caching
# across requests.
encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
renderer_config=vllm_config.renderer_config,
model_config=vllm_config.model_config,
scheduler_config=vllm_config.scheduler_config,
mm_registry=mm_registry,
)

View File

@ -91,7 +91,6 @@ class AsyncLLM(EngineClient):
# Ensure we can serialize custom transformer configs
maybe_register_config_serialize_by_value()
self.renderer_config = vllm_config.renderer_config
self.model_config = vllm_config.model_config
self.vllm_config = vllm_config
self.observability_config = vllm_config.observability_config
@ -109,15 +108,15 @@ class AsyncLLM(EngineClient):
"enabling logging without default stat loggers."
)
if self.renderer_config.skip_tokenizer_init:
if self.model_config.skip_tokenizer_init:
tokenizer = None
else:
tokenizer = init_tokenizer_from_config(self.renderer_config)
tokenizer = init_tokenizer_from_config(self.model_config)
self.input_processor = InputProcessor(self.vllm_config, tokenizer)
self.io_processor = get_io_processor(
self.vllm_config,
self.renderer_config.io_processor_plugin,
self.model_config.io_processor_plugin,
)
# OutputProcessor (converts EngineCoreOutputs --> RequestOutput).

View File

@ -43,7 +43,6 @@ class InputProcessor:
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
) -> None:
self.vllm_config = vllm_config
self.renderer_config = vllm_config.renderer_config
self.model_config = vllm_config.model_config
self.cache_config = vllm_config.cache_config
self.lora_config = vllm_config.lora_config
@ -55,7 +54,7 @@ class InputProcessor:
self.mm_processor_cache = processor_cache_from_config(vllm_config, mm_registry)
self.input_preprocessor = InputPreprocessor(
self.renderer_config,
self.model_config,
tokenizer,
mm_registry,
mm_processor_cache=self.mm_processor_cache,
@ -253,7 +252,7 @@ class InputProcessor:
if not params.structured_outputs or not self.structured_outputs_config:
return
if self.renderer_config.skip_tokenizer_init and params.structured_outputs:
if self.model_config.skip_tokenizer_init and params.structured_outputs:
raise ValueError(
"Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'" # noqa: E501
)
@ -583,7 +582,7 @@ class InputProcessor:
if prompt_type == "encoder" and model_config.is_multimodal_model:
mm_registry = self.input_preprocessor.mm_registry
mm_processor = mm_registry.create_processor(
self.renderer_config,
model_config,
tokenizer=tokenizer,
)
assert isinstance(mm_processor, EncDecMultiModalProcessor)

View File

@ -60,7 +60,6 @@ class LLMEngine:
) -> None:
self.vllm_config = vllm_config
self.observability_config = vllm_config.observability_config
self.renderer_config = vllm_config.renderer_config
self.model_config = vllm_config.model_config
self.cache_config = vllm_config.cache_config
@ -84,15 +83,15 @@ class LLMEngine:
self.dp_group = None
self.should_execute_dummy_batch = False
if self.renderer_config.skip_tokenizer_init:
if self.model_config.skip_tokenizer_init:
tokenizer = None
else:
tokenizer = init_tokenizer_from_config(self.renderer_config)
tokenizer = init_tokenizer_from_config(self.model_config)
self.input_processor = InputProcessor(self.vllm_config, tokenizer)
self.io_processor = get_io_processor(
self.vllm_config,
self.renderer_config.io_processor_plugin,
self.model_config.io_processor_plugin,
)
# OutputProcessor (convert EngineCoreOutputs --> RequestOutput).

Some files were not shown because too many files have changed in this diff Show More