mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-01-25 07:34:28 +08:00
This commit is contained in:
parent
27f4c2fd46
commit
e83b7e379c
@ -22,7 +22,7 @@ Declare supported languages and capabilities:
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from vllm.config import RendererConfig, SpeechToTextConfig
|
||||
from vllm.config import ModelConfig, SpeechToTextConfig
|
||||
from vllm.inputs.data import PromptType
|
||||
from vllm.model_executor.models.interfaces import SupportsTranscription
|
||||
|
||||
@ -52,7 +52,7 @@ This is for controlling general behavior of the API when serving your model:
|
||||
@classmethod
|
||||
def get_speech_to_text_config(
|
||||
cls,
|
||||
renderer_config: RendererConfig,
|
||||
model_config: ModelConfig,
|
||||
task_type: Literal["transcribe", "translate"],
|
||||
) -> SpeechToTextConfig:
|
||||
return SpeechToTextConfig(
|
||||
@ -83,7 +83,7 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
|
||||
cls,
|
||||
audio: np.ndarray,
|
||||
stt_config: SpeechToTextConfig,
|
||||
renderer_config: RendererConfig,
|
||||
model_config: ModelConfig,
|
||||
language: str | None,
|
||||
task_type: Literal["transcribe", "translate"],
|
||||
request_prompt: str,
|
||||
@ -120,7 +120,7 @@ Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
|
||||
cls,
|
||||
audio: np.ndarray,
|
||||
stt_config: SpeechToTextConfig,
|
||||
renderer_config: RendererConfig,
|
||||
model_config: ModelConfig,
|
||||
language: str | None,
|
||||
task_type: Literal["transcribe", "translate"],
|
||||
request_prompt: str,
|
||||
@ -183,7 +183,7 @@ Provide a fast duration→token estimate to improve streaming usage statistics:
|
||||
cls,
|
||||
audio_duration_s: float,
|
||||
stt_config: SpeechToTextConfig,
|
||||
renderer_config: RendererConfig,
|
||||
model_config: ModelConfig,
|
||||
) -> int | None:
|
||||
# Return None if unknown; otherwise return an estimate.
|
||||
return int(audio_duration_s * stt_config.sample_rate // 320) # example
|
||||
@ -216,7 +216,7 @@ Relevant server logic:
|
||||
prompt = self.model_cls.get_generation_prompt(
|
||||
audio=chunk,
|
||||
stt_config=self.asr_config,
|
||||
renderer_config=self.renderer_config,
|
||||
model_config=self.model_config,
|
||||
language=language,
|
||||
task_type=self.task_type,
|
||||
request_prompt=request.prompt,
|
||||
|
||||
@ -17,7 +17,6 @@ from vllm.config import (
|
||||
DeviceConfig,
|
||||
ModelConfig,
|
||||
PassConfig,
|
||||
RendererConfig,
|
||||
VllmConfig,
|
||||
get_current_vllm_config,
|
||||
set_current_vllm_config,
|
||||
@ -277,7 +276,6 @@ def sequence_parallelism_pass_on_test_model(
|
||||
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
renderer_config=RendererConfig(model_config=model_config),
|
||||
device_config=device_config,
|
||||
compilation_config=compilation_config,
|
||||
)
|
||||
|
||||
@ -15,7 +15,6 @@ from vllm.config import (
|
||||
CompilationConfig,
|
||||
ModelConfig,
|
||||
PassConfig,
|
||||
RendererConfig,
|
||||
VllmConfig,
|
||||
set_current_vllm_config,
|
||||
)
|
||||
@ -220,11 +219,8 @@ def test_fix_functionalization(
|
||||
torch.set_default_device("cuda")
|
||||
torch.set_default_dtype(dtype)
|
||||
|
||||
model_config = ModelConfig(dtype=dtype)
|
||||
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
renderer_config=RendererConfig(model_config=model_config),
|
||||
model_config=ModelConfig(dtype=dtype),
|
||||
compilation_config=CompilationConfig(
|
||||
custom_ops=["all"],
|
||||
pass_config=PassConfig(
|
||||
|
||||
@ -15,7 +15,6 @@ from vllm.config import (
|
||||
CompilationMode,
|
||||
ModelConfig,
|
||||
PassConfig,
|
||||
RendererConfig,
|
||||
VllmConfig,
|
||||
)
|
||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||
@ -155,11 +154,8 @@ def test_fusion_rmsnorm_quant(
|
||||
custom_ops.append("+rms_norm")
|
||||
if enable_quant_fp8_custom_op:
|
||||
custom_ops.append("+quant_fp8")
|
||||
|
||||
model_config = ModelConfig(dtype=dtype)
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
renderer_config=RendererConfig(model_config=model_config),
|
||||
model_config=ModelConfig(dtype=dtype),
|
||||
compilation_config=CompilationConfig(
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
custom_ops=custom_ops,
|
||||
|
||||
@ -24,7 +24,6 @@ from vllm.config import (
|
||||
CompilationMode,
|
||||
ModelConfig,
|
||||
PassConfig,
|
||||
RendererConfig,
|
||||
SchedulerConfig,
|
||||
VllmConfig,
|
||||
set_current_vllm_config,
|
||||
@ -326,7 +325,6 @@ def test_attention_quant_pattern(
|
||||
)
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
renderer_config=RendererConfig(model_config=model_config),
|
||||
scheduler_config=SchedulerConfig(
|
||||
max_num_seqs=1024,
|
||||
max_model_len=model_config.max_model_len,
|
||||
|
||||
@ -7,7 +7,7 @@ import torch
|
||||
|
||||
from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
|
||||
from vllm.compilation.pass_manager import PostGradPassManager
|
||||
from vllm.config import ModelConfig, RendererConfig, VllmConfig
|
||||
from vllm.config import ModelConfig, VllmConfig
|
||||
|
||||
|
||||
# dummy custom pass that doesn't inherit
|
||||
@ -43,11 +43,7 @@ class ProperPass(InductorPass):
|
||||
)
|
||||
def test_pass_manager_uuid(callable):
|
||||
# Some passes need dtype to be set
|
||||
model_config = ModelConfig(dtype=torch.bfloat16)
|
||||
config = VllmConfig(
|
||||
model_config=model_config,
|
||||
renderer_config=RendererConfig(model_config=model_config),
|
||||
)
|
||||
config = VllmConfig(model_config=ModelConfig(dtype=torch.bfloat16))
|
||||
|
||||
pass_manager = PostGradPassManager()
|
||||
pass_manager.configure(config)
|
||||
|
||||
@ -19,7 +19,6 @@ from vllm.config import (
|
||||
CompilationMode,
|
||||
ModelConfig,
|
||||
PassConfig,
|
||||
RendererConfig,
|
||||
VllmConfig,
|
||||
set_current_vllm_config,
|
||||
)
|
||||
@ -134,10 +133,8 @@ def test_qk_norm_rope_fusion(
|
||||
if enable_rope_custom_op:
|
||||
custom_ops.append("+rotary_embedding")
|
||||
|
||||
model_config = ModelConfig(dtype=dtype)
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
renderer_config=RendererConfig(model_config=model_config),
|
||||
model_config=ModelConfig(dtype=dtype),
|
||||
compilation_config=CompilationConfig(
|
||||
mode=CompilationMode.VLLM_COMPILE,
|
||||
custom_ops=custom_ops,
|
||||
|
||||
@ -5,7 +5,6 @@ from vllm.config import (
|
||||
DeviceConfig,
|
||||
KVTransferConfig,
|
||||
ModelConfig,
|
||||
RendererConfig,
|
||||
VllmConfig,
|
||||
set_current_vllm_config,
|
||||
)
|
||||
@ -48,7 +47,6 @@ def test_get_kv_connector_cache_layout_with_nixl_connector():
|
||||
vllm_config = VllmConfig(
|
||||
device_config=DeviceConfig("cpu"),
|
||||
model_config=model_config,
|
||||
renderer_config=RendererConfig(model_config=model_config),
|
||||
kv_transfer_config=kv_transfer_config,
|
||||
)
|
||||
with set_current_vllm_config(vllm_config):
|
||||
@ -72,7 +70,6 @@ def test_get_kv_connector_cache_layout_with_multi_connector():
|
||||
vllm_config = VllmConfig(
|
||||
device_config=DeviceConfig("cpu"),
|
||||
model_config=model_config,
|
||||
renderer_config=RendererConfig(model_config=model_config),
|
||||
kv_transfer_config=kv_transfer_config,
|
||||
)
|
||||
with set_current_vllm_config(vllm_config):
|
||||
|
||||
@ -3,6 +3,7 @@
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.entrypoints.chat_utils import apply_hf_chat_template, load_chat_template
|
||||
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
@ -106,11 +107,24 @@ def test_get_gen_prompt(
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
|
||||
renderer_config = model_info.build_renderer_config(model)
|
||||
model_config = ModelConfig(
|
||||
model,
|
||||
tokenizer=model_info.tokenizer or model,
|
||||
tokenizer_mode=model_info.tokenizer_mode,
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
revision=model_info.revision,
|
||||
hf_overrides=model_info.hf_overrides,
|
||||
skip_tokenizer_init=model_info.require_embed_inputs,
|
||||
enable_prompt_embeds=model_info.require_embed_inputs,
|
||||
enable_mm_embeds=model_info.require_embed_inputs,
|
||||
enforce_eager=model_info.enforce_eager,
|
||||
dtype=model_info.dtype,
|
||||
)
|
||||
|
||||
# Initialize the tokenizer
|
||||
tokenizer = get_tokenizer(
|
||||
renderer_config.tokenizer,
|
||||
trust_remote_code=renderer_config.trust_remote_code,
|
||||
tokenizer_name=model_config.tokenizer,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
)
|
||||
template_content = load_chat_template(chat_template=template)
|
||||
|
||||
@ -129,7 +143,7 @@ def test_get_gen_prompt(
|
||||
tokenizer=tokenizer,
|
||||
conversation=mock_request.messages,
|
||||
chat_template=mock_request.chat_template or template_content,
|
||||
renderer_config=renderer_config,
|
||||
model_config=model_config,
|
||||
tools=None,
|
||||
add_generation_prompt=mock_request.add_generation_prompt,
|
||||
continue_final_message=mock_request.continue_final_message,
|
||||
|
||||
@ -33,34 +33,26 @@ class MockModelConfig:
|
||||
"""Minimal mock ModelConfig for testing."""
|
||||
|
||||
model: str = MODEL_NAME
|
||||
tokenizer: str = MODEL_NAME
|
||||
trust_remote_code: bool = False
|
||||
tokenizer_mode: str = "auto"
|
||||
max_model_len: int = 100
|
||||
tokenizer_revision: str | None = None
|
||||
multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig)
|
||||
hf_config: MockHFConfig = field(default_factory=MockHFConfig)
|
||||
logits_processors: list[str] | None = None
|
||||
logits_processor_pattern: str | None = None
|
||||
diff_sampling_param: dict | None = None
|
||||
allowed_local_media_path: str = ""
|
||||
allowed_media_domains: list[str] | None = None
|
||||
encoder_config = None
|
||||
generation_config: str = "auto"
|
||||
skip_tokenizer_init: bool = False
|
||||
|
||||
def get_diff_sampling_param(self):
|
||||
return self.diff_sampling_param or {}
|
||||
|
||||
|
||||
@dataclass
|
||||
class MockRendererConfig:
|
||||
"""Minimal mock RendererConfig for testing."""
|
||||
|
||||
model_config: MockModelConfig
|
||||
|
||||
tokenizer: str = MODEL_NAME
|
||||
tokenizer_mode: str = "auto"
|
||||
tokenizer_revision: str | None = None
|
||||
skip_tokenizer_init: bool = False
|
||||
allowed_local_media_path: str = ""
|
||||
allowed_media_domains: list[str] | None = None
|
||||
|
||||
|
||||
class MockLoRAResolver(LoRAResolver):
|
||||
async def resolve_lora(
|
||||
self, base_model_name: str, lora_name: str
|
||||
@ -122,7 +114,6 @@ def mock_serving_setup():
|
||||
mock_engine.add_lora.reset_mock()
|
||||
|
||||
mock_engine.model_config = MockModelConfig()
|
||||
mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
|
||||
mock_engine.input_processor = MagicMock()
|
||||
mock_engine.io_processor = MagicMock()
|
||||
|
||||
|
||||
@ -346,33 +346,27 @@ class MockHFConfig:
|
||||
class MockModelConfig:
|
||||
task = "generate"
|
||||
runner_type = "generate"
|
||||
tokenizer = MODEL_NAME
|
||||
trust_remote_code = False
|
||||
tokenizer_mode = "auto"
|
||||
max_model_len = 100
|
||||
tokenizer_revision = None
|
||||
multimodal_config = MultiModalConfig()
|
||||
hf_config = MockHFConfig()
|
||||
logits_processors: list[str] | None = None
|
||||
logits_processor_pattern = None
|
||||
diff_sampling_param: dict | None = None
|
||||
allowed_local_media_path: str = ""
|
||||
allowed_media_domains: list[str] | None = None
|
||||
encoder_config = None
|
||||
generation_config: str = "auto"
|
||||
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
|
||||
skip_tokenizer_init = False
|
||||
|
||||
def get_diff_sampling_param(self):
|
||||
return self.diff_sampling_param or {}
|
||||
|
||||
|
||||
@dataclass
|
||||
class MockRendererConfig:
|
||||
model_config: MockModelConfig = field(default_factory=MockModelConfig)
|
||||
|
||||
tokenizer = MODEL_NAME
|
||||
tokenizer_mode = "auto"
|
||||
tokenizer_revision = None
|
||||
skip_tokenizer_init = False
|
||||
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
|
||||
allowed_local_media_path: str = ""
|
||||
allowed_media_domains: list[str] | None = None
|
||||
|
||||
|
||||
def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
|
||||
models = OpenAIServingModels(
|
||||
engine_client=engine,
|
||||
@ -405,7 +399,6 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
|
||||
@dataclass
|
||||
class MockEngine:
|
||||
model_config: MockModelConfig = field(default_factory=MockModelConfig)
|
||||
renderer_config: MockRendererConfig = field(default_factory=MockRendererConfig)
|
||||
input_processor: MagicMock = field(default_factory=MagicMock)
|
||||
io_processor: MagicMock = field(default_factory=MagicMock)
|
||||
|
||||
@ -436,7 +429,6 @@ async def test_serving_chat_returns_correct_model_name():
|
||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||
mock_engine.errored = False
|
||||
mock_engine.model_config = MockModelConfig()
|
||||
mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
|
||||
mock_engine.input_processor = MagicMock()
|
||||
mock_engine.io_processor = MagicMock()
|
||||
|
||||
@ -467,7 +459,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
|
||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||
mock_engine.errored = False
|
||||
mock_engine.model_config = MockModelConfig()
|
||||
mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
|
||||
mock_engine.input_processor = MagicMock()
|
||||
mock_engine.io_processor = MagicMock()
|
||||
|
||||
@ -501,7 +492,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
|
||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||
mock_engine.errored = False
|
||||
mock_engine.model_config = mock_model_config
|
||||
mock_engine.renderer_config = MockRendererConfig(mock_model_config)
|
||||
mock_engine.input_processor = MagicMock()
|
||||
mock_engine.io_processor = MagicMock()
|
||||
|
||||
@ -547,7 +537,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
|
||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||
mock_engine.errored = False
|
||||
mock_engine.model_config = mock_model_config
|
||||
mock_engine.renderer_config = MockRendererConfig(mock_model_config)
|
||||
mock_engine.input_processor = MagicMock()
|
||||
mock_engine.io_processor = MagicMock()
|
||||
|
||||
@ -594,7 +583,6 @@ async def test_serving_chat_could_load_correct_generation_config():
|
||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||
mock_engine.errored = False
|
||||
mock_engine.model_config = mock_model_config
|
||||
mock_engine.renderer_config = MockRendererConfig(mock_model_config)
|
||||
mock_engine.input_processor = MagicMock()
|
||||
mock_engine.io_processor = MagicMock()
|
||||
|
||||
@ -641,7 +629,6 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
|
||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||
mock_engine.errored = False
|
||||
mock_engine.model_config = mock_model_config
|
||||
mock_engine.renderer_config = MockRendererConfig(mock_model_config)
|
||||
mock_engine.input_processor = MagicMock()
|
||||
mock_engine.io_processor = MagicMock()
|
||||
|
||||
@ -675,7 +662,6 @@ async def test_serving_chat_data_parallel_rank_extraction():
|
||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||
mock_engine.errored = False
|
||||
mock_engine.model_config = MockModelConfig()
|
||||
mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
|
||||
mock_engine.input_processor = MagicMock()
|
||||
mock_engine.io_processor = MagicMock()
|
||||
|
||||
|
||||
@ -7,7 +7,7 @@ from unittest.mock import Mock
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import ModelConfig, RendererConfig
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||
from vllm.tokenizers import MistralTokenizer
|
||||
@ -19,16 +19,10 @@ def serving() -> OpenAIServing:
|
||||
|
||||
# Create minimal mocks
|
||||
engine_client = Mock()
|
||||
|
||||
model_config = Mock(spec=ModelConfig)
|
||||
model_config.max_model_len = 32768
|
||||
|
||||
renderer_config = Mock(spec=RendererConfig)
|
||||
renderer_config.model_config = model_config
|
||||
|
||||
models = Mock(spec=OpenAIServingModels)
|
||||
models.model_config = model_config
|
||||
models.renderer_config = renderer_config
|
||||
models.input_processor = Mock()
|
||||
models.io_processor = Mock()
|
||||
|
||||
|
||||
@ -6,7 +6,7 @@ from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import ModelConfig, RendererConfig
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.openai.protocol import (
|
||||
ErrorResponse,
|
||||
@ -27,15 +27,9 @@ LORA_UNLOADING_SUCCESS_MESSAGE = (
|
||||
async def _async_serving_models_init() -> OpenAIServingModels:
|
||||
mock_engine_client = MagicMock(spec=EngineClient)
|
||||
# Set the max_model_len attribute to avoid missing attribute
|
||||
|
||||
mock_model_config = MagicMock(spec=ModelConfig)
|
||||
mock_model_config.max_model_len = 2048
|
||||
|
||||
mock_renderer_config = MagicMock(spec=RendererConfig)
|
||||
mock_renderer_config.model_config = mock_model_config
|
||||
|
||||
mock_engine_client.model_config = mock_model_config
|
||||
mock_engine_client.renderer_config = mock_renderer_config
|
||||
mock_engine_client.input_processor = MagicMock()
|
||||
mock_engine_client.io_processor = MagicMock()
|
||||
|
||||
|
||||
@ -12,7 +12,7 @@ from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
|
||||
from vllm.assets.audio import AudioAsset
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.assets.video import VideoAsset
|
||||
from vllm.config import ModelConfig, RendererConfig
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.entrypoints.chat_utils import (
|
||||
_try_extract_ast,
|
||||
apply_mistral_chat_template,
|
||||
@ -233,7 +233,7 @@ def test_parse_chat_messages_single_image(
|
||||
],
|
||||
}
|
||||
],
|
||||
RendererConfig(model_config=phi3v_model_config),
|
||||
phi3v_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -265,7 +265,7 @@ def test_parse_chat_messages_single_image_with_uuid(
|
||||
],
|
||||
}
|
||||
],
|
||||
RendererConfig(model_config=phi3v_model_config),
|
||||
phi3v_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -295,7 +295,7 @@ def test_parse_chat_messages_single_empty_image_with_uuid(
|
||||
],
|
||||
}
|
||||
],
|
||||
RendererConfig(model_config=phi3v_model_config),
|
||||
phi3v_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -328,7 +328,7 @@ def test_parse_chat_messages_single_image_with_bad_uuid_format(
|
||||
],
|
||||
}
|
||||
],
|
||||
RendererConfig(model_config=phi3v_model_config),
|
||||
phi3v_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -369,7 +369,7 @@ def test_parse_chat_messages_multiple_images_with_uuids(
|
||||
],
|
||||
}
|
||||
],
|
||||
RendererConfig(model_config=phi3v_model_config),
|
||||
phi3v_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -409,7 +409,7 @@ def test_parse_chat_messages_multiple_empty_images_with_uuids(
|
||||
],
|
||||
}
|
||||
],
|
||||
RendererConfig(model_config=phi3v_model_config),
|
||||
phi3v_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -451,7 +451,7 @@ def test_parse_chat_messages_mixed_empty_images_with_uuids(
|
||||
],
|
||||
}
|
||||
],
|
||||
RendererConfig(model_config=phi3v_model_config),
|
||||
phi3v_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -485,7 +485,7 @@ async def test_parse_chat_messages_single_image_with_uuid_async(
|
||||
],
|
||||
}
|
||||
],
|
||||
RendererConfig(model_config=phi3v_model_config),
|
||||
phi3v_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -516,7 +516,7 @@ async def test_parse_chat_messages_empty_image_with_uuid_async(
|
||||
],
|
||||
}
|
||||
],
|
||||
RendererConfig(model_config=phi3v_model_config),
|
||||
phi3v_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -554,7 +554,7 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async(
|
||||
],
|
||||
}
|
||||
],
|
||||
RendererConfig(model_config=phi3v_model_config),
|
||||
phi3v_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -595,7 +595,7 @@ async def test_parse_chat_messages_multiple_empty_images_with_uuids_async(
|
||||
],
|
||||
}
|
||||
],
|
||||
RendererConfig(model_config=phi3v_model_config),
|
||||
phi3v_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -634,7 +634,7 @@ async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
|
||||
],
|
||||
}
|
||||
],
|
||||
RendererConfig(model_config=phi3v_model_config),
|
||||
phi3v_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -660,7 +660,7 @@ def test_parse_chat_messages_empty_system(
|
||||
"content": [{"type": "text", "text": "Who are you?"}],
|
||||
},
|
||||
],
|
||||
RendererConfig(model_config=mistral_model_config),
|
||||
mistral_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
assert conversation == [
|
||||
@ -677,7 +677,7 @@ def test_parse_chat_messages_empty_system(
|
||||
"content": [{"type": "text", "text": "Who are you?"}],
|
||||
},
|
||||
],
|
||||
RendererConfig(model_config=mistral_model_config),
|
||||
mistral_model_config,
|
||||
content_format="openai",
|
||||
)
|
||||
assert conversation == [
|
||||
@ -701,7 +701,7 @@ async def test_parse_chat_messages_single_image_async(
|
||||
],
|
||||
}
|
||||
],
|
||||
RendererConfig(model_config=phi3v_model_config),
|
||||
phi3v_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -730,7 +730,7 @@ def test_parse_chat_messages_multiple_images(
|
||||
],
|
||||
}
|
||||
],
|
||||
RendererConfig(model_config=phi3v_model_config),
|
||||
phi3v_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -758,7 +758,7 @@ def test_parse_chat_messages_empty_pil_image_with_uuid(
|
||||
],
|
||||
}
|
||||
],
|
||||
RendererConfig(model_config=phi3v_model_config),
|
||||
phi3v_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -786,7 +786,7 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
|
||||
],
|
||||
}
|
||||
],
|
||||
RendererConfig(model_config=phi3v_model_config_image_embeds),
|
||||
phi3v_model_config_image_embeds,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -818,7 +818,7 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid(
|
||||
],
|
||||
}
|
||||
],
|
||||
RendererConfig(model_config=audio_embeds_model_config),
|
||||
audio_embeds_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -858,7 +858,7 @@ def test_parse_chat_messages_audio_embeds_with_string(
|
||||
],
|
||||
}
|
||||
],
|
||||
RendererConfig(model_config=audio_embeds_model_config),
|
||||
audio_embeds_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -900,7 +900,7 @@ async def test_parse_chat_messages_audio_embeds_async(
|
||||
],
|
||||
}
|
||||
],
|
||||
RendererConfig(model_config=audio_embeds_model_config),
|
||||
audio_embeds_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -1108,7 +1108,7 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
|
||||
],
|
||||
}
|
||||
],
|
||||
RendererConfig(model_config=phi3v_model_config_image_embeds),
|
||||
phi3v_model_config_image_embeds,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -1144,7 +1144,7 @@ async def test_parse_chat_messages_multiple_images_async(
|
||||
],
|
||||
}
|
||||
],
|
||||
RendererConfig(model_config=phi3v_model_config),
|
||||
phi3v_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -1176,7 +1176,7 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
|
||||
],
|
||||
}
|
||||
],
|
||||
RendererConfig(model_config=phi3v_model_config),
|
||||
phi3v_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
assert conversation == [
|
||||
@ -1208,7 +1208,7 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
|
||||
],
|
||||
}
|
||||
],
|
||||
RendererConfig(model_config=phi3v_model_config),
|
||||
phi3v_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -1245,7 +1245,7 @@ def test_parse_chat_messages_multiple_images_across_messages(
|
||||
],
|
||||
},
|
||||
],
|
||||
RendererConfig(model_config=phi3v_model_config),
|
||||
phi3v_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -1289,7 +1289,7 @@ def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
|
||||
],
|
||||
},
|
||||
],
|
||||
RendererConfig(model_config=phi3v_model_config),
|
||||
phi3v_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -1314,7 +1314,7 @@ def test_parse_chat_messages_context_text_format(
|
||||
{"role": "assistant", "content": "Some stuff."},
|
||||
{"role": "user", "content": "What about this one?"},
|
||||
],
|
||||
RendererConfig(model_config=phi3v_model_config),
|
||||
phi3v_model_config,
|
||||
content_format="openai",
|
||||
)
|
||||
|
||||
@ -1367,7 +1367,7 @@ def test_parse_chat_messages_rejects_too_many_images_in_one_message(
|
||||
],
|
||||
}
|
||||
],
|
||||
RendererConfig(model_config=phi3v_model_config),
|
||||
phi3v_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -1410,7 +1410,7 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages(
|
||||
],
|
||||
},
|
||||
],
|
||||
RendererConfig(model_config=phi3v_model_config),
|
||||
phi3v_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -1430,7 +1430,7 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
|
||||
],
|
||||
}
|
||||
],
|
||||
RendererConfig(model_config=phi3v_model_config),
|
||||
phi3v_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -1464,7 +1464,7 @@ def test_parse_chat_messages_multiple_images_interleave(
|
||||
],
|
||||
}
|
||||
],
|
||||
RendererConfig(model_config=phi3v_model_config_mm_interleaved),
|
||||
phi3v_model_config_mm_interleaved,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -1500,7 +1500,7 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
|
||||
],
|
||||
}
|
||||
],
|
||||
RendererConfig(model_config=phi3v_model_config_mm_interleaved),
|
||||
phi3v_model_config_mm_interleaved,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -1545,7 +1545,7 @@ async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(
|
||||
],
|
||||
}
|
||||
],
|
||||
RendererConfig(model_config=phi3v_model_config_mm_interleaved),
|
||||
phi3v_model_config_mm_interleaved,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -1583,7 +1583,7 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
|
||||
],
|
||||
},
|
||||
],
|
||||
RendererConfig(model_config=phi3v_model_config_mm_interleaved),
|
||||
phi3v_model_config_mm_interleaved,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -1631,7 +1631,7 @@ def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interl
|
||||
],
|
||||
},
|
||||
],
|
||||
RendererConfig(model_config=phi3v_model_config_mm_interleaved),
|
||||
phi3v_model_config_mm_interleaved,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -1675,7 +1675,7 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
|
||||
],
|
||||
},
|
||||
],
|
||||
RendererConfig(model_config=qwen25omni_model_config_mm_interleaved),
|
||||
qwen25omni_model_config_mm_interleaved,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -1743,7 +1743,7 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl
|
||||
],
|
||||
},
|
||||
],
|
||||
RendererConfig(model_config=qwen25omni_model_config_mm_interleaved),
|
||||
qwen25omni_model_config_mm_interleaved,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -1813,7 +1813,7 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes
|
||||
],
|
||||
},
|
||||
],
|
||||
RendererConfig(model_config=qwen25omni_model_config_mm_interleaved),
|
||||
qwen25omni_model_config_mm_interleaved,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -1879,7 +1879,7 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message
|
||||
],
|
||||
},
|
||||
],
|
||||
RendererConfig(model_config=qwen25omni_model_config_mm_interleaved),
|
||||
qwen25omni_model_config_mm_interleaved,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -1927,7 +1927,7 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
|
||||
],
|
||||
}
|
||||
],
|
||||
RendererConfig(model_config=phi3v_model_config_mm_interleaved),
|
||||
phi3v_model_config_mm_interleaved,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -1945,11 +1945,24 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
|
||||
renderer_config = model_info.build_renderer_config(model)
|
||||
model_config = ModelConfig(
|
||||
model,
|
||||
tokenizer=model_info.tokenizer or model,
|
||||
tokenizer_mode=model_info.tokenizer_mode,
|
||||
revision=model_info.revision,
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
hf_overrides=model_info.hf_overrides,
|
||||
skip_tokenizer_init=model_info.require_embed_inputs,
|
||||
enable_prompt_embeds=model_info.require_embed_inputs,
|
||||
enable_mm_embeds=model_info.require_embed_inputs,
|
||||
enforce_eager=model_info.enforce_eager,
|
||||
dtype=model_info.dtype,
|
||||
)
|
||||
|
||||
# Build the tokenizer
|
||||
tokenizer = get_tokenizer(
|
||||
renderer_config.tokenizer,
|
||||
trust_remote_code=renderer_config.trust_remote_code,
|
||||
model,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
)
|
||||
|
||||
tools = (
|
||||
@ -1972,7 +1985,7 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
|
||||
tokenizer,
|
||||
chat_template=None,
|
||||
tools=tools,
|
||||
model_config=renderer_config.model_config,
|
||||
model_config=model_config,
|
||||
)
|
||||
assert isinstance(chat_template, str)
|
||||
|
||||
@ -2034,11 +2047,24 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwa
|
||||
"enable_thinking": True,
|
||||
}
|
||||
|
||||
renderer_config = model_info.build_renderer_config(model)
|
||||
model_config = ModelConfig(
|
||||
model,
|
||||
tokenizer=model_info.tokenizer or model,
|
||||
tokenizer_mode=model_info.tokenizer_mode,
|
||||
revision=model_info.revision,
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
hf_overrides=model_info.hf_overrides,
|
||||
skip_tokenizer_init=model_info.require_embed_inputs,
|
||||
enable_prompt_embeds=model_info.require_embed_inputs,
|
||||
enable_mm_embeds=model_info.require_embed_inputs,
|
||||
enforce_eager=model_info.enforce_eager,
|
||||
dtype=model_info.dtype,
|
||||
)
|
||||
|
||||
# Build the tokenizer
|
||||
tokenizer = get_tokenizer(
|
||||
renderer_config.tokenizer,
|
||||
trust_remote_code=renderer_config.trust_remote_code,
|
||||
model,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
)
|
||||
|
||||
# Test detecting the tokenizer's chat_template
|
||||
@ -2046,7 +2072,7 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwa
|
||||
tokenizer,
|
||||
chat_template=None,
|
||||
tools=tools,
|
||||
model_config=renderer_config.model_config,
|
||||
model_config=model_config,
|
||||
)
|
||||
with pytest.raises(
|
||||
ValueError, match="Found unexpected chat template kwargs from request"
|
||||
@ -2117,11 +2143,23 @@ def test_resolve_content_format_hf_defined(model, expected_format):
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
|
||||
renderer_config = model_info.build_renderer_config(model)
|
||||
model_config = ModelConfig(
|
||||
model,
|
||||
tokenizer=model_info.tokenizer or model,
|
||||
tokenizer_mode=model_info.tokenizer_mode,
|
||||
revision=model_info.revision,
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
hf_overrides=model_info.hf_overrides,
|
||||
skip_tokenizer_init=model_info.require_embed_inputs,
|
||||
enable_prompt_embeds=model_info.require_embed_inputs,
|
||||
enable_mm_embeds=model_info.require_embed_inputs,
|
||||
enforce_eager=model_info.enforce_eager,
|
||||
dtype=model_info.dtype,
|
||||
)
|
||||
|
||||
tokenizer = get_tokenizer(
|
||||
renderer_config.tokenizer,
|
||||
trust_remote_code=renderer_config.trust_remote_code,
|
||||
model,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
)
|
||||
|
||||
# Test detecting the tokenizer's chat_template
|
||||
@ -2129,7 +2167,7 @@ def test_resolve_content_format_hf_defined(model, expected_format):
|
||||
tokenizer,
|
||||
chat_template=None,
|
||||
tools=None,
|
||||
model_config=renderer_config.model_config,
|
||||
model_config=model_config,
|
||||
)
|
||||
assert isinstance(chat_template, str)
|
||||
|
||||
@ -2143,7 +2181,7 @@ def test_resolve_content_format_hf_defined(model, expected_format):
|
||||
None,
|
||||
"auto",
|
||||
tokenizer,
|
||||
renderer_config=renderer_config,
|
||||
model_config=model_config,
|
||||
)
|
||||
|
||||
assert resolved_format == expected_format
|
||||
@ -2165,11 +2203,23 @@ def test_resolve_content_format_fallbacks(model, expected_format):
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
|
||||
renderer_config = model_info.build_renderer_config(model)
|
||||
model_config = ModelConfig(
|
||||
model,
|
||||
tokenizer=model_info.tokenizer or model,
|
||||
tokenizer_mode=model_info.tokenizer_mode,
|
||||
revision=model_info.revision,
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
hf_overrides=model_info.hf_overrides,
|
||||
skip_tokenizer_init=model_info.require_embed_inputs,
|
||||
enable_prompt_embeds=model_info.require_embed_inputs,
|
||||
enable_mm_embeds=model_info.require_embed_inputs,
|
||||
enforce_eager=model_info.enforce_eager,
|
||||
dtype=model_info.dtype,
|
||||
)
|
||||
|
||||
tokenizer = get_tokenizer(
|
||||
renderer_config.tokenizer,
|
||||
trust_remote_code=renderer_config.trust_remote_code,
|
||||
model_config.tokenizer,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
)
|
||||
|
||||
# Test detecting the tokenizer's chat_template
|
||||
@ -2177,7 +2227,7 @@ def test_resolve_content_format_fallbacks(model, expected_format):
|
||||
tokenizer,
|
||||
chat_template=None,
|
||||
tools=None,
|
||||
model_config=renderer_config.model_config,
|
||||
model_config=model_config,
|
||||
)
|
||||
assert isinstance(chat_template, str)
|
||||
|
||||
@ -2191,7 +2241,7 @@ def test_resolve_content_format_fallbacks(model, expected_format):
|
||||
None,
|
||||
"auto",
|
||||
tokenizer,
|
||||
renderer_config=renderer_config,
|
||||
model_config=model_config,
|
||||
)
|
||||
|
||||
assert resolved_format == expected_format
|
||||
@ -2222,13 +2272,15 @@ def test_resolve_content_format_fallbacks(model, expected_format):
|
||||
],
|
||||
)
|
||||
def test_resolve_content_format_examples(template_path, expected_format):
|
||||
model = PHI3V_MODEL_ID # Dummy
|
||||
model_config = ModelConfig(model, trust_remote_code=True)
|
||||
renderer_config = RendererConfig(model_config=model_config, tokenizer=model)
|
||||
model_config = ModelConfig(
|
||||
PHI3V_MODEL_ID, # Dummy
|
||||
tokenizer=PHI3V_MODEL_ID, # Dummy
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
dummy_tokenizer = get_tokenizer(
|
||||
renderer_config.tokenizer,
|
||||
trust_remote_code=renderer_config.trust_remote_code,
|
||||
PHI3V_MODEL_ID, # Dummy
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
)
|
||||
dummy_tokenizer.chat_template = None
|
||||
|
||||
@ -2245,7 +2297,7 @@ def test_resolve_content_format_examples(template_path, expected_format):
|
||||
None,
|
||||
"auto",
|
||||
dummy_tokenizer,
|
||||
renderer_config=renderer_config,
|
||||
model_config=model_config,
|
||||
)
|
||||
|
||||
assert resolved_format == expected_format
|
||||
@ -2280,7 +2332,7 @@ def test_parse_chat_messages_include_thinking_chunk(mistral_model_config):
|
||||
|
||||
conversation_with_thinking, _, _ = parse_chat_messages(
|
||||
messages,
|
||||
RendererConfig(model_config=mistral_model_config),
|
||||
mistral_model_config,
|
||||
content_format="openai",
|
||||
)
|
||||
|
||||
@ -2380,7 +2432,7 @@ def test_parse_chat_messages_single_empty_audio_with_uuid(
|
||||
],
|
||||
}
|
||||
],
|
||||
RendererConfig(model_config=qwen2_audio_model_config),
|
||||
qwen2_audio_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
@ -2414,7 +2466,7 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
|
||||
],
|
||||
}
|
||||
],
|
||||
RendererConfig(model_config=qwen2_audio_model_config),
|
||||
qwen2_audio_model_config,
|
||||
content_format="string",
|
||||
)
|
||||
|
||||
|
||||
@ -8,7 +8,7 @@ import torch
|
||||
from safetensors.torch import load_file
|
||||
from torch import nn
|
||||
|
||||
from vllm.config import ModelConfig, RendererConfig, VllmConfig
|
||||
from vllm.config import ModelConfig, VllmConfig
|
||||
from vllm.config.lora import LoRAConfig
|
||||
from vllm.lora.layers import (
|
||||
ColumnParallelLinearWithLoRA,
|
||||
@ -422,11 +422,7 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_pa
|
||||
)
|
||||
|
||||
model_config = ModelConfig(max_model_len=16)
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
renderer_config=RendererConfig(model_config=model_config),
|
||||
lora_config=lora_config,
|
||||
)
|
||||
vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config)
|
||||
|
||||
vllm_config.scheduler_config.max_num_seqs = 4
|
||||
vllm_config.scheduler_config.max_num_batched_tokens = 2
|
||||
@ -529,11 +525,7 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path
|
||||
)
|
||||
|
||||
model_config = ModelConfig(max_model_len=16)
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
renderer_config=RendererConfig(model_config=model_config),
|
||||
lora_config=lora_config,
|
||||
)
|
||||
vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config)
|
||||
|
||||
vllm_config.scheduler_config.max_num_seqs = 4
|
||||
vllm_config.scheduler_config.max_num_batched_tokens = 2
|
||||
|
||||
@ -11,7 +11,6 @@ from vllm.config import (
|
||||
DeviceConfig,
|
||||
ModelConfig,
|
||||
ParallelConfig,
|
||||
RendererConfig,
|
||||
SchedulerConfig,
|
||||
VllmConfig,
|
||||
)
|
||||
@ -44,7 +43,6 @@ def test_worker_apply_lora(qwen3_lora_files):
|
||||
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
renderer_config=RendererConfig(model_config=model_config),
|
||||
load_config=LoadConfig(
|
||||
download_dir=None,
|
||||
load_format="dummy",
|
||||
|
||||
@ -42,10 +42,8 @@ def test_model_loading_with_params(vllm_runner, monkeypatch):
|
||||
"Write a short story about a robot that dreams for the first time.\n"
|
||||
)
|
||||
|
||||
llm_engine = vllm_model.llm.llm_engine
|
||||
model_config = llm_engine.model_config
|
||||
renderer_config = llm_engine.renderer_config
|
||||
tokenizer = llm_engine.tokenizer
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
model_tokenizer = vllm_model.llm.llm_engine.tokenizer
|
||||
|
||||
# asserts on the bert model config file
|
||||
assert model_config.encoder_config["max_seq_length"] == 512
|
||||
@ -56,8 +54,8 @@ def test_model_loading_with_params(vllm_runner, monkeypatch):
|
||||
assert model_config.pooler_config.normalize
|
||||
|
||||
# asserts on the tokenizer loaded
|
||||
assert renderer_config.tokenizer == "BAAI/bge-base-en-v1.5"
|
||||
assert tokenizer.model_max_length == 512
|
||||
assert model_config.tokenizer == "BAAI/bge-base-en-v1.5"
|
||||
assert model_tokenizer.model_max_length == 512
|
||||
|
||||
def check_model(model):
|
||||
assert isinstance(model, BertEmbeddingModel)
|
||||
@ -88,10 +86,8 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
|
||||
"Write a short story about a robot that dreams for the first time.\n"
|
||||
)
|
||||
|
||||
llm_engine = vllm_model.llm.llm_engine
|
||||
model_config = llm_engine.model_config
|
||||
renderer_config = llm_engine.renderer_config
|
||||
tokenizer = llm_engine.tokenizer
|
||||
model_config = vllm_model.llm.llm_engine.model_config
|
||||
model_tokenizer = vllm_model.llm.llm_engine.tokenizer
|
||||
|
||||
# asserts on the bert model config file
|
||||
assert model_config.encoder_config["max_seq_length"] == 512
|
||||
@ -102,8 +98,8 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
|
||||
assert model_config.pooler_config.normalize
|
||||
|
||||
# asserts on the tokenizer loaded
|
||||
assert renderer_config.tokenizer == "intfloat/multilingual-e5-base"
|
||||
assert tokenizer.model_max_length == 512
|
||||
assert model_config.tokenizer == "intfloat/multilingual-e5-base"
|
||||
assert model_tokenizer.model_max_length == 512
|
||||
|
||||
def check_model(model):
|
||||
assert isinstance(model, RobertaEmbeddingModel)
|
||||
@ -132,7 +128,7 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner, monkeypatch):
|
||||
"Write a short story about a robot that dreams for the first time.\n"
|
||||
)
|
||||
|
||||
assert vllm_model.llm.llm_engine.renderer_config.tokenizer == model_name
|
||||
assert vllm_model.llm.llm_engine.model_config.tokenizer == model_name
|
||||
|
||||
def check_model(model):
|
||||
assert isinstance(model, RobertaEmbeddingModel)
|
||||
|
||||
@ -6,7 +6,7 @@ import pytest
|
||||
from scipy.spatial.distance import cosine
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.config import ModelConfig, RendererConfig
|
||||
from vllm.config import ModelConfig
|
||||
|
||||
from ....utils import RemoteOpenAIServer
|
||||
|
||||
@ -31,8 +31,7 @@ def test_find_array():
|
||||
dtype="bfloat16",
|
||||
seed=0,
|
||||
)
|
||||
renderer_config = RendererConfig(model_config=model_config)
|
||||
pooling = GritLMMeanPool(renderer_config=renderer_config)
|
||||
pooling = GritLMMeanPool(model_config=model_config)
|
||||
|
||||
arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
|
||||
|
||||
|
||||
@ -25,6 +25,7 @@ from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingC
|
||||
from vllm.tokenizers import (
|
||||
MistralTokenizer,
|
||||
TokenizerLike,
|
||||
cached_tokenizer_from_config,
|
||||
)
|
||||
|
||||
from ....multimodal.utils import random_audio, random_image, random_video
|
||||
@ -211,20 +212,31 @@ def _test_processing_correctness(
|
||||
else:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id_or_arch)
|
||||
model_id = model_id_or_arch
|
||||
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
renderer_config = model_info.build_renderer_config(
|
||||
model=model_id,
|
||||
model_config = ModelConfig(
|
||||
model_id,
|
||||
tokenizer=model_info.tokenizer or model_id,
|
||||
tokenizer_mode=model_info.tokenizer_mode,
|
||||
revision=model_info.revision,
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
hf_overrides=model_info.hf_overrides,
|
||||
# Ensure that the cache can fit all of the data
|
||||
mm_processor_cache_gb=2048,
|
||||
skip_tokenizer_init=model_info.require_embed_inputs,
|
||||
enable_prompt_embeds=model_info.require_embed_inputs,
|
||||
enable_mm_embeds=model_info.require_embed_inputs,
|
||||
enforce_eager=model_info.enforce_eager,
|
||||
dtype=model_info.dtype,
|
||||
)
|
||||
model_config = renderer_config.model_config
|
||||
|
||||
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
|
||||
factories = model_cls._processor_factory
|
||||
ctx = InputProcessingContext.from_config(renderer_config)
|
||||
ctx = InputProcessingContext(
|
||||
model_config,
|
||||
tokenizer=cached_tokenizer_from_config(model_config),
|
||||
)
|
||||
cache = MultiModalProcessorOnlyCache(model_config)
|
||||
|
||||
processing_info = factories.info(ctx)
|
||||
|
||||
@ -40,7 +40,7 @@ def test_processor_override(
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"video": 1},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
tokenizer = processor.info.get_tokenizer()
|
||||
hf_processor_mm_kwargs = {"fps": fps}
|
||||
|
||||
@ -79,7 +79,7 @@ def test_video_loader_consistency(
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"video": 1},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
hf_processor_mm_kwargs = {"fps": fps}
|
||||
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
|
||||
@ -162,7 +162,7 @@ def test_processor_override(
|
||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||
limit_mm_per_prompt={"image": len(size_factors)},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||
|
||||
min_num = min_dynamic_patch if dynamic_image_size else 1
|
||||
|
||||
@ -38,7 +38,7 @@ def test_processor_override(
|
||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
|
||||
@ -116,7 +116,7 @@ def test_processor_override(
|
||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||
limit_mm_per_prompt={"image": len(size_factors)},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||
|
||||
min_num = min_dynamic_patch if dynamic_image_size else 1
|
||||
|
||||
@ -30,7 +30,7 @@ def test_processor_override(
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
mm_processor_cache_gb=mm_processor_cache_gb,
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
config = processor.info.get_hf_config()
|
||||
tokenizer = processor.info.get_tokenizer()
|
||||
hf_processor = processor.info.get_hf_processor()
|
||||
|
||||
@ -42,7 +42,7 @@ def test_processor_max_tokens(model_id):
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
info = processor.info
|
||||
|
||||
seen_aspect_ratios = set[float]()
|
||||
@ -140,7 +140,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
|
||||
image_ratios = [
|
||||
(171, 152),
|
||||
@ -173,7 +173,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
|
||||
seen_aspect_ratios = set[float]()
|
||||
image_sizes = list[ImageSize]()
|
||||
|
||||
@ -42,7 +42,7 @@ def test_processor_max_tokens(model_id):
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
info = processor.info
|
||||
|
||||
seen_aspect_ratios = set[float]()
|
||||
@ -138,7 +138,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
|
||||
image_ratios = [
|
||||
(171, 152),
|
||||
@ -171,7 +171,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
|
||||
seen_aspect_ratios = set[float]()
|
||||
image_sizes = list[ImageSize]()
|
||||
|
||||
@ -24,7 +24,7 @@ def test_processor_override(
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
prompt = "<image>" * num_imgs
|
||||
image = Image.new("RGB", size=(364, 364))
|
||||
mm_data = {"image": [image] * num_imgs}
|
||||
@ -83,7 +83,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
|
||||
image_ratios = [
|
||||
(171, 152),
|
||||
|
||||
@ -25,7 +25,7 @@ def test_profiling(model_id: str, max_model_len: int):
|
||||
limit_mm_per_prompt=mm_counts,
|
||||
)
|
||||
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
profiler = MultiModalProfiler(processor)
|
||||
|
||||
decoder_dummy_data = profiler.get_decoder_dummy_data(
|
||||
|
||||
@ -118,7 +118,7 @@ def test_processor_override(
|
||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||
limit_mm_per_prompt={"image": len(size_factors)},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||
|
||||
min_num = min_dynamic_patch if dynamic_image_size else 1
|
||||
|
||||
@ -39,7 +39,7 @@ def test_processor_override(
|
||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
|
||||
@ -39,7 +39,7 @@ def test_processor_override(
|
||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
|
||||
@ -34,7 +34,7 @@ def test_processor_override(
|
||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
tokenizer = processor.info.get_tokenizer()
|
||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||
|
||||
|
||||
@ -38,7 +38,7 @@ def test_processor_override(
|
||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
|
||||
@ -11,7 +11,7 @@ import pytest
|
||||
import torch.nn as nn
|
||||
from PIL import Image
|
||||
|
||||
from vllm.config import ModelConfig, RendererConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.config.multimodal import (
|
||||
AudioDummyOptions,
|
||||
BaseDummyOptions,
|
||||
@ -31,6 +31,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
|
||||
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
|
||||
from vllm.multimodal.utils import group_mm_kwargs_by_modality
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.tokenizers import cached_tokenizer_from_config
|
||||
from vllm.utils.collection_utils import is_list_of
|
||||
from vllm.utils.torch_utils import set_default_torch_dtype
|
||||
|
||||
@ -149,10 +150,7 @@ def initialize_dummy_model(
|
||||
backend="nccl",
|
||||
)
|
||||
initialize_model_parallel(tensor_model_parallel_size=1)
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
renderer_config=RendererConfig(model_config=model_config),
|
||||
)
|
||||
vllm_config = VllmConfig(model_config=model_config)
|
||||
with set_current_vllm_config(vllm_config=vllm_config):
|
||||
with set_default_torch_dtype(model_config.dtype):
|
||||
model = model_cls(vllm_config=vllm_config)
|
||||
@ -184,12 +182,19 @@ def test_model_tensor_schema(model_id: str):
|
||||
else:
|
||||
dtype = model_info.dtype
|
||||
|
||||
renderer_config = model_info.build_renderer_config(
|
||||
model_config = ModelConfig(
|
||||
model_id,
|
||||
tokenizer=model_info.tokenizer or model_id,
|
||||
tokenizer_mode=model_info.tokenizer_mode,
|
||||
revision=model_info.revision,
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
hf_overrides=hf_overrides_fn,
|
||||
skip_tokenizer_init=model_info.require_embed_inputs,
|
||||
enable_prompt_embeds=model_info.require_embed_inputs,
|
||||
enable_mm_embeds=model_info.require_embed_inputs,
|
||||
enforce_eager=model_info.enforce_eager,
|
||||
dtype=dtype,
|
||||
)
|
||||
model_config = renderer_config.model_config
|
||||
|
||||
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
|
||||
assert supports_multimodal(model_cls)
|
||||
@ -207,7 +212,10 @@ def test_model_tensor_schema(model_id: str):
|
||||
if not any(inputs_parse_methods):
|
||||
pytest.skip(f"{model_arch} does not support tensor schema validation.")
|
||||
|
||||
ctx = InputProcessingContext.from_config(renderer_config)
|
||||
ctx = InputProcessingContext(
|
||||
model_config,
|
||||
tokenizer=cached_tokenizer_from_config(model_config),
|
||||
)
|
||||
processing_info = factories.info(ctx)
|
||||
supported_mm_limits = processing_info.get_supported_mm_limits()
|
||||
limit_mm_per_prompt = {
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
import pytest
|
||||
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.config import ModelConfig, RendererConfig
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
|
||||
|
||||
@ -13,9 +13,8 @@ def test_multimodal_processor(model_id):
|
||||
model=model_id,
|
||||
model_impl="transformers",
|
||||
)
|
||||
renderer_config = RendererConfig(model_config=model_config)
|
||||
|
||||
mm_processor = MULTIMODAL_REGISTRY.create_processor(renderer_config)
|
||||
mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config)
|
||||
|
||||
image_pil = ImageAsset("cherry_blossom").pil_image
|
||||
mm_data = {"image": image_pil}
|
||||
|
||||
@ -7,6 +7,7 @@ import torch
|
||||
import transformers
|
||||
from transformers import AutoConfig, PreTrainedModel
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.model_executor.models.utils import WeightsMapper
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.transformers_utils.config import try_get_safetensors_metadata
|
||||
@ -49,11 +50,37 @@ def test_hf_model_weights_mapper(model_arch: str):
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
model_config = model_info.build_model_config(config_format="hf")
|
||||
is_mistral_model = model_arch in [
|
||||
"Mistral3ForConditionalGeneration",
|
||||
"PixtralForConditionalGeneration",
|
||||
"VoxtralForConditionalGeneration",
|
||||
]
|
||||
|
||||
if not is_mistral_model or model_info.tokenizer_mode == "mistral":
|
||||
tokenizer_mode = model_info.tokenizer_mode
|
||||
else:
|
||||
tokenizer_mode = "hf"
|
||||
|
||||
model_id = model_info.default
|
||||
|
||||
model_config = ModelConfig(
|
||||
model_id,
|
||||
tokenizer=model_info.tokenizer or model_id,
|
||||
tokenizer_mode=tokenizer_mode,
|
||||
config_format="hf",
|
||||
revision=model_info.revision,
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
hf_overrides=model_info.hf_overrides,
|
||||
skip_tokenizer_init=model_info.require_embed_inputs,
|
||||
enable_prompt_embeds=model_info.require_embed_inputs,
|
||||
enable_mm_embeds=model_info.require_embed_inputs,
|
||||
enforce_eager=model_info.enforce_eager,
|
||||
dtype=model_info.dtype,
|
||||
)
|
||||
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
|
||||
|
||||
original_weights = create_repo_dummy_weights(model_config.model)
|
||||
hf_dummy_model = create_dummy_model(model_config.model, model_arch)
|
||||
original_weights = create_repo_dummy_weights(model_id)
|
||||
hf_dummy_model = create_dummy_model(model_id, model_arch)
|
||||
hf_converted_weights = hf_dummy_model.named_parameters()
|
||||
hf_converted_buffers = hf_dummy_model.named_buffers()
|
||||
mapper: WeightsMapper = model_cls.hf_to_vllm_mapper
|
||||
|
||||
@ -9,8 +9,7 @@ import pytest
|
||||
from packaging.version import Version
|
||||
from transformers import __version__ as TRANSFORMERS_VERSION
|
||||
|
||||
from vllm.config.model import ModelConfig, ModelDType
|
||||
from vllm.config.renderer import RendererConfig, TokenizerMode
|
||||
from vllm.config.model import ModelDType, TokenizerMode
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
@ -171,36 +170,6 @@ class _HfExamplesInfo:
|
||||
else:
|
||||
pytest.skip(msg)
|
||||
|
||||
def build_model_config(self, model: str | None = None, **kwargs) -> ModelConfig:
|
||||
if model is None:
|
||||
model = self.default
|
||||
|
||||
return ModelConfig(
|
||||
**{
|
||||
"model": model,
|
||||
"revision": self.revision,
|
||||
"trust_remote_code": self.trust_remote_code,
|
||||
"hf_overrides": self.hf_overrides,
|
||||
"enable_prompt_embeds": self.require_embed_inputs,
|
||||
"enable_mm_embeds": self.require_embed_inputs,
|
||||
"enforce_eager": self.enforce_eager,
|
||||
"dtype": self.dtype,
|
||||
**kwargs,
|
||||
}
|
||||
)
|
||||
|
||||
def build_renderer_config(
|
||||
self, model: str | None = None, **kwargs
|
||||
) -> RendererConfig:
|
||||
model_config = self.build_model_config(model, **kwargs)
|
||||
|
||||
return RendererConfig(
|
||||
model_config=model_config,
|
||||
tokenizer=self.tokenizer or model_config.model,
|
||||
tokenizer_mode=self.tokenizer_mode,
|
||||
skip_tokenizer_init=self.require_embed_inputs,
|
||||
)
|
||||
|
||||
|
||||
_TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||
# [Decoder-only]
|
||||
|
||||
@ -13,6 +13,7 @@ from transformers import PretrainedConfig
|
||||
from vllm.config.model import ModelConfig, ModelDType, RunnerOption
|
||||
from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
|
||||
from vllm.multimodal.processing import InputProcessingContext
|
||||
from vllm.tokenizers import cached_tokenizer_from_config
|
||||
|
||||
from .. import ci_envs
|
||||
from .registry import HF_EXAMPLE_MODELS
|
||||
@ -295,18 +296,30 @@ def build_model_context(
|
||||
|
||||
model_config_kwargs = model_config_kwargs or {}
|
||||
limit_mm_per_prompt = limit_mm_per_prompt or {}
|
||||
renderer_config = model_info.build_renderer_config(
|
||||
model_config = ModelConfig(
|
||||
model_id,
|
||||
runner=runner,
|
||||
tokenizer=model_info.tokenizer or model_id,
|
||||
tokenizer_mode=model_info.tokenizer_mode,
|
||||
revision=model_info.revision,
|
||||
trust_remote_code=model_info.trust_remote_code,
|
||||
dtype=dtype,
|
||||
seed=0,
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
mm_processor_cache_gb=mm_processor_cache_gb,
|
||||
hf_overrides=model_info.hf_overrides,
|
||||
skip_tokenizer_init=model_info.require_embed_inputs,
|
||||
enable_prompt_embeds=model_info.require_embed_inputs,
|
||||
enable_mm_embeds=model_info.require_embed_inputs,
|
||||
enforce_eager=model_info.enforce_eager,
|
||||
**model_config_kwargs,
|
||||
)
|
||||
|
||||
return InputProcessingContext.from_config(renderer_config)
|
||||
return InputProcessingContext(
|
||||
model_config,
|
||||
tokenizer=cached_tokenizer_from_config(model_config),
|
||||
)
|
||||
|
||||
|
||||
def check_embeddings_close(
|
||||
|
||||
@ -6,7 +6,7 @@ import numpy as np
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.config import ModelConfig, ParallelConfig, RendererConfig, VllmConfig
|
||||
from vllm.config import ModelConfig, ParallelConfig, VllmConfig
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.cache import (
|
||||
BaseMultiModalProcessorCache,
|
||||
@ -110,14 +110,11 @@ def _create_vllm_config(
|
||||
mm_processor_cache_gb: float,
|
||||
enable_ipc: bool,
|
||||
):
|
||||
model_config = ModelConfig(
|
||||
model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
|
||||
mm_processor_cache_gb=mm_processor_cache_gb,
|
||||
)
|
||||
|
||||
return VllmConfig(
|
||||
model_config=model_config,
|
||||
renderer_config=RendererConfig(model_config=model_config),
|
||||
model_config=ModelConfig(
|
||||
model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
|
||||
mm_processor_cache_gb=mm_processor_cache_gb,
|
||||
),
|
||||
parallel_config=ParallelConfig(data_parallel_size=1 if enable_ipc else 2),
|
||||
)
|
||||
|
||||
@ -509,15 +506,13 @@ def _run_test_cache_eviction_shm(
|
||||
|
||||
|
||||
def test_cache_eviction_shm_cache():
|
||||
model_config = ModelConfig(
|
||||
model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
|
||||
mm_processor_cache_type="shm",
|
||||
mm_shm_cache_max_object_size_mb=6,
|
||||
mm_processor_cache_gb=15.2 * MiB_bytes / GiB_bytes,
|
||||
)
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
renderer_config=RendererConfig(model_config=model_config),
|
||||
model_config=ModelConfig(
|
||||
model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
|
||||
mm_processor_cache_type="shm",
|
||||
mm_shm_cache_max_object_size_mb=6,
|
||||
mm_processor_cache_gb=15.2 * MiB_bytes / GiB_bytes,
|
||||
),
|
||||
)
|
||||
sender_cache = ShmObjectStoreSenderCache(vllm_config)
|
||||
receiver_cache = ShmObjectStoreReceiverCache(vllm_config, mp.Lock())
|
||||
|
||||
@ -7,7 +7,7 @@ from contextlib import nullcontext
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from vllm.config import ModelConfig, RendererConfig
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.processing import (
|
||||
InputProcessingContext,
|
||||
@ -920,9 +920,8 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
|
||||
model=model_id,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
)
|
||||
renderer_config = RendererConfig(model_config=model_config)
|
||||
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(renderer_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(model_config)
|
||||
processor._supported_mm_limits = {"image": num_supported}
|
||||
|
||||
profiler = MultiModalProfiler(processor)
|
||||
@ -956,9 +955,8 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
|
||||
model=model_id,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
)
|
||||
renderer_config = RendererConfig(model_config=model_config)
|
||||
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(renderer_config)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(model_config)
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
image = random_image(rng, min_wh=128, max_wh=256)
|
||||
@ -1014,13 +1012,11 @@ def test_hf_processor_init_kwargs(
|
||||
inference_kwargs,
|
||||
expected_kwargs,
|
||||
):
|
||||
model_config = ModelConfig(model_id, mm_processor_kwargs=config_kwargs)
|
||||
renderer_config = RendererConfig(
|
||||
model_config=model_config,
|
||||
tokenizer=model_id,
|
||||
ctx = InputProcessingContext(
|
||||
model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
|
||||
tokenizer=None,
|
||||
)
|
||||
|
||||
ctx = InputProcessingContext.from_config(renderer_config)
|
||||
processor = ctx.get_hf_processor(
|
||||
DummyProcessor, # type: ignore[arg-type]
|
||||
**inference_kwargs,
|
||||
@ -1049,13 +1045,11 @@ def test_hf_processor_call_kwargs(
|
||||
inference_kwargs,
|
||||
expected_kwargs,
|
||||
):
|
||||
model_config = ModelConfig(model_id, mm_processor_kwargs=config_kwargs)
|
||||
renderer_config = RendererConfig(
|
||||
model_config=model_config,
|
||||
tokenizer=model_id,
|
||||
ctx = InputProcessingContext(
|
||||
model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
|
||||
tokenizer=None,
|
||||
)
|
||||
|
||||
ctx = InputProcessingContext.from_config(renderer_config)
|
||||
processor = ctx.get_hf_processor(DummyProcessor) # type: ignore[arg-type]
|
||||
|
||||
result = ctx.call_hf_processor(processor, {}, inference_kwargs)
|
||||
|
||||
@ -31,6 +31,4 @@ def test_supports_multimodal_inputs(model_id, limit_mm_per_prompt, expected):
|
||||
model_id,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
)
|
||||
assert (
|
||||
MULTIMODAL_REGISTRY.supports_multimodal_inputs(ctx.renderer_config) is expected
|
||||
)
|
||||
assert MULTIMODAL_REGISTRY.supports_multimodal_inputs(ctx.model_config) is expected
|
||||
|
||||
@ -13,7 +13,6 @@ from vllm.config import (
|
||||
CompilationConfig,
|
||||
ModelConfig,
|
||||
PoolerConfig,
|
||||
RendererConfig,
|
||||
SchedulerConfig,
|
||||
VllmConfig,
|
||||
update_config,
|
||||
@ -477,41 +476,27 @@ def test_load_config_pt_load_map_location(pt_load_map_location):
|
||||
("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", 131073, 131072, True),
|
||||
],
|
||||
)
|
||||
def test_recalculate_max_model_len(
|
||||
def test_get_and_verify_max_len(
|
||||
model_id, max_model_len, expected_max_len, should_raise
|
||||
):
|
||||
"""Test recalculate_max_model_len with different configurations."""
|
||||
"""Test get_and_verify_max_len with different configurations."""
|
||||
model_config = ModelConfig(model_id)
|
||||
|
||||
if should_raise:
|
||||
with pytest.raises(ValueError):
|
||||
model_config.recalculate_max_model_len(
|
||||
max_model_len,
|
||||
tokenizer=model_id,
|
||||
tokenizer_revision=None,
|
||||
)
|
||||
model_config.get_and_verify_max_len(max_model_len)
|
||||
else:
|
||||
model_config.recalculate_max_model_len(
|
||||
max_model_len,
|
||||
tokenizer=model_id,
|
||||
tokenizer_revision=None,
|
||||
)
|
||||
assert model_config.max_model_len == expected_max_len
|
||||
actual_max_len = model_config.get_and_verify_max_len(max_model_len)
|
||||
assert actual_max_len == expected_max_len
|
||||
|
||||
|
||||
class MockModelConfig:
|
||||
"""Simple mock object for testing maybe_pull_model_for_runai"""
|
||||
class MockConfig:
|
||||
"""Simple mock object for testing maybe_pull_model_tokenizer_for_runai"""
|
||||
|
||||
def __init__(self, model: str):
|
||||
def __init__(self, model: str, tokenizer: str):
|
||||
self.model = model
|
||||
|
||||
|
||||
class MockRendererConfig:
|
||||
"""Simple mock object for testing maybe_pull_tokenizer_for_runai"""
|
||||
|
||||
def __init__(self, model_config: MockModelConfig):
|
||||
self.model_config = model_config
|
||||
self.tokenizer = model_config.model
|
||||
self.tokenizer = tokenizer
|
||||
self.model_weights = None
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
@ -529,65 +514,59 @@ def test_s3_url_model_tokenizer_paths(mock_pull_files, s3_url):
|
||||
mock_pull_files.return_value = None
|
||||
|
||||
# Create first mock and run the method
|
||||
model_config1 = MockModelConfig(model=s3_url)
|
||||
renderer_config1 = MockRendererConfig(model_config=model_config1)
|
||||
ModelConfig.maybe_pull_model_for_runai(model_config1, s3_url)
|
||||
RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config1, s3_url)
|
||||
config1 = MockConfig(model=s3_url, tokenizer=s3_url)
|
||||
ModelConfig.maybe_pull_model_tokenizer_for_runai(config1, s3_url, s3_url)
|
||||
|
||||
# Check that model and tokenizer point to existing directories
|
||||
assert os.path.exists(model_config1.model), (
|
||||
f"Model directory does not exist: {model_config1.model}"
|
||||
assert os.path.exists(config1.model), (
|
||||
f"Model directory does not exist: {config1.model}"
|
||||
)
|
||||
assert os.path.isdir(model_config1.model), (
|
||||
f"Model path is not a directory: {model_config1.model}"
|
||||
assert os.path.isdir(config1.model), (
|
||||
f"Model path is not a directory: {config1.model}"
|
||||
)
|
||||
assert os.path.exists(renderer_config1.tokenizer), (
|
||||
f"Tokenizer directory does not exist: {renderer_config1.tokenizer}"
|
||||
assert os.path.exists(config1.tokenizer), (
|
||||
f"Tokenizer directory does not exist: {config1.tokenizer}"
|
||||
)
|
||||
assert os.path.isdir(renderer_config1.tokenizer), (
|
||||
f"Tokenizer path is not a directory: {renderer_config1.tokenizer}"
|
||||
assert os.path.isdir(config1.tokenizer), (
|
||||
f"Tokenizer path is not a directory: {config1.tokenizer}"
|
||||
)
|
||||
|
||||
# Verify that the paths are different from the original S3 URL
|
||||
assert model_config1.model != s3_url, (
|
||||
"Model path should be converted to local directory"
|
||||
)
|
||||
assert renderer_config1.tokenizer != s3_url, (
|
||||
assert config1.model != s3_url, "Model path should be converted to local directory"
|
||||
assert config1.tokenizer != s3_url, (
|
||||
"Tokenizer path should be converted to local directory"
|
||||
)
|
||||
|
||||
# Store the original paths
|
||||
created_model_dir = model_config1.model
|
||||
create_tokenizer_dir = renderer_config1.tokenizer
|
||||
created_model_dir = config1.model
|
||||
create_tokenizer_dir = config1.tokenizer
|
||||
|
||||
# Create a new mock and run the method with the same S3 URL
|
||||
model_config2 = MockModelConfig(model=s3_url)
|
||||
renderer_config2 = MockRendererConfig(model_config=model_config2)
|
||||
ModelConfig.maybe_pull_model_for_runai(model_config2, s3_url)
|
||||
RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config2, s3_url)
|
||||
config2 = MockConfig(model=s3_url, tokenizer=s3_url)
|
||||
ModelConfig.maybe_pull_model_tokenizer_for_runai(config2, s3_url, s3_url)
|
||||
|
||||
# Check that the new directories exist
|
||||
assert os.path.exists(model_config2.model), (
|
||||
f"Model directory does not exist: {model_config2.model}"
|
||||
assert os.path.exists(config2.model), (
|
||||
f"Model directory does not exist: {config2.model}"
|
||||
)
|
||||
assert os.path.isdir(model_config2.model), (
|
||||
f"Model path is not a directory: {model_config2.model}"
|
||||
assert os.path.isdir(config2.model), (
|
||||
f"Model path is not a directory: {config2.model}"
|
||||
)
|
||||
assert os.path.exists(renderer_config2.tokenizer), (
|
||||
f"Tokenizer directory does not exist: {renderer_config2.tokenizer}"
|
||||
assert os.path.exists(config2.tokenizer), (
|
||||
f"Tokenizer directory does not exist: {config2.tokenizer}"
|
||||
)
|
||||
assert os.path.isdir(renderer_config2.tokenizer), (
|
||||
f"Tokenizer path is not a directory: {renderer_config2.tokenizer}"
|
||||
assert os.path.isdir(config2.tokenizer), (
|
||||
f"Tokenizer path is not a directory: {config2.tokenizer}"
|
||||
)
|
||||
|
||||
# Verify that the paths are deterministic (same as before)
|
||||
assert model_config2.model == created_model_dir, (
|
||||
assert config2.model == created_model_dir, (
|
||||
f"Model paths are not deterministic. "
|
||||
f"Original: {created_model_dir}, New: {model_config2.model}"
|
||||
f"Original: {created_model_dir}, New: {config2.model}"
|
||||
)
|
||||
assert renderer_config2.tokenizer == create_tokenizer_dir, (
|
||||
assert config2.tokenizer == create_tokenizer_dir, (
|
||||
f"Tokenizer paths are not deterministic. "
|
||||
f"Original: {create_tokenizer_dir}, New: {renderer_config2.tokenizer}"
|
||||
f"Original: {create_tokenizer_dir}, New: {config2.tokenizer}"
|
||||
)
|
||||
|
||||
|
||||
@ -601,36 +580,28 @@ def test_s3_url_different_models_create_different_directories(mock_pull_files):
|
||||
s3_url2 = "s3://example-bucket-2/model/"
|
||||
|
||||
# Create mocks with different S3 URLs and run the method
|
||||
model_config1 = MockModelConfig(model=s3_url1)
|
||||
renderer_config1 = MockRendererConfig(model_config=model_config1)
|
||||
ModelConfig.maybe_pull_model_for_runai(model_config1, s3_url1)
|
||||
RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config1, s3_url1)
|
||||
config1 = MockConfig(model=s3_url1, tokenizer=s3_url1)
|
||||
ModelConfig.maybe_pull_model_tokenizer_for_runai(config1, s3_url1, s3_url1)
|
||||
|
||||
model_config2 = MockModelConfig(model=s3_url2)
|
||||
renderer_config2 = MockRendererConfig(model_config=model_config2)
|
||||
ModelConfig.maybe_pull_model_for_runai(model_config2, s3_url2)
|
||||
RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config2, s3_url2)
|
||||
config2 = MockConfig(model=s3_url2, tokenizer=s3_url2)
|
||||
ModelConfig.maybe_pull_model_tokenizer_for_runai(config2, s3_url2, s3_url2)
|
||||
|
||||
# Verify that different URLs produce different directories
|
||||
assert model_config1.model != model_config2.model, (
|
||||
assert config1.model != config2.model, (
|
||||
f"Different S3 URLs should create different model directories. "
|
||||
f"URL1 model: {model_config1.model}, URL2 model: {model_config2.model}"
|
||||
f"URL1 model: {config1.model}, URL2 model: {config2.model}"
|
||||
)
|
||||
assert renderer_config1.tokenizer != renderer_config2.tokenizer, (
|
||||
assert config1.tokenizer != config2.tokenizer, (
|
||||
f"Different S3 URLs should create different tokenizer directories. "
|
||||
f"URL1 tokenizer: {renderer_config1.tokenizer}, "
|
||||
f"URL2 tokenizer: {renderer_config2.tokenizer}"
|
||||
f"URL1 tokenizer: {config1.tokenizer}, "
|
||||
f"URL2 tokenizer: {config2.tokenizer}"
|
||||
)
|
||||
|
||||
# Verify that both sets of directories exist
|
||||
assert os.path.exists(model_config1.model) and os.path.isdir(model_config1.model)
|
||||
assert os.path.exists(renderer_config1.tokenizer) and os.path.isdir(
|
||||
renderer_config1.tokenizer
|
||||
)
|
||||
assert os.path.exists(model_config2.model) and os.path.isdir(model_config2.model)
|
||||
assert os.path.exists(renderer_config2.tokenizer) and os.path.isdir(
|
||||
renderer_config2.tokenizer
|
||||
)
|
||||
assert os.path.exists(config1.model) and os.path.isdir(config1.model)
|
||||
assert os.path.exists(config1.tokenizer) and os.path.isdir(config1.tokenizer)
|
||||
assert os.path.exists(config2.model) and os.path.isdir(config2.model)
|
||||
assert os.path.exists(config2.tokenizer) and os.path.isdir(config2.tokenizer)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import ModelConfig, RendererConfig
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.inputs import zip_enc_dec_prompts
|
||||
from vllm.inputs.parse import parse_raw_prompts
|
||||
from vllm.inputs.preprocess import InputPreprocessor
|
||||
@ -108,9 +108,8 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
|
||||
)
|
||||
def test_preprocessor_always_mm_code_path(model_id, prompt):
|
||||
model_config = ModelConfig(model=model_id)
|
||||
renderer_config = RendererConfig(model_config=model_config)
|
||||
tokenizer = init_tokenizer_from_config(renderer_config)
|
||||
input_preprocessor = InputPreprocessor(renderer_config, tokenizer)
|
||||
tokenizer = init_tokenizer_from_config(model_config)
|
||||
input_preprocessor = InputPreprocessor(model_config, tokenizer)
|
||||
|
||||
# HF processor adds sep token
|
||||
sep_token_id = tokenizer.vocab[tokenizer.sep_token]
|
||||
|
||||
@ -16,7 +16,6 @@ from vllm.config import (
|
||||
LoadConfig,
|
||||
ModelConfig,
|
||||
ParallelConfig,
|
||||
RendererConfig,
|
||||
SchedulerConfig,
|
||||
VllmConfig,
|
||||
)
|
||||
@ -217,7 +216,6 @@ def create_vllm_config(
|
||||
|
||||
return VllmConfig(
|
||||
model_config=model_config,
|
||||
renderer_config=RendererConfig(model_config=model_config),
|
||||
cache_config=cache_config,
|
||||
parallel_config=parallel_config,
|
||||
scheduler_config=scheduler_config,
|
||||
|
||||
@ -8,7 +8,7 @@ import pytest
|
||||
import torch
|
||||
|
||||
import vllm.v1.core.kv_cache_utils as kv_cache_utils
|
||||
from vllm.config import ModelConfig, RendererConfig, SchedulerConfig, VllmConfig
|
||||
from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.multimodal.inputs import (
|
||||
MultiModalFeatureSpec,
|
||||
@ -667,10 +667,7 @@ def test_metrics_empty_stats():
|
||||
|
||||
def test_get_kv_cache_configs_multiple_workers():
|
||||
model_config = ModelConfig(max_model_len=16)
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
renderer_config=RendererConfig(model_config=model_config),
|
||||
)
|
||||
vllm_config = VllmConfig(model_config=model_config)
|
||||
|
||||
ref_kv_cache_spec = new_kv_cache_spec()
|
||||
same_kv_cache_specs = [
|
||||
@ -1139,7 +1136,6 @@ def test_estimate_max_model_len(model_id, max_model_len, want_estimated_max_len)
|
||||
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
renderer_config=RendererConfig(model_config=model_config),
|
||||
scheduler_config=scheduler_config,
|
||||
)
|
||||
|
||||
@ -1179,7 +1175,6 @@ def test_get_max_concurrency_for_kv_cache_config():
|
||||
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
renderer_config=RendererConfig(model_config=model_config),
|
||||
scheduler_config=scheduler_config,
|
||||
)
|
||||
|
||||
@ -1298,10 +1293,7 @@ def test_allocate_with_lookahead():
|
||||
def test_get_kv_cache_config_one_worker():
|
||||
# pass max_model_len to pass check_enough_kv_cache_memory
|
||||
model_config = ModelConfig(max_model_len=16)
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
renderer_config=RendererConfig(model_config=model_config),
|
||||
)
|
||||
vllm_config = VllmConfig(model_config=model_config)
|
||||
|
||||
mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2
|
||||
# all layers are full attention -> single group
|
||||
@ -1592,11 +1584,7 @@ def test_get_kv_cache_config_one_worker():
|
||||
|
||||
def test_get_kv_cache_configs_attention_free():
|
||||
kv_cache_specs: dict[str, KVCacheSpec] = {}
|
||||
model_config = ModelConfig(max_model_len=16)
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
renderer_config=RendererConfig(model_config=model_config),
|
||||
)
|
||||
vllm_config = VllmConfig(model_config=ModelConfig(max_model_len=16))
|
||||
kv_cache_configs = get_kv_cache_configs(vllm_config, [kv_cache_specs], [0])
|
||||
assert kv_cache_configs == [
|
||||
KVCacheConfig(
|
||||
|
||||
@ -11,7 +11,6 @@ from vllm.config import (
|
||||
ECTransferConfig,
|
||||
KVTransferConfig,
|
||||
ModelConfig,
|
||||
RendererConfig,
|
||||
SchedulerConfig,
|
||||
SpeculativeConfig,
|
||||
VllmConfig,
|
||||
@ -1564,7 +1563,6 @@ def create_scheduler_with_priority(
|
||||
vllm_config = VllmConfig(
|
||||
scheduler_config=scheduler_config,
|
||||
model_config=model_config,
|
||||
renderer_config=RendererConfig(model_config=model_config),
|
||||
cache_config=cache_config,
|
||||
kv_transfer_config=kv_transfer_config,
|
||||
speculative_config=speculative_config,
|
||||
|
||||
@ -9,7 +9,6 @@ from vllm.config import (
|
||||
ECTransferConfig,
|
||||
KVTransferConfig,
|
||||
ModelConfig,
|
||||
RendererConfig,
|
||||
SchedulerConfig,
|
||||
SpeculativeConfig,
|
||||
VllmConfig,
|
||||
@ -133,7 +132,6 @@ def create_scheduler(
|
||||
vllm_config = VllmConfig(
|
||||
scheduler_config=scheduler_config,
|
||||
model_config=model_config,
|
||||
renderer_config=RendererConfig(model_config=model_config),
|
||||
cache_config=cache_config,
|
||||
kv_transfer_config=kv_transfer_config,
|
||||
speculative_config=speculative_config,
|
||||
|
||||
@ -15,7 +15,6 @@ from vllm.config import (
|
||||
ECTransferConfig,
|
||||
KVTransferConfig,
|
||||
ModelConfig,
|
||||
RendererConfig,
|
||||
SchedulerConfig,
|
||||
VllmConfig,
|
||||
)
|
||||
@ -523,7 +522,6 @@ def test_encoder_instance_zero_kv_cache(
|
||||
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
renderer_config=RendererConfig(model_config=model_config),
|
||||
cache_config=cache_config,
|
||||
scheduler_config=scheduler_config,
|
||||
kv_transfer_config=kv_transfer_config,
|
||||
|
||||
@ -5,14 +5,7 @@ import pytest
|
||||
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.assets.video import VideoAsset
|
||||
from vllm.config import (
|
||||
CacheConfig,
|
||||
DeviceConfig,
|
||||
ModelConfig,
|
||||
MultiModalConfig,
|
||||
RendererConfig,
|
||||
VllmConfig,
|
||||
)
|
||||
from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.v1.engine import input_processor as input_processor_mod
|
||||
from vllm.v1.engine.input_processor import InputProcessor
|
||||
@ -51,21 +44,22 @@ def _mock_input_processor(
|
||||
monkeypatch.setattr(VllmConfig, "__post_init__", lambda self: None, raising=True)
|
||||
|
||||
model_config = ModelConfig(
|
||||
skip_tokenizer_init=True,
|
||||
max_model_len=128,
|
||||
mm_processor_cache_gb=mm_cache_gb,
|
||||
generation_config="vllm",
|
||||
)
|
||||
model_config.multimodal_config = MultiModalConfig(mm_processor_cache_gb=mm_cache_gb)
|
||||
|
||||
renderer_config = RendererConfig(
|
||||
model_config=model_config,
|
||||
tokenizer="dummy",
|
||||
skip_tokenizer_init=True,
|
||||
)
|
||||
|
||||
# Minimal multimodal_config to satisfy references in
|
||||
# Processor.process_inputs.
|
||||
class _MockMMConfig:
|
||||
def __init__(self, gb: float):
|
||||
self.mm_processor_cache_gb = gb
|
||||
|
||||
model_config.multimodal_config = _MockMMConfig(mm_cache_gb) # type: ignore[attr-defined]
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
renderer_config=renderer_config,
|
||||
cache_config=CacheConfig(enable_prefix_caching=enable_prefix_caching),
|
||||
device_config=DeviceConfig(device="cpu"),
|
||||
)
|
||||
|
||||
@ -15,7 +15,6 @@ from vllm.config import (
|
||||
DeviceConfig,
|
||||
KVTransferConfig,
|
||||
ModelConfig,
|
||||
RendererConfig,
|
||||
SchedulerConfig,
|
||||
VllmConfig,
|
||||
)
|
||||
@ -128,7 +127,6 @@ def create_vllm_config(
|
||||
return VllmConfig(
|
||||
scheduler_config=scheduler_config,
|
||||
model_config=model_config,
|
||||
renderer_config=RendererConfig(model_config=model_config),
|
||||
cache_config=cache_config,
|
||||
kv_transfer_config=kv_transfer_config,
|
||||
device_config=DeviceConfig("cpu"),
|
||||
|
||||
@ -19,7 +19,6 @@ from vllm.config import (
|
||||
DeviceConfig,
|
||||
ModelConfig,
|
||||
ParallelConfig,
|
||||
RendererConfig,
|
||||
SchedulerConfig,
|
||||
SpeculativeConfig,
|
||||
VllmConfig,
|
||||
@ -62,7 +61,6 @@ def _create_proposer(
|
||||
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
renderer_config=RendererConfig(model_config=model_config),
|
||||
cache_config=CacheConfig(),
|
||||
speculative_config=speculative_config,
|
||||
device_config=DeviceConfig(device=current_platform.device_type),
|
||||
|
||||
@ -18,7 +18,6 @@ from vllm.config import (
|
||||
DeviceConfig,
|
||||
ModelConfig,
|
||||
ParallelConfig,
|
||||
RendererConfig,
|
||||
SchedulerConfig,
|
||||
SpeculativeConfig,
|
||||
VllmConfig,
|
||||
@ -47,7 +46,6 @@ def _create_mtp_proposer(num_speculative_tokens: int) -> EagleProposer:
|
||||
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
renderer_config=RendererConfig(model_config=model_config),
|
||||
cache_config=CacheConfig(),
|
||||
speculative_config=speculative_config,
|
||||
device_config=DeviceConfig(device=current_platform.device_type),
|
||||
|
||||
@ -4,7 +4,6 @@ import numpy as np
|
||||
|
||||
from vllm.config import (
|
||||
ModelConfig,
|
||||
RendererConfig,
|
||||
SpeculativeConfig,
|
||||
VllmConfig,
|
||||
)
|
||||
@ -70,7 +69,6 @@ def test_ngram_proposer():
|
||||
return NgramProposer(
|
||||
vllm_config=VllmConfig(
|
||||
model_config=model_config,
|
||||
renderer_config=RendererConfig(model_config=model_config),
|
||||
speculative_config=SpeculativeConfig(
|
||||
prompt_lookup_min=min_n,
|
||||
prompt_lookup_max=max_n,
|
||||
|
||||
@ -6,7 +6,7 @@ from concurrent.futures import Future
|
||||
import pytest
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm.config import RendererConfig, StructuredOutputsConfig, VllmConfig
|
||||
from vllm.config import StructuredOutputsConfig, VllmConfig
|
||||
from vllm.config.model import ModelConfig
|
||||
from vllm.config.parallel import ParallelConfig
|
||||
from vllm.config.speculative import SpeculativeConfig
|
||||
@ -72,11 +72,8 @@ def test_backend_guidance_rollback_terminated():
|
||||
def test_grammar_bitmask_with_specdec():
|
||||
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
|
||||
prompt = tokenizer.encode('{"a": "b"}')
|
||||
|
||||
model_config = ModelConfig(tokenizer=TOKENIZER)
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
renderer_config=RendererConfig(model_config=model_config, tokenizer=TOKENIZER),
|
||||
model_config=ModelConfig(tokenizer=TOKENIZER),
|
||||
structured_outputs_config=StructuredOutputsConfig(backend="guidance"),
|
||||
speculative_config=SpeculativeConfig(model="[ngram]", num_speculative_tokens=3),
|
||||
)
|
||||
@ -140,11 +137,8 @@ def test_grammar_init_async_and_sync(async_grammar):
|
||||
|
||||
# Use "external_launcher" for sync mode, None for async mode
|
||||
executor_backend = None if async_grammar else "external_launcher"
|
||||
|
||||
model_config = ModelConfig(tokenizer=TOKENIZER)
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
renderer_config=RendererConfig(model_config=model_config, tokenizer=TOKENIZER),
|
||||
model_config=ModelConfig(tokenizer=TOKENIZER),
|
||||
structured_outputs_config=StructuredOutputsConfig(backend="guidance"),
|
||||
parallel_config=ParallelConfig(distributed_executor_backend=executor_backend),
|
||||
)
|
||||
|
||||
@ -7,7 +7,7 @@ from unittest.mock import Mock
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.config import ModelConfig, RendererConfig, SchedulerConfig, VllmConfig
|
||||
from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
|
||||
from vllm.reasoning import ReasoningParser
|
||||
from vllm.v1.request import Request
|
||||
from vllm.v1.structured_output import StructuredOutputManager
|
||||
@ -17,26 +17,19 @@ class TestReasoningStructuredOutput:
|
||||
"""Test reasoning-aware structured output functionality."""
|
||||
|
||||
@pytest.fixture
|
||||
def mock_renderer_config(self):
|
||||
"""Create a mock RendererConfig."""
|
||||
renderer_config = Mock(spec=RendererConfig)
|
||||
renderer_config.skip_tokenizer_init = (
|
||||
True # Skip tokenizer init to avoid network calls
|
||||
)
|
||||
|
||||
model_config = Mock(spec=ModelConfig)
|
||||
model_config.get_vocab_size = Mock(return_value=50000)
|
||||
model_config.trust_remote_code = False
|
||||
def mock_model_config(self):
|
||||
"""Create a mock ModelConfig."""
|
||||
config = Mock(spec=ModelConfig)
|
||||
config.skip_tokenizer_init = True # Skip tokenizer init to avoid network calls
|
||||
config.get_vocab_size = Mock(return_value=50000)
|
||||
# Add missing runner_type attribute that tokenizer initialization expects
|
||||
model_config.runner_type = "generate"
|
||||
renderer_config.model_config = model_config
|
||||
|
||||
config.runner_type = "generate"
|
||||
# Add other attributes that tokenizer initialization might need
|
||||
renderer_config.tokenizer = "test-tokenizer"
|
||||
renderer_config.tokenizer_mode = "auto"
|
||||
renderer_config.tokenizer_revision = None
|
||||
|
||||
return renderer_config
|
||||
config.tokenizer = "test-tokenizer"
|
||||
config.tokenizer_mode = "auto"
|
||||
config.trust_remote_code = False
|
||||
config.tokenizer_revision = None
|
||||
return config
|
||||
|
||||
@pytest.fixture
|
||||
def mock_scheduler_config(self):
|
||||
@ -46,10 +39,10 @@ class TestReasoningStructuredOutput:
|
||||
return config
|
||||
|
||||
@pytest.fixture
|
||||
def mock_vllm_config(self, mock_renderer_config, mock_scheduler_config):
|
||||
def mock_vllm_config(self, mock_model_config, mock_scheduler_config):
|
||||
"""Create a mock VllmConfig."""
|
||||
config = Mock(spec=VllmConfig)
|
||||
config.renderer_config = mock_renderer_config
|
||||
config.model_config = mock_model_config
|
||||
config.scheduler_config = mock_scheduler_config
|
||||
config.structured_outputs_config = Mock()
|
||||
config.structured_outputs_config.reasoning_parser = None
|
||||
|
||||
@ -7,7 +7,6 @@ from vllm.attention.layer import Attention
|
||||
from vllm.config import (
|
||||
CacheConfig,
|
||||
ModelConfig,
|
||||
RendererConfig,
|
||||
SchedulerConfig,
|
||||
VllmConfig,
|
||||
set_current_vllm_config,
|
||||
@ -46,7 +45,6 @@ def get_vllm_config():
|
||||
)
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
renderer_config=RendererConfig(model_config=model_config),
|
||||
cache_config=cache_config,
|
||||
scheduler_config=scheduler_config,
|
||||
)
|
||||
|
||||
@ -13,7 +13,6 @@ from vllm.config import (
|
||||
CacheConfig,
|
||||
ModelConfig,
|
||||
ParallelConfig,
|
||||
RendererConfig,
|
||||
SchedulerConfig,
|
||||
VllmConfig,
|
||||
set_current_vllm_config,
|
||||
@ -102,7 +101,6 @@ def get_vllm_config():
|
||||
parallel_config = ParallelConfig()
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
renderer_config=RendererConfig(model_config=model_config),
|
||||
cache_config=cache_config,
|
||||
scheduler_config=scheduler_config,
|
||||
parallel_config=parallel_config,
|
||||
@ -813,7 +811,6 @@ def test_hybrid_attention_mamba_tensor_shapes():
|
||||
attention_config = AttentionConfig(backend=AttentionBackendEnum.FLASHINFER)
|
||||
vllm_config = VllmConfig(
|
||||
model_config=model_config,
|
||||
renderer_config=RendererConfig(model_config=model_config),
|
||||
cache_config=cache_config,
|
||||
scheduler_config=scheduler_config,
|
||||
parallel_config=parallel_config,
|
||||
|
||||
@ -24,7 +24,6 @@ from vllm.config.multimodal import MultiModalConfig
|
||||
from vllm.config.observability import ObservabilityConfig
|
||||
from vllm.config.parallel import EPLBConfig, ParallelConfig
|
||||
from vllm.config.pooler import PoolerConfig
|
||||
from vllm.config.renderer import RendererConfig
|
||||
from vllm.config.scheduler import SchedulerConfig
|
||||
from vllm.config.speculative import SpeculativeConfig
|
||||
from vllm.config.speech_to_text import SpeechToTextConfig
|
||||
@ -82,8 +81,6 @@ __all__ = [
|
||||
"ParallelConfig",
|
||||
# From vllm.config.pooler
|
||||
"PoolerConfig",
|
||||
# From vllm.config.renderer
|
||||
"RendererConfig",
|
||||
# From vllm.config.scheduler
|
||||
"SchedulerConfig",
|
||||
# From vllm.config.speculative
|
||||
|
||||
@ -36,6 +36,7 @@ from vllm.transformers_utils.config import (
|
||||
uses_xdrope_dim,
|
||||
)
|
||||
from vllm.transformers_utils.gguf_utils import (
|
||||
is_gguf,
|
||||
is_remote_gguf,
|
||||
maybe_patch_hf_config_from_gguf,
|
||||
split_remote_gguf,
|
||||
@ -82,6 +83,7 @@ TaskOption = Literal[
|
||||
"transcription",
|
||||
"draft",
|
||||
]
|
||||
TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"]
|
||||
ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
|
||||
LogprobsMode = Literal[
|
||||
"raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"
|
||||
@ -129,6 +131,18 @@ class ModelConfig:
|
||||
|
||||
Note that the model may support other tasks using the same model runner.
|
||||
"""
|
||||
tokenizer: SkipValidation[str] = None # type: ignore
|
||||
"""Name or path of the Hugging Face tokenizer to use. If unspecified, model
|
||||
name or path will be used."""
|
||||
tokenizer_mode: TokenizerMode | str = "auto"
|
||||
"""Tokenizer mode:\n
|
||||
- "auto" will use the tokenizer from `mistral_common` for Mistral models
|
||||
if available, otherwise it will use the "hf" tokenizer.\n
|
||||
- "hf" will use the fast tokenizer if available.\n
|
||||
- "slow" will always use the slow tokenizer.\n
|
||||
- "mistral" will always use the tokenizer from `mistral_common`.\n
|
||||
- "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
|
||||
- Other custom values can be supported via plugins."""
|
||||
trust_remote_code: bool = False
|
||||
"""Trust remote code (e.g., from HuggingFace) when downloading the model
|
||||
and tokenizer."""
|
||||
@ -154,6 +168,13 @@ class ModelConfig:
|
||||
hf_config_path: str | None = None
|
||||
"""Name or path of the Hugging Face config to use. If unspecified, model
|
||||
name or path will be used."""
|
||||
allowed_local_media_path: str = ""
|
||||
"""Allowing API requests to read local images or videos from directories
|
||||
specified by the server file system. This is a security risk. Should only
|
||||
be enabled in trusted environments."""
|
||||
allowed_media_domains: list[str] | None = None
|
||||
"""If set, only media URLs that belong to this domain can be used for
|
||||
multi-modal inputs. """
|
||||
revision: str | None = None
|
||||
"""The specific model version to use. It can be a branch name, a tag name,
|
||||
or a commit id. If unspecified, will use the default version."""
|
||||
@ -161,6 +182,10 @@ class ModelConfig:
|
||||
"""The specific revision to use for the model code on the Hugging Face Hub.
|
||||
It can be a branch name, a tag name, or a commit id. If unspecified, will
|
||||
use the default version."""
|
||||
tokenizer_revision: str | None = None
|
||||
"""The specific revision to use for the tokenizer on the Hugging Face Hub.
|
||||
It can be a branch name, a tag name, or a commit id. If unspecified, will
|
||||
use the default version."""
|
||||
max_model_len: SkipValidation[int] = None # type: ignore
|
||||
"""Model context length (prompt and output). If unspecified, will be
|
||||
automatically derived from the model config.
|
||||
@ -205,6 +230,10 @@ class ModelConfig:
|
||||
preventing potential numerical issues. Note that even if this is set to
|
||||
False, cascade attention will be only used when the heuristic tells that
|
||||
it's beneficial."""
|
||||
skip_tokenizer_init: bool = False
|
||||
"""Skip initialization of tokenizer and detokenizer. Expects valid
|
||||
`prompt_token_ids` and `None` for prompt from the input. The generated
|
||||
output will contain token ids."""
|
||||
enable_prompt_embeds: bool = False
|
||||
"""If `True`, enables passing text embeddings as inputs via the
|
||||
`prompt_embeds` key.
|
||||
@ -265,6 +294,8 @@ class ModelConfig:
|
||||
logits_processors: list[str | type[LogitsProcessor]] | None = None
|
||||
"""One or more logits processors' fully-qualified class names or class
|
||||
definitions"""
|
||||
io_processor_plugin: str | None = None
|
||||
"""IOProcessor plugin name to load at model startup"""
|
||||
|
||||
# Pooler config
|
||||
pooler_config: PoolerConfig | None = None
|
||||
@ -277,6 +308,7 @@ class ModelConfig:
|
||||
from the architecture of `self.model`."""
|
||||
limit_mm_per_prompt: InitVar[dict[str, int | dict[str, int]] | None] = None
|
||||
enable_mm_embeds: InitVar[bool | None] = None
|
||||
media_io_kwargs: InitVar[dict[str, dict[str, Any]] | None] = None
|
||||
mm_processor_kwargs: InitVar[dict[str, Any] | None] = None
|
||||
mm_processor_cache_gb: InitVar[float | None] = None
|
||||
mm_processor_cache_type: InitVar[MMCacheType | None] = None
|
||||
@ -303,12 +335,18 @@ class ModelConfig:
|
||||
"runner",
|
||||
"convert",
|
||||
"task",
|
||||
"tokenizer",
|
||||
"tokenizer_mode",
|
||||
"seed",
|
||||
"hf_config_path",
|
||||
"allowed_local_media_path",
|
||||
"allowed_media_domains",
|
||||
"tokenizer_revision",
|
||||
"spec_target_max_model_len",
|
||||
"enforce_eager",
|
||||
"logprobs_mode",
|
||||
"disable_cascade_attn",
|
||||
"skip_tokenizer_init",
|
||||
"served_model_name",
|
||||
"config_format",
|
||||
"hf_token",
|
||||
@ -316,9 +354,11 @@ class ModelConfig:
|
||||
"logits_processor_pattern",
|
||||
"override_attention_dtype",
|
||||
"logits_processors",
|
||||
"io_processor_plugin",
|
||||
"pooler_config",
|
||||
"multimodal_config",
|
||||
"limit_mm_per_prompt",
|
||||
"media_io_kwargs",
|
||||
"mm_processor_kwargs",
|
||||
"mm_processor_cache_gb",
|
||||
"mm_processor_cache_type",
|
||||
@ -383,6 +423,7 @@ class ModelConfig:
|
||||
# Multimodal config init vars
|
||||
limit_mm_per_prompt: dict[str, int | dict[str, int]] | None,
|
||||
enable_mm_embeds: bool | None,
|
||||
media_io_kwargs: dict[str, dict[str, Any]] | None,
|
||||
mm_processor_kwargs: dict[str, Any] | None,
|
||||
mm_processor_cache_gb: float | None,
|
||||
mm_processor_cache_type: MMCacheType | None,
|
||||
@ -397,8 +438,13 @@ class ModelConfig:
|
||||
self.served_model_name = get_served_model_name(
|
||||
self.model, self.served_model_name
|
||||
)
|
||||
self.original_model = self.model
|
||||
self.model = maybe_model_redirect(self.original_model)
|
||||
self.model = maybe_model_redirect(self.model)
|
||||
# The tokenizer is consistent with the model by default.
|
||||
if self.tokenizer is None:
|
||||
self.tokenizer = self.model
|
||||
if self.tokenizer_revision is None:
|
||||
self.tokenizer_revision = self.revision
|
||||
self.tokenizer = maybe_model_redirect(self.tokenizer)
|
||||
|
||||
if isinstance(self.hf_config_path, str):
|
||||
self.hf_config_path = maybe_model_redirect(self.hf_config_path)
|
||||
@ -419,7 +465,7 @@ class ModelConfig:
|
||||
hf_overrides_kw[key] = value
|
||||
hf_overrides_fn = None
|
||||
|
||||
self.maybe_pull_model_for_runai(self.model)
|
||||
self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer)
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
@ -602,8 +648,7 @@ class ModelConfig:
|
||||
)
|
||||
|
||||
self.original_max_model_len = self.max_model_len
|
||||
self.recalculate_max_model_len(self.original_max_model_len)
|
||||
|
||||
self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
|
||||
# Init multimodal config if needed
|
||||
if self._model_info.supports_multimodal:
|
||||
if (
|
||||
@ -619,6 +664,7 @@ class ModelConfig:
|
||||
mm_config_kwargs = dict(
|
||||
limit_per_prompt=limit_mm_per_prompt,
|
||||
enable_mm_embeds=enable_mm_embeds,
|
||||
media_io_kwargs=media_io_kwargs,
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
mm_processor_cache_gb=mm_processor_cache_gb,
|
||||
mm_processor_cache_type=mm_processor_cache_type,
|
||||
@ -636,8 +682,16 @@ class ModelConfig:
|
||||
|
||||
self.multimodal_config = MultiModalConfig(**mm_config_kwargs)
|
||||
|
||||
# Multimodal GGUF models must use original repo for mm processing
|
||||
if is_gguf(self.tokenizer) and self.is_multimodal_model:
|
||||
raise ValueError(
|
||||
"Loading a multimodal GGUF model needs to use original "
|
||||
"tokenizer. Please specify the unquantized hf model's "
|
||||
"repo name or path using the --tokenizer argument."
|
||||
)
|
||||
|
||||
if self.disable_sliding_window:
|
||||
# Set after recalculate_max_model_len to ensure that max_model_len
|
||||
# Set after get_and_verify_max_len to ensure that max_model_len
|
||||
# can be correctly capped to sliding window size
|
||||
self.hf_text_config.sliding_window = None
|
||||
|
||||
@ -661,9 +715,10 @@ class ModelConfig:
|
||||
|
||||
@model_validator(mode="after")
|
||||
def validate_model_config_after(self: "ModelConfig") -> "ModelConfig":
|
||||
if not isinstance(self.tokenizer, str):
|
||||
raise ValueError("tokenizer must be a string after __post_init__.")
|
||||
if not isinstance(self.max_model_len, int):
|
||||
raise ValueError("max_model_len must be an integer after __post_init__.")
|
||||
|
||||
return self
|
||||
|
||||
def _get_transformers_backend_cls(self) -> str:
|
||||
@ -712,17 +767,49 @@ class ModelConfig:
|
||||
"""The architecture vllm actually used."""
|
||||
return self._architecture
|
||||
|
||||
def maybe_pull_model_for_runai(self, model: str) -> None:
|
||||
"""Pull model from Object Storage to temporary directory when needed."""
|
||||
if not is_runai_obj_uri(model):
|
||||
def maybe_pull_model_tokenizer_for_runai(self, model: str, tokenizer: str) -> None:
|
||||
"""Pull model/tokenizer from Object Storage to temporary
|
||||
directory when needed.
|
||||
|
||||
Args:
|
||||
model: Model name or path
|
||||
tokenizer: Tokenizer name or path
|
||||
"""
|
||||
|
||||
if not (is_runai_obj_uri(model) or is_runai_obj_uri(tokenizer)):
|
||||
return
|
||||
|
||||
object_storage_model = ObjectStorageModel(url=model)
|
||||
object_storage_model.pull_files(
|
||||
model, allow_pattern=["*.model", "*.py", "*.json"]
|
||||
)
|
||||
self.model_weights = model
|
||||
self.model = object_storage_model.dir
|
||||
if is_runai_obj_uri(model):
|
||||
object_storage_model = ObjectStorageModel(url=model)
|
||||
object_storage_model.pull_files(
|
||||
model, allow_pattern=["*.model", "*.py", "*.json"]
|
||||
)
|
||||
self.model_weights = model
|
||||
self.model = object_storage_model.dir
|
||||
|
||||
# If tokenizer is same as model, download to same directory
|
||||
if model == tokenizer:
|
||||
object_storage_model.pull_files(
|
||||
model,
|
||||
ignore_pattern=[
|
||||
"*.pt",
|
||||
"*.safetensors",
|
||||
"*.bin",
|
||||
"*.tensors",
|
||||
"*.pth",
|
||||
],
|
||||
)
|
||||
self.tokenizer = object_storage_model.dir
|
||||
return
|
||||
|
||||
# Only download tokenizer if needed and not already handled
|
||||
if is_runai_obj_uri(tokenizer):
|
||||
object_storage_tokenizer = ObjectStorageModel(url=tokenizer)
|
||||
object_storage_tokenizer.pull_files(
|
||||
model,
|
||||
ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors", "*.pth"],
|
||||
)
|
||||
self.tokenizer = object_storage_tokenizer.dir
|
||||
|
||||
def _get_encoder_config(self):
|
||||
model = self.model
|
||||
@ -1625,38 +1712,30 @@ class ModelConfig:
|
||||
return dense_modules[-1]["out_features"]
|
||||
return self.get_hidden_size()
|
||||
|
||||
def recalculate_max_model_len(
|
||||
self,
|
||||
original_max_model_len: int | None,
|
||||
*,
|
||||
tokenizer: str | None = None,
|
||||
tokenizer_revision: str | None = None,
|
||||
) -> None:
|
||||
def get_and_verify_max_len(self, max_model_len: int):
|
||||
# Consider max_model_len in tokenizer_config only when
|
||||
# pooling models use absolute position_embedding.
|
||||
# NOTE: For simplicity we assume `args.model == args.tokenizer`
|
||||
# since this is
|
||||
tokenizer_config = None
|
||||
if (
|
||||
self.runner_type == "pooling"
|
||||
and getattr(self.hf_config, "position_embedding_type", "") == "absolute"
|
||||
):
|
||||
tokenizer_config = try_get_tokenizer_config(
|
||||
tokenizer or self.model,
|
||||
self.tokenizer,
|
||||
trust_remote_code=self.trust_remote_code,
|
||||
revision=tokenizer_revision or self.revision,
|
||||
revision=self.tokenizer_revision,
|
||||
)
|
||||
|
||||
self.max_model_len = _get_and_verify_max_len(
|
||||
max_model_len = _get_and_verify_max_len(
|
||||
hf_config=self.hf_text_config,
|
||||
tokenizer_config=tokenizer_config,
|
||||
max_model_len=original_max_model_len,
|
||||
max_model_len=max_model_len,
|
||||
disable_sliding_window=self.disable_sliding_window,
|
||||
sliding_window=self.get_sliding_window(),
|
||||
spec_target_max_model_len=self.spec_target_max_model_len,
|
||||
encoder_config=self.encoder_config,
|
||||
)
|
||||
logger.info("Using max model len %s", self.max_model_len)
|
||||
logger.info("Using max model len %s", max_model_len)
|
||||
return max_model_len
|
||||
|
||||
@property
|
||||
def attn_type(self) -> AttnTypeStr:
|
||||
|
||||
@ -79,6 +79,10 @@ class MultiModalConfig:
|
||||
|
||||
WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed.
|
||||
Only enable this flag for trusted users!"""
|
||||
media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict)
|
||||
"""Additional args passed to process media inputs, keyed by modalities.
|
||||
For example, to set num_frames for video, set
|
||||
`--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
|
||||
mm_processor_kwargs: dict[str, object] | None = None
|
||||
"""Arguments to be forwarded to the model's processor for multi-modal data,
|
||||
e.g., image processor. Overrides for the multi-modal processor obtained
|
||||
|
||||
@ -1,109 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from typing import Any, Literal
|
||||
|
||||
from pydantic import Field, SkipValidation
|
||||
from pydantic.dataclasses import dataclass
|
||||
|
||||
from vllm.config.model import ModelConfig
|
||||
from vllm.config.utils import config
|
||||
from vllm.transformers_utils.gguf_utils import is_gguf
|
||||
from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri
|
||||
from vllm.transformers_utils.utils import maybe_model_redirect
|
||||
|
||||
TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"]
|
||||
|
||||
|
||||
@config
|
||||
@dataclass
|
||||
class RendererConfig:
|
||||
"""Configuration for the renderer."""
|
||||
|
||||
# NOTE: In reality, this is a required argument.
|
||||
# We provide a dummy default value here to generate the CLI args.
|
||||
model_config: SkipValidation[ModelConfig] = None # type: ignore
|
||||
"""Provides model context to the renderer."""
|
||||
|
||||
tokenizer: str = ""
|
||||
"""Name or path of the Hugging Face tokenizer to use. If unspecified, model
|
||||
name or path will be used."""
|
||||
tokenizer_mode: TokenizerMode | str = "auto"
|
||||
"""Tokenizer mode:\n
|
||||
- "auto" will use the tokenizer from `mistral_common` for Mistral models
|
||||
if available, otherwise it will use the "hf" tokenizer.\n
|
||||
- "hf" will use the fast tokenizer if available.\n
|
||||
- "slow" will always use the slow tokenizer.\n
|
||||
- "mistral" will always use the tokenizer from `mistral_common`.\n
|
||||
- "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
|
||||
- Other custom values can be supported via plugins."""
|
||||
tokenizer_revision: str | None = None
|
||||
"""The specific revision to use for the tokenizer on the Hugging Face Hub.
|
||||
It can be a branch name, a tag name, or a commit id. If unspecified, will
|
||||
use the default version."""
|
||||
skip_tokenizer_init: bool = False
|
||||
"""Skip initialization of tokenizer and detokenizer. Expects valid
|
||||
`prompt_token_ids` and `None` for prompt from the input. The generated
|
||||
output will contain token ids."""
|
||||
|
||||
io_processor_plugin: str | None = None
|
||||
"""IOProcessor plugin name to load at model startup."""
|
||||
|
||||
media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict)
|
||||
"""Additional args passed to process media inputs, keyed by modalities.
|
||||
For example, to set num_frames for video, set
|
||||
`--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
|
||||
allowed_local_media_path: str = ""
|
||||
"""Allowing API requests to read local images or videos from directories
|
||||
specified by the server file system. This is a security risk. Should only
|
||||
be enabled in trusted environments."""
|
||||
allowed_media_domains: list[str] | None = None
|
||||
"""If set, only media URLs that belong to this domain can be used for
|
||||
multi-modal inputs. """
|
||||
|
||||
@property
|
||||
def trust_remote_code(self) -> bool:
|
||||
return self.model_config.trust_remote_code
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
model_config = self.model_config
|
||||
|
||||
# The tokenizer is consistent with the model by default.
|
||||
if not self.tokenizer:
|
||||
self.tokenizer = (
|
||||
ModelConfig.model
|
||||
if model_config is None
|
||||
else model_config.original_model
|
||||
)
|
||||
if not self.tokenizer_revision:
|
||||
self.tokenizer_revision = (
|
||||
ModelConfig.revision if model_config is None else model_config.revision
|
||||
)
|
||||
|
||||
self.original_tokenizer = self.tokenizer
|
||||
self.tokenizer = maybe_model_redirect(self.original_tokenizer)
|
||||
self.maybe_pull_tokenizer_for_runai(self.tokenizer)
|
||||
|
||||
# Multimodal GGUF models must use original repo for mm processing
|
||||
is_multimodal_model = (
|
||||
ModelConfig.is_multimodal_model
|
||||
if model_config is None
|
||||
else model_config.is_multimodal_model
|
||||
)
|
||||
if is_gguf(self.tokenizer) and is_multimodal_model:
|
||||
raise ValueError(
|
||||
"Loading a multimodal GGUF model needs to use original "
|
||||
"tokenizer. Please specify the unquantized hf model's "
|
||||
"repo name or path using the --tokenizer argument."
|
||||
)
|
||||
|
||||
def maybe_pull_tokenizer_for_runai(self, tokenizer: str) -> None:
|
||||
"""Pull tokenizer from Object Storage to temporary directory when needed."""
|
||||
if not is_runai_obj_uri(tokenizer):
|
||||
return
|
||||
|
||||
object_storage_tokenizer = ObjectStorageModel(url=tokenizer)
|
||||
object_storage_tokenizer.pull_files(
|
||||
tokenizer,
|
||||
ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors", "*.pth"],
|
||||
)
|
||||
self.tokenizer = object_storage_tokenizer.dir
|
||||
@ -322,11 +322,16 @@ class SpeculativeConfig:
|
||||
self.draft_model_config = ModelConfig(
|
||||
model=self.model,
|
||||
runner="draft",
|
||||
tokenizer=self.target_model_config.tokenizer,
|
||||
tokenizer_mode=self.target_model_config.tokenizer_mode,
|
||||
trust_remote_code=self.target_model_config.trust_remote_code,
|
||||
allowed_local_media_path=self.target_model_config.allowed_local_media_path,
|
||||
allowed_media_domains=self.target_model_config.allowed_media_domains,
|
||||
dtype=self.target_model_config.dtype,
|
||||
seed=self.target_model_config.seed,
|
||||
revision=self.revision,
|
||||
code_revision=self.code_revision,
|
||||
tokenizer_revision=self.target_model_config.tokenizer_revision,
|
||||
spec_target_max_model_len=self.target_model_config.max_model_len,
|
||||
quantization=self.quantization,
|
||||
enforce_eager=self.target_model_config.enforce_eager,
|
||||
|
||||
@ -39,7 +39,6 @@ from .lora import LoRAConfig
|
||||
from .model import ModelConfig
|
||||
from .observability import ObservabilityConfig
|
||||
from .parallel import ParallelConfig
|
||||
from .renderer import RendererConfig
|
||||
from .scheduler import SchedulerConfig
|
||||
from .speculative import SpeculativeConfig
|
||||
from .structured_outputs import StructuredOutputsConfig
|
||||
@ -182,8 +181,6 @@ class VllmConfig:
|
||||
# try to download a model
|
||||
model_config: ModelConfig = Field(default=None)
|
||||
"""Model configuration."""
|
||||
renderer_config: RendererConfig = Field(default_factory=RendererConfig)
|
||||
"""Renderer configuration."""
|
||||
cache_config: CacheConfig = Field(default_factory=CacheConfig)
|
||||
"""Cache configuration."""
|
||||
parallel_config: ParallelConfig = Field(default_factory=ParallelConfig)
|
||||
@ -744,7 +741,7 @@ class VllmConfig:
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
|
||||
self.scheduler_config.max_num_encoder_input_tokens = (
|
||||
MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.renderer_config)
|
||||
MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config)
|
||||
)
|
||||
logger.debug(
|
||||
"Encoder-decoder model detected: setting "
|
||||
@ -1189,13 +1186,11 @@ class VllmConfig:
|
||||
computed_compile_ranges_split_points
|
||||
)
|
||||
|
||||
def recalculate_max_model_len(self, original_max_model_len: int | None) -> None:
|
||||
# Can only be called during try_verify_and_update_config
|
||||
self.model_config.recalculate_max_model_len(
|
||||
original_max_model_len,
|
||||
tokenizer=self.renderer_config.tokenizer,
|
||||
tokenizer_revision=self.renderer_config.tokenizer_revision,
|
||||
)
|
||||
def recalculate_max_model_len(self, max_model_len: int):
|
||||
# Can only be called in try_verify_and_update_config
|
||||
model_config = self.model_config
|
||||
max_model_len = model_config.get_and_verify_max_len(max_model_len)
|
||||
self.model_config.max_model_len = max_model_len
|
||||
|
||||
def try_verify_and_update_config(self):
|
||||
if self.model_config is None:
|
||||
@ -1269,11 +1264,11 @@ class VllmConfig:
|
||||
return (
|
||||
f"model={self.model_config.model!r}, "
|
||||
f"speculative_config={self.speculative_config!r}, "
|
||||
f"tokenizer={self.renderer_config.tokenizer!r}, "
|
||||
f"skip_tokenizer_init={self.renderer_config.skip_tokenizer_init}, "
|
||||
f"tokenizer_mode={self.renderer_config.tokenizer_mode}, "
|
||||
f"tokenizer={self.model_config.tokenizer!r}, "
|
||||
f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, "
|
||||
f"tokenizer_mode={self.model_config.tokenizer_mode}, "
|
||||
f"revision={self.model_config.revision}, "
|
||||
f"tokenizer_revision={self.renderer_config.tokenizer_revision}, "
|
||||
f"tokenizer_revision={self.model_config.tokenizer_revision}, "
|
||||
f"trust_remote_code={self.model_config.trust_remote_code}, "
|
||||
f"dtype={self.model_config.dtype}, "
|
||||
f"max_seq_len={self.model_config.max_model_len}, "
|
||||
|
||||
@ -71,11 +71,11 @@ from vllm.config.model import (
|
||||
ModelDType,
|
||||
RunnerOption,
|
||||
TaskOption,
|
||||
TokenizerMode,
|
||||
)
|
||||
from vllm.config.multimodal import MMCacheType, MMEncoderTPMode
|
||||
from vllm.config.observability import DetailedTraceModules
|
||||
from vllm.config.parallel import DistributedExecutorBackend, ExpertPlacementStrategy
|
||||
from vllm.config.renderer import RendererConfig, TokenizerMode
|
||||
from vllm.config.scheduler import SchedulerPolicy
|
||||
from vllm.config.utils import get_field
|
||||
from vllm.config.vllm import OptimizationLevel
|
||||
@ -355,12 +355,17 @@ class EngineArgs:
|
||||
|
||||
model: str = ModelConfig.model
|
||||
served_model_name: str | list[str] | None = ModelConfig.served_model_name
|
||||
tokenizer: str | None = ModelConfig.tokenizer
|
||||
hf_config_path: str | None = ModelConfig.hf_config_path
|
||||
runner: RunnerOption = ModelConfig.runner
|
||||
convert: ConvertOption = ModelConfig.convert
|
||||
task: TaskOption | None = ModelConfig.task
|
||||
skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
|
||||
enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
|
||||
tokenizer_mode: TokenizerMode | str = ModelConfig.tokenizer_mode
|
||||
trust_remote_code: bool = ModelConfig.trust_remote_code
|
||||
allowed_local_media_path: str = ModelConfig.allowed_local_media_path
|
||||
allowed_media_domains: list[str] | None = ModelConfig.allowed_media_domains
|
||||
download_dir: str | None = LoadConfig.download_dir
|
||||
safetensors_load_strategy: str = LoadConfig.safetensors_load_strategy
|
||||
load_format: str | LoadFormats = LoadConfig.load_format
|
||||
@ -444,6 +449,7 @@ class EngineArgs:
|
||||
code_revision: str | None = ModelConfig.code_revision
|
||||
hf_token: bool | str | None = ModelConfig.hf_token
|
||||
hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides")
|
||||
tokenizer_revision: str | None = ModelConfig.tokenizer_revision
|
||||
quantization: QuantizationMethods | None = ModelConfig.quantization
|
||||
enforce_eager: bool = ModelConfig.enforce_eager
|
||||
disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
|
||||
@ -452,6 +458,9 @@ class EngineArgs:
|
||||
)
|
||||
enable_mm_embeds: bool = MultiModalConfig.enable_mm_embeds
|
||||
interleave_mm_strings: bool = MultiModalConfig.interleave_mm_strings
|
||||
media_io_kwargs: dict[str, dict[str, Any]] = get_field(
|
||||
MultiModalConfig, "media_io_kwargs"
|
||||
)
|
||||
mm_processor_kwargs: dict[str, Any] | None = MultiModalConfig.mm_processor_kwargs
|
||||
disable_mm_preprocessor_cache: bool = False # DEPRECATED
|
||||
mm_processor_cache_gb: float = MultiModalConfig.mm_processor_cache_gb
|
||||
@ -465,19 +474,9 @@ class EngineArgs:
|
||||
mm_encoder_attn_backend: AttentionBackendEnum | str | None = (
|
||||
MultiModalConfig.mm_encoder_attn_backend
|
||||
)
|
||||
io_processor_plugin: str | None = None
|
||||
skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling
|
||||
video_pruning_rate: float = MultiModalConfig.video_pruning_rate
|
||||
# Renderer fields
|
||||
tokenizer: str | None = None
|
||||
tokenizer_mode: TokenizerMode | str = RendererConfig.tokenizer_mode
|
||||
tokenizer_revision: str | None = RendererConfig.tokenizer_revision
|
||||
skip_tokenizer_init: bool = RendererConfig.skip_tokenizer_init
|
||||
io_processor_plugin: str | None = None
|
||||
media_io_kwargs: dict[str, dict[str, Any]] = get_field(
|
||||
RendererConfig, "media_io_kwargs"
|
||||
)
|
||||
allowed_local_media_path: str = RendererConfig.allowed_local_media_path
|
||||
allowed_media_domains: list[str] | None = RendererConfig.allowed_media_domains
|
||||
# LoRA fields
|
||||
enable_lora: bool = False
|
||||
max_loras: int = LoRAConfig.max_loras
|
||||
@ -628,14 +627,25 @@ class EngineArgs:
|
||||
model_group.add_argument("--runner", **model_kwargs["runner"])
|
||||
model_group.add_argument("--convert", **model_kwargs["convert"])
|
||||
model_group.add_argument("--task", **model_kwargs["task"], deprecated=True)
|
||||
model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"])
|
||||
model_group.add_argument("--tokenizer-mode", **model_kwargs["tokenizer_mode"])
|
||||
model_group.add_argument(
|
||||
"--trust-remote-code", **model_kwargs["trust_remote_code"]
|
||||
)
|
||||
model_group.add_argument("--dtype", **model_kwargs["dtype"])
|
||||
model_group.add_argument("--seed", **model_kwargs["seed"])
|
||||
model_group.add_argument("--hf-config-path", **model_kwargs["hf_config_path"])
|
||||
model_group.add_argument(
|
||||
"--allowed-local-media-path", **model_kwargs["allowed_local_media_path"]
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--allowed-media-domains", **model_kwargs["allowed_media_domains"]
|
||||
)
|
||||
model_group.add_argument("--revision", **model_kwargs["revision"])
|
||||
model_group.add_argument("--code-revision", **model_kwargs["code_revision"])
|
||||
model_group.add_argument(
|
||||
"--tokenizer-revision", **model_kwargs["tokenizer_revision"]
|
||||
)
|
||||
model_group.add_argument("--max-model-len", **model_kwargs["max_model_len"])
|
||||
model_group.add_argument("--quantization", "-q", **model_kwargs["quantization"])
|
||||
model_group.add_argument("--enforce-eager", **model_kwargs["enforce_eager"])
|
||||
@ -647,6 +657,9 @@ class EngineArgs:
|
||||
model_group.add_argument(
|
||||
"--disable-cascade-attn", **model_kwargs["disable_cascade_attn"]
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--skip-tokenizer-init", **model_kwargs["skip_tokenizer_init"]
|
||||
)
|
||||
model_group.add_argument(
|
||||
"--enable-prompt-embeds", **model_kwargs["enable_prompt_embeds"]
|
||||
)
|
||||
@ -685,34 +698,8 @@ class EngineArgs:
|
||||
model_group.add_argument(
|
||||
"--logits-processors", **model_kwargs["logits_processors"]
|
||||
)
|
||||
|
||||
# Renderer arguments
|
||||
renderer_kwargs = get_kwargs(RendererConfig)
|
||||
renderer_group = parser.add_argument_group(
|
||||
title="RendererConfig",
|
||||
description=RendererConfig.__doc__,
|
||||
)
|
||||
renderer_group.add_argument("--tokenizer", **renderer_kwargs["tokenizer"])
|
||||
renderer_group.add_argument(
|
||||
"--tokenizer-mode", **renderer_kwargs["tokenizer_mode"]
|
||||
)
|
||||
renderer_group.add_argument(
|
||||
"--tokenizer-revision", **renderer_kwargs["tokenizer_revision"]
|
||||
)
|
||||
renderer_group.add_argument(
|
||||
"--skip-tokenizer-init", **renderer_kwargs["skip_tokenizer_init"]
|
||||
)
|
||||
renderer_group.add_argument(
|
||||
"--media-io-kwargs", **renderer_kwargs["media_io_kwargs"]
|
||||
)
|
||||
renderer_group.add_argument(
|
||||
"--allowed-local-media-path", **renderer_kwargs["allowed_local_media_path"]
|
||||
)
|
||||
renderer_group.add_argument(
|
||||
"--allowed-media-domains", **renderer_kwargs["allowed_media_domains"]
|
||||
)
|
||||
renderer_group.add_argument(
|
||||
"--io-processor-plugin", **renderer_kwargs["io_processor_plugin"]
|
||||
model_group.add_argument(
|
||||
"--io-processor-plugin", **model_kwargs["io_processor_plugin"]
|
||||
)
|
||||
|
||||
# Model loading arguments
|
||||
@ -962,6 +949,9 @@ class EngineArgs:
|
||||
multimodal_group.add_argument(
|
||||
"--enable-mm-embeds", **multimodal_kwargs["enable_mm_embeds"]
|
||||
)
|
||||
multimodal_group.add_argument(
|
||||
"--media-io-kwargs", **multimodal_kwargs["media_io_kwargs"]
|
||||
)
|
||||
multimodal_group.add_argument(
|
||||
"--mm-processor-kwargs", **multimodal_kwargs["mm_processor_kwargs"]
|
||||
)
|
||||
@ -1265,13 +1255,18 @@ class EngineArgs:
|
||||
runner=self.runner,
|
||||
convert=self.convert,
|
||||
task=self.task,
|
||||
tokenizer=self.tokenizer,
|
||||
tokenizer_mode=self.tokenizer_mode,
|
||||
trust_remote_code=self.trust_remote_code,
|
||||
allowed_local_media_path=self.allowed_local_media_path,
|
||||
allowed_media_domains=self.allowed_media_domains,
|
||||
dtype=self.dtype,
|
||||
seed=self.seed,
|
||||
revision=self.revision,
|
||||
code_revision=self.code_revision,
|
||||
hf_token=self.hf_token,
|
||||
hf_overrides=self.hf_overrides,
|
||||
tokenizer_revision=self.tokenizer_revision,
|
||||
max_model_len=self.max_model_len,
|
||||
quantization=self.quantization,
|
||||
enforce_eager=self.enforce_eager,
|
||||
@ -1279,11 +1274,13 @@ class EngineArgs:
|
||||
logprobs_mode=self.logprobs_mode,
|
||||
disable_sliding_window=self.disable_sliding_window,
|
||||
disable_cascade_attn=self.disable_cascade_attn,
|
||||
skip_tokenizer_init=self.skip_tokenizer_init,
|
||||
enable_prompt_embeds=self.enable_prompt_embeds,
|
||||
served_model_name=self.served_model_name,
|
||||
limit_mm_per_prompt=self.limit_mm_per_prompt,
|
||||
enable_mm_embeds=self.enable_mm_embeds,
|
||||
interleave_mm_strings=self.interleave_mm_strings,
|
||||
media_io_kwargs=self.media_io_kwargs,
|
||||
skip_mm_profiling=self.skip_mm_profiling,
|
||||
config_format=self.config_format,
|
||||
mm_processor_kwargs=self.mm_processor_kwargs,
|
||||
@ -1301,6 +1298,7 @@ class EngineArgs:
|
||||
override_attention_dtype=self.override_attention_dtype,
|
||||
logits_processors=self.logits_processors,
|
||||
video_pruning_rate=self.video_pruning_rate,
|
||||
io_processor_plugin=self.io_processor_plugin,
|
||||
)
|
||||
|
||||
def validate_tensorizer_args(self):
|
||||
@ -1396,25 +1394,9 @@ class EngineArgs:
|
||||
)
|
||||
|
||||
model_config = self.create_model_config()
|
||||
renderer_config = RendererConfig(
|
||||
model_config=model_config,
|
||||
tokenizer=self.tokenizer or "",
|
||||
tokenizer_mode=self.tokenizer_mode,
|
||||
tokenizer_revision=self.tokenizer_revision,
|
||||
skip_tokenizer_init=self.skip_tokenizer_init,
|
||||
io_processor_plugin=self.io_processor_plugin,
|
||||
media_io_kwargs=self.media_io_kwargs,
|
||||
allowed_local_media_path=self.allowed_local_media_path,
|
||||
allowed_media_domains=self.allowed_media_domains,
|
||||
)
|
||||
|
||||
model_config.recalculate_max_model_len(
|
||||
model_config.original_max_model_len,
|
||||
tokenizer=renderer_config.tokenizer,
|
||||
tokenizer_revision=renderer_config.tokenizer_revision,
|
||||
)
|
||||
|
||||
self.model = model_config.model
|
||||
self.tokenizer = model_config.tokenizer
|
||||
|
||||
self._check_feature_supported(model_config)
|
||||
self._set_default_chunked_prefill_and_prefix_caching_args(model_config)
|
||||
self._set_default_max_num_seqs_and_batched_tokens_args(
|
||||
@ -1786,7 +1768,6 @@ class EngineArgs:
|
||||
)
|
||||
config = VllmConfig(
|
||||
model_config=model_config,
|
||||
renderer_config=renderer_config,
|
||||
cache_config=cache_config,
|
||||
parallel_config=parallel_config,
|
||||
scheduler_config=scheduler_config,
|
||||
|
||||
@ -5,7 +5,7 @@ from abc import ABC, abstractmethod
|
||||
from collections.abc import AsyncGenerator, Iterable, Mapping
|
||||
from typing import Any
|
||||
|
||||
from vllm.config import ModelConfig, RendererConfig, VllmConfig
|
||||
from vllm.config import ModelConfig, VllmConfig
|
||||
from vllm.inputs.data import PromptType
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.outputs import PoolingRequestOutput, RequestOutput
|
||||
@ -22,7 +22,6 @@ class EngineClient(ABC):
|
||||
"""Protocol class for Clients to Engine"""
|
||||
|
||||
vllm_config: VllmConfig
|
||||
renderer_config: RendererConfig
|
||||
model_config: ModelConfig
|
||||
input_processor: InputProcessor
|
||||
io_processor: IOProcessor | None
|
||||
|
||||
@ -44,7 +44,7 @@ from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast, Processor
|
||||
from typing_extensions import Required, TypedDict
|
||||
|
||||
from vllm import envs
|
||||
from vllm.config import ModelConfig, RendererConfig
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.models import SupportsMultiModal
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict
|
||||
@ -452,10 +452,9 @@ This is needed because `lru_cache` does not cache when an exception happens.
|
||||
|
||||
def _try_get_processor_chat_template(
|
||||
tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
|
||||
*,
|
||||
trust_remote_code: bool,
|
||||
model_config: ModelConfig,
|
||||
) -> str | None:
|
||||
cache_key = (tokenizer.name_or_path, trust_remote_code)
|
||||
cache_key = (tokenizer.name_or_path, model_config.trust_remote_code)
|
||||
if cache_key in _PROCESSOR_CHAT_TEMPLATES:
|
||||
return _PROCESSOR_CHAT_TEMPLATES[cache_key]
|
||||
|
||||
@ -467,7 +466,7 @@ def _try_get_processor_chat_template(
|
||||
PreTrainedTokenizerFast,
|
||||
ProcessorMixin,
|
||||
),
|
||||
trust_remote_code=trust_remote_code,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
)
|
||||
if (
|
||||
isinstance(processor, ProcessorMixin)
|
||||
@ -500,10 +499,7 @@ def resolve_hf_chat_template(
|
||||
|
||||
# 2nd priority: AutoProcessor chat template, unless tool calling is enabled
|
||||
if tools is None:
|
||||
chat_template = _try_get_processor_chat_template(
|
||||
tokenizer,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
)
|
||||
chat_template = _try_get_processor_chat_template(tokenizer, model_config)
|
||||
if chat_template is not None:
|
||||
return chat_template
|
||||
|
||||
@ -517,10 +513,10 @@ def resolve_hf_chat_template(
|
||||
exc_info=True,
|
||||
)
|
||||
|
||||
# 4th priority: Predefined fallbacks]
|
||||
# 4th priority: Predefined fallbacks
|
||||
path = get_chat_template_fallback_path(
|
||||
model_type=model_config.hf_config.model_type,
|
||||
tokenizer_name_or_path=tokenizer.name_or_path,
|
||||
tokenizer_name_or_path=model_config.tokenizer,
|
||||
)
|
||||
if path is not None:
|
||||
logger.info_once(
|
||||
@ -542,14 +538,14 @@ def _resolve_chat_template_content_format(
|
||||
tools: list[dict[str, Any]] | None,
|
||||
tokenizer: TokenizerLike | None,
|
||||
*,
|
||||
renderer_config: RendererConfig,
|
||||
model_config: ModelConfig,
|
||||
) -> _ChatTemplateContentFormat:
|
||||
if isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
|
||||
hf_chat_template = resolve_hf_chat_template(
|
||||
tokenizer,
|
||||
chat_template=chat_template,
|
||||
tools=tools,
|
||||
model_config=renderer_config.model_config,
|
||||
model_config=model_config,
|
||||
)
|
||||
else:
|
||||
hf_chat_template = None
|
||||
@ -599,7 +595,7 @@ def resolve_chat_template_content_format(
|
||||
given_format: ChatTemplateContentFormatOption,
|
||||
tokenizer: TokenizerLike | None,
|
||||
*,
|
||||
renderer_config: RendererConfig,
|
||||
model_config: ModelConfig,
|
||||
) -> _ChatTemplateContentFormat:
|
||||
if given_format != "auto":
|
||||
return given_format
|
||||
@ -608,7 +604,7 @@ def resolve_chat_template_content_format(
|
||||
chat_template,
|
||||
tools,
|
||||
tokenizer,
|
||||
renderer_config=renderer_config,
|
||||
model_config=model_config,
|
||||
)
|
||||
|
||||
_log_chat_template_content_format(
|
||||
@ -631,32 +627,32 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
|
||||
maximum per prompt.
|
||||
"""
|
||||
|
||||
def __init__(self, renderer_config: RendererConfig):
|
||||
def __init__(self, model_config: ModelConfig):
|
||||
super().__init__()
|
||||
|
||||
self._renderer_config = renderer_config
|
||||
self._model_config = model_config
|
||||
|
||||
self._items_by_modality = defaultdict[str, list[_T | None]](list)
|
||||
self._uuids_by_modality = defaultdict[str, list[str | None]](list)
|
||||
|
||||
@property
|
||||
def renderer_config(self) -> RendererConfig:
|
||||
return self._renderer_config
|
||||
def model_config(self) -> ModelConfig:
|
||||
return self._model_config
|
||||
|
||||
@cached_property
|
||||
def model_cls(self) -> type[SupportsMultiModal]:
|
||||
from vllm.model_executor.model_loader import get_model_cls
|
||||
|
||||
model_cls = get_model_cls(self.renderer_config.model_config)
|
||||
model_cls = get_model_cls(self.model_config)
|
||||
return cast(type[SupportsMultiModal], model_cls)
|
||||
|
||||
@property
|
||||
def allowed_local_media_path(self):
|
||||
return self._renderer_config.allowed_local_media_path
|
||||
return self._model_config.allowed_local_media_path
|
||||
|
||||
@property
|
||||
def allowed_media_domains(self):
|
||||
return self._renderer_config.allowed_media_domains
|
||||
return self._model_config.allowed_media_domains
|
||||
|
||||
@property
|
||||
def mm_registry(self):
|
||||
@ -664,7 +660,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
|
||||
|
||||
@cached_property
|
||||
def mm_processor(self):
|
||||
return self.mm_registry.create_processor(self.renderer_config)
|
||||
return self.mm_registry.create_processor(self.model_config)
|
||||
|
||||
def add(
|
||||
self,
|
||||
@ -855,20 +851,19 @@ class MultiModalContentParser(BaseMultiModalContentParser):
|
||||
super().__init__()
|
||||
|
||||
self._tracker = tracker
|
||||
multimodal_config = self._tracker.model_config.multimodal_config
|
||||
media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None)
|
||||
|
||||
self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load(
|
||||
envs.VLLM_MEDIA_CONNECTOR,
|
||||
media_io_kwargs=self.renderer_config.media_io_kwargs,
|
||||
media_io_kwargs=media_io_kwargs,
|
||||
allowed_local_media_path=tracker.allowed_local_media_path,
|
||||
allowed_media_domains=tracker.allowed_media_domains,
|
||||
)
|
||||
|
||||
@property
|
||||
def renderer_config(self) -> RendererConfig:
|
||||
return self._tracker.renderer_config
|
||||
|
||||
@property
|
||||
def model_config(self) -> ModelConfig:
|
||||
return self.renderer_config.model_config
|
||||
return self._tracker.model_config
|
||||
|
||||
def parse_image(self, image_url: str | None, uuid: str | None = None) -> None:
|
||||
image = self._connector.fetch_image(image_url) if image_url else None
|
||||
@ -968,20 +963,18 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
|
||||
super().__init__()
|
||||
|
||||
self._tracker = tracker
|
||||
multimodal_config = self._tracker.model_config.multimodal_config
|
||||
media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None)
|
||||
self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load(
|
||||
envs.VLLM_MEDIA_CONNECTOR,
|
||||
media_io_kwargs=self.renderer_config.media_io_kwargs,
|
||||
media_io_kwargs=media_io_kwargs,
|
||||
allowed_local_media_path=tracker.allowed_local_media_path,
|
||||
allowed_media_domains=tracker.allowed_media_domains,
|
||||
)
|
||||
|
||||
@property
|
||||
def renderer_config(self) -> RendererConfig:
|
||||
return self._tracker.renderer_config
|
||||
|
||||
@property
|
||||
def model_config(self) -> ModelConfig:
|
||||
return self.renderer_config.model_config
|
||||
return self._tracker.model_config
|
||||
|
||||
def parse_image(self, image_url: str | None, uuid: str | None = None) -> None:
|
||||
image_coro = self._connector.fetch_image_async(image_url) if image_url else None
|
||||
@ -1611,17 +1604,15 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None:
|
||||
|
||||
def parse_chat_messages(
|
||||
messages: list[ChatCompletionMessageParam],
|
||||
renderer_config: RendererConfig,
|
||||
model_config: ModelConfig,
|
||||
content_format: _ChatTemplateContentFormat,
|
||||
) -> tuple[
|
||||
list[ConversationMessage],
|
||||
MultiModalDataDict | None,
|
||||
MultiModalUUIDDict | None,
|
||||
]:
|
||||
model_config = renderer_config.model_config
|
||||
|
||||
conversation: list[ConversationMessage] = []
|
||||
mm_tracker = MultiModalItemTracker(renderer_config)
|
||||
mm_tracker = MultiModalItemTracker(model_config)
|
||||
|
||||
for msg in messages:
|
||||
sub_messages = _parse_chat_message_content(
|
||||
@ -1644,17 +1635,15 @@ def parse_chat_messages(
|
||||
|
||||
def parse_chat_messages_futures(
|
||||
messages: list[ChatCompletionMessageParam],
|
||||
renderer_config: RendererConfig,
|
||||
model_config: ModelConfig,
|
||||
content_format: _ChatTemplateContentFormat,
|
||||
) -> tuple[
|
||||
list[ConversationMessage],
|
||||
Awaitable[MultiModalDataDict | None],
|
||||
MultiModalUUIDDict | None,
|
||||
]:
|
||||
model_config = renderer_config.model_config
|
||||
|
||||
conversation: list[ConversationMessage] = []
|
||||
mm_tracker = AsyncMultiModalItemTracker(renderer_config)
|
||||
mm_tracker = AsyncMultiModalItemTracker(model_config)
|
||||
|
||||
for msg in messages:
|
||||
sub_messages = _parse_chat_message_content(
|
||||
@ -1759,14 +1748,14 @@ def apply_hf_chat_template(
|
||||
chat_template: str | None,
|
||||
tools: list[dict[str, Any]] | None,
|
||||
*,
|
||||
renderer_config: RendererConfig,
|
||||
model_config: ModelConfig,
|
||||
**kwargs: Any,
|
||||
) -> str:
|
||||
hf_chat_template = resolve_hf_chat_template(
|
||||
tokenizer,
|
||||
chat_template=chat_template,
|
||||
tools=tools,
|
||||
model_config=renderer_config.model_config,
|
||||
model_config=model_config,
|
||||
)
|
||||
|
||||
if hf_chat_template is None:
|
||||
|
||||
@ -29,8 +29,8 @@ from vllm.config.model import (
|
||||
HfOverrides,
|
||||
ModelDType,
|
||||
RunnerOption,
|
||||
TokenizerMode,
|
||||
)
|
||||
from vllm.config.renderer import TokenizerMode
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.entrypoints.chat_utils import (
|
||||
ChatCompletionMessageParam,
|
||||
@ -343,7 +343,6 @@ class LLM:
|
||||
logger.info("Supported tasks: %s", supported_tasks)
|
||||
self.supported_tasks = supported_tasks
|
||||
|
||||
self.renderer_config = self.llm_engine.renderer_config
|
||||
self.model_config = self.llm_engine.model_config
|
||||
self.input_processor = self.llm_engine.input_processor
|
||||
self.io_processor = self.llm_engine.io_processor
|
||||
@ -809,13 +808,13 @@ class LLM:
|
||||
list_of_messages = [cast(list[ChatCompletionMessageParam], messages)]
|
||||
|
||||
tokenizer = self.get_tokenizer()
|
||||
renderer_config = self.renderer_config
|
||||
model_config = self.model_config
|
||||
resolved_content_format = resolve_chat_template_content_format(
|
||||
chat_template,
|
||||
tools,
|
||||
chat_template_content_format,
|
||||
tokenizer,
|
||||
renderer_config=renderer_config,
|
||||
model_config=model_config,
|
||||
)
|
||||
|
||||
_chat_template_kwargs: dict[str, Any] = dict(
|
||||
@ -834,7 +833,7 @@ class LLM:
|
||||
# the chat message parsing for it.
|
||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||
msgs,
|
||||
renderer_config,
|
||||
model_config,
|
||||
content_format=resolved_content_format,
|
||||
)
|
||||
|
||||
@ -848,7 +847,7 @@ class LLM:
|
||||
prompt_str = apply_hf_chat_template(
|
||||
tokenizer=tokenizer,
|
||||
conversation=conversation,
|
||||
renderer_config=renderer_config,
|
||||
model_config=model_config,
|
||||
**_chat_template_kwargs,
|
||||
)
|
||||
# Special tokens are already included in chat templates so
|
||||
@ -1291,7 +1290,6 @@ class LLM:
|
||||
lora_request: list[LoRARequest] | LoRARequest | None = None,
|
||||
tokenization_kwargs: dict[str, Any] | None = None,
|
||||
) -> list[ScoringRequestOutput]:
|
||||
renderer_config = self.renderer_config
|
||||
model_config = self.model_config
|
||||
|
||||
if isinstance(tokenizer, MistralTokenizer):
|
||||
@ -1319,7 +1317,7 @@ class LLM:
|
||||
|
||||
for q, d in input_pairs:
|
||||
_, engine_prompt = get_score_prompt(
|
||||
renderer_config=renderer_config,
|
||||
model_config=model_config,
|
||||
data_1=q,
|
||||
data_2=d,
|
||||
tokenizer=tokenizer,
|
||||
|
||||
@ -1099,7 +1099,7 @@ async def init_app_state(
|
||||
logger.info("Supported tasks: %s", supported_tasks)
|
||||
|
||||
resolved_chat_template = await process_chat_template(
|
||||
args.chat_template, engine_client, vllm_config.renderer_config
|
||||
args.chat_template, engine_client, vllm_config.model_config
|
||||
)
|
||||
|
||||
if args.tool_server == "demo":
|
||||
|
||||
@ -122,7 +122,7 @@ class OpenAIServingCompletion(OpenAIServing):
|
||||
try:
|
||||
lora_request = self._maybe_get_adapters(request)
|
||||
|
||||
if self.renderer_config.skip_tokenizer_init:
|
||||
if self.model_config.skip_tokenizer_init:
|
||||
tokenizer = None
|
||||
else:
|
||||
tokenizer = await self.engine_client.get_tokenizer()
|
||||
|
||||
@ -291,7 +291,6 @@ class OpenAIServing:
|
||||
|
||||
self.input_processor = self.models.input_processor
|
||||
self.io_processor = self.models.io_processor
|
||||
self.renderer_config = self.models.renderer_config
|
||||
self.model_config = self.models.model_config
|
||||
self.max_model_len = self.model_config.max_model_len
|
||||
|
||||
@ -1101,18 +1100,18 @@ class OpenAIServing:
|
||||
Sequence[RequestPrompt],
|
||||
list[EngineTokensPrompt],
|
||||
]:
|
||||
renderer_config = self.renderer_config
|
||||
model_config = self.model_config
|
||||
|
||||
resolved_content_format = resolve_chat_template_content_format(
|
||||
chat_template,
|
||||
tool_dicts,
|
||||
chat_template_content_format,
|
||||
tokenizer,
|
||||
renderer_config=renderer_config,
|
||||
model_config=model_config,
|
||||
)
|
||||
conversation, mm_data_future, mm_uuids = parse_chat_messages_futures(
|
||||
messages,
|
||||
renderer_config,
|
||||
model_config,
|
||||
content_format=resolved_content_format,
|
||||
)
|
||||
|
||||
@ -1139,14 +1138,14 @@ class OpenAIServing:
|
||||
request_prompt = tokenizer.apply_chat_template(
|
||||
conversation=conversation,
|
||||
messages=messages,
|
||||
model_config=renderer_config.model_config,
|
||||
model_config=model_config,
|
||||
**_chat_template_kwargs,
|
||||
)
|
||||
else:
|
||||
request_prompt = apply_hf_chat_template(
|
||||
tokenizer=tokenizer,
|
||||
conversation=conversation,
|
||||
renderer_config=renderer_config,
|
||||
model_config=model_config,
|
||||
**_chat_template_kwargs,
|
||||
)
|
||||
|
||||
|
||||
@ -71,7 +71,6 @@ class OpenAIServingModels:
|
||||
|
||||
self.input_processor = self.engine_client.input_processor
|
||||
self.io_processor = self.engine_client.io_processor
|
||||
self.renderer_config = self.engine_client.renderer_config
|
||||
self.model_config = self.engine_client.model_config
|
||||
self.max_model_len = self.model_config.max_model_len
|
||||
|
||||
|
||||
@ -91,7 +91,7 @@ class OpenAISpeechToText(OpenAIServing):
|
||||
self.task_type = task_type
|
||||
|
||||
self.asr_config = self.model_cls.get_speech_to_text_config(
|
||||
self.renderer_config, task_type
|
||||
self.model_config, task_type
|
||||
)
|
||||
|
||||
self.enable_force_include_usage = enable_force_include_usage
|
||||
@ -101,8 +101,8 @@ class OpenAISpeechToText(OpenAIServing):
|
||||
self.tokenizer = cast(
|
||||
PreTrainedTokenizerBase,
|
||||
get_tokenizer(
|
||||
tokenizer_name=self.renderer_config.tokenizer,
|
||||
tokenizer_mode=self.renderer_config.tokenizer_mode,
|
||||
tokenizer_name=self.model_config.tokenizer,
|
||||
tokenizer_mode=self.model_config.tokenizer_mode,
|
||||
),
|
||||
)
|
||||
|
||||
@ -154,7 +154,7 @@ class OpenAISpeechToText(OpenAIServing):
|
||||
prompt = self.model_cls.get_generation_prompt(
|
||||
audio=chunk,
|
||||
stt_config=self.asr_config,
|
||||
renderer_config=self.renderer_config,
|
||||
model_config=self.model_config,
|
||||
language=language,
|
||||
task_type=self.task_type,
|
||||
request_prompt=request.prompt,
|
||||
@ -428,7 +428,7 @@ class OpenAISpeechToText(OpenAIServing):
|
||||
if res.prompt_token_ids is not None:
|
||||
num_prompt_tokens = len(res.prompt_token_ids)
|
||||
if audio_tokens := self.model_cls.get_num_audio_tokens(
|
||||
audio_duration_s, self.asr_config, self.renderer_config
|
||||
audio_duration_s, self.asr_config, self.model_config
|
||||
):
|
||||
num_prompt_tokens += audio_tokens
|
||||
|
||||
|
||||
@ -94,7 +94,7 @@ class OpenAIServingPooling(OpenAIServing):
|
||||
try:
|
||||
lora_request = self._maybe_get_adapters(request)
|
||||
|
||||
if self.renderer_config.skip_tokenizer_init:
|
||||
if self.model_config.skip_tokenizer_init:
|
||||
tokenizer = None
|
||||
else:
|
||||
tokenizer = await self.engine_client.get_tokenizer()
|
||||
|
||||
@ -160,8 +160,10 @@ class ServingScores(OpenAIServing):
|
||||
data_1: str | ScoreContentPartParam,
|
||||
data_2: str | ScoreContentPartParam,
|
||||
) -> tuple[str, TokensPrompt]:
|
||||
model_config = self.model_config
|
||||
|
||||
full_prompt, engine_prompt = get_score_prompt(
|
||||
renderer_config=self.renderer_config,
|
||||
model_config=model_config,
|
||||
data_1=data_1,
|
||||
data_2=data_2,
|
||||
tokenizer=tokenizer,
|
||||
|
||||
@ -5,7 +5,7 @@ from typing import Any, TypeAlias, cast
|
||||
from torch.nn import CosineSimilarity
|
||||
from typing_extensions import Required, TypedDict
|
||||
|
||||
from vllm.config import ModelConfig, RendererConfig
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.entrypoints.chat_utils import (
|
||||
BaseMultiModalItemTracker,
|
||||
ChatCompletionContentPartImageEmbedsParam,
|
||||
@ -88,9 +88,9 @@ def _validate_score_input_lens(
|
||||
def parse_score_data(
|
||||
data_1: str | ScoreContentPartParam,
|
||||
data_2: str | ScoreContentPartParam,
|
||||
renderer_config: RendererConfig,
|
||||
model_config: ModelConfig,
|
||||
) -> tuple[str, str, MultiModalDataDict | None]:
|
||||
mm_tracker = MultiModalItemTracker(renderer_config)
|
||||
mm_tracker = MultiModalItemTracker(model_config)
|
||||
|
||||
content_1 = _parse_score_content(data_1, mm_tracker)
|
||||
content_2 = _parse_score_content(data_2, mm_tracker)
|
||||
@ -176,7 +176,7 @@ def post_process_tokens(
|
||||
|
||||
|
||||
def get_score_prompt(
|
||||
renderer_config: RendererConfig,
|
||||
model_config: ModelConfig,
|
||||
tokenizer: TokenizerLike,
|
||||
tokenization_kwargs: dict[str, Any],
|
||||
data_1: str | ScoreContentPartParam,
|
||||
@ -185,14 +185,11 @@ def get_score_prompt(
|
||||
prompt_1, prompt_2, mm_data = parse_score_data(
|
||||
data_1,
|
||||
data_2,
|
||||
renderer_config,
|
||||
model_config,
|
||||
)
|
||||
|
||||
from vllm.model_executor.model_loader import get_model_cls
|
||||
|
||||
model_config = renderer_config.model_config
|
||||
model = get_model_cls(model_config)
|
||||
|
||||
if supports_score_template(model):
|
||||
full_prompt = apply_score_template(model_config, prompt_1, prompt_2)
|
||||
prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs)
|
||||
|
||||
@ -13,7 +13,7 @@ from fastapi import Request
|
||||
from fastapi.responses import JSONResponse, StreamingResponse
|
||||
from starlette.background import BackgroundTask, BackgroundTasks
|
||||
|
||||
from vllm.config import RendererConfig
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
from vllm.engine.protocol import EngineClient
|
||||
from vllm.entrypoints.chat_utils import (
|
||||
@ -288,7 +288,7 @@ def process_lora_modules(
|
||||
async def process_chat_template(
|
||||
args_chat_template: Path | str | None,
|
||||
engine_client: EngineClient,
|
||||
renderer_config: RendererConfig,
|
||||
model_config: ModelConfig,
|
||||
) -> str | None:
|
||||
resolved_chat_template = load_chat_template(args_chat_template)
|
||||
if resolved_chat_template is not None:
|
||||
@ -305,7 +305,7 @@ async def process_chat_template(
|
||||
tokenizer=tokenizer,
|
||||
chat_template=None,
|
||||
tools=None,
|
||||
model_config=renderer_config.model_config,
|
||||
model_config=model_config,
|
||||
)
|
||||
|
||||
if hf_chat_template != resolved_chat_template:
|
||||
@ -314,6 +314,6 @@ async def process_chat_template(
|
||||
"It is different from official chat template '%s'. "
|
||||
"This discrepancy may lead to performance degradation.",
|
||||
resolved_chat_template,
|
||||
renderer_config.model_config.model,
|
||||
model_config.model,
|
||||
)
|
||||
return resolved_chat_template
|
||||
|
||||
@ -6,7 +6,7 @@ from typing import Any, cast
|
||||
|
||||
from typing_extensions import assert_never
|
||||
|
||||
from vllm.config import RendererConfig
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
|
||||
from vllm.multimodal.cache import BaseMultiModalProcessorCache
|
||||
@ -45,15 +45,14 @@ logger = init_logger(__name__)
|
||||
class InputPreprocessor:
|
||||
def __init__(
|
||||
self,
|
||||
renderer_config: RendererConfig,
|
||||
model_config: ModelConfig,
|
||||
tokenizer: TokenizerLike | None,
|
||||
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
|
||||
mm_processor_cache: BaseMultiModalProcessorCache | None = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.renderer_config = renderer_config
|
||||
self.model_config = renderer_config.model_config
|
||||
self.model_config = model_config
|
||||
self.tokenizer = tokenizer
|
||||
self.mm_registry = mm_registry
|
||||
self.mm_processor_cache = mm_processor_cache
|
||||
@ -232,7 +231,7 @@ class InputPreprocessor:
|
||||
def _get_mm_processor(self) -> BaseMultiModalProcessor:
|
||||
if not hasattr(self, "_mm_processor"):
|
||||
self._mm_processor = self.mm_registry.create_processor(
|
||||
self.renderer_config,
|
||||
self.model_config,
|
||||
tokenizer=self.tokenizer,
|
||||
cache=self.mm_processor_cache,
|
||||
)
|
||||
|
||||
@ -415,7 +415,7 @@ def load_weights_using_from_2_way_softmax(
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
|
||||
renderer_config = model.vllm_config.renderer_config
|
||||
model_config = model.vllm_config.model_config
|
||||
quant_config = model.vllm_config.quant_config
|
||||
text_config = model.config.get_text_config()
|
||||
|
||||
@ -447,10 +447,10 @@ def load_weights_using_from_2_way_softmax(
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
|
||||
tokenizer = get_tokenizer(
|
||||
renderer_config.tokenizer,
|
||||
revision=renderer_config.tokenizer_revision,
|
||||
tokenizer_mode=renderer_config.tokenizer_mode,
|
||||
trust_remote_code=renderer_config.trust_remote_code,
|
||||
model_config.tokenizer,
|
||||
revision=model_config.tokenizer_revision,
|
||||
tokenizer_mode=model_config.tokenizer_mode,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
)
|
||||
|
||||
false_id = tokenizer.convert_tokens_to_ids(tokens[0])
|
||||
@ -473,7 +473,7 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
|
||||
renderer_config = model.vllm_config.renderer_config
|
||||
model_config = model.vllm_config.model_config
|
||||
quant_config = model.vllm_config.quant_config
|
||||
text_config = model.config.get_text_config()
|
||||
|
||||
@ -501,10 +501,10 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
|
||||
from vllm.tokenizers import get_tokenizer
|
||||
|
||||
tokenizer = get_tokenizer(
|
||||
renderer_config.tokenizer,
|
||||
revision=renderer_config.tokenizer_revision,
|
||||
tokenizer_mode=renderer_config.tokenizer_mode,
|
||||
trust_remote_code=renderer_config.trust_remote_code,
|
||||
model_config.tokenizer,
|
||||
revision=model_config.tokenizer_revision,
|
||||
tokenizer_mode=model_config.tokenizer_mode,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
)
|
||||
|
||||
token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens]
|
||||
|
||||
@ -377,8 +377,8 @@ class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
self.projector_config = config.projector_config
|
||||
self.text_config = config.text_config
|
||||
|
||||
renderer_config = vllm_config.renderer_config
|
||||
tokenizer = cached_tokenizer_from_config(renderer_config)
|
||||
model_config = vllm_config.model_config
|
||||
tokenizer = cached_tokenizer_from_config(model_config)
|
||||
self.image_token_id = tokenizer.vocab[_IMAGE_TOKEN]
|
||||
|
||||
self.sam_model = build_sam_vit_b()
|
||||
|
||||
@ -370,8 +370,8 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
||||
self.projector_config = config.projector_config
|
||||
self.text_config = config.text_config
|
||||
|
||||
renderer_config = vllm_config.renderer_config
|
||||
tokenizer = cached_tokenizer_from_config(renderer_config)
|
||||
model_config = vllm_config.model_config
|
||||
tokenizer = cached_tokenizer_from_config(model_config)
|
||||
self.image_token_id: int = tokenizer.vocab[_IMAGE_TOKEN]
|
||||
|
||||
self.vision = self._init_vision_module(
|
||||
|
||||
@ -18,7 +18,7 @@ from transformers.models.gemma3n import (
|
||||
)
|
||||
from transformers.models.siglip import SiglipImageProcessorFast
|
||||
|
||||
from vllm.config import RendererConfig, SpeechToTextConfig, VllmConfig
|
||||
from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions
|
||||
from vllm.inputs.data import PromptType
|
||||
from vllm.logger import init_logger
|
||||
@ -760,7 +760,7 @@ class Gemma3nForConditionalGeneration(
|
||||
cls,
|
||||
audio: np.ndarray,
|
||||
stt_config: SpeechToTextConfig,
|
||||
renderer_config: RendererConfig,
|
||||
model_config: ModelConfig,
|
||||
language: Optional[str],
|
||||
task_type: Literal["transcribe", "translate"],
|
||||
request_prompt: str,
|
||||
@ -798,9 +798,7 @@ class Gemma3nForConditionalGeneration(
|
||||
|
||||
@classmethod
|
||||
def get_speech_to_text_config(
|
||||
cls,
|
||||
renderer_config: RendererConfig,
|
||||
task_type: str,
|
||||
cls, model_config: ModelConfig, task_type: str
|
||||
) -> SpeechToTextConfig:
|
||||
return SpeechToTextConfig(
|
||||
# Let's set this to 30 as suggested in the docs for now, although
|
||||
|
||||
@ -34,7 +34,7 @@ import torch.nn.functional as F
|
||||
from torch import nn
|
||||
from transformers import BatchFeature, PretrainedConfig
|
||||
|
||||
from vllm.config import CacheConfig, RendererConfig, SpeechToTextConfig, VllmConfig
|
||||
from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions
|
||||
from vllm.inputs.data import PromptType
|
||||
from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
|
||||
@ -840,7 +840,7 @@ class GraniteSpeechForConditionalGeneration(
|
||||
def get_generation_prompt(
|
||||
cls,
|
||||
audio: np.ndarray,
|
||||
renderer_config: RendererConfig,
|
||||
model_config: ModelConfig,
|
||||
stt_config: SpeechToTextConfig,
|
||||
language: str | None,
|
||||
task_type: Literal["transcribe", "translate"],
|
||||
@ -861,7 +861,7 @@ class GraniteSpeechForConditionalGeneration(
|
||||
else:
|
||||
raise ValueError(f"Unsupported task type {task_type}")
|
||||
|
||||
tokenizer = cached_tokenizer_from_config(renderer_config)
|
||||
tokenizer = cached_tokenizer_from_config(model_config)
|
||||
chat = [dict(role="user", content=user_prompt)]
|
||||
prompt = tokenizer.apply_chat_template(
|
||||
chat,
|
||||
@ -882,10 +882,10 @@ class GraniteSpeechForConditionalGeneration(
|
||||
cls,
|
||||
audio_duration_s: float,
|
||||
stt_config: SpeechToTextConfig,
|
||||
renderer_config: RendererConfig,
|
||||
model_config: ModelConfig,
|
||||
) -> int | None:
|
||||
"""Get the number of audio tokens for an audio duration in sec."""
|
||||
processor = cached_processor_from_config(renderer_config)
|
||||
processor = cached_processor_from_config(model_config)
|
||||
hop_length = processor.audio_processor.melspec_kwargs["hop_length"]
|
||||
proj_win_size = processor.audio_processor.projector_window_size
|
||||
ds_rate = processor.audio_processor.projector_downsample_rate
|
||||
@ -903,9 +903,7 @@ class GraniteSpeechForConditionalGeneration(
|
||||
|
||||
@classmethod
|
||||
def get_speech_to_text_config(
|
||||
cls,
|
||||
renderer_config: RendererConfig,
|
||||
task_type: str,
|
||||
cls, model_config: ModelConfig, task_type: str
|
||||
) -> SpeechToTextConfig:
|
||||
"""Get the stt config for this model."""
|
||||
# Default settings are reasonable for this model and we don't currently
|
||||
|
||||
@ -6,7 +6,7 @@ import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from vllm.config import RendererConfig, VllmConfig
|
||||
from vllm.config import ModelConfig, VllmConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.pooler import (
|
||||
DispatchPooler,
|
||||
@ -29,12 +29,12 @@ logger = init_logger(__name__)
|
||||
class GritLMMeanPool(nn.Module):
|
||||
"""As `MeanPool`, but only includes non-instruction tokens."""
|
||||
|
||||
def __init__(self, renderer_config: RendererConfig):
|
||||
def __init__(self, model_config: ModelConfig):
|
||||
super().__init__()
|
||||
|
||||
self.renderer_config = renderer_config
|
||||
self.model_config = model_config
|
||||
|
||||
tokenizer = cached_tokenizer_from_config(self.renderer_config)
|
||||
tokenizer = cached_tokenizer_from_config(self.model_config)
|
||||
|
||||
# Collect the tokens needed for pattern matching.
|
||||
# "▁<" is different from "_<". The former uses "▁" to indicate that
|
||||
@ -174,10 +174,10 @@ class GritLMMeanPool(nn.Module):
|
||||
|
||||
|
||||
class GritLMPooler(Pooler):
|
||||
def __init__(self, renderer_config: RendererConfig):
|
||||
def __init__(self, model_config: ModelConfig):
|
||||
super().__init__()
|
||||
|
||||
self.pooling = GritLMMeanPool(renderer_config)
|
||||
self.pooling = GritLMMeanPool(model_config)
|
||||
self.head = PoolerHead(PoolerNormalize())
|
||||
|
||||
def get_supported_tasks(self) -> Set[PoolingTask]:
|
||||
@ -238,6 +238,6 @@ class GritLM(LlamaForCausalLM):
|
||||
self.pooler = DispatchPooler(
|
||||
{
|
||||
"token_embed": Pooler.for_token_embed(pooler_config),
|
||||
"embed": GritLMPooler(vllm_config.renderer_config),
|
||||
"embed": GritLMPooler(vllm_config.model_config),
|
||||
}
|
||||
)
|
||||
|
||||
@ -19,7 +19,7 @@ from torch import Tensor
|
||||
from transformers.models.whisper.tokenization_whisper import LANGUAGES
|
||||
from typing_extensions import Self, TypeIs
|
||||
|
||||
from vllm.config import RendererConfig, SpeechToTextConfig
|
||||
from vllm.config import ModelConfig, SpeechToTextConfig
|
||||
from vllm.inputs import TokensPrompt
|
||||
from vllm.inputs.data import PromptType
|
||||
from vllm.logger import init_logger
|
||||
@ -887,7 +887,7 @@ class SupportsTranscription(Protocol):
|
||||
cls,
|
||||
audio: np.ndarray,
|
||||
stt_config: SpeechToTextConfig,
|
||||
renderer_config: RendererConfig,
|
||||
model_config: ModelConfig,
|
||||
language: str | None,
|
||||
task_type: Literal["transcribe", "translate"],
|
||||
request_prompt: str,
|
||||
@ -930,9 +930,7 @@ class SupportsTranscription(Protocol):
|
||||
|
||||
@classmethod
|
||||
def get_speech_to_text_config(
|
||||
cls,
|
||||
renderer_config: RendererConfig,
|
||||
task_type: Literal["transcribe", "translate"],
|
||||
cls, model_config: ModelConfig, task_type: Literal["transcribe", "translate"]
|
||||
) -> SpeechToTextConfig:
|
||||
"""Get the speech to text config for the ASR model."""
|
||||
...
|
||||
@ -942,7 +940,7 @@ class SupportsTranscription(Protocol):
|
||||
cls,
|
||||
audio_duration_s: float,
|
||||
stt_config: SpeechToTextConfig,
|
||||
renderer_config: RendererConfig,
|
||||
model_config: ModelConfig,
|
||||
) -> int | None:
|
||||
"""
|
||||
Map from audio duration to number of audio tokens produced by the ASR
|
||||
|
||||
@ -182,7 +182,7 @@ class InternS1ProcessingInfo(BaseProcessingInfo):
|
||||
def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
|
||||
hf_processor = self.ctx.get_hf_processor(InternVLProcessor, **kwargs)
|
||||
hf_processor.video_processor = cached_video_processor_from_config(
|
||||
self.ctx.renderer_config,
|
||||
self.ctx.model_config,
|
||||
processor_cls=InternVLVideoProcessor,
|
||||
size=hf_processor.image_processor.size,
|
||||
**kwargs,
|
||||
|
||||
@ -1169,17 +1169,16 @@ class NemotronH_Nano_VL_V2(
|
||||
self.mlp1 = self.mlp1.to(self.language_model.config.dtype)
|
||||
|
||||
self.config = config
|
||||
self.model_config = vllm_config.model_config
|
||||
|
||||
# Pre-tokenize special tokens for video processing
|
||||
# to avoid repeated tokenization
|
||||
self._tokenizer = cached_tokenizer_from_config(vllm_config.renderer_config)
|
||||
self._img_start_token_ids = self._tokenizer.encode(
|
||||
tokenizer = cached_tokenizer_from_config(vllm_config.model_config)
|
||||
self._img_start_token_ids = tokenizer.encode(
|
||||
IMG_START, add_special_tokens=False
|
||||
)
|
||||
self._img_end_token_ids = self._tokenizer.encode(
|
||||
IMG_END, add_special_tokens=False
|
||||
)
|
||||
self._img_context_token_ids = self._tokenizer.encode(
|
||||
self._img_end_token_ids = tokenizer.encode(IMG_END, add_special_tokens=False)
|
||||
self._img_context_token_ids = tokenizer.encode(
|
||||
IMG_CONTEXT, add_special_tokens=False
|
||||
)
|
||||
|
||||
@ -1365,7 +1364,7 @@ class NemotronH_Nano_VL_V2(
|
||||
input_embeds for the LLM.
|
||||
"""
|
||||
device = video_embeddings.device
|
||||
tokenizer = self._tokenizer
|
||||
tokenizer = cached_tokenizer_from_config(self.model_config)
|
||||
|
||||
# Generate video replacement token IDs using get_video_repl
|
||||
# This tokenizes each frame separator independently, then uses pre-tokenized
|
||||
|
||||
@ -347,7 +347,7 @@ class NemotronVLProcessingInfo(BaseInternVLProcessingInfo):
|
||||
|
||||
def get_image_processor(self, **kwargs: object):
|
||||
return cached_image_processor_from_config(
|
||||
self.ctx.renderer_config,
|
||||
self.ctx.model_config,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
@ -193,7 +193,7 @@ class PixtralProcessorAdapter:
|
||||
|
||||
class PixtralProcessingInfo(BaseProcessingInfo):
|
||||
def get_tokenizer(self) -> MistralTokenizer:
|
||||
tokenizer = cached_tokenizer_from_config(self.ctx.renderer_config)
|
||||
tokenizer = cached_tokenizer_from_config(self.ctx.model_config)
|
||||
if not isinstance(tokenizer, MistralTokenizer):
|
||||
raise ValueError("This model requires `--tokenizer-mode mistral`")
|
||||
|
||||
|
||||
@ -20,7 +20,7 @@ from mistral_common.tokens.tokenizers.audio import Audio, AudioEncoder
|
||||
from transformers import BatchFeature, TensorType, WhisperConfig
|
||||
from transformers.tokenization_utils_base import TextInput
|
||||
|
||||
from vllm.config import RendererConfig, SpeechToTextConfig, VllmConfig
|
||||
from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions
|
||||
from vllm.inputs.data import PromptType
|
||||
from vllm.logger import init_logger
|
||||
@ -176,7 +176,7 @@ class VoxtralProcessorAdapter:
|
||||
|
||||
class VoxtralProcessingInfo(BaseProcessingInfo):
|
||||
def get_tokenizer(self) -> MistralTokenizer:
|
||||
tokenizer = cached_tokenizer_from_config(self.ctx.renderer_config)
|
||||
tokenizer = cached_tokenizer_from_config(self.ctx.model_config)
|
||||
if not isinstance(tokenizer, MistralTokenizer):
|
||||
raise ValueError("This model requires `--tokenizer-mode mistral`")
|
||||
|
||||
@ -339,7 +339,7 @@ class VoxtralForConditionalGeneration(
|
||||
|
||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||
super().__init__()
|
||||
self.tokenizer = cached_tokenizer_from_config(vllm_config.renderer_config)
|
||||
self.tokenizer = cached_tokenizer_from_config(vllm_config.model_config)
|
||||
|
||||
# update quant config to so that ignored module and target module names
|
||||
# match the vLLM model names
|
||||
@ -450,11 +450,9 @@ class VoxtralForConditionalGeneration(
|
||||
|
||||
@classmethod
|
||||
def get_speech_to_text_config(
|
||||
cls,
|
||||
renderer_config: RendererConfig,
|
||||
task_type: str,
|
||||
cls, model_config: ModelConfig, task_type: str
|
||||
) -> SpeechToTextConfig:
|
||||
tokenizer = cached_tokenizer_from_config(renderer_config)
|
||||
tokenizer = cached_tokenizer_from_config(model_config)
|
||||
audio_config = tokenizer.instruct.audio_encoder.audio_config
|
||||
max_audio_clip_s = audio_config.chunk_length_s
|
||||
sample_rate = audio_config.sampling_rate
|
||||
@ -470,17 +468,17 @@ class VoxtralForConditionalGeneration(
|
||||
def get_generation_prompt(
|
||||
cls,
|
||||
audio: np.ndarray,
|
||||
renderer_config: RendererConfig, # not needed here
|
||||
model_config: ModelConfig,
|
||||
stt_config: SpeechToTextConfig,
|
||||
language: str | None,
|
||||
task_type: Literal["transcribe", "translate"],
|
||||
request_prompt: str,
|
||||
to_language: str | None,
|
||||
) -> PromptType:
|
||||
tokenizer = cached_tokenizer_from_config(renderer_config)
|
||||
tokenizer = cached_tokenizer_from_config(model_config)
|
||||
audio = Audio(audio, int(stt_config.sample_rate), format="wav") # lossless
|
||||
req = TranscriptionRequest(
|
||||
model=renderer_config.model_config.model,
|
||||
model=model_config.model,
|
||||
audio=RawAudio.from_audio(audio),
|
||||
language=language,
|
||||
)
|
||||
@ -496,14 +494,14 @@ class VoxtralForConditionalGeneration(
|
||||
cls,
|
||||
audio_duration_s: float,
|
||||
stt_config: SpeechToTextConfig,
|
||||
renderer_config: RendererConfig,
|
||||
model_config: ModelConfig,
|
||||
) -> int | None:
|
||||
"""
|
||||
Map from audio duration to number of audio tokens produced by the ASR
|
||||
model, without running a forward pass.
|
||||
This is used for estimating the amount of processing for this audio.
|
||||
"""
|
||||
tokenizer = cached_tokenizer_from_config(renderer_config)
|
||||
tokenizer = cached_tokenizer_from_config(model_config)
|
||||
adapter = VoxtralProcessorAdapter(tokenizer)
|
||||
return adapter.get_num_audio_tokens(
|
||||
int(audio_duration_s * stt_config.sample_rate)
|
||||
|
||||
@ -19,7 +19,7 @@ from transformers.models.whisper.modeling_whisper import sinusoids
|
||||
from vllm.attention.backends.abstract import AttentionType
|
||||
from vllm.attention.layer import Attention, MultiHeadAttention
|
||||
from vllm.attention.layers.cross_attention import CrossAttention
|
||||
from vllm.config import CacheConfig, RendererConfig, SpeechToTextConfig, VllmConfig
|
||||
from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
|
||||
from vllm.config.multimodal import BaseDummyOptions
|
||||
from vllm.distributed import get_tensor_model_parallel_world_size
|
||||
from vllm.inputs.data import PromptType
|
||||
@ -811,7 +811,7 @@ class WhisperForConditionalGeneration(
|
||||
def get_generation_prompt(
|
||||
cls,
|
||||
audio: np.ndarray,
|
||||
renderer_config: RendererConfig, # not needed here
|
||||
model_config: ModelConfig, # not needed here
|
||||
stt_config: SpeechToTextConfig,
|
||||
language: str | None,
|
||||
task_type: Literal["transcribe", "translate"],
|
||||
@ -847,11 +847,9 @@ class WhisperForConditionalGeneration(
|
||||
|
||||
@classmethod
|
||||
def get_speech_to_text_config(
|
||||
cls,
|
||||
renderer_config: RendererConfig,
|
||||
task_type: str,
|
||||
cls, model_config: ModelConfig, task_type: str
|
||||
) -> SpeechToTextConfig:
|
||||
processor = cached_processor_from_config(renderer_config)
|
||||
processor = cached_processor_from_config(model_config)
|
||||
|
||||
return SpeechToTextConfig(
|
||||
max_audio_clip_s=processor.feature_extractor.chunk_length,
|
||||
@ -863,9 +861,9 @@ class WhisperForConditionalGeneration(
|
||||
cls,
|
||||
audio_duration_s: float,
|
||||
stt_config: SpeechToTextConfig,
|
||||
renderer_config: RendererConfig,
|
||||
model_config: ModelConfig,
|
||||
) -> int | None:
|
||||
processor = cached_processor_from_config(renderer_config)
|
||||
processor = cached_processor_from_config(model_config)
|
||||
hop_length = processor.feature_extractor.hop_length
|
||||
assert hop_length is not None
|
||||
# NOTE(NickLucche) user can't pass encoder
|
||||
|
||||
@ -31,7 +31,7 @@ from .inputs import (
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import ModelConfig, RendererConfig, VllmConfig
|
||||
from vllm.config import ModelConfig, VllmConfig
|
||||
|
||||
from .processing import ResolvedPromptUpdate
|
||||
from .registry import MultiModalRegistry
|
||||
@ -561,13 +561,13 @@ class ShmObjectStoreSenderCache(BaseMultiModalProcessorCache):
|
||||
|
||||
|
||||
def _enable_processor_cache(
|
||||
renderer_config: "RendererConfig",
|
||||
model_config: "ModelConfig",
|
||||
mm_registry: "MultiModalRegistry",
|
||||
) -> bool:
|
||||
if not mm_registry.supports_multimodal_inputs(renderer_config):
|
||||
if not mm_registry.supports_multimodal_inputs(model_config):
|
||||
return False
|
||||
|
||||
mm_config = renderer_config.model_config.get_multimodal_config()
|
||||
mm_config = model_config.get_multimodal_config()
|
||||
return mm_config.mm_processor_cache_gb > 0
|
||||
|
||||
|
||||
@ -599,7 +599,7 @@ def processor_cache_from_config(
|
||||
"""Return a `BaseMultiModalProcessorCache`, if enabled."""
|
||||
model_config = vllm_config.model_config
|
||||
|
||||
if not _enable_processor_cache(vllm_config.renderer_config, mm_registry):
|
||||
if not _enable_processor_cache(model_config, mm_registry):
|
||||
return None
|
||||
|
||||
if not _enable_ipc_cache(vllm_config):
|
||||
@ -611,14 +611,14 @@ def processor_cache_from_config(
|
||||
|
||||
|
||||
def processor_only_cache_from_config(
|
||||
renderer_config: "RendererConfig",
|
||||
model_config: "ModelConfig",
|
||||
mm_registry: "MultiModalRegistry",
|
||||
):
|
||||
"""Return a `MultiModalProcessorOnlyCache`, if enabled."""
|
||||
if not _enable_processor_cache(renderer_config, mm_registry):
|
||||
if not _enable_processor_cache(model_config, mm_registry):
|
||||
return None
|
||||
|
||||
return MultiModalProcessorOnlyCache(renderer_config.model_config)
|
||||
return MultiModalProcessorOnlyCache(model_config)
|
||||
|
||||
|
||||
class BaseMultiModalReceiverCache(
|
||||
@ -787,7 +787,7 @@ def engine_receiver_cache_from_config(
|
||||
"""
|
||||
model_config = vllm_config.model_config
|
||||
|
||||
if not _enable_processor_cache(vllm_config.renderer_config, mm_registry):
|
||||
if not _enable_processor_cache(model_config, mm_registry):
|
||||
return None
|
||||
|
||||
if not _enable_ipc_cache(vllm_config):
|
||||
@ -809,7 +809,9 @@ def worker_receiver_cache_from_config(
|
||||
Return a `BaseMultiModalReceiverCache` only when IPC caching is enabled and
|
||||
mm_processor_cache_type=="shm".
|
||||
"""
|
||||
if not _enable_processor_cache(vllm_config.renderer_config, mm_registry):
|
||||
model_config = vllm_config.model_config
|
||||
|
||||
if not _enable_processor_cache(model_config, mm_registry):
|
||||
return None
|
||||
|
||||
if not _enable_ipc_cache(vllm_config):
|
||||
|
||||
@ -23,7 +23,7 @@ import torch
|
||||
from typing_extensions import TypeVar, assert_never
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.transformers_utils.processor import cached_processor_from_config
|
||||
from vllm.utils.collection_utils import flatten_2d_lists, full_groupby
|
||||
from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
|
||||
@ -53,7 +53,7 @@ if TYPE_CHECKING:
|
||||
from transformers.feature_extraction_utils import BatchFeature
|
||||
from transformers.processing_utils import ProcessorMixin
|
||||
|
||||
from vllm.config import ModelConfig, RendererConfig
|
||||
from vllm.config import ModelConfig
|
||||
|
||||
from .cache import BaseMultiModalProcessorCache
|
||||
from .profiling import BaseDummyInputsBuilder
|
||||
@ -63,7 +63,6 @@ else:
|
||||
ProcessorMixin = object
|
||||
|
||||
ModelConfig = object
|
||||
RendererConfig = object
|
||||
|
||||
BaseMultiModalProcessorCache = object
|
||||
|
||||
@ -946,29 +945,12 @@ class InputProcessingContext:
|
||||
modify the inputs.
|
||||
"""
|
||||
|
||||
renderer_config: RendererConfig
|
||||
"""The configuration of the renderer."""
|
||||
model_config: ModelConfig
|
||||
"""The configuration of the model."""
|
||||
|
||||
tokenizer: TokenizerLike | None
|
||||
"""The tokenizer used to tokenize the inputs."""
|
||||
|
||||
@classmethod
|
||||
def from_config(
|
||||
cls,
|
||||
renderer_config: RendererConfig,
|
||||
*,
|
||||
tokenizer: TokenizerLike | None = None,
|
||||
):
|
||||
if tokenizer is None and not renderer_config.skip_tokenizer_init:
|
||||
tokenizer = cached_tokenizer_from_config(renderer_config)
|
||||
|
||||
return cls(renderer_config, tokenizer)
|
||||
|
||||
@property
|
||||
def model_config(self) -> ModelConfig:
|
||||
"""The configuration of the model."""
|
||||
return self.renderer_config.model_config
|
||||
|
||||
def get_tokenizer(self) -> TokenizerLike:
|
||||
if self.tokenizer is None:
|
||||
raise ValueError(
|
||||
@ -1065,7 +1047,7 @@ class InputProcessingContext:
|
||||
typ = ProcessorMixin
|
||||
|
||||
return cached_processor_from_config(
|
||||
self.renderer_config,
|
||||
self.model_config,
|
||||
processor_cls=typ,
|
||||
tokenizer=self.tokenizer,
|
||||
**kwargs,
|
||||
|
||||
@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Generic, Protocol, TypeVar, cast
|
||||
|
||||
from vllm.config.multimodal import BaseDummyOptions
|
||||
from vllm.logger import init_logger
|
||||
from vllm.tokenizers import TokenizerLike
|
||||
from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
|
||||
|
||||
from .cache import BaseMultiModalProcessorCache
|
||||
from .processing import (
|
||||
@ -22,7 +22,7 @@ from .profiling import (
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import ModelConfig, RendererConfig
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.model_executor.models.interfaces import SupportsMultiModal
|
||||
|
||||
logger = init_logger(__name__)
|
||||
@ -114,18 +114,17 @@ class MultiModalRegistry:
|
||||
|
||||
return mm_options if len(mm_options) > 0 else None
|
||||
|
||||
def supports_multimodal_inputs(self, renderer_config: "RendererConfig") -> bool:
|
||||
def supports_multimodal_inputs(self, model_config: "ModelConfig") -> bool:
|
||||
"""
|
||||
Checks if the model supports multimodal inputs.
|
||||
Returns True if the model is multimodal with any non-zero supported
|
||||
modalities, otherwise returns False, effectively running in
|
||||
text-only mode.
|
||||
"""
|
||||
model_config = renderer_config.model_config
|
||||
if not model_config.is_multimodal_model:
|
||||
return False
|
||||
|
||||
info = self._create_processing_info(renderer_config, tokenizer=None)
|
||||
info = self._create_processing_info(model_config, tokenizer=None)
|
||||
supported_modalities = info.get_supported_mm_limits()
|
||||
|
||||
mm_config = model_config.get_multimodal_config()
|
||||
@ -145,7 +144,7 @@ class MultiModalRegistry:
|
||||
|
||||
def get_max_tokens_per_item_by_modality(
|
||||
self,
|
||||
renderer_config: "RendererConfig",
|
||||
model_config: "ModelConfig",
|
||||
*,
|
||||
cache: BaseMultiModalProcessorCache | None = None,
|
||||
profiler_limits: Mapping[str, int] | None = None,
|
||||
@ -154,11 +153,10 @@ class MultiModalRegistry:
|
||||
Get the maximum number of tokens per data item from each modality based
|
||||
on underlying model configuration.
|
||||
"""
|
||||
model_config = renderer_config.model_config
|
||||
if not model_config.is_multimodal_model:
|
||||
return {}
|
||||
|
||||
processor = self.create_processor(renderer_config, cache=cache)
|
||||
processor = self.create_processor(model_config, cache=cache)
|
||||
profiler: MultiModalProfiler = MultiModalProfiler(processor)
|
||||
|
||||
seq_len = model_config.max_model_len
|
||||
@ -173,7 +171,7 @@ class MultiModalRegistry:
|
||||
|
||||
def get_mm_limits_per_prompt(
|
||||
self,
|
||||
renderer_config: "RendererConfig",
|
||||
model_config: "ModelConfig",
|
||||
*,
|
||||
cache: BaseMultiModalProcessorCache | None = None,
|
||||
) -> Mapping[str, int]:
|
||||
@ -181,11 +179,10 @@ class MultiModalRegistry:
|
||||
Get the maximum number of multi-modal input instances for each modality
|
||||
that are allowed per prompt for a model class.
|
||||
"""
|
||||
model_config = renderer_config.model_config
|
||||
if not model_config.is_multimodal_model:
|
||||
return {}
|
||||
|
||||
processor = self.create_processor(renderer_config, cache=cache)
|
||||
processor = self.create_processor(model_config, cache=cache)
|
||||
profiler: MultiModalProfiler = MultiModalProfiler(processor)
|
||||
return profiler.get_mm_limits()
|
||||
|
||||
@ -231,21 +228,30 @@ class MultiModalRegistry:
|
||||
assert hasattr(model_cls, "_processor_factory")
|
||||
return cast("SupportsMultiModal", model_cls)
|
||||
|
||||
def _create_processing_ctx(
|
||||
self,
|
||||
model_config: "ModelConfig",
|
||||
tokenizer: TokenizerLike | None = None,
|
||||
) -> InputProcessingContext:
|
||||
if tokenizer is None and not model_config.skip_tokenizer_init:
|
||||
tokenizer = cached_tokenizer_from_config(model_config)
|
||||
|
||||
return InputProcessingContext(model_config, tokenizer)
|
||||
|
||||
def _create_processing_info(
|
||||
self,
|
||||
renderer_config: "RendererConfig",
|
||||
model_config: "ModelConfig",
|
||||
*,
|
||||
tokenizer: TokenizerLike | None = None,
|
||||
) -> BaseProcessingInfo:
|
||||
model_cls = self._get_model_cls(renderer_config.model_config)
|
||||
model_cls = self._get_model_cls(model_config)
|
||||
factories = model_cls._processor_factory
|
||||
|
||||
ctx = InputProcessingContext.from_config(renderer_config, tokenizer=tokenizer)
|
||||
ctx = self._create_processing_ctx(model_config, tokenizer)
|
||||
return factories.info(ctx)
|
||||
|
||||
def create_processor(
|
||||
self,
|
||||
renderer_config: "RendererConfig",
|
||||
model_config: "ModelConfig",
|
||||
*,
|
||||
tokenizer: TokenizerLike | None = None,
|
||||
cache: BaseMultiModalProcessorCache | None = None,
|
||||
@ -253,19 +259,19 @@ class MultiModalRegistry:
|
||||
"""
|
||||
Create a multi-modal processor for a specific model and tokenizer.
|
||||
"""
|
||||
model_config = renderer_config.model_config
|
||||
if not model_config.is_multimodal_model:
|
||||
raise ValueError(f"{model_config.model} is not a multimodal model")
|
||||
|
||||
model_cls = self._get_model_cls(model_config)
|
||||
factories = model_cls._processor_factory
|
||||
|
||||
ctx = InputProcessingContext.from_config(renderer_config, tokenizer=tokenizer)
|
||||
ctx = self._create_processing_ctx(model_config, tokenizer)
|
||||
|
||||
return factories.build_processor(ctx, cache=cache)
|
||||
|
||||
def get_decoder_dummy_data(
|
||||
self,
|
||||
renderer_config: "RendererConfig",
|
||||
model_config: "ModelConfig",
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int] | None = None,
|
||||
*,
|
||||
@ -274,15 +280,15 @@ class MultiModalRegistry:
|
||||
"""
|
||||
Create dummy data for profiling the memory usage of a model.
|
||||
|
||||
The model is identified by `renderer_config`.
|
||||
The model is identified by `model_config`.
|
||||
"""
|
||||
processor = self.create_processor(renderer_config, cache=cache)
|
||||
processor = self.create_processor(model_config, cache=cache)
|
||||
profiler: MultiModalProfiler = MultiModalProfiler(processor)
|
||||
|
||||
# Extract configurable options from multimodal config.
|
||||
# Only include modalities that use advanced option types so legacy
|
||||
# count-only behavior remains unchanged.
|
||||
mm_options = self._extract_mm_options(renderer_config.model_config)
|
||||
mm_options = self._extract_mm_options(model_config)
|
||||
|
||||
dummy_data = profiler.get_decoder_dummy_data(seq_len, mm_counts, mm_options)
|
||||
|
||||
@ -298,7 +304,7 @@ class MultiModalRegistry:
|
||||
|
||||
def get_encoder_dummy_data(
|
||||
self,
|
||||
renderer_config: "RendererConfig",
|
||||
model_config: "ModelConfig",
|
||||
seq_len: int,
|
||||
mm_counts: Mapping[str, int] | None = None,
|
||||
*,
|
||||
@ -307,15 +313,15 @@ class MultiModalRegistry:
|
||||
"""
|
||||
Create dummy data for profiling the memory usage of a model.
|
||||
|
||||
The model is identified by `renderer_config`.
|
||||
The model is identified by `model_config`.
|
||||
"""
|
||||
processor = self.create_processor(renderer_config, cache=cache)
|
||||
processor = self.create_processor(model_config, cache=cache)
|
||||
profiler: MultiModalProfiler = MultiModalProfiler(processor)
|
||||
|
||||
# Extract configurable options from multimodal config.
|
||||
# Only include modalities that use advanced option types so legacy
|
||||
# count-only behavior remains unchanged.
|
||||
mm_options = self._extract_mm_options(renderer_config.model_config)
|
||||
mm_options = self._extract_mm_options(model_config)
|
||||
|
||||
dummy_data = profiler.get_encoder_dummy_data(seq_len, mm_counts, mm_options)
|
||||
|
||||
@ -330,15 +336,13 @@ class MultiModalRegistry:
|
||||
|
||||
return dummy_data
|
||||
|
||||
def get_encdec_max_encoder_len(self, renderer_config: "RendererConfig") -> int:
|
||||
def get_encdec_max_encoder_len(self, model_config: "ModelConfig") -> int:
|
||||
"""
|
||||
Get the maximum length of the encoder input for encoder-decoder models.
|
||||
"""
|
||||
model_config = renderer_config.model_config
|
||||
if not model_config.is_encoder_decoder:
|
||||
return 0
|
||||
|
||||
max_tokens = self.get_max_tokens_per_item_by_modality(renderer_config)
|
||||
max_tokens = self.get_max_tokens_per_item_by_modality(model_config)
|
||||
if not max_tokens:
|
||||
# TODO - this function assumes encoder-decoder models are
|
||||
# multimodal. This will need to change when adding support for more
|
||||
|
||||
@ -24,7 +24,7 @@ from vllm.utils.import_utils import resolve_obj_by_qualname
|
||||
from .protocol import TokenizerLike
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import RendererConfig
|
||||
from vllm.config import ModelConfig
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@ -205,18 +205,18 @@ def get_tokenizer(
|
||||
cached_get_tokenizer = lru_cache(get_tokenizer)
|
||||
|
||||
|
||||
def cached_tokenizer_from_config(renderer_config: "RendererConfig", **kwargs):
|
||||
def cached_tokenizer_from_config(model_config: "ModelConfig", **kwargs):
|
||||
return cached_get_tokenizer(
|
||||
renderer_config.tokenizer,
|
||||
tokenizer_mode=renderer_config.tokenizer_mode,
|
||||
revision=renderer_config.tokenizer_revision,
|
||||
trust_remote_code=renderer_config.trust_remote_code,
|
||||
model_config.tokenizer,
|
||||
tokenizer_mode=model_config.tokenizer_mode,
|
||||
revision=model_config.tokenizer_revision,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
def init_tokenizer_from_config(renderer_config: "RendererConfig"):
|
||||
runner_type = renderer_config.model_config.runner_type
|
||||
def init_tokenizer_from_config(model_config: "ModelConfig"):
|
||||
runner_type = model_config.runner_type
|
||||
if runner_type == "generate" or runner_type == "draft":
|
||||
truncation_side = "left"
|
||||
elif runner_type == "pooling":
|
||||
@ -225,9 +225,9 @@ def init_tokenizer_from_config(renderer_config: "RendererConfig"):
|
||||
assert_never(runner_type)
|
||||
|
||||
return get_tokenizer(
|
||||
renderer_config.tokenizer,
|
||||
tokenizer_mode=renderer_config.tokenizer_mode,
|
||||
trust_remote_code=renderer_config.trust_remote_code,
|
||||
revision=renderer_config.tokenizer_revision,
|
||||
model_config.tokenizer,
|
||||
tokenizer_mode=model_config.tokenizer_mode,
|
||||
trust_remote_code=model_config.trust_remote_code,
|
||||
revision=model_config.tokenizer_revision,
|
||||
truncation_side=truncation_side,
|
||||
)
|
||||
|
||||
@ -23,7 +23,7 @@ from vllm.transformers_utils.utils import convert_model_repo_to_path
|
||||
from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import ModelConfig, RendererConfig
|
||||
from vllm.config import ModelConfig
|
||||
|
||||
_P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
|
||||
_V = TypeVar("_V", bound=BaseVideoProcessor, default=BaseVideoProcessor)
|
||||
@ -233,18 +233,17 @@ def cached_get_processor_without_dynamic_kwargs(
|
||||
|
||||
|
||||
def cached_processor_from_config(
|
||||
renderer_config: "RendererConfig",
|
||||
model_config: "ModelConfig",
|
||||
processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin,
|
||||
**kwargs: Any,
|
||||
) -> _P:
|
||||
model_config = renderer_config.model_config
|
||||
if is_gguf(model_config.model):
|
||||
assert not is_gguf(renderer_config.tokenizer), (
|
||||
assert not is_gguf(model_config.tokenizer), (
|
||||
"For multimodal GGUF models, the original tokenizer "
|
||||
"should be used to correctly load processor."
|
||||
)
|
||||
model = renderer_config.tokenizer
|
||||
revision = renderer_config.tokenizer_revision
|
||||
model = model_config.tokenizer
|
||||
revision = model_config.tokenizer_revision
|
||||
else:
|
||||
model = model_config.model
|
||||
revision = model_config.revision
|
||||
@ -298,11 +297,9 @@ cached_get_feature_extractor = lru_cache(get_feature_extractor)
|
||||
|
||||
|
||||
def cached_feature_extractor_from_config(
|
||||
renderer_config: "RendererConfig",
|
||||
model_config: "ModelConfig",
|
||||
**kwargs: Any,
|
||||
):
|
||||
model_config = renderer_config.model_config
|
||||
|
||||
return cached_get_feature_extractor(
|
||||
model_config.model,
|
||||
revision=model_config.revision,
|
||||
@ -351,17 +348,16 @@ cached_get_image_processor = lru_cache(get_image_processor)
|
||||
|
||||
|
||||
def cached_image_processor_from_config(
|
||||
renderer_config: "RendererConfig",
|
||||
model_config: "ModelConfig",
|
||||
**kwargs: Any,
|
||||
):
|
||||
model_config = renderer_config.model_config
|
||||
if is_gguf(model_config.model):
|
||||
assert not is_gguf(renderer_config.tokenizer), (
|
||||
assert not is_gguf(model_config.tokenizer), (
|
||||
"For multimodal GGUF models, the original tokenizer "
|
||||
"should be used to correctly load image processor."
|
||||
)
|
||||
model = renderer_config.tokenizer
|
||||
revision = renderer_config.tokenizer_revision
|
||||
model = model_config.tokenizer
|
||||
revision = model_config.tokenizer_revision
|
||||
else:
|
||||
model = model_config.model
|
||||
revision = model_config.revision
|
||||
@ -415,12 +411,10 @@ cached_get_video_processor = lru_cache(get_video_processor)
|
||||
|
||||
|
||||
def cached_video_processor_from_config(
|
||||
renderer_config: "RendererConfig",
|
||||
model_config: "ModelConfig",
|
||||
processor_cls: type[_V] | None = None,
|
||||
**kwargs: Any,
|
||||
):
|
||||
model_config = renderer_config.model_config
|
||||
|
||||
return cached_get_video_processor(
|
||||
model_config.model,
|
||||
revision=model_config.revision,
|
||||
|
||||
@ -10,7 +10,7 @@ from vllm.multimodal import MultiModalRegistry
|
||||
from vllm.v1.request import Request
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from vllm.config import RendererConfig, SchedulerConfig
|
||||
from vllm.config import ModelConfig, SchedulerConfig
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@ -250,7 +250,7 @@ class EncoderCacheManager:
|
||||
|
||||
|
||||
def compute_encoder_budget(
|
||||
renderer_config: "RendererConfig",
|
||||
model_config: "ModelConfig",
|
||||
scheduler_config: "SchedulerConfig",
|
||||
mm_registry: MultiModalRegistry,
|
||||
) -> tuple[int, int]:
|
||||
@ -263,9 +263,9 @@ def compute_encoder_budget(
|
||||
- Space budget for encoder cache size, measured in number of tokens
|
||||
from the input sequence.
|
||||
"""
|
||||
if mm_registry.supports_multimodal_inputs(renderer_config):
|
||||
if mm_registry.supports_multimodal_inputs(model_config):
|
||||
max_tokens_by_modality = mm_registry.get_max_tokens_per_item_by_modality(
|
||||
renderer_config
|
||||
model_config
|
||||
)
|
||||
|
||||
return compute_mm_encoder_budget(
|
||||
|
||||
@ -164,7 +164,7 @@ class Scheduler(SchedulerInterface):
|
||||
# This can be changed when we make encoder cache for embedding caching
|
||||
# across requests.
|
||||
encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
|
||||
renderer_config=vllm_config.renderer_config,
|
||||
model_config=vllm_config.model_config,
|
||||
scheduler_config=vllm_config.scheduler_config,
|
||||
mm_registry=mm_registry,
|
||||
)
|
||||
|
||||
@ -91,7 +91,6 @@ class AsyncLLM(EngineClient):
|
||||
# Ensure we can serialize custom transformer configs
|
||||
maybe_register_config_serialize_by_value()
|
||||
|
||||
self.renderer_config = vllm_config.renderer_config
|
||||
self.model_config = vllm_config.model_config
|
||||
self.vllm_config = vllm_config
|
||||
self.observability_config = vllm_config.observability_config
|
||||
@ -109,15 +108,15 @@ class AsyncLLM(EngineClient):
|
||||
"enabling logging without default stat loggers."
|
||||
)
|
||||
|
||||
if self.renderer_config.skip_tokenizer_init:
|
||||
if self.model_config.skip_tokenizer_init:
|
||||
tokenizer = None
|
||||
else:
|
||||
tokenizer = init_tokenizer_from_config(self.renderer_config)
|
||||
tokenizer = init_tokenizer_from_config(self.model_config)
|
||||
|
||||
self.input_processor = InputProcessor(self.vllm_config, tokenizer)
|
||||
self.io_processor = get_io_processor(
|
||||
self.vllm_config,
|
||||
self.renderer_config.io_processor_plugin,
|
||||
self.model_config.io_processor_plugin,
|
||||
)
|
||||
|
||||
# OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
|
||||
|
||||
@ -43,7 +43,6 @@ class InputProcessor:
|
||||
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
|
||||
) -> None:
|
||||
self.vllm_config = vllm_config
|
||||
self.renderer_config = vllm_config.renderer_config
|
||||
self.model_config = vllm_config.model_config
|
||||
self.cache_config = vllm_config.cache_config
|
||||
self.lora_config = vllm_config.lora_config
|
||||
@ -55,7 +54,7 @@ class InputProcessor:
|
||||
self.mm_processor_cache = processor_cache_from_config(vllm_config, mm_registry)
|
||||
|
||||
self.input_preprocessor = InputPreprocessor(
|
||||
self.renderer_config,
|
||||
self.model_config,
|
||||
tokenizer,
|
||||
mm_registry,
|
||||
mm_processor_cache=self.mm_processor_cache,
|
||||
@ -253,7 +252,7 @@ class InputProcessor:
|
||||
if not params.structured_outputs or not self.structured_outputs_config:
|
||||
return
|
||||
|
||||
if self.renderer_config.skip_tokenizer_init and params.structured_outputs:
|
||||
if self.model_config.skip_tokenizer_init and params.structured_outputs:
|
||||
raise ValueError(
|
||||
"Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'" # noqa: E501
|
||||
)
|
||||
@ -583,7 +582,7 @@ class InputProcessor:
|
||||
if prompt_type == "encoder" and model_config.is_multimodal_model:
|
||||
mm_registry = self.input_preprocessor.mm_registry
|
||||
mm_processor = mm_registry.create_processor(
|
||||
self.renderer_config,
|
||||
model_config,
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
assert isinstance(mm_processor, EncDecMultiModalProcessor)
|
||||
|
||||
@ -60,7 +60,6 @@ class LLMEngine:
|
||||
) -> None:
|
||||
self.vllm_config = vllm_config
|
||||
self.observability_config = vllm_config.observability_config
|
||||
self.renderer_config = vllm_config.renderer_config
|
||||
self.model_config = vllm_config.model_config
|
||||
self.cache_config = vllm_config.cache_config
|
||||
|
||||
@ -84,15 +83,15 @@ class LLMEngine:
|
||||
self.dp_group = None
|
||||
self.should_execute_dummy_batch = False
|
||||
|
||||
if self.renderer_config.skip_tokenizer_init:
|
||||
if self.model_config.skip_tokenizer_init:
|
||||
tokenizer = None
|
||||
else:
|
||||
tokenizer = init_tokenizer_from_config(self.renderer_config)
|
||||
tokenizer = init_tokenizer_from_config(self.model_config)
|
||||
|
||||
self.input_processor = InputProcessor(self.vllm_config, tokenizer)
|
||||
self.io_processor = get_io_processor(
|
||||
self.vllm_config,
|
||||
self.renderer_config.io_processor_plugin,
|
||||
self.model_config.io_processor_plugin,
|
||||
)
|
||||
|
||||
# OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user