mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-07 23:01:20 +08:00
This commit is contained in:
parent
27f4c2fd46
commit
e83b7e379c
@ -22,7 +22,7 @@ Declare supported languages and capabilities:
|
|||||||
import torch
|
import torch
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
from vllm.config import RendererConfig, SpeechToTextConfig
|
from vllm.config import ModelConfig, SpeechToTextConfig
|
||||||
from vllm.inputs.data import PromptType
|
from vllm.inputs.data import PromptType
|
||||||
from vllm.model_executor.models.interfaces import SupportsTranscription
|
from vllm.model_executor.models.interfaces import SupportsTranscription
|
||||||
|
|
||||||
@ -52,7 +52,7 @@ This is for controlling general behavior of the API when serving your model:
|
|||||||
@classmethod
|
@classmethod
|
||||||
def get_speech_to_text_config(
|
def get_speech_to_text_config(
|
||||||
cls,
|
cls,
|
||||||
renderer_config: RendererConfig,
|
model_config: ModelConfig,
|
||||||
task_type: Literal["transcribe", "translate"],
|
task_type: Literal["transcribe", "translate"],
|
||||||
) -> SpeechToTextConfig:
|
) -> SpeechToTextConfig:
|
||||||
return SpeechToTextConfig(
|
return SpeechToTextConfig(
|
||||||
@ -83,7 +83,7 @@ Return a dict containing `multi_modal_data` with the audio, and either a `prompt
|
|||||||
cls,
|
cls,
|
||||||
audio: np.ndarray,
|
audio: np.ndarray,
|
||||||
stt_config: SpeechToTextConfig,
|
stt_config: SpeechToTextConfig,
|
||||||
renderer_config: RendererConfig,
|
model_config: ModelConfig,
|
||||||
language: str | None,
|
language: str | None,
|
||||||
task_type: Literal["transcribe", "translate"],
|
task_type: Literal["transcribe", "translate"],
|
||||||
request_prompt: str,
|
request_prompt: str,
|
||||||
@ -120,7 +120,7 @@ Return a dict with separate `encoder_prompt` and `decoder_prompt` entries:
|
|||||||
cls,
|
cls,
|
||||||
audio: np.ndarray,
|
audio: np.ndarray,
|
||||||
stt_config: SpeechToTextConfig,
|
stt_config: SpeechToTextConfig,
|
||||||
renderer_config: RendererConfig,
|
model_config: ModelConfig,
|
||||||
language: str | None,
|
language: str | None,
|
||||||
task_type: Literal["transcribe", "translate"],
|
task_type: Literal["transcribe", "translate"],
|
||||||
request_prompt: str,
|
request_prompt: str,
|
||||||
@ -183,7 +183,7 @@ Provide a fast duration→token estimate to improve streaming usage statistics:
|
|||||||
cls,
|
cls,
|
||||||
audio_duration_s: float,
|
audio_duration_s: float,
|
||||||
stt_config: SpeechToTextConfig,
|
stt_config: SpeechToTextConfig,
|
||||||
renderer_config: RendererConfig,
|
model_config: ModelConfig,
|
||||||
) -> int | None:
|
) -> int | None:
|
||||||
# Return None if unknown; otherwise return an estimate.
|
# Return None if unknown; otherwise return an estimate.
|
||||||
return int(audio_duration_s * stt_config.sample_rate // 320) # example
|
return int(audio_duration_s * stt_config.sample_rate // 320) # example
|
||||||
@ -216,7 +216,7 @@ Relevant server logic:
|
|||||||
prompt = self.model_cls.get_generation_prompt(
|
prompt = self.model_cls.get_generation_prompt(
|
||||||
audio=chunk,
|
audio=chunk,
|
||||||
stt_config=self.asr_config,
|
stt_config=self.asr_config,
|
||||||
renderer_config=self.renderer_config,
|
model_config=self.model_config,
|
||||||
language=language,
|
language=language,
|
||||||
task_type=self.task_type,
|
task_type=self.task_type,
|
||||||
request_prompt=request.prompt,
|
request_prompt=request.prompt,
|
||||||
|
|||||||
@ -17,7 +17,6 @@ from vllm.config import (
|
|||||||
DeviceConfig,
|
DeviceConfig,
|
||||||
ModelConfig,
|
ModelConfig,
|
||||||
PassConfig,
|
PassConfig,
|
||||||
RendererConfig,
|
|
||||||
VllmConfig,
|
VllmConfig,
|
||||||
get_current_vllm_config,
|
get_current_vllm_config,
|
||||||
set_current_vllm_config,
|
set_current_vllm_config,
|
||||||
@ -277,7 +276,6 @@ def sequence_parallelism_pass_on_test_model(
|
|||||||
|
|
||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
model_config=model_config,
|
model_config=model_config,
|
||||||
renderer_config=RendererConfig(model_config=model_config),
|
|
||||||
device_config=device_config,
|
device_config=device_config,
|
||||||
compilation_config=compilation_config,
|
compilation_config=compilation_config,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -15,7 +15,6 @@ from vllm.config import (
|
|||||||
CompilationConfig,
|
CompilationConfig,
|
||||||
ModelConfig,
|
ModelConfig,
|
||||||
PassConfig,
|
PassConfig,
|
||||||
RendererConfig,
|
|
||||||
VllmConfig,
|
VllmConfig,
|
||||||
set_current_vllm_config,
|
set_current_vllm_config,
|
||||||
)
|
)
|
||||||
@ -220,11 +219,8 @@ def test_fix_functionalization(
|
|||||||
torch.set_default_device("cuda")
|
torch.set_default_device("cuda")
|
||||||
torch.set_default_dtype(dtype)
|
torch.set_default_dtype(dtype)
|
||||||
|
|
||||||
model_config = ModelConfig(dtype=dtype)
|
|
||||||
|
|
||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
model_config=model_config,
|
model_config=ModelConfig(dtype=dtype),
|
||||||
renderer_config=RendererConfig(model_config=model_config),
|
|
||||||
compilation_config=CompilationConfig(
|
compilation_config=CompilationConfig(
|
||||||
custom_ops=["all"],
|
custom_ops=["all"],
|
||||||
pass_config=PassConfig(
|
pass_config=PassConfig(
|
||||||
|
|||||||
@ -15,7 +15,6 @@ from vllm.config import (
|
|||||||
CompilationMode,
|
CompilationMode,
|
||||||
ModelConfig,
|
ModelConfig,
|
||||||
PassConfig,
|
PassConfig,
|
||||||
RendererConfig,
|
|
||||||
VllmConfig,
|
VllmConfig,
|
||||||
)
|
)
|
||||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||||
@ -155,11 +154,8 @@ def test_fusion_rmsnorm_quant(
|
|||||||
custom_ops.append("+rms_norm")
|
custom_ops.append("+rms_norm")
|
||||||
if enable_quant_fp8_custom_op:
|
if enable_quant_fp8_custom_op:
|
||||||
custom_ops.append("+quant_fp8")
|
custom_ops.append("+quant_fp8")
|
||||||
|
|
||||||
model_config = ModelConfig(dtype=dtype)
|
|
||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
model_config=model_config,
|
model_config=ModelConfig(dtype=dtype),
|
||||||
renderer_config=RendererConfig(model_config=model_config),
|
|
||||||
compilation_config=CompilationConfig(
|
compilation_config=CompilationConfig(
|
||||||
mode=CompilationMode.VLLM_COMPILE,
|
mode=CompilationMode.VLLM_COMPILE,
|
||||||
custom_ops=custom_ops,
|
custom_ops=custom_ops,
|
||||||
|
|||||||
@ -24,7 +24,6 @@ from vllm.config import (
|
|||||||
CompilationMode,
|
CompilationMode,
|
||||||
ModelConfig,
|
ModelConfig,
|
||||||
PassConfig,
|
PassConfig,
|
||||||
RendererConfig,
|
|
||||||
SchedulerConfig,
|
SchedulerConfig,
|
||||||
VllmConfig,
|
VllmConfig,
|
||||||
set_current_vllm_config,
|
set_current_vllm_config,
|
||||||
@ -326,7 +325,6 @@ def test_attention_quant_pattern(
|
|||||||
)
|
)
|
||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
model_config=model_config,
|
model_config=model_config,
|
||||||
renderer_config=RendererConfig(model_config=model_config),
|
|
||||||
scheduler_config=SchedulerConfig(
|
scheduler_config=SchedulerConfig(
|
||||||
max_num_seqs=1024,
|
max_num_seqs=1024,
|
||||||
max_model_len=model_config.max_model_len,
|
max_model_len=model_config.max_model_len,
|
||||||
|
|||||||
@ -7,7 +7,7 @@ import torch
|
|||||||
|
|
||||||
from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
|
from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
|
||||||
from vllm.compilation.pass_manager import PostGradPassManager
|
from vllm.compilation.pass_manager import PostGradPassManager
|
||||||
from vllm.config import ModelConfig, RendererConfig, VllmConfig
|
from vllm.config import ModelConfig, VllmConfig
|
||||||
|
|
||||||
|
|
||||||
# dummy custom pass that doesn't inherit
|
# dummy custom pass that doesn't inherit
|
||||||
@ -43,11 +43,7 @@ class ProperPass(InductorPass):
|
|||||||
)
|
)
|
||||||
def test_pass_manager_uuid(callable):
|
def test_pass_manager_uuid(callable):
|
||||||
# Some passes need dtype to be set
|
# Some passes need dtype to be set
|
||||||
model_config = ModelConfig(dtype=torch.bfloat16)
|
config = VllmConfig(model_config=ModelConfig(dtype=torch.bfloat16))
|
||||||
config = VllmConfig(
|
|
||||||
model_config=model_config,
|
|
||||||
renderer_config=RendererConfig(model_config=model_config),
|
|
||||||
)
|
|
||||||
|
|
||||||
pass_manager = PostGradPassManager()
|
pass_manager = PostGradPassManager()
|
||||||
pass_manager.configure(config)
|
pass_manager.configure(config)
|
||||||
|
|||||||
@ -19,7 +19,6 @@ from vllm.config import (
|
|||||||
CompilationMode,
|
CompilationMode,
|
||||||
ModelConfig,
|
ModelConfig,
|
||||||
PassConfig,
|
PassConfig,
|
||||||
RendererConfig,
|
|
||||||
VllmConfig,
|
VllmConfig,
|
||||||
set_current_vllm_config,
|
set_current_vllm_config,
|
||||||
)
|
)
|
||||||
@ -134,10 +133,8 @@ def test_qk_norm_rope_fusion(
|
|||||||
if enable_rope_custom_op:
|
if enable_rope_custom_op:
|
||||||
custom_ops.append("+rotary_embedding")
|
custom_ops.append("+rotary_embedding")
|
||||||
|
|
||||||
model_config = ModelConfig(dtype=dtype)
|
|
||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
model_config=model_config,
|
model_config=ModelConfig(dtype=dtype),
|
||||||
renderer_config=RendererConfig(model_config=model_config),
|
|
||||||
compilation_config=CompilationConfig(
|
compilation_config=CompilationConfig(
|
||||||
mode=CompilationMode.VLLM_COMPILE,
|
mode=CompilationMode.VLLM_COMPILE,
|
||||||
custom_ops=custom_ops,
|
custom_ops=custom_ops,
|
||||||
|
|||||||
@ -5,7 +5,6 @@ from vllm.config import (
|
|||||||
DeviceConfig,
|
DeviceConfig,
|
||||||
KVTransferConfig,
|
KVTransferConfig,
|
||||||
ModelConfig,
|
ModelConfig,
|
||||||
RendererConfig,
|
|
||||||
VllmConfig,
|
VllmConfig,
|
||||||
set_current_vllm_config,
|
set_current_vllm_config,
|
||||||
)
|
)
|
||||||
@ -48,7 +47,6 @@ def test_get_kv_connector_cache_layout_with_nixl_connector():
|
|||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
device_config=DeviceConfig("cpu"),
|
device_config=DeviceConfig("cpu"),
|
||||||
model_config=model_config,
|
model_config=model_config,
|
||||||
renderer_config=RendererConfig(model_config=model_config),
|
|
||||||
kv_transfer_config=kv_transfer_config,
|
kv_transfer_config=kv_transfer_config,
|
||||||
)
|
)
|
||||||
with set_current_vllm_config(vllm_config):
|
with set_current_vllm_config(vllm_config):
|
||||||
@ -72,7 +70,6 @@ def test_get_kv_connector_cache_layout_with_multi_connector():
|
|||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
device_config=DeviceConfig("cpu"),
|
device_config=DeviceConfig("cpu"),
|
||||||
model_config=model_config,
|
model_config=model_config,
|
||||||
renderer_config=RendererConfig(model_config=model_config),
|
|
||||||
kv_transfer_config=kv_transfer_config,
|
kv_transfer_config=kv_transfer_config,
|
||||||
)
|
)
|
||||||
with set_current_vllm_config(vllm_config):
|
with set_current_vllm_config(vllm_config):
|
||||||
|
|||||||
@ -3,6 +3,7 @@
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from vllm.config import ModelConfig
|
||||||
from vllm.entrypoints.chat_utils import apply_hf_chat_template, load_chat_template
|
from vllm.entrypoints.chat_utils import apply_hf_chat_template, load_chat_template
|
||||||
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
|
from vllm.entrypoints.openai.protocol import ChatCompletionRequest
|
||||||
from vllm.tokenizers import get_tokenizer
|
from vllm.tokenizers import get_tokenizer
|
||||||
@ -106,11 +107,24 @@ def test_get_gen_prompt(
|
|||||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||||
model_info.check_available_online(on_fail="skip")
|
model_info.check_available_online(on_fail="skip")
|
||||||
|
|
||||||
renderer_config = model_info.build_renderer_config(model)
|
model_config = ModelConfig(
|
||||||
|
model,
|
||||||
|
tokenizer=model_info.tokenizer or model,
|
||||||
|
tokenizer_mode=model_info.tokenizer_mode,
|
||||||
|
trust_remote_code=model_info.trust_remote_code,
|
||||||
|
revision=model_info.revision,
|
||||||
|
hf_overrides=model_info.hf_overrides,
|
||||||
|
skip_tokenizer_init=model_info.require_embed_inputs,
|
||||||
|
enable_prompt_embeds=model_info.require_embed_inputs,
|
||||||
|
enable_mm_embeds=model_info.require_embed_inputs,
|
||||||
|
enforce_eager=model_info.enforce_eager,
|
||||||
|
dtype=model_info.dtype,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Initialize the tokenizer
|
||||||
tokenizer = get_tokenizer(
|
tokenizer = get_tokenizer(
|
||||||
renderer_config.tokenizer,
|
tokenizer_name=model_config.tokenizer,
|
||||||
trust_remote_code=renderer_config.trust_remote_code,
|
trust_remote_code=model_config.trust_remote_code,
|
||||||
)
|
)
|
||||||
template_content = load_chat_template(chat_template=template)
|
template_content = load_chat_template(chat_template=template)
|
||||||
|
|
||||||
@ -129,7 +143,7 @@ def test_get_gen_prompt(
|
|||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
conversation=mock_request.messages,
|
conversation=mock_request.messages,
|
||||||
chat_template=mock_request.chat_template or template_content,
|
chat_template=mock_request.chat_template or template_content,
|
||||||
renderer_config=renderer_config,
|
model_config=model_config,
|
||||||
tools=None,
|
tools=None,
|
||||||
add_generation_prompt=mock_request.add_generation_prompt,
|
add_generation_prompt=mock_request.add_generation_prompt,
|
||||||
continue_final_message=mock_request.continue_final_message,
|
continue_final_message=mock_request.continue_final_message,
|
||||||
|
|||||||
@ -33,34 +33,26 @@ class MockModelConfig:
|
|||||||
"""Minimal mock ModelConfig for testing."""
|
"""Minimal mock ModelConfig for testing."""
|
||||||
|
|
||||||
model: str = MODEL_NAME
|
model: str = MODEL_NAME
|
||||||
|
tokenizer: str = MODEL_NAME
|
||||||
trust_remote_code: bool = False
|
trust_remote_code: bool = False
|
||||||
|
tokenizer_mode: str = "auto"
|
||||||
max_model_len: int = 100
|
max_model_len: int = 100
|
||||||
|
tokenizer_revision: str | None = None
|
||||||
multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig)
|
multimodal_config: MultiModalConfig = field(default_factory=MultiModalConfig)
|
||||||
hf_config: MockHFConfig = field(default_factory=MockHFConfig)
|
hf_config: MockHFConfig = field(default_factory=MockHFConfig)
|
||||||
logits_processors: list[str] | None = None
|
logits_processors: list[str] | None = None
|
||||||
logits_processor_pattern: str | None = None
|
logits_processor_pattern: str | None = None
|
||||||
diff_sampling_param: dict | None = None
|
diff_sampling_param: dict | None = None
|
||||||
|
allowed_local_media_path: str = ""
|
||||||
|
allowed_media_domains: list[str] | None = None
|
||||||
encoder_config = None
|
encoder_config = None
|
||||||
generation_config: str = "auto"
|
generation_config: str = "auto"
|
||||||
|
skip_tokenizer_init: bool = False
|
||||||
|
|
||||||
def get_diff_sampling_param(self):
|
def get_diff_sampling_param(self):
|
||||||
return self.diff_sampling_param or {}
|
return self.diff_sampling_param or {}
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class MockRendererConfig:
|
|
||||||
"""Minimal mock RendererConfig for testing."""
|
|
||||||
|
|
||||||
model_config: MockModelConfig
|
|
||||||
|
|
||||||
tokenizer: str = MODEL_NAME
|
|
||||||
tokenizer_mode: str = "auto"
|
|
||||||
tokenizer_revision: str | None = None
|
|
||||||
skip_tokenizer_init: bool = False
|
|
||||||
allowed_local_media_path: str = ""
|
|
||||||
allowed_media_domains: list[str] | None = None
|
|
||||||
|
|
||||||
|
|
||||||
class MockLoRAResolver(LoRAResolver):
|
class MockLoRAResolver(LoRAResolver):
|
||||||
async def resolve_lora(
|
async def resolve_lora(
|
||||||
self, base_model_name: str, lora_name: str
|
self, base_model_name: str, lora_name: str
|
||||||
@ -122,7 +114,6 @@ def mock_serving_setup():
|
|||||||
mock_engine.add_lora.reset_mock()
|
mock_engine.add_lora.reset_mock()
|
||||||
|
|
||||||
mock_engine.model_config = MockModelConfig()
|
mock_engine.model_config = MockModelConfig()
|
||||||
mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
|
|
||||||
mock_engine.input_processor = MagicMock()
|
mock_engine.input_processor = MagicMock()
|
||||||
mock_engine.io_processor = MagicMock()
|
mock_engine.io_processor = MagicMock()
|
||||||
|
|
||||||
|
|||||||
@ -346,33 +346,27 @@ class MockHFConfig:
|
|||||||
class MockModelConfig:
|
class MockModelConfig:
|
||||||
task = "generate"
|
task = "generate"
|
||||||
runner_type = "generate"
|
runner_type = "generate"
|
||||||
|
tokenizer = MODEL_NAME
|
||||||
trust_remote_code = False
|
trust_remote_code = False
|
||||||
|
tokenizer_mode = "auto"
|
||||||
max_model_len = 100
|
max_model_len = 100
|
||||||
|
tokenizer_revision = None
|
||||||
multimodal_config = MultiModalConfig()
|
multimodal_config = MultiModalConfig()
|
||||||
hf_config = MockHFConfig()
|
hf_config = MockHFConfig()
|
||||||
logits_processors: list[str] | None = None
|
logits_processors: list[str] | None = None
|
||||||
logits_processor_pattern = None
|
logits_processor_pattern = None
|
||||||
diff_sampling_param: dict | None = None
|
diff_sampling_param: dict | None = None
|
||||||
|
allowed_local_media_path: str = ""
|
||||||
|
allowed_media_domains: list[str] | None = None
|
||||||
encoder_config = None
|
encoder_config = None
|
||||||
generation_config: str = "auto"
|
generation_config: str = "auto"
|
||||||
|
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
|
||||||
|
skip_tokenizer_init = False
|
||||||
|
|
||||||
def get_diff_sampling_param(self):
|
def get_diff_sampling_param(self):
|
||||||
return self.diff_sampling_param or {}
|
return self.diff_sampling_param or {}
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class MockRendererConfig:
|
|
||||||
model_config: MockModelConfig = field(default_factory=MockModelConfig)
|
|
||||||
|
|
||||||
tokenizer = MODEL_NAME
|
|
||||||
tokenizer_mode = "auto"
|
|
||||||
tokenizer_revision = None
|
|
||||||
skip_tokenizer_init = False
|
|
||||||
media_io_kwargs: dict[str, dict[str, Any]] = field(default_factory=dict)
|
|
||||||
allowed_local_media_path: str = ""
|
|
||||||
allowed_media_domains: list[str] | None = None
|
|
||||||
|
|
||||||
|
|
||||||
def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
|
def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
|
||||||
models = OpenAIServingModels(
|
models = OpenAIServingModels(
|
||||||
engine_client=engine,
|
engine_client=engine,
|
||||||
@ -405,7 +399,6 @@ def _build_serving_chat(engine: AsyncLLM) -> OpenAIServingChat:
|
|||||||
@dataclass
|
@dataclass
|
||||||
class MockEngine:
|
class MockEngine:
|
||||||
model_config: MockModelConfig = field(default_factory=MockModelConfig)
|
model_config: MockModelConfig = field(default_factory=MockModelConfig)
|
||||||
renderer_config: MockRendererConfig = field(default_factory=MockRendererConfig)
|
|
||||||
input_processor: MagicMock = field(default_factory=MagicMock)
|
input_processor: MagicMock = field(default_factory=MagicMock)
|
||||||
io_processor: MagicMock = field(default_factory=MagicMock)
|
io_processor: MagicMock = field(default_factory=MagicMock)
|
||||||
|
|
||||||
@ -436,7 +429,6 @@ async def test_serving_chat_returns_correct_model_name():
|
|||||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||||
mock_engine.errored = False
|
mock_engine.errored = False
|
||||||
mock_engine.model_config = MockModelConfig()
|
mock_engine.model_config = MockModelConfig()
|
||||||
mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
|
|
||||||
mock_engine.input_processor = MagicMock()
|
mock_engine.input_processor = MagicMock()
|
||||||
mock_engine.io_processor = MagicMock()
|
mock_engine.io_processor = MagicMock()
|
||||||
|
|
||||||
@ -467,7 +459,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
|
|||||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||||
mock_engine.errored = False
|
mock_engine.errored = False
|
||||||
mock_engine.model_config = MockModelConfig()
|
mock_engine.model_config = MockModelConfig()
|
||||||
mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
|
|
||||||
mock_engine.input_processor = MagicMock()
|
mock_engine.input_processor = MagicMock()
|
||||||
mock_engine.io_processor = MagicMock()
|
mock_engine.io_processor = MagicMock()
|
||||||
|
|
||||||
@ -501,7 +492,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
|
|||||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||||
mock_engine.errored = False
|
mock_engine.errored = False
|
||||||
mock_engine.model_config = mock_model_config
|
mock_engine.model_config = mock_model_config
|
||||||
mock_engine.renderer_config = MockRendererConfig(mock_model_config)
|
|
||||||
mock_engine.input_processor = MagicMock()
|
mock_engine.input_processor = MagicMock()
|
||||||
mock_engine.io_processor = MagicMock()
|
mock_engine.io_processor = MagicMock()
|
||||||
|
|
||||||
@ -547,7 +537,6 @@ async def test_serving_chat_should_set_correct_max_tokens():
|
|||||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||||
mock_engine.errored = False
|
mock_engine.errored = False
|
||||||
mock_engine.model_config = mock_model_config
|
mock_engine.model_config = mock_model_config
|
||||||
mock_engine.renderer_config = MockRendererConfig(mock_model_config)
|
|
||||||
mock_engine.input_processor = MagicMock()
|
mock_engine.input_processor = MagicMock()
|
||||||
mock_engine.io_processor = MagicMock()
|
mock_engine.io_processor = MagicMock()
|
||||||
|
|
||||||
@ -594,7 +583,6 @@ async def test_serving_chat_could_load_correct_generation_config():
|
|||||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||||
mock_engine.errored = False
|
mock_engine.errored = False
|
||||||
mock_engine.model_config = mock_model_config
|
mock_engine.model_config = mock_model_config
|
||||||
mock_engine.renderer_config = MockRendererConfig(mock_model_config)
|
|
||||||
mock_engine.input_processor = MagicMock()
|
mock_engine.input_processor = MagicMock()
|
||||||
mock_engine.io_processor = MagicMock()
|
mock_engine.io_processor = MagicMock()
|
||||||
|
|
||||||
@ -641,7 +629,6 @@ async def test_serving_chat_did_set_correct_cache_salt(model_type):
|
|||||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||||
mock_engine.errored = False
|
mock_engine.errored = False
|
||||||
mock_engine.model_config = mock_model_config
|
mock_engine.model_config = mock_model_config
|
||||||
mock_engine.renderer_config = MockRendererConfig(mock_model_config)
|
|
||||||
mock_engine.input_processor = MagicMock()
|
mock_engine.input_processor = MagicMock()
|
||||||
mock_engine.io_processor = MagicMock()
|
mock_engine.io_processor = MagicMock()
|
||||||
|
|
||||||
@ -675,7 +662,6 @@ async def test_serving_chat_data_parallel_rank_extraction():
|
|||||||
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
|
||||||
mock_engine.errored = False
|
mock_engine.errored = False
|
||||||
mock_engine.model_config = MockModelConfig()
|
mock_engine.model_config = MockModelConfig()
|
||||||
mock_engine.renderer_config = MockRendererConfig(mock_engine.model_config)
|
|
||||||
mock_engine.input_processor = MagicMock()
|
mock_engine.input_processor = MagicMock()
|
||||||
mock_engine.io_processor = MagicMock()
|
mock_engine.io_processor = MagicMock()
|
||||||
|
|
||||||
|
|||||||
@ -7,7 +7,7 @@ from unittest.mock import Mock
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm.config import ModelConfig, RendererConfig
|
from vllm.config import ModelConfig
|
||||||
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
from vllm.entrypoints.openai.serving_engine import OpenAIServing
|
||||||
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
from vllm.entrypoints.openai.serving_models import OpenAIServingModels
|
||||||
from vllm.tokenizers import MistralTokenizer
|
from vllm.tokenizers import MistralTokenizer
|
||||||
@ -19,16 +19,10 @@ def serving() -> OpenAIServing:
|
|||||||
|
|
||||||
# Create minimal mocks
|
# Create minimal mocks
|
||||||
engine_client = Mock()
|
engine_client = Mock()
|
||||||
|
|
||||||
model_config = Mock(spec=ModelConfig)
|
model_config = Mock(spec=ModelConfig)
|
||||||
model_config.max_model_len = 32768
|
model_config.max_model_len = 32768
|
||||||
|
|
||||||
renderer_config = Mock(spec=RendererConfig)
|
|
||||||
renderer_config.model_config = model_config
|
|
||||||
|
|
||||||
models = Mock(spec=OpenAIServingModels)
|
models = Mock(spec=OpenAIServingModels)
|
||||||
models.model_config = model_config
|
models.model_config = model_config
|
||||||
models.renderer_config = renderer_config
|
|
||||||
models.input_processor = Mock()
|
models.input_processor = Mock()
|
||||||
models.io_processor = Mock()
|
models.io_processor = Mock()
|
||||||
|
|
||||||
|
|||||||
@ -6,7 +6,7 @@ from unittest.mock import MagicMock
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm.config import ModelConfig, RendererConfig
|
from vllm.config import ModelConfig
|
||||||
from vllm.engine.protocol import EngineClient
|
from vllm.engine.protocol import EngineClient
|
||||||
from vllm.entrypoints.openai.protocol import (
|
from vllm.entrypoints.openai.protocol import (
|
||||||
ErrorResponse,
|
ErrorResponse,
|
||||||
@ -27,15 +27,9 @@ LORA_UNLOADING_SUCCESS_MESSAGE = (
|
|||||||
async def _async_serving_models_init() -> OpenAIServingModels:
|
async def _async_serving_models_init() -> OpenAIServingModels:
|
||||||
mock_engine_client = MagicMock(spec=EngineClient)
|
mock_engine_client = MagicMock(spec=EngineClient)
|
||||||
# Set the max_model_len attribute to avoid missing attribute
|
# Set the max_model_len attribute to avoid missing attribute
|
||||||
|
|
||||||
mock_model_config = MagicMock(spec=ModelConfig)
|
mock_model_config = MagicMock(spec=ModelConfig)
|
||||||
mock_model_config.max_model_len = 2048
|
mock_model_config.max_model_len = 2048
|
||||||
|
|
||||||
mock_renderer_config = MagicMock(spec=RendererConfig)
|
|
||||||
mock_renderer_config.model_config = mock_model_config
|
|
||||||
|
|
||||||
mock_engine_client.model_config = mock_model_config
|
mock_engine_client.model_config = mock_model_config
|
||||||
mock_engine_client.renderer_config = mock_renderer_config
|
|
||||||
mock_engine_client.input_processor = MagicMock()
|
mock_engine_client.input_processor = MagicMock()
|
||||||
mock_engine_client.io_processor = MagicMock()
|
mock_engine_client.io_processor = MagicMock()
|
||||||
|
|
||||||
|
|||||||
@ -12,7 +12,7 @@ from mistral_common.tokens.tokenizers.base import SpecialTokenPolicy
|
|||||||
from vllm.assets.audio import AudioAsset
|
from vllm.assets.audio import AudioAsset
|
||||||
from vllm.assets.image import ImageAsset
|
from vllm.assets.image import ImageAsset
|
||||||
from vllm.assets.video import VideoAsset
|
from vllm.assets.video import VideoAsset
|
||||||
from vllm.config import ModelConfig, RendererConfig
|
from vllm.config import ModelConfig
|
||||||
from vllm.entrypoints.chat_utils import (
|
from vllm.entrypoints.chat_utils import (
|
||||||
_try_extract_ast,
|
_try_extract_ast,
|
||||||
apply_mistral_chat_template,
|
apply_mistral_chat_template,
|
||||||
@ -233,7 +233,7 @@ def test_parse_chat_messages_single_image(
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=phi3v_model_config),
|
phi3v_model_config,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -265,7 +265,7 @@ def test_parse_chat_messages_single_image_with_uuid(
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=phi3v_model_config),
|
phi3v_model_config,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -295,7 +295,7 @@ def test_parse_chat_messages_single_empty_image_with_uuid(
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=phi3v_model_config),
|
phi3v_model_config,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -328,7 +328,7 @@ def test_parse_chat_messages_single_image_with_bad_uuid_format(
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=phi3v_model_config),
|
phi3v_model_config,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -369,7 +369,7 @@ def test_parse_chat_messages_multiple_images_with_uuids(
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=phi3v_model_config),
|
phi3v_model_config,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -409,7 +409,7 @@ def test_parse_chat_messages_multiple_empty_images_with_uuids(
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=phi3v_model_config),
|
phi3v_model_config,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -451,7 +451,7 @@ def test_parse_chat_messages_mixed_empty_images_with_uuids(
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=phi3v_model_config),
|
phi3v_model_config,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -485,7 +485,7 @@ async def test_parse_chat_messages_single_image_with_uuid_async(
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=phi3v_model_config),
|
phi3v_model_config,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -516,7 +516,7 @@ async def test_parse_chat_messages_empty_image_with_uuid_async(
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=phi3v_model_config),
|
phi3v_model_config,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -554,7 +554,7 @@ async def test_parse_chat_messages_multiple_images_with_uuids_async(
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=phi3v_model_config),
|
phi3v_model_config,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -595,7 +595,7 @@ async def test_parse_chat_messages_multiple_empty_images_with_uuids_async(
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=phi3v_model_config),
|
phi3v_model_config,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -634,7 +634,7 @@ async def test_parse_chat_messages_multiple_images_with_partial_uuids_async(
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=phi3v_model_config),
|
phi3v_model_config,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -660,7 +660,7 @@ def test_parse_chat_messages_empty_system(
|
|||||||
"content": [{"type": "text", "text": "Who are you?"}],
|
"content": [{"type": "text", "text": "Who are you?"}],
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=mistral_model_config),
|
mistral_model_config,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
assert conversation == [
|
assert conversation == [
|
||||||
@ -677,7 +677,7 @@ def test_parse_chat_messages_empty_system(
|
|||||||
"content": [{"type": "text", "text": "Who are you?"}],
|
"content": [{"type": "text", "text": "Who are you?"}],
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=mistral_model_config),
|
mistral_model_config,
|
||||||
content_format="openai",
|
content_format="openai",
|
||||||
)
|
)
|
||||||
assert conversation == [
|
assert conversation == [
|
||||||
@ -701,7 +701,7 @@ async def test_parse_chat_messages_single_image_async(
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=phi3v_model_config),
|
phi3v_model_config,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -730,7 +730,7 @@ def test_parse_chat_messages_multiple_images(
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=phi3v_model_config),
|
phi3v_model_config,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -758,7 +758,7 @@ def test_parse_chat_messages_empty_pil_image_with_uuid(
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=phi3v_model_config),
|
phi3v_model_config,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -786,7 +786,7 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=phi3v_model_config_image_embeds),
|
phi3v_model_config_image_embeds,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -818,7 +818,7 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid(
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=audio_embeds_model_config),
|
audio_embeds_model_config,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -858,7 +858,7 @@ def test_parse_chat_messages_audio_embeds_with_string(
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=audio_embeds_model_config),
|
audio_embeds_model_config,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -900,7 +900,7 @@ async def test_parse_chat_messages_audio_embeds_async(
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=audio_embeds_model_config),
|
audio_embeds_model_config,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1108,7 +1108,7 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=phi3v_model_config_image_embeds),
|
phi3v_model_config_image_embeds,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1144,7 +1144,7 @@ async def test_parse_chat_messages_multiple_images_async(
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=phi3v_model_config),
|
phi3v_model_config,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1176,7 +1176,7 @@ def test_parse_chat_messages_placeholder_already_in_prompt(
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=phi3v_model_config),
|
phi3v_model_config,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
assert conversation == [
|
assert conversation == [
|
||||||
@ -1208,7 +1208,7 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=phi3v_model_config),
|
phi3v_model_config,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1245,7 +1245,7 @@ def test_parse_chat_messages_multiple_images_across_messages(
|
|||||||
],
|
],
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=phi3v_model_config),
|
phi3v_model_config,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1289,7 +1289,7 @@ def test_parse_chat_messages_multiple_images_with_uuids_across_messages(
|
|||||||
],
|
],
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=phi3v_model_config),
|
phi3v_model_config,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1314,7 +1314,7 @@ def test_parse_chat_messages_context_text_format(
|
|||||||
{"role": "assistant", "content": "Some stuff."},
|
{"role": "assistant", "content": "Some stuff."},
|
||||||
{"role": "user", "content": "What about this one?"},
|
{"role": "user", "content": "What about this one?"},
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=phi3v_model_config),
|
phi3v_model_config,
|
||||||
content_format="openai",
|
content_format="openai",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1367,7 +1367,7 @@ def test_parse_chat_messages_rejects_too_many_images_in_one_message(
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=phi3v_model_config),
|
phi3v_model_config,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1410,7 +1410,7 @@ def test_parse_chat_messages_rejects_too_many_images_across_messages(
|
|||||||
],
|
],
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=phi3v_model_config),
|
phi3v_model_config,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1430,7 +1430,7 @@ def test_parse_chat_messages_multiple_images_uncommon_input(
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=phi3v_model_config),
|
phi3v_model_config,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1464,7 +1464,7 @@ def test_parse_chat_messages_multiple_images_interleave(
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=phi3v_model_config_mm_interleaved),
|
phi3v_model_config_mm_interleaved,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1500,7 +1500,7 @@ async def test_parse_chat_messages_multiple_images_interleave_async(
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=phi3v_model_config_mm_interleaved),
|
phi3v_model_config_mm_interleaved,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1545,7 +1545,7 @@ async def test_parse_chat_messages_multiple_images_with_uuids_interleave_async(
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=phi3v_model_config_mm_interleaved),
|
phi3v_model_config_mm_interleaved,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1583,7 +1583,7 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
|
|||||||
],
|
],
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=phi3v_model_config_mm_interleaved),
|
phi3v_model_config_mm_interleaved,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1631,7 +1631,7 @@ def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interl
|
|||||||
],
|
],
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=phi3v_model_config_mm_interleaved),
|
phi3v_model_config_mm_interleaved,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1675,7 +1675,7 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
|
|||||||
],
|
],
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=qwen25omni_model_config_mm_interleaved),
|
qwen25omni_model_config_mm_interleaved,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1743,7 +1743,7 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl
|
|||||||
],
|
],
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=qwen25omni_model_config_mm_interleaved),
|
qwen25omni_model_config_mm_interleaved,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1813,7 +1813,7 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes
|
|||||||
],
|
],
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=qwen25omni_model_config_mm_interleaved),
|
qwen25omni_model_config_mm_interleaved,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1879,7 +1879,7 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message
|
|||||||
],
|
],
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=qwen25omni_model_config_mm_interleaved),
|
qwen25omni_model_config_mm_interleaved,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1927,7 +1927,7 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=phi3v_model_config_mm_interleaved),
|
phi3v_model_config_mm_interleaved,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1945,11 +1945,24 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
|
|||||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||||
model_info.check_available_online(on_fail="skip")
|
model_info.check_available_online(on_fail="skip")
|
||||||
|
|
||||||
renderer_config = model_info.build_renderer_config(model)
|
model_config = ModelConfig(
|
||||||
|
model,
|
||||||
|
tokenizer=model_info.tokenizer or model,
|
||||||
|
tokenizer_mode=model_info.tokenizer_mode,
|
||||||
|
revision=model_info.revision,
|
||||||
|
trust_remote_code=model_info.trust_remote_code,
|
||||||
|
hf_overrides=model_info.hf_overrides,
|
||||||
|
skip_tokenizer_init=model_info.require_embed_inputs,
|
||||||
|
enable_prompt_embeds=model_info.require_embed_inputs,
|
||||||
|
enable_mm_embeds=model_info.require_embed_inputs,
|
||||||
|
enforce_eager=model_info.enforce_eager,
|
||||||
|
dtype=model_info.dtype,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Build the tokenizer
|
||||||
tokenizer = get_tokenizer(
|
tokenizer = get_tokenizer(
|
||||||
renderer_config.tokenizer,
|
model,
|
||||||
trust_remote_code=renderer_config.trust_remote_code,
|
trust_remote_code=model_config.trust_remote_code,
|
||||||
)
|
)
|
||||||
|
|
||||||
tools = (
|
tools = (
|
||||||
@ -1972,7 +1985,7 @@ def test_resolve_hf_chat_template(sample_json_schema, model, use_tools):
|
|||||||
tokenizer,
|
tokenizer,
|
||||||
chat_template=None,
|
chat_template=None,
|
||||||
tools=tools,
|
tools=tools,
|
||||||
model_config=renderer_config.model_config,
|
model_config=model_config,
|
||||||
)
|
)
|
||||||
assert isinstance(chat_template, str)
|
assert isinstance(chat_template, str)
|
||||||
|
|
||||||
@ -2034,11 +2047,24 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwa
|
|||||||
"enable_thinking": True,
|
"enable_thinking": True,
|
||||||
}
|
}
|
||||||
|
|
||||||
renderer_config = model_info.build_renderer_config(model)
|
model_config = ModelConfig(
|
||||||
|
model,
|
||||||
|
tokenizer=model_info.tokenizer or model,
|
||||||
|
tokenizer_mode=model_info.tokenizer_mode,
|
||||||
|
revision=model_info.revision,
|
||||||
|
trust_remote_code=model_info.trust_remote_code,
|
||||||
|
hf_overrides=model_info.hf_overrides,
|
||||||
|
skip_tokenizer_init=model_info.require_embed_inputs,
|
||||||
|
enable_prompt_embeds=model_info.require_embed_inputs,
|
||||||
|
enable_mm_embeds=model_info.require_embed_inputs,
|
||||||
|
enforce_eager=model_info.enforce_eager,
|
||||||
|
dtype=model_info.dtype,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Build the tokenizer
|
||||||
tokenizer = get_tokenizer(
|
tokenizer = get_tokenizer(
|
||||||
renderer_config.tokenizer,
|
model,
|
||||||
trust_remote_code=renderer_config.trust_remote_code,
|
trust_remote_code=model_config.trust_remote_code,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Test detecting the tokenizer's chat_template
|
# Test detecting the tokenizer's chat_template
|
||||||
@ -2046,7 +2072,7 @@ def test_resolve_hf_chat_template_kwargs(sample_json_schema, model, expected_kwa
|
|||||||
tokenizer,
|
tokenizer,
|
||||||
chat_template=None,
|
chat_template=None,
|
||||||
tools=tools,
|
tools=tools,
|
||||||
model_config=renderer_config.model_config,
|
model_config=model_config,
|
||||||
)
|
)
|
||||||
with pytest.raises(
|
with pytest.raises(
|
||||||
ValueError, match="Found unexpected chat template kwargs from request"
|
ValueError, match="Found unexpected chat template kwargs from request"
|
||||||
@ -2117,11 +2143,23 @@ def test_resolve_content_format_hf_defined(model, expected_format):
|
|||||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||||
model_info.check_available_online(on_fail="skip")
|
model_info.check_available_online(on_fail="skip")
|
||||||
|
|
||||||
renderer_config = model_info.build_renderer_config(model)
|
model_config = ModelConfig(
|
||||||
|
model,
|
||||||
|
tokenizer=model_info.tokenizer or model,
|
||||||
|
tokenizer_mode=model_info.tokenizer_mode,
|
||||||
|
revision=model_info.revision,
|
||||||
|
trust_remote_code=model_info.trust_remote_code,
|
||||||
|
hf_overrides=model_info.hf_overrides,
|
||||||
|
skip_tokenizer_init=model_info.require_embed_inputs,
|
||||||
|
enable_prompt_embeds=model_info.require_embed_inputs,
|
||||||
|
enable_mm_embeds=model_info.require_embed_inputs,
|
||||||
|
enforce_eager=model_info.enforce_eager,
|
||||||
|
dtype=model_info.dtype,
|
||||||
|
)
|
||||||
|
|
||||||
tokenizer = get_tokenizer(
|
tokenizer = get_tokenizer(
|
||||||
renderer_config.tokenizer,
|
model,
|
||||||
trust_remote_code=renderer_config.trust_remote_code,
|
trust_remote_code=model_config.trust_remote_code,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Test detecting the tokenizer's chat_template
|
# Test detecting the tokenizer's chat_template
|
||||||
@ -2129,7 +2167,7 @@ def test_resolve_content_format_hf_defined(model, expected_format):
|
|||||||
tokenizer,
|
tokenizer,
|
||||||
chat_template=None,
|
chat_template=None,
|
||||||
tools=None,
|
tools=None,
|
||||||
model_config=renderer_config.model_config,
|
model_config=model_config,
|
||||||
)
|
)
|
||||||
assert isinstance(chat_template, str)
|
assert isinstance(chat_template, str)
|
||||||
|
|
||||||
@ -2143,7 +2181,7 @@ def test_resolve_content_format_hf_defined(model, expected_format):
|
|||||||
None,
|
None,
|
||||||
"auto",
|
"auto",
|
||||||
tokenizer,
|
tokenizer,
|
||||||
renderer_config=renderer_config,
|
model_config=model_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
assert resolved_format == expected_format
|
assert resolved_format == expected_format
|
||||||
@ -2165,11 +2203,23 @@ def test_resolve_content_format_fallbacks(model, expected_format):
|
|||||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||||
model_info.check_available_online(on_fail="skip")
|
model_info.check_available_online(on_fail="skip")
|
||||||
|
|
||||||
renderer_config = model_info.build_renderer_config(model)
|
model_config = ModelConfig(
|
||||||
|
model,
|
||||||
|
tokenizer=model_info.tokenizer or model,
|
||||||
|
tokenizer_mode=model_info.tokenizer_mode,
|
||||||
|
revision=model_info.revision,
|
||||||
|
trust_remote_code=model_info.trust_remote_code,
|
||||||
|
hf_overrides=model_info.hf_overrides,
|
||||||
|
skip_tokenizer_init=model_info.require_embed_inputs,
|
||||||
|
enable_prompt_embeds=model_info.require_embed_inputs,
|
||||||
|
enable_mm_embeds=model_info.require_embed_inputs,
|
||||||
|
enforce_eager=model_info.enforce_eager,
|
||||||
|
dtype=model_info.dtype,
|
||||||
|
)
|
||||||
|
|
||||||
tokenizer = get_tokenizer(
|
tokenizer = get_tokenizer(
|
||||||
renderer_config.tokenizer,
|
model_config.tokenizer,
|
||||||
trust_remote_code=renderer_config.trust_remote_code,
|
trust_remote_code=model_config.trust_remote_code,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Test detecting the tokenizer's chat_template
|
# Test detecting the tokenizer's chat_template
|
||||||
@ -2177,7 +2227,7 @@ def test_resolve_content_format_fallbacks(model, expected_format):
|
|||||||
tokenizer,
|
tokenizer,
|
||||||
chat_template=None,
|
chat_template=None,
|
||||||
tools=None,
|
tools=None,
|
||||||
model_config=renderer_config.model_config,
|
model_config=model_config,
|
||||||
)
|
)
|
||||||
assert isinstance(chat_template, str)
|
assert isinstance(chat_template, str)
|
||||||
|
|
||||||
@ -2191,7 +2241,7 @@ def test_resolve_content_format_fallbacks(model, expected_format):
|
|||||||
None,
|
None,
|
||||||
"auto",
|
"auto",
|
||||||
tokenizer,
|
tokenizer,
|
||||||
renderer_config=renderer_config,
|
model_config=model_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
assert resolved_format == expected_format
|
assert resolved_format == expected_format
|
||||||
@ -2222,13 +2272,15 @@ def test_resolve_content_format_fallbacks(model, expected_format):
|
|||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_resolve_content_format_examples(template_path, expected_format):
|
def test_resolve_content_format_examples(template_path, expected_format):
|
||||||
model = PHI3V_MODEL_ID # Dummy
|
model_config = ModelConfig(
|
||||||
model_config = ModelConfig(model, trust_remote_code=True)
|
PHI3V_MODEL_ID, # Dummy
|
||||||
renderer_config = RendererConfig(model_config=model_config, tokenizer=model)
|
tokenizer=PHI3V_MODEL_ID, # Dummy
|
||||||
|
trust_remote_code=True,
|
||||||
|
)
|
||||||
|
|
||||||
dummy_tokenizer = get_tokenizer(
|
dummy_tokenizer = get_tokenizer(
|
||||||
renderer_config.tokenizer,
|
PHI3V_MODEL_ID, # Dummy
|
||||||
trust_remote_code=renderer_config.trust_remote_code,
|
trust_remote_code=model_config.trust_remote_code,
|
||||||
)
|
)
|
||||||
dummy_tokenizer.chat_template = None
|
dummy_tokenizer.chat_template = None
|
||||||
|
|
||||||
@ -2245,7 +2297,7 @@ def test_resolve_content_format_examples(template_path, expected_format):
|
|||||||
None,
|
None,
|
||||||
"auto",
|
"auto",
|
||||||
dummy_tokenizer,
|
dummy_tokenizer,
|
||||||
renderer_config=renderer_config,
|
model_config=model_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
assert resolved_format == expected_format
|
assert resolved_format == expected_format
|
||||||
@ -2280,7 +2332,7 @@ def test_parse_chat_messages_include_thinking_chunk(mistral_model_config):
|
|||||||
|
|
||||||
conversation_with_thinking, _, _ = parse_chat_messages(
|
conversation_with_thinking, _, _ = parse_chat_messages(
|
||||||
messages,
|
messages,
|
||||||
RendererConfig(model_config=mistral_model_config),
|
mistral_model_config,
|
||||||
content_format="openai",
|
content_format="openai",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -2380,7 +2432,7 @@ def test_parse_chat_messages_single_empty_audio_with_uuid(
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=qwen2_audio_model_config),
|
qwen2_audio_model_config,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -2414,7 +2466,7 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
|
|||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
RendererConfig(model_config=qwen2_audio_model_config),
|
qwen2_audio_model_config,
|
||||||
content_format="string",
|
content_format="string",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -8,7 +8,7 @@ import torch
|
|||||||
from safetensors.torch import load_file
|
from safetensors.torch import load_file
|
||||||
from torch import nn
|
from torch import nn
|
||||||
|
|
||||||
from vllm.config import ModelConfig, RendererConfig, VllmConfig
|
from vllm.config import ModelConfig, VllmConfig
|
||||||
from vllm.config.lora import LoRAConfig
|
from vllm.config.lora import LoRAConfig
|
||||||
from vllm.lora.layers import (
|
from vllm.lora.layers import (
|
||||||
ColumnParallelLinearWithLoRA,
|
ColumnParallelLinearWithLoRA,
|
||||||
@ -422,11 +422,7 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_pa
|
|||||||
)
|
)
|
||||||
|
|
||||||
model_config = ModelConfig(max_model_len=16)
|
model_config = ModelConfig(max_model_len=16)
|
||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config)
|
||||||
model_config=model_config,
|
|
||||||
renderer_config=RendererConfig(model_config=model_config),
|
|
||||||
lora_config=lora_config,
|
|
||||||
)
|
|
||||||
|
|
||||||
vllm_config.scheduler_config.max_num_seqs = 4
|
vllm_config.scheduler_config.max_num_seqs = 4
|
||||||
vllm_config.scheduler_config.max_num_batched_tokens = 2
|
vllm_config.scheduler_config.max_num_batched_tokens = 2
|
||||||
@ -529,11 +525,7 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path
|
|||||||
)
|
)
|
||||||
|
|
||||||
model_config = ModelConfig(max_model_len=16)
|
model_config = ModelConfig(max_model_len=16)
|
||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(model_config=model_config, lora_config=lora_config)
|
||||||
model_config=model_config,
|
|
||||||
renderer_config=RendererConfig(model_config=model_config),
|
|
||||||
lora_config=lora_config,
|
|
||||||
)
|
|
||||||
|
|
||||||
vllm_config.scheduler_config.max_num_seqs = 4
|
vllm_config.scheduler_config.max_num_seqs = 4
|
||||||
vllm_config.scheduler_config.max_num_batched_tokens = 2
|
vllm_config.scheduler_config.max_num_batched_tokens = 2
|
||||||
|
|||||||
@ -11,7 +11,6 @@ from vllm.config import (
|
|||||||
DeviceConfig,
|
DeviceConfig,
|
||||||
ModelConfig,
|
ModelConfig,
|
||||||
ParallelConfig,
|
ParallelConfig,
|
||||||
RendererConfig,
|
|
||||||
SchedulerConfig,
|
SchedulerConfig,
|
||||||
VllmConfig,
|
VllmConfig,
|
||||||
)
|
)
|
||||||
@ -44,7 +43,6 @@ def test_worker_apply_lora(qwen3_lora_files):
|
|||||||
|
|
||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
model_config=model_config,
|
model_config=model_config,
|
||||||
renderer_config=RendererConfig(model_config=model_config),
|
|
||||||
load_config=LoadConfig(
|
load_config=LoadConfig(
|
||||||
download_dir=None,
|
download_dir=None,
|
||||||
load_format="dummy",
|
load_format="dummy",
|
||||||
|
|||||||
@ -42,10 +42,8 @@ def test_model_loading_with_params(vllm_runner, monkeypatch):
|
|||||||
"Write a short story about a robot that dreams for the first time.\n"
|
"Write a short story about a robot that dreams for the first time.\n"
|
||||||
)
|
)
|
||||||
|
|
||||||
llm_engine = vllm_model.llm.llm_engine
|
model_config = vllm_model.llm.llm_engine.model_config
|
||||||
model_config = llm_engine.model_config
|
model_tokenizer = vllm_model.llm.llm_engine.tokenizer
|
||||||
renderer_config = llm_engine.renderer_config
|
|
||||||
tokenizer = llm_engine.tokenizer
|
|
||||||
|
|
||||||
# asserts on the bert model config file
|
# asserts on the bert model config file
|
||||||
assert model_config.encoder_config["max_seq_length"] == 512
|
assert model_config.encoder_config["max_seq_length"] == 512
|
||||||
@ -56,8 +54,8 @@ def test_model_loading_with_params(vllm_runner, monkeypatch):
|
|||||||
assert model_config.pooler_config.normalize
|
assert model_config.pooler_config.normalize
|
||||||
|
|
||||||
# asserts on the tokenizer loaded
|
# asserts on the tokenizer loaded
|
||||||
assert renderer_config.tokenizer == "BAAI/bge-base-en-v1.5"
|
assert model_config.tokenizer == "BAAI/bge-base-en-v1.5"
|
||||||
assert tokenizer.model_max_length == 512
|
assert model_tokenizer.model_max_length == 512
|
||||||
|
|
||||||
def check_model(model):
|
def check_model(model):
|
||||||
assert isinstance(model, BertEmbeddingModel)
|
assert isinstance(model, BertEmbeddingModel)
|
||||||
@ -88,10 +86,8 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
|
|||||||
"Write a short story about a robot that dreams for the first time.\n"
|
"Write a short story about a robot that dreams for the first time.\n"
|
||||||
)
|
)
|
||||||
|
|
||||||
llm_engine = vllm_model.llm.llm_engine
|
model_config = vllm_model.llm.llm_engine.model_config
|
||||||
model_config = llm_engine.model_config
|
model_tokenizer = vllm_model.llm.llm_engine.tokenizer
|
||||||
renderer_config = llm_engine.renderer_config
|
|
||||||
tokenizer = llm_engine.tokenizer
|
|
||||||
|
|
||||||
# asserts on the bert model config file
|
# asserts on the bert model config file
|
||||||
assert model_config.encoder_config["max_seq_length"] == 512
|
assert model_config.encoder_config["max_seq_length"] == 512
|
||||||
@ -102,8 +98,8 @@ def test_roberta_model_loading_with_params(vllm_runner, monkeypatch):
|
|||||||
assert model_config.pooler_config.normalize
|
assert model_config.pooler_config.normalize
|
||||||
|
|
||||||
# asserts on the tokenizer loaded
|
# asserts on the tokenizer loaded
|
||||||
assert renderer_config.tokenizer == "intfloat/multilingual-e5-base"
|
assert model_config.tokenizer == "intfloat/multilingual-e5-base"
|
||||||
assert tokenizer.model_max_length == 512
|
assert model_tokenizer.model_max_length == 512
|
||||||
|
|
||||||
def check_model(model):
|
def check_model(model):
|
||||||
assert isinstance(model, RobertaEmbeddingModel)
|
assert isinstance(model, RobertaEmbeddingModel)
|
||||||
@ -132,7 +128,7 @@ def test_facebook_roberta_model_loading_with_params(vllm_runner, monkeypatch):
|
|||||||
"Write a short story about a robot that dreams for the first time.\n"
|
"Write a short story about a robot that dreams for the first time.\n"
|
||||||
)
|
)
|
||||||
|
|
||||||
assert vllm_model.llm.llm_engine.renderer_config.tokenizer == model_name
|
assert vllm_model.llm.llm_engine.model_config.tokenizer == model_name
|
||||||
|
|
||||||
def check_model(model):
|
def check_model(model):
|
||||||
assert isinstance(model, RobertaEmbeddingModel)
|
assert isinstance(model, RobertaEmbeddingModel)
|
||||||
|
|||||||
@ -6,7 +6,7 @@ import pytest
|
|||||||
from scipy.spatial.distance import cosine
|
from scipy.spatial.distance import cosine
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.config import ModelConfig, RendererConfig
|
from vllm.config import ModelConfig
|
||||||
|
|
||||||
from ....utils import RemoteOpenAIServer
|
from ....utils import RemoteOpenAIServer
|
||||||
|
|
||||||
@ -31,8 +31,7 @@ def test_find_array():
|
|||||||
dtype="bfloat16",
|
dtype="bfloat16",
|
||||||
seed=0,
|
seed=0,
|
||||||
)
|
)
|
||||||
renderer_config = RendererConfig(model_config=model_config)
|
pooling = GritLMMeanPool(model_config=model_config)
|
||||||
pooling = GritLMMeanPool(renderer_config=renderer_config)
|
|
||||||
|
|
||||||
arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
|
arr = _arr([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
|
||||||
|
|
||||||
|
|||||||
@ -25,6 +25,7 @@ from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingC
|
|||||||
from vllm.tokenizers import (
|
from vllm.tokenizers import (
|
||||||
MistralTokenizer,
|
MistralTokenizer,
|
||||||
TokenizerLike,
|
TokenizerLike,
|
||||||
|
cached_tokenizer_from_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
from ....multimodal.utils import random_audio, random_image, random_video
|
from ....multimodal.utils import random_audio, random_image, random_video
|
||||||
@ -211,20 +212,31 @@ def _test_processing_correctness(
|
|||||||
else:
|
else:
|
||||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id_or_arch)
|
model_info = HF_EXAMPLE_MODELS.find_hf_info(model_id_or_arch)
|
||||||
model_id = model_id_or_arch
|
model_id = model_id_or_arch
|
||||||
|
|
||||||
model_info.check_available_online(on_fail="skip")
|
model_info.check_available_online(on_fail="skip")
|
||||||
model_info.check_transformers_version(on_fail="skip")
|
model_info.check_transformers_version(on_fail="skip")
|
||||||
|
|
||||||
renderer_config = model_info.build_renderer_config(
|
model_config = ModelConfig(
|
||||||
model=model_id,
|
model_id,
|
||||||
|
tokenizer=model_info.tokenizer or model_id,
|
||||||
|
tokenizer_mode=model_info.tokenizer_mode,
|
||||||
|
revision=model_info.revision,
|
||||||
|
trust_remote_code=model_info.trust_remote_code,
|
||||||
|
hf_overrides=model_info.hf_overrides,
|
||||||
# Ensure that the cache can fit all of the data
|
# Ensure that the cache can fit all of the data
|
||||||
mm_processor_cache_gb=2048,
|
mm_processor_cache_gb=2048,
|
||||||
|
skip_tokenizer_init=model_info.require_embed_inputs,
|
||||||
|
enable_prompt_embeds=model_info.require_embed_inputs,
|
||||||
|
enable_mm_embeds=model_info.require_embed_inputs,
|
||||||
|
enforce_eager=model_info.enforce_eager,
|
||||||
|
dtype=model_info.dtype,
|
||||||
)
|
)
|
||||||
model_config = renderer_config.model_config
|
|
||||||
|
|
||||||
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
|
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
|
||||||
factories = model_cls._processor_factory
|
factories = model_cls._processor_factory
|
||||||
ctx = InputProcessingContext.from_config(renderer_config)
|
ctx = InputProcessingContext(
|
||||||
|
model_config,
|
||||||
|
tokenizer=cached_tokenizer_from_config(model_config),
|
||||||
|
)
|
||||||
cache = MultiModalProcessorOnlyCache(model_config)
|
cache = MultiModalProcessorOnlyCache(model_config)
|
||||||
|
|
||||||
processing_info = factories.info(ctx)
|
processing_info = factories.info(ctx)
|
||||||
|
|||||||
@ -40,7 +40,7 @@ def test_processor_override(
|
|||||||
mm_processor_kwargs=None,
|
mm_processor_kwargs=None,
|
||||||
limit_mm_per_prompt={"video": 1},
|
limit_mm_per_prompt={"video": 1},
|
||||||
)
|
)
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||||
tokenizer = processor.info.get_tokenizer()
|
tokenizer = processor.info.get_tokenizer()
|
||||||
hf_processor_mm_kwargs = {"fps": fps}
|
hf_processor_mm_kwargs = {"fps": fps}
|
||||||
|
|
||||||
@ -79,7 +79,7 @@ def test_video_loader_consistency(
|
|||||||
mm_processor_kwargs=None,
|
mm_processor_kwargs=None,
|
||||||
limit_mm_per_prompt={"video": 1},
|
limit_mm_per_prompt={"video": 1},
|
||||||
)
|
)
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||||
hf_processor_mm_kwargs = {"fps": fps}
|
hf_processor_mm_kwargs = {"fps": fps}
|
||||||
|
|
||||||
# Build the image str / prompt based on the number of images we pass
|
# Build the image str / prompt based on the number of images we pass
|
||||||
|
|||||||
@ -162,7 +162,7 @@ def test_processor_override(
|
|||||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||||
limit_mm_per_prompt={"image": len(size_factors)},
|
limit_mm_per_prompt={"image": len(size_factors)},
|
||||||
)
|
)
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||||
|
|
||||||
min_num = min_dynamic_patch if dynamic_image_size else 1
|
min_num = min_dynamic_patch if dynamic_image_size else 1
|
||||||
|
|||||||
@ -38,7 +38,7 @@ def test_processor_override(
|
|||||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||||
limit_mm_per_prompt={"image": num_imgs},
|
limit_mm_per_prompt={"image": num_imgs},
|
||||||
)
|
)
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||||
|
|
||||||
# Build the image str / prompt based on the number of images we pass
|
# Build the image str / prompt based on the number of images we pass
|
||||||
|
|||||||
@ -116,7 +116,7 @@ def test_processor_override(
|
|||||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||||
limit_mm_per_prompt={"image": len(size_factors)},
|
limit_mm_per_prompt={"image": len(size_factors)},
|
||||||
)
|
)
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||||
|
|
||||||
min_num = min_dynamic_patch if dynamic_image_size else 1
|
min_num = min_dynamic_patch if dynamic_image_size else 1
|
||||||
|
|||||||
@ -30,7 +30,7 @@ def test_processor_override(
|
|||||||
limit_mm_per_prompt={"image": num_imgs},
|
limit_mm_per_prompt={"image": num_imgs},
|
||||||
mm_processor_cache_gb=mm_processor_cache_gb,
|
mm_processor_cache_gb=mm_processor_cache_gb,
|
||||||
)
|
)
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||||
config = processor.info.get_hf_config()
|
config = processor.info.get_hf_config()
|
||||||
tokenizer = processor.info.get_tokenizer()
|
tokenizer = processor.info.get_tokenizer()
|
||||||
hf_processor = processor.info.get_hf_processor()
|
hf_processor = processor.info.get_hf_processor()
|
||||||
|
|||||||
@ -42,7 +42,7 @@ def test_processor_max_tokens(model_id):
|
|||||||
mm_processor_kwargs=None,
|
mm_processor_kwargs=None,
|
||||||
limit_mm_per_prompt={"image": 1},
|
limit_mm_per_prompt={"image": 1},
|
||||||
)
|
)
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||||
info = processor.info
|
info = processor.info
|
||||||
|
|
||||||
seen_aspect_ratios = set[float]()
|
seen_aspect_ratios = set[float]()
|
||||||
@ -140,7 +140,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
|
|||||||
mm_processor_kwargs=None,
|
mm_processor_kwargs=None,
|
||||||
limit_mm_per_prompt={"image": num_imgs},
|
limit_mm_per_prompt={"image": num_imgs},
|
||||||
)
|
)
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||||
|
|
||||||
image_ratios = [
|
image_ratios = [
|
||||||
(171, 152),
|
(171, 152),
|
||||||
@ -173,7 +173,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
|
|||||||
mm_processor_kwargs=None,
|
mm_processor_kwargs=None,
|
||||||
limit_mm_per_prompt={"image": num_imgs},
|
limit_mm_per_prompt={"image": num_imgs},
|
||||||
)
|
)
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||||
|
|
||||||
seen_aspect_ratios = set[float]()
|
seen_aspect_ratios = set[float]()
|
||||||
image_sizes = list[ImageSize]()
|
image_sizes = list[ImageSize]()
|
||||||
|
|||||||
@ -42,7 +42,7 @@ def test_processor_max_tokens(model_id):
|
|||||||
mm_processor_kwargs=None,
|
mm_processor_kwargs=None,
|
||||||
limit_mm_per_prompt={"image": 1},
|
limit_mm_per_prompt={"image": 1},
|
||||||
)
|
)
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||||
info = processor.info
|
info = processor.info
|
||||||
|
|
||||||
seen_aspect_ratios = set[float]()
|
seen_aspect_ratios = set[float]()
|
||||||
@ -138,7 +138,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
|
|||||||
mm_processor_kwargs=None,
|
mm_processor_kwargs=None,
|
||||||
limit_mm_per_prompt={"image": num_imgs},
|
limit_mm_per_prompt={"image": num_imgs},
|
||||||
)
|
)
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||||
|
|
||||||
image_ratios = [
|
image_ratios = [
|
||||||
(171, 152),
|
(171, 152),
|
||||||
@ -171,7 +171,7 @@ def test_processor_prompt_replacements_all(model_id, num_imgs):
|
|||||||
mm_processor_kwargs=None,
|
mm_processor_kwargs=None,
|
||||||
limit_mm_per_prompt={"image": num_imgs},
|
limit_mm_per_prompt={"image": num_imgs},
|
||||||
)
|
)
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||||
|
|
||||||
seen_aspect_ratios = set[float]()
|
seen_aspect_ratios = set[float]()
|
||||||
image_sizes = list[ImageSize]()
|
image_sizes = list[ImageSize]()
|
||||||
|
|||||||
@ -24,7 +24,7 @@ def test_processor_override(
|
|||||||
mm_processor_kwargs=None,
|
mm_processor_kwargs=None,
|
||||||
limit_mm_per_prompt={"image": num_imgs},
|
limit_mm_per_prompt={"image": num_imgs},
|
||||||
)
|
)
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||||
prompt = "<image>" * num_imgs
|
prompt = "<image>" * num_imgs
|
||||||
image = Image.new("RGB", size=(364, 364))
|
image = Image.new("RGB", size=(364, 364))
|
||||||
mm_data = {"image": [image] * num_imgs}
|
mm_data = {"image": [image] * num_imgs}
|
||||||
@ -83,7 +83,7 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
|
|||||||
mm_processor_kwargs=None,
|
mm_processor_kwargs=None,
|
||||||
limit_mm_per_prompt={"image": num_imgs},
|
limit_mm_per_prompt={"image": num_imgs},
|
||||||
)
|
)
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||||
|
|
||||||
image_ratios = [
|
image_ratios = [
|
||||||
(171, 152),
|
(171, 152),
|
||||||
|
|||||||
@ -25,7 +25,7 @@ def test_profiling(model_id: str, max_model_len: int):
|
|||||||
limit_mm_per_prompt=mm_counts,
|
limit_mm_per_prompt=mm_counts,
|
||||||
)
|
)
|
||||||
|
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||||
profiler = MultiModalProfiler(processor)
|
profiler = MultiModalProfiler(processor)
|
||||||
|
|
||||||
decoder_dummy_data = profiler.get_decoder_dummy_data(
|
decoder_dummy_data = profiler.get_decoder_dummy_data(
|
||||||
|
|||||||
@ -118,7 +118,7 @@ def test_processor_override(
|
|||||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||||
limit_mm_per_prompt={"image": len(size_factors)},
|
limit_mm_per_prompt={"image": len(size_factors)},
|
||||||
)
|
)
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||||
|
|
||||||
min_num = min_dynamic_patch if dynamic_image_size else 1
|
min_num = min_dynamic_patch if dynamic_image_size else 1
|
||||||
|
|||||||
@ -39,7 +39,7 @@ def test_processor_override(
|
|||||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||||
limit_mm_per_prompt={"image": num_imgs},
|
limit_mm_per_prompt={"image": num_imgs},
|
||||||
)
|
)
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||||
|
|
||||||
# Build the image str / prompt based on the number of images we pass
|
# Build the image str / prompt based on the number of images we pass
|
||||||
|
|||||||
@ -39,7 +39,7 @@ def test_processor_override(
|
|||||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||||
limit_mm_per_prompt={"image": num_imgs},
|
limit_mm_per_prompt={"image": num_imgs},
|
||||||
)
|
)
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||||
|
|
||||||
# Build the image str / prompt based on the number of images we pass
|
# Build the image str / prompt based on the number of images we pass
|
||||||
|
|||||||
@ -34,7 +34,7 @@ def test_processor_override(
|
|||||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||||
limit_mm_per_prompt={"image": num_imgs},
|
limit_mm_per_prompt={"image": num_imgs},
|
||||||
)
|
)
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||||
tokenizer = processor.info.get_tokenizer()
|
tokenizer = processor.info.get_tokenizer()
|
||||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||||
|
|
||||||
|
|||||||
@ -38,7 +38,7 @@ def test_processor_override(
|
|||||||
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None,
|
||||||
limit_mm_per_prompt={"image": num_imgs},
|
limit_mm_per_prompt={"image": num_imgs},
|
||||||
)
|
)
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.renderer_config)
|
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||||
|
|
||||||
# Build the image str / prompt based on the number of images we pass
|
# Build the image str / prompt based on the number of images we pass
|
||||||
|
|||||||
@ -11,7 +11,7 @@ import pytest
|
|||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from vllm.config import ModelConfig, RendererConfig, VllmConfig, set_current_vllm_config
|
from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
|
||||||
from vllm.config.multimodal import (
|
from vllm.config.multimodal import (
|
||||||
AudioDummyOptions,
|
AudioDummyOptions,
|
||||||
BaseDummyOptions,
|
BaseDummyOptions,
|
||||||
@ -31,6 +31,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
|
|||||||
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
|
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
|
||||||
from vllm.multimodal.utils import group_mm_kwargs_by_modality
|
from vllm.multimodal.utils import group_mm_kwargs_by_modality
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
from vllm.tokenizers import cached_tokenizer_from_config
|
||||||
from vllm.utils.collection_utils import is_list_of
|
from vllm.utils.collection_utils import is_list_of
|
||||||
from vllm.utils.torch_utils import set_default_torch_dtype
|
from vllm.utils.torch_utils import set_default_torch_dtype
|
||||||
|
|
||||||
@ -149,10 +150,7 @@ def initialize_dummy_model(
|
|||||||
backend="nccl",
|
backend="nccl",
|
||||||
)
|
)
|
||||||
initialize_model_parallel(tensor_model_parallel_size=1)
|
initialize_model_parallel(tensor_model_parallel_size=1)
|
||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(model_config=model_config)
|
||||||
model_config=model_config,
|
|
||||||
renderer_config=RendererConfig(model_config=model_config),
|
|
||||||
)
|
|
||||||
with set_current_vllm_config(vllm_config=vllm_config):
|
with set_current_vllm_config(vllm_config=vllm_config):
|
||||||
with set_default_torch_dtype(model_config.dtype):
|
with set_default_torch_dtype(model_config.dtype):
|
||||||
model = model_cls(vllm_config=vllm_config)
|
model = model_cls(vllm_config=vllm_config)
|
||||||
@ -184,12 +182,19 @@ def test_model_tensor_schema(model_id: str):
|
|||||||
else:
|
else:
|
||||||
dtype = model_info.dtype
|
dtype = model_info.dtype
|
||||||
|
|
||||||
renderer_config = model_info.build_renderer_config(
|
model_config = ModelConfig(
|
||||||
model_id,
|
model_id,
|
||||||
|
tokenizer=model_info.tokenizer or model_id,
|
||||||
|
tokenizer_mode=model_info.tokenizer_mode,
|
||||||
|
revision=model_info.revision,
|
||||||
|
trust_remote_code=model_info.trust_remote_code,
|
||||||
hf_overrides=hf_overrides_fn,
|
hf_overrides=hf_overrides_fn,
|
||||||
|
skip_tokenizer_init=model_info.require_embed_inputs,
|
||||||
|
enable_prompt_embeds=model_info.require_embed_inputs,
|
||||||
|
enable_mm_embeds=model_info.require_embed_inputs,
|
||||||
|
enforce_eager=model_info.enforce_eager,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
)
|
)
|
||||||
model_config = renderer_config.model_config
|
|
||||||
|
|
||||||
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
|
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
|
||||||
assert supports_multimodal(model_cls)
|
assert supports_multimodal(model_cls)
|
||||||
@ -207,7 +212,10 @@ def test_model_tensor_schema(model_id: str):
|
|||||||
if not any(inputs_parse_methods):
|
if not any(inputs_parse_methods):
|
||||||
pytest.skip(f"{model_arch} does not support tensor schema validation.")
|
pytest.skip(f"{model_arch} does not support tensor schema validation.")
|
||||||
|
|
||||||
ctx = InputProcessingContext.from_config(renderer_config)
|
ctx = InputProcessingContext(
|
||||||
|
model_config,
|
||||||
|
tokenizer=cached_tokenizer_from_config(model_config),
|
||||||
|
)
|
||||||
processing_info = factories.info(ctx)
|
processing_info = factories.info(ctx)
|
||||||
supported_mm_limits = processing_info.get_supported_mm_limits()
|
supported_mm_limits = processing_info.get_supported_mm_limits()
|
||||||
limit_mm_per_prompt = {
|
limit_mm_per_prompt = {
|
||||||
|
|||||||
@ -3,7 +3,7 @@
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm.assets.image import ImageAsset
|
from vllm.assets.image import ImageAsset
|
||||||
from vllm.config import ModelConfig, RendererConfig
|
from vllm.config import ModelConfig
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
|
|
||||||
|
|
||||||
@ -13,9 +13,8 @@ def test_multimodal_processor(model_id):
|
|||||||
model=model_id,
|
model=model_id,
|
||||||
model_impl="transformers",
|
model_impl="transformers",
|
||||||
)
|
)
|
||||||
renderer_config = RendererConfig(model_config=model_config)
|
|
||||||
|
|
||||||
mm_processor = MULTIMODAL_REGISTRY.create_processor(renderer_config)
|
mm_processor = MULTIMODAL_REGISTRY.create_processor(model_config)
|
||||||
|
|
||||||
image_pil = ImageAsset("cherry_blossom").pil_image
|
image_pil = ImageAsset("cherry_blossom").pil_image
|
||||||
mm_data = {"image": image_pil}
|
mm_data = {"image": image_pil}
|
||||||
|
|||||||
@ -7,6 +7,7 @@ import torch
|
|||||||
import transformers
|
import transformers
|
||||||
from transformers import AutoConfig, PreTrainedModel
|
from transformers import AutoConfig, PreTrainedModel
|
||||||
|
|
||||||
|
from vllm.config import ModelConfig
|
||||||
from vllm.model_executor.models.utils import WeightsMapper
|
from vllm.model_executor.models.utils import WeightsMapper
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.transformers_utils.config import try_get_safetensors_metadata
|
from vllm.transformers_utils.config import try_get_safetensors_metadata
|
||||||
@ -49,11 +50,37 @@ def test_hf_model_weights_mapper(model_arch: str):
|
|||||||
model_info.check_available_online(on_fail="skip")
|
model_info.check_available_online(on_fail="skip")
|
||||||
model_info.check_transformers_version(on_fail="skip")
|
model_info.check_transformers_version(on_fail="skip")
|
||||||
|
|
||||||
model_config = model_info.build_model_config(config_format="hf")
|
is_mistral_model = model_arch in [
|
||||||
|
"Mistral3ForConditionalGeneration",
|
||||||
|
"PixtralForConditionalGeneration",
|
||||||
|
"VoxtralForConditionalGeneration",
|
||||||
|
]
|
||||||
|
|
||||||
|
if not is_mistral_model or model_info.tokenizer_mode == "mistral":
|
||||||
|
tokenizer_mode = model_info.tokenizer_mode
|
||||||
|
else:
|
||||||
|
tokenizer_mode = "hf"
|
||||||
|
|
||||||
|
model_id = model_info.default
|
||||||
|
|
||||||
|
model_config = ModelConfig(
|
||||||
|
model_id,
|
||||||
|
tokenizer=model_info.tokenizer or model_id,
|
||||||
|
tokenizer_mode=tokenizer_mode,
|
||||||
|
config_format="hf",
|
||||||
|
revision=model_info.revision,
|
||||||
|
trust_remote_code=model_info.trust_remote_code,
|
||||||
|
hf_overrides=model_info.hf_overrides,
|
||||||
|
skip_tokenizer_init=model_info.require_embed_inputs,
|
||||||
|
enable_prompt_embeds=model_info.require_embed_inputs,
|
||||||
|
enable_mm_embeds=model_info.require_embed_inputs,
|
||||||
|
enforce_eager=model_info.enforce_eager,
|
||||||
|
dtype=model_info.dtype,
|
||||||
|
)
|
||||||
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
|
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
|
||||||
|
|
||||||
original_weights = create_repo_dummy_weights(model_config.model)
|
original_weights = create_repo_dummy_weights(model_id)
|
||||||
hf_dummy_model = create_dummy_model(model_config.model, model_arch)
|
hf_dummy_model = create_dummy_model(model_id, model_arch)
|
||||||
hf_converted_weights = hf_dummy_model.named_parameters()
|
hf_converted_weights = hf_dummy_model.named_parameters()
|
||||||
hf_converted_buffers = hf_dummy_model.named_buffers()
|
hf_converted_buffers = hf_dummy_model.named_buffers()
|
||||||
mapper: WeightsMapper = model_cls.hf_to_vllm_mapper
|
mapper: WeightsMapper = model_cls.hf_to_vllm_mapper
|
||||||
|
|||||||
@ -9,8 +9,7 @@ import pytest
|
|||||||
from packaging.version import Version
|
from packaging.version import Version
|
||||||
from transformers import __version__ as TRANSFORMERS_VERSION
|
from transformers import __version__ as TRANSFORMERS_VERSION
|
||||||
|
|
||||||
from vllm.config.model import ModelConfig, ModelDType
|
from vllm.config.model import ModelDType, TokenizerMode
|
||||||
from vllm.config.renderer import RendererConfig, TokenizerMode
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass(frozen=True)
|
@dataclass(frozen=True)
|
||||||
@ -171,36 +170,6 @@ class _HfExamplesInfo:
|
|||||||
else:
|
else:
|
||||||
pytest.skip(msg)
|
pytest.skip(msg)
|
||||||
|
|
||||||
def build_model_config(self, model: str | None = None, **kwargs) -> ModelConfig:
|
|
||||||
if model is None:
|
|
||||||
model = self.default
|
|
||||||
|
|
||||||
return ModelConfig(
|
|
||||||
**{
|
|
||||||
"model": model,
|
|
||||||
"revision": self.revision,
|
|
||||||
"trust_remote_code": self.trust_remote_code,
|
|
||||||
"hf_overrides": self.hf_overrides,
|
|
||||||
"enable_prompt_embeds": self.require_embed_inputs,
|
|
||||||
"enable_mm_embeds": self.require_embed_inputs,
|
|
||||||
"enforce_eager": self.enforce_eager,
|
|
||||||
"dtype": self.dtype,
|
|
||||||
**kwargs,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
def build_renderer_config(
|
|
||||||
self, model: str | None = None, **kwargs
|
|
||||||
) -> RendererConfig:
|
|
||||||
model_config = self.build_model_config(model, **kwargs)
|
|
||||||
|
|
||||||
return RendererConfig(
|
|
||||||
model_config=model_config,
|
|
||||||
tokenizer=self.tokenizer or model_config.model,
|
|
||||||
tokenizer_mode=self.tokenizer_mode,
|
|
||||||
skip_tokenizer_init=self.require_embed_inputs,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
_TEXT_GENERATION_EXAMPLE_MODELS = {
|
_TEXT_GENERATION_EXAMPLE_MODELS = {
|
||||||
# [Decoder-only]
|
# [Decoder-only]
|
||||||
|
|||||||
@ -13,6 +13,7 @@ from transformers import PretrainedConfig
|
|||||||
from vllm.config.model import ModelConfig, ModelDType, RunnerOption
|
from vllm.config.model import ModelConfig, ModelDType, RunnerOption
|
||||||
from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
|
from vllm.logprobs import Logprob, PromptLogprobs, SampleLogprobs
|
||||||
from vllm.multimodal.processing import InputProcessingContext
|
from vllm.multimodal.processing import InputProcessingContext
|
||||||
|
from vllm.tokenizers import cached_tokenizer_from_config
|
||||||
|
|
||||||
from .. import ci_envs
|
from .. import ci_envs
|
||||||
from .registry import HF_EXAMPLE_MODELS
|
from .registry import HF_EXAMPLE_MODELS
|
||||||
@ -295,18 +296,30 @@ def build_model_context(
|
|||||||
|
|
||||||
model_config_kwargs = model_config_kwargs or {}
|
model_config_kwargs = model_config_kwargs or {}
|
||||||
limit_mm_per_prompt = limit_mm_per_prompt or {}
|
limit_mm_per_prompt = limit_mm_per_prompt or {}
|
||||||
renderer_config = model_info.build_renderer_config(
|
model_config = ModelConfig(
|
||||||
model_id,
|
model_id,
|
||||||
runner=runner,
|
runner=runner,
|
||||||
|
tokenizer=model_info.tokenizer or model_id,
|
||||||
|
tokenizer_mode=model_info.tokenizer_mode,
|
||||||
|
revision=model_info.revision,
|
||||||
|
trust_remote_code=model_info.trust_remote_code,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
seed=0,
|
seed=0,
|
||||||
mm_processor_kwargs=mm_processor_kwargs,
|
mm_processor_kwargs=mm_processor_kwargs,
|
||||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||||
mm_processor_cache_gb=mm_processor_cache_gb,
|
mm_processor_cache_gb=mm_processor_cache_gb,
|
||||||
|
hf_overrides=model_info.hf_overrides,
|
||||||
|
skip_tokenizer_init=model_info.require_embed_inputs,
|
||||||
|
enable_prompt_embeds=model_info.require_embed_inputs,
|
||||||
|
enable_mm_embeds=model_info.require_embed_inputs,
|
||||||
|
enforce_eager=model_info.enforce_eager,
|
||||||
**model_config_kwargs,
|
**model_config_kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
return InputProcessingContext.from_config(renderer_config)
|
return InputProcessingContext(
|
||||||
|
model_config,
|
||||||
|
tokenizer=cached_tokenizer_from_config(model_config),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def check_embeddings_close(
|
def check_embeddings_close(
|
||||||
|
|||||||
@ -6,7 +6,7 @@ import numpy as np
|
|||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm.config import ModelConfig, ParallelConfig, RendererConfig, VllmConfig
|
from vllm.config import ModelConfig, ParallelConfig, VllmConfig
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.cache import (
|
from vllm.multimodal.cache import (
|
||||||
BaseMultiModalProcessorCache,
|
BaseMultiModalProcessorCache,
|
||||||
@ -110,14 +110,11 @@ def _create_vllm_config(
|
|||||||
mm_processor_cache_gb: float,
|
mm_processor_cache_gb: float,
|
||||||
enable_ipc: bool,
|
enable_ipc: bool,
|
||||||
):
|
):
|
||||||
model_config = ModelConfig(
|
|
||||||
model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
|
|
||||||
mm_processor_cache_gb=mm_processor_cache_gb,
|
|
||||||
)
|
|
||||||
|
|
||||||
return VllmConfig(
|
return VllmConfig(
|
||||||
model_config=model_config,
|
model_config=ModelConfig(
|
||||||
renderer_config=RendererConfig(model_config=model_config),
|
model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
|
||||||
|
mm_processor_cache_gb=mm_processor_cache_gb,
|
||||||
|
),
|
||||||
parallel_config=ParallelConfig(data_parallel_size=1 if enable_ipc else 2),
|
parallel_config=ParallelConfig(data_parallel_size=1 if enable_ipc else 2),
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -509,15 +506,13 @@ def _run_test_cache_eviction_shm(
|
|||||||
|
|
||||||
|
|
||||||
def test_cache_eviction_shm_cache():
|
def test_cache_eviction_shm_cache():
|
||||||
model_config = ModelConfig(
|
|
||||||
model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
|
|
||||||
mm_processor_cache_type="shm",
|
|
||||||
mm_shm_cache_max_object_size_mb=6,
|
|
||||||
mm_processor_cache_gb=15.2 * MiB_bytes / GiB_bytes,
|
|
||||||
)
|
|
||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
model_config=model_config,
|
model_config=ModelConfig(
|
||||||
renderer_config=RendererConfig(model_config=model_config),
|
model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf",
|
||||||
|
mm_processor_cache_type="shm",
|
||||||
|
mm_shm_cache_max_object_size_mb=6,
|
||||||
|
mm_processor_cache_gb=15.2 * MiB_bytes / GiB_bytes,
|
||||||
|
),
|
||||||
)
|
)
|
||||||
sender_cache = ShmObjectStoreSenderCache(vllm_config)
|
sender_cache = ShmObjectStoreSenderCache(vllm_config)
|
||||||
receiver_cache = ShmObjectStoreReceiverCache(vllm_config, mp.Lock())
|
receiver_cache = ShmObjectStoreReceiverCache(vllm_config, mp.Lock())
|
||||||
|
|||||||
@ -7,7 +7,7 @@ from contextlib import nullcontext
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm.config import ModelConfig, RendererConfig
|
from vllm.config import ModelConfig
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.processing import (
|
from vllm.multimodal.processing import (
|
||||||
InputProcessingContext,
|
InputProcessingContext,
|
||||||
@ -920,9 +920,8 @@ def test_limit_mm_per_prompt_dummy(model_id, limit, num_supported, is_valid):
|
|||||||
model=model_id,
|
model=model_id,
|
||||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||||
)
|
)
|
||||||
renderer_config = RendererConfig(model_config=model_config)
|
|
||||||
|
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(renderer_config)
|
processor = MULTIMODAL_REGISTRY.create_processor(model_config)
|
||||||
processor._supported_mm_limits = {"image": num_supported}
|
processor._supported_mm_limits = {"image": num_supported}
|
||||||
|
|
||||||
profiler = MultiModalProfiler(processor)
|
profiler = MultiModalProfiler(processor)
|
||||||
@ -956,9 +955,8 @@ def test_limit_mm_per_prompt_apply(model_id, num_images, limit, is_valid):
|
|||||||
model=model_id,
|
model=model_id,
|
||||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||||
)
|
)
|
||||||
renderer_config = RendererConfig(model_config=model_config)
|
|
||||||
|
|
||||||
processor = MULTIMODAL_REGISTRY.create_processor(renderer_config)
|
processor = MULTIMODAL_REGISTRY.create_processor(model_config)
|
||||||
|
|
||||||
rng = np.random.RandomState(0)
|
rng = np.random.RandomState(0)
|
||||||
image = random_image(rng, min_wh=128, max_wh=256)
|
image = random_image(rng, min_wh=128, max_wh=256)
|
||||||
@ -1014,13 +1012,11 @@ def test_hf_processor_init_kwargs(
|
|||||||
inference_kwargs,
|
inference_kwargs,
|
||||||
expected_kwargs,
|
expected_kwargs,
|
||||||
):
|
):
|
||||||
model_config = ModelConfig(model_id, mm_processor_kwargs=config_kwargs)
|
ctx = InputProcessingContext(
|
||||||
renderer_config = RendererConfig(
|
model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
|
||||||
model_config=model_config,
|
tokenizer=None,
|
||||||
tokenizer=model_id,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
ctx = InputProcessingContext.from_config(renderer_config)
|
|
||||||
processor = ctx.get_hf_processor(
|
processor = ctx.get_hf_processor(
|
||||||
DummyProcessor, # type: ignore[arg-type]
|
DummyProcessor, # type: ignore[arg-type]
|
||||||
**inference_kwargs,
|
**inference_kwargs,
|
||||||
@ -1049,13 +1045,11 @@ def test_hf_processor_call_kwargs(
|
|||||||
inference_kwargs,
|
inference_kwargs,
|
||||||
expected_kwargs,
|
expected_kwargs,
|
||||||
):
|
):
|
||||||
model_config = ModelConfig(model_id, mm_processor_kwargs=config_kwargs)
|
ctx = InputProcessingContext(
|
||||||
renderer_config = RendererConfig(
|
model_config=ModelConfig(model_id, mm_processor_kwargs=config_kwargs),
|
||||||
model_config=model_config,
|
tokenizer=None,
|
||||||
tokenizer=model_id,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
ctx = InputProcessingContext.from_config(renderer_config)
|
|
||||||
processor = ctx.get_hf_processor(DummyProcessor) # type: ignore[arg-type]
|
processor = ctx.get_hf_processor(DummyProcessor) # type: ignore[arg-type]
|
||||||
|
|
||||||
result = ctx.call_hf_processor(processor, {}, inference_kwargs)
|
result = ctx.call_hf_processor(processor, {}, inference_kwargs)
|
||||||
|
|||||||
@ -31,6 +31,4 @@ def test_supports_multimodal_inputs(model_id, limit_mm_per_prompt, expected):
|
|||||||
model_id,
|
model_id,
|
||||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||||
)
|
)
|
||||||
assert (
|
assert MULTIMODAL_REGISTRY.supports_multimodal_inputs(ctx.model_config) is expected
|
||||||
MULTIMODAL_REGISTRY.supports_multimodal_inputs(ctx.renderer_config) is expected
|
|
||||||
)
|
|
||||||
|
|||||||
@ -13,7 +13,6 @@ from vllm.config import (
|
|||||||
CompilationConfig,
|
CompilationConfig,
|
||||||
ModelConfig,
|
ModelConfig,
|
||||||
PoolerConfig,
|
PoolerConfig,
|
||||||
RendererConfig,
|
|
||||||
SchedulerConfig,
|
SchedulerConfig,
|
||||||
VllmConfig,
|
VllmConfig,
|
||||||
update_config,
|
update_config,
|
||||||
@ -477,41 +476,27 @@ def test_load_config_pt_load_map_location(pt_load_map_location):
|
|||||||
("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", 131073, 131072, True),
|
("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", 131073, 131072, True),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_recalculate_max_model_len(
|
def test_get_and_verify_max_len(
|
||||||
model_id, max_model_len, expected_max_len, should_raise
|
model_id, max_model_len, expected_max_len, should_raise
|
||||||
):
|
):
|
||||||
"""Test recalculate_max_model_len with different configurations."""
|
"""Test get_and_verify_max_len with different configurations."""
|
||||||
model_config = ModelConfig(model_id)
|
model_config = ModelConfig(model_id)
|
||||||
|
|
||||||
if should_raise:
|
if should_raise:
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
model_config.recalculate_max_model_len(
|
model_config.get_and_verify_max_len(max_model_len)
|
||||||
max_model_len,
|
|
||||||
tokenizer=model_id,
|
|
||||||
tokenizer_revision=None,
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
model_config.recalculate_max_model_len(
|
actual_max_len = model_config.get_and_verify_max_len(max_model_len)
|
||||||
max_model_len,
|
assert actual_max_len == expected_max_len
|
||||||
tokenizer=model_id,
|
|
||||||
tokenizer_revision=None,
|
|
||||||
)
|
|
||||||
assert model_config.max_model_len == expected_max_len
|
|
||||||
|
|
||||||
|
|
||||||
class MockModelConfig:
|
class MockConfig:
|
||||||
"""Simple mock object for testing maybe_pull_model_for_runai"""
|
"""Simple mock object for testing maybe_pull_model_tokenizer_for_runai"""
|
||||||
|
|
||||||
def __init__(self, model: str):
|
def __init__(self, model: str, tokenizer: str):
|
||||||
self.model = model
|
self.model = model
|
||||||
|
self.tokenizer = tokenizer
|
||||||
|
self.model_weights = None
|
||||||
class MockRendererConfig:
|
|
||||||
"""Simple mock object for testing maybe_pull_tokenizer_for_runai"""
|
|
||||||
|
|
||||||
def __init__(self, model_config: MockModelConfig):
|
|
||||||
self.model_config = model_config
|
|
||||||
self.tokenizer = model_config.model
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@ -529,65 +514,59 @@ def test_s3_url_model_tokenizer_paths(mock_pull_files, s3_url):
|
|||||||
mock_pull_files.return_value = None
|
mock_pull_files.return_value = None
|
||||||
|
|
||||||
# Create first mock and run the method
|
# Create first mock and run the method
|
||||||
model_config1 = MockModelConfig(model=s3_url)
|
config1 = MockConfig(model=s3_url, tokenizer=s3_url)
|
||||||
renderer_config1 = MockRendererConfig(model_config=model_config1)
|
ModelConfig.maybe_pull_model_tokenizer_for_runai(config1, s3_url, s3_url)
|
||||||
ModelConfig.maybe_pull_model_for_runai(model_config1, s3_url)
|
|
||||||
RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config1, s3_url)
|
|
||||||
|
|
||||||
# Check that model and tokenizer point to existing directories
|
# Check that model and tokenizer point to existing directories
|
||||||
assert os.path.exists(model_config1.model), (
|
assert os.path.exists(config1.model), (
|
||||||
f"Model directory does not exist: {model_config1.model}"
|
f"Model directory does not exist: {config1.model}"
|
||||||
)
|
)
|
||||||
assert os.path.isdir(model_config1.model), (
|
assert os.path.isdir(config1.model), (
|
||||||
f"Model path is not a directory: {model_config1.model}"
|
f"Model path is not a directory: {config1.model}"
|
||||||
)
|
)
|
||||||
assert os.path.exists(renderer_config1.tokenizer), (
|
assert os.path.exists(config1.tokenizer), (
|
||||||
f"Tokenizer directory does not exist: {renderer_config1.tokenizer}"
|
f"Tokenizer directory does not exist: {config1.tokenizer}"
|
||||||
)
|
)
|
||||||
assert os.path.isdir(renderer_config1.tokenizer), (
|
assert os.path.isdir(config1.tokenizer), (
|
||||||
f"Tokenizer path is not a directory: {renderer_config1.tokenizer}"
|
f"Tokenizer path is not a directory: {config1.tokenizer}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Verify that the paths are different from the original S3 URL
|
# Verify that the paths are different from the original S3 URL
|
||||||
assert model_config1.model != s3_url, (
|
assert config1.model != s3_url, "Model path should be converted to local directory"
|
||||||
"Model path should be converted to local directory"
|
assert config1.tokenizer != s3_url, (
|
||||||
)
|
|
||||||
assert renderer_config1.tokenizer != s3_url, (
|
|
||||||
"Tokenizer path should be converted to local directory"
|
"Tokenizer path should be converted to local directory"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Store the original paths
|
# Store the original paths
|
||||||
created_model_dir = model_config1.model
|
created_model_dir = config1.model
|
||||||
create_tokenizer_dir = renderer_config1.tokenizer
|
create_tokenizer_dir = config1.tokenizer
|
||||||
|
|
||||||
# Create a new mock and run the method with the same S3 URL
|
# Create a new mock and run the method with the same S3 URL
|
||||||
model_config2 = MockModelConfig(model=s3_url)
|
config2 = MockConfig(model=s3_url, tokenizer=s3_url)
|
||||||
renderer_config2 = MockRendererConfig(model_config=model_config2)
|
ModelConfig.maybe_pull_model_tokenizer_for_runai(config2, s3_url, s3_url)
|
||||||
ModelConfig.maybe_pull_model_for_runai(model_config2, s3_url)
|
|
||||||
RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config2, s3_url)
|
|
||||||
|
|
||||||
# Check that the new directories exist
|
# Check that the new directories exist
|
||||||
assert os.path.exists(model_config2.model), (
|
assert os.path.exists(config2.model), (
|
||||||
f"Model directory does not exist: {model_config2.model}"
|
f"Model directory does not exist: {config2.model}"
|
||||||
)
|
)
|
||||||
assert os.path.isdir(model_config2.model), (
|
assert os.path.isdir(config2.model), (
|
||||||
f"Model path is not a directory: {model_config2.model}"
|
f"Model path is not a directory: {config2.model}"
|
||||||
)
|
)
|
||||||
assert os.path.exists(renderer_config2.tokenizer), (
|
assert os.path.exists(config2.tokenizer), (
|
||||||
f"Tokenizer directory does not exist: {renderer_config2.tokenizer}"
|
f"Tokenizer directory does not exist: {config2.tokenizer}"
|
||||||
)
|
)
|
||||||
assert os.path.isdir(renderer_config2.tokenizer), (
|
assert os.path.isdir(config2.tokenizer), (
|
||||||
f"Tokenizer path is not a directory: {renderer_config2.tokenizer}"
|
f"Tokenizer path is not a directory: {config2.tokenizer}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Verify that the paths are deterministic (same as before)
|
# Verify that the paths are deterministic (same as before)
|
||||||
assert model_config2.model == created_model_dir, (
|
assert config2.model == created_model_dir, (
|
||||||
f"Model paths are not deterministic. "
|
f"Model paths are not deterministic. "
|
||||||
f"Original: {created_model_dir}, New: {model_config2.model}"
|
f"Original: {created_model_dir}, New: {config2.model}"
|
||||||
)
|
)
|
||||||
assert renderer_config2.tokenizer == create_tokenizer_dir, (
|
assert config2.tokenizer == create_tokenizer_dir, (
|
||||||
f"Tokenizer paths are not deterministic. "
|
f"Tokenizer paths are not deterministic. "
|
||||||
f"Original: {create_tokenizer_dir}, New: {renderer_config2.tokenizer}"
|
f"Original: {create_tokenizer_dir}, New: {config2.tokenizer}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@ -601,36 +580,28 @@ def test_s3_url_different_models_create_different_directories(mock_pull_files):
|
|||||||
s3_url2 = "s3://example-bucket-2/model/"
|
s3_url2 = "s3://example-bucket-2/model/"
|
||||||
|
|
||||||
# Create mocks with different S3 URLs and run the method
|
# Create mocks with different S3 URLs and run the method
|
||||||
model_config1 = MockModelConfig(model=s3_url1)
|
config1 = MockConfig(model=s3_url1, tokenizer=s3_url1)
|
||||||
renderer_config1 = MockRendererConfig(model_config=model_config1)
|
ModelConfig.maybe_pull_model_tokenizer_for_runai(config1, s3_url1, s3_url1)
|
||||||
ModelConfig.maybe_pull_model_for_runai(model_config1, s3_url1)
|
|
||||||
RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config1, s3_url1)
|
|
||||||
|
|
||||||
model_config2 = MockModelConfig(model=s3_url2)
|
config2 = MockConfig(model=s3_url2, tokenizer=s3_url2)
|
||||||
renderer_config2 = MockRendererConfig(model_config=model_config2)
|
ModelConfig.maybe_pull_model_tokenizer_for_runai(config2, s3_url2, s3_url2)
|
||||||
ModelConfig.maybe_pull_model_for_runai(model_config2, s3_url2)
|
|
||||||
RendererConfig.maybe_pull_tokenizer_for_runai(renderer_config2, s3_url2)
|
|
||||||
|
|
||||||
# Verify that different URLs produce different directories
|
# Verify that different URLs produce different directories
|
||||||
assert model_config1.model != model_config2.model, (
|
assert config1.model != config2.model, (
|
||||||
f"Different S3 URLs should create different model directories. "
|
f"Different S3 URLs should create different model directories. "
|
||||||
f"URL1 model: {model_config1.model}, URL2 model: {model_config2.model}"
|
f"URL1 model: {config1.model}, URL2 model: {config2.model}"
|
||||||
)
|
)
|
||||||
assert renderer_config1.tokenizer != renderer_config2.tokenizer, (
|
assert config1.tokenizer != config2.tokenizer, (
|
||||||
f"Different S3 URLs should create different tokenizer directories. "
|
f"Different S3 URLs should create different tokenizer directories. "
|
||||||
f"URL1 tokenizer: {renderer_config1.tokenizer}, "
|
f"URL1 tokenizer: {config1.tokenizer}, "
|
||||||
f"URL2 tokenizer: {renderer_config2.tokenizer}"
|
f"URL2 tokenizer: {config2.tokenizer}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Verify that both sets of directories exist
|
# Verify that both sets of directories exist
|
||||||
assert os.path.exists(model_config1.model) and os.path.isdir(model_config1.model)
|
assert os.path.exists(config1.model) and os.path.isdir(config1.model)
|
||||||
assert os.path.exists(renderer_config1.tokenizer) and os.path.isdir(
|
assert os.path.exists(config1.tokenizer) and os.path.isdir(config1.tokenizer)
|
||||||
renderer_config1.tokenizer
|
assert os.path.exists(config2.model) and os.path.isdir(config2.model)
|
||||||
)
|
assert os.path.exists(config2.tokenizer) and os.path.isdir(config2.tokenizer)
|
||||||
assert os.path.exists(model_config2.model) and os.path.isdir(model_config2.model)
|
|
||||||
assert os.path.exists(renderer_config2.tokenizer) and os.path.isdir(
|
|
||||||
renderer_config2.tokenizer
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
|
|||||||
@ -3,7 +3,7 @@
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm.config import ModelConfig, RendererConfig
|
from vllm.config import ModelConfig
|
||||||
from vllm.inputs import zip_enc_dec_prompts
|
from vllm.inputs import zip_enc_dec_prompts
|
||||||
from vllm.inputs.parse import parse_raw_prompts
|
from vllm.inputs.parse import parse_raw_prompts
|
||||||
from vllm.inputs.preprocess import InputPreprocessor
|
from vllm.inputs.preprocess import InputPreprocessor
|
||||||
@ -108,9 +108,8 @@ def test_zip_enc_dec_prompts(mm_processor_kwargs, expected_mm_kwargs):
|
|||||||
)
|
)
|
||||||
def test_preprocessor_always_mm_code_path(model_id, prompt):
|
def test_preprocessor_always_mm_code_path(model_id, prompt):
|
||||||
model_config = ModelConfig(model=model_id)
|
model_config = ModelConfig(model=model_id)
|
||||||
renderer_config = RendererConfig(model_config=model_config)
|
tokenizer = init_tokenizer_from_config(model_config)
|
||||||
tokenizer = init_tokenizer_from_config(renderer_config)
|
input_preprocessor = InputPreprocessor(model_config, tokenizer)
|
||||||
input_preprocessor = InputPreprocessor(renderer_config, tokenizer)
|
|
||||||
|
|
||||||
# HF processor adds sep token
|
# HF processor adds sep token
|
||||||
sep_token_id = tokenizer.vocab[tokenizer.sep_token]
|
sep_token_id = tokenizer.vocab[tokenizer.sep_token]
|
||||||
|
|||||||
@ -16,7 +16,6 @@ from vllm.config import (
|
|||||||
LoadConfig,
|
LoadConfig,
|
||||||
ModelConfig,
|
ModelConfig,
|
||||||
ParallelConfig,
|
ParallelConfig,
|
||||||
RendererConfig,
|
|
||||||
SchedulerConfig,
|
SchedulerConfig,
|
||||||
VllmConfig,
|
VllmConfig,
|
||||||
)
|
)
|
||||||
@ -217,7 +216,6 @@ def create_vllm_config(
|
|||||||
|
|
||||||
return VllmConfig(
|
return VllmConfig(
|
||||||
model_config=model_config,
|
model_config=model_config,
|
||||||
renderer_config=RendererConfig(model_config=model_config),
|
|
||||||
cache_config=cache_config,
|
cache_config=cache_config,
|
||||||
parallel_config=parallel_config,
|
parallel_config=parallel_config,
|
||||||
scheduler_config=scheduler_config,
|
scheduler_config=scheduler_config,
|
||||||
|
|||||||
@ -8,7 +8,7 @@ import pytest
|
|||||||
import torch
|
import torch
|
||||||
|
|
||||||
import vllm.v1.core.kv_cache_utils as kv_cache_utils
|
import vllm.v1.core.kv_cache_utils as kv_cache_utils
|
||||||
from vllm.config import ModelConfig, RendererConfig, SchedulerConfig, VllmConfig
|
from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.multimodal.inputs import (
|
from vllm.multimodal.inputs import (
|
||||||
MultiModalFeatureSpec,
|
MultiModalFeatureSpec,
|
||||||
@ -667,10 +667,7 @@ def test_metrics_empty_stats():
|
|||||||
|
|
||||||
def test_get_kv_cache_configs_multiple_workers():
|
def test_get_kv_cache_configs_multiple_workers():
|
||||||
model_config = ModelConfig(max_model_len=16)
|
model_config = ModelConfig(max_model_len=16)
|
||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(model_config=model_config)
|
||||||
model_config=model_config,
|
|
||||||
renderer_config=RendererConfig(model_config=model_config),
|
|
||||||
)
|
|
||||||
|
|
||||||
ref_kv_cache_spec = new_kv_cache_spec()
|
ref_kv_cache_spec = new_kv_cache_spec()
|
||||||
same_kv_cache_specs = [
|
same_kv_cache_specs = [
|
||||||
@ -1139,7 +1136,6 @@ def test_estimate_max_model_len(model_id, max_model_len, want_estimated_max_len)
|
|||||||
|
|
||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
model_config=model_config,
|
model_config=model_config,
|
||||||
renderer_config=RendererConfig(model_config=model_config),
|
|
||||||
scheduler_config=scheduler_config,
|
scheduler_config=scheduler_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1179,7 +1175,6 @@ def test_get_max_concurrency_for_kv_cache_config():
|
|||||||
|
|
||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
model_config=model_config,
|
model_config=model_config,
|
||||||
renderer_config=RendererConfig(model_config=model_config),
|
|
||||||
scheduler_config=scheduler_config,
|
scheduler_config=scheduler_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1298,10 +1293,7 @@ def test_allocate_with_lookahead():
|
|||||||
def test_get_kv_cache_config_one_worker():
|
def test_get_kv_cache_config_one_worker():
|
||||||
# pass max_model_len to pass check_enough_kv_cache_memory
|
# pass max_model_len to pass check_enough_kv_cache_memory
|
||||||
model_config = ModelConfig(max_model_len=16)
|
model_config = ModelConfig(max_model_len=16)
|
||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(model_config=model_config)
|
||||||
model_config=model_config,
|
|
||||||
renderer_config=RendererConfig(model_config=model_config),
|
|
||||||
)
|
|
||||||
|
|
||||||
mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2
|
mem_per_block_per_layer = 16 * 2 * 64 * 4 * 2
|
||||||
# all layers are full attention -> single group
|
# all layers are full attention -> single group
|
||||||
@ -1592,11 +1584,7 @@ def test_get_kv_cache_config_one_worker():
|
|||||||
|
|
||||||
def test_get_kv_cache_configs_attention_free():
|
def test_get_kv_cache_configs_attention_free():
|
||||||
kv_cache_specs: dict[str, KVCacheSpec] = {}
|
kv_cache_specs: dict[str, KVCacheSpec] = {}
|
||||||
model_config = ModelConfig(max_model_len=16)
|
vllm_config = VllmConfig(model_config=ModelConfig(max_model_len=16))
|
||||||
vllm_config = VllmConfig(
|
|
||||||
model_config=model_config,
|
|
||||||
renderer_config=RendererConfig(model_config=model_config),
|
|
||||||
)
|
|
||||||
kv_cache_configs = get_kv_cache_configs(vllm_config, [kv_cache_specs], [0])
|
kv_cache_configs = get_kv_cache_configs(vllm_config, [kv_cache_specs], [0])
|
||||||
assert kv_cache_configs == [
|
assert kv_cache_configs == [
|
||||||
KVCacheConfig(
|
KVCacheConfig(
|
||||||
|
|||||||
@ -11,7 +11,6 @@ from vllm.config import (
|
|||||||
ECTransferConfig,
|
ECTransferConfig,
|
||||||
KVTransferConfig,
|
KVTransferConfig,
|
||||||
ModelConfig,
|
ModelConfig,
|
||||||
RendererConfig,
|
|
||||||
SchedulerConfig,
|
SchedulerConfig,
|
||||||
SpeculativeConfig,
|
SpeculativeConfig,
|
||||||
VllmConfig,
|
VllmConfig,
|
||||||
@ -1564,7 +1563,6 @@ def create_scheduler_with_priority(
|
|||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
scheduler_config=scheduler_config,
|
scheduler_config=scheduler_config,
|
||||||
model_config=model_config,
|
model_config=model_config,
|
||||||
renderer_config=RendererConfig(model_config=model_config),
|
|
||||||
cache_config=cache_config,
|
cache_config=cache_config,
|
||||||
kv_transfer_config=kv_transfer_config,
|
kv_transfer_config=kv_transfer_config,
|
||||||
speculative_config=speculative_config,
|
speculative_config=speculative_config,
|
||||||
|
|||||||
@ -9,7 +9,6 @@ from vllm.config import (
|
|||||||
ECTransferConfig,
|
ECTransferConfig,
|
||||||
KVTransferConfig,
|
KVTransferConfig,
|
||||||
ModelConfig,
|
ModelConfig,
|
||||||
RendererConfig,
|
|
||||||
SchedulerConfig,
|
SchedulerConfig,
|
||||||
SpeculativeConfig,
|
SpeculativeConfig,
|
||||||
VllmConfig,
|
VllmConfig,
|
||||||
@ -133,7 +132,6 @@ def create_scheduler(
|
|||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
scheduler_config=scheduler_config,
|
scheduler_config=scheduler_config,
|
||||||
model_config=model_config,
|
model_config=model_config,
|
||||||
renderer_config=RendererConfig(model_config=model_config),
|
|
||||||
cache_config=cache_config,
|
cache_config=cache_config,
|
||||||
kv_transfer_config=kv_transfer_config,
|
kv_transfer_config=kv_transfer_config,
|
||||||
speculative_config=speculative_config,
|
speculative_config=speculative_config,
|
||||||
|
|||||||
@ -15,7 +15,6 @@ from vllm.config import (
|
|||||||
ECTransferConfig,
|
ECTransferConfig,
|
||||||
KVTransferConfig,
|
KVTransferConfig,
|
||||||
ModelConfig,
|
ModelConfig,
|
||||||
RendererConfig,
|
|
||||||
SchedulerConfig,
|
SchedulerConfig,
|
||||||
VllmConfig,
|
VllmConfig,
|
||||||
)
|
)
|
||||||
@ -523,7 +522,6 @@ def test_encoder_instance_zero_kv_cache(
|
|||||||
|
|
||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
model_config=model_config,
|
model_config=model_config,
|
||||||
renderer_config=RendererConfig(model_config=model_config),
|
|
||||||
cache_config=cache_config,
|
cache_config=cache_config,
|
||||||
scheduler_config=scheduler_config,
|
scheduler_config=scheduler_config,
|
||||||
kv_transfer_config=kv_transfer_config,
|
kv_transfer_config=kv_transfer_config,
|
||||||
|
|||||||
@ -5,14 +5,7 @@ import pytest
|
|||||||
|
|
||||||
from vllm.assets.image import ImageAsset
|
from vllm.assets.image import ImageAsset
|
||||||
from vllm.assets.video import VideoAsset
|
from vllm.assets.video import VideoAsset
|
||||||
from vllm.config import (
|
from vllm.config import CacheConfig, DeviceConfig, ModelConfig, VllmConfig
|
||||||
CacheConfig,
|
|
||||||
DeviceConfig,
|
|
||||||
ModelConfig,
|
|
||||||
MultiModalConfig,
|
|
||||||
RendererConfig,
|
|
||||||
VllmConfig,
|
|
||||||
)
|
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
from vllm.v1.engine import input_processor as input_processor_mod
|
from vllm.v1.engine import input_processor as input_processor_mod
|
||||||
from vllm.v1.engine.input_processor import InputProcessor
|
from vllm.v1.engine.input_processor import InputProcessor
|
||||||
@ -51,21 +44,22 @@ def _mock_input_processor(
|
|||||||
monkeypatch.setattr(VllmConfig, "__post_init__", lambda self: None, raising=True)
|
monkeypatch.setattr(VllmConfig, "__post_init__", lambda self: None, raising=True)
|
||||||
|
|
||||||
model_config = ModelConfig(
|
model_config = ModelConfig(
|
||||||
|
skip_tokenizer_init=True,
|
||||||
max_model_len=128,
|
max_model_len=128,
|
||||||
mm_processor_cache_gb=mm_cache_gb,
|
mm_processor_cache_gb=mm_cache_gb,
|
||||||
generation_config="vllm",
|
generation_config="vllm",
|
||||||
)
|
|
||||||
model_config.multimodal_config = MultiModalConfig(mm_processor_cache_gb=mm_cache_gb)
|
|
||||||
|
|
||||||
renderer_config = RendererConfig(
|
|
||||||
model_config=model_config,
|
|
||||||
tokenizer="dummy",
|
tokenizer="dummy",
|
||||||
skip_tokenizer_init=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Minimal multimodal_config to satisfy references in
|
||||||
|
# Processor.process_inputs.
|
||||||
|
class _MockMMConfig:
|
||||||
|
def __init__(self, gb: float):
|
||||||
|
self.mm_processor_cache_gb = gb
|
||||||
|
|
||||||
|
model_config.multimodal_config = _MockMMConfig(mm_cache_gb) # type: ignore[attr-defined]
|
||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
model_config=model_config,
|
model_config=model_config,
|
||||||
renderer_config=renderer_config,
|
|
||||||
cache_config=CacheConfig(enable_prefix_caching=enable_prefix_caching),
|
cache_config=CacheConfig(enable_prefix_caching=enable_prefix_caching),
|
||||||
device_config=DeviceConfig(device="cpu"),
|
device_config=DeviceConfig(device="cpu"),
|
||||||
)
|
)
|
||||||
|
|||||||
@ -15,7 +15,6 @@ from vllm.config import (
|
|||||||
DeviceConfig,
|
DeviceConfig,
|
||||||
KVTransferConfig,
|
KVTransferConfig,
|
||||||
ModelConfig,
|
ModelConfig,
|
||||||
RendererConfig,
|
|
||||||
SchedulerConfig,
|
SchedulerConfig,
|
||||||
VllmConfig,
|
VllmConfig,
|
||||||
)
|
)
|
||||||
@ -128,7 +127,6 @@ def create_vllm_config(
|
|||||||
return VllmConfig(
|
return VllmConfig(
|
||||||
scheduler_config=scheduler_config,
|
scheduler_config=scheduler_config,
|
||||||
model_config=model_config,
|
model_config=model_config,
|
||||||
renderer_config=RendererConfig(model_config=model_config),
|
|
||||||
cache_config=cache_config,
|
cache_config=cache_config,
|
||||||
kv_transfer_config=kv_transfer_config,
|
kv_transfer_config=kv_transfer_config,
|
||||||
device_config=DeviceConfig("cpu"),
|
device_config=DeviceConfig("cpu"),
|
||||||
|
|||||||
@ -19,7 +19,6 @@ from vllm.config import (
|
|||||||
DeviceConfig,
|
DeviceConfig,
|
||||||
ModelConfig,
|
ModelConfig,
|
||||||
ParallelConfig,
|
ParallelConfig,
|
||||||
RendererConfig,
|
|
||||||
SchedulerConfig,
|
SchedulerConfig,
|
||||||
SpeculativeConfig,
|
SpeculativeConfig,
|
||||||
VllmConfig,
|
VllmConfig,
|
||||||
@ -62,7 +61,6 @@ def _create_proposer(
|
|||||||
|
|
||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
model_config=model_config,
|
model_config=model_config,
|
||||||
renderer_config=RendererConfig(model_config=model_config),
|
|
||||||
cache_config=CacheConfig(),
|
cache_config=CacheConfig(),
|
||||||
speculative_config=speculative_config,
|
speculative_config=speculative_config,
|
||||||
device_config=DeviceConfig(device=current_platform.device_type),
|
device_config=DeviceConfig(device=current_platform.device_type),
|
||||||
|
|||||||
@ -18,7 +18,6 @@ from vllm.config import (
|
|||||||
DeviceConfig,
|
DeviceConfig,
|
||||||
ModelConfig,
|
ModelConfig,
|
||||||
ParallelConfig,
|
ParallelConfig,
|
||||||
RendererConfig,
|
|
||||||
SchedulerConfig,
|
SchedulerConfig,
|
||||||
SpeculativeConfig,
|
SpeculativeConfig,
|
||||||
VllmConfig,
|
VllmConfig,
|
||||||
@ -47,7 +46,6 @@ def _create_mtp_proposer(num_speculative_tokens: int) -> EagleProposer:
|
|||||||
|
|
||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
model_config=model_config,
|
model_config=model_config,
|
||||||
renderer_config=RendererConfig(model_config=model_config),
|
|
||||||
cache_config=CacheConfig(),
|
cache_config=CacheConfig(),
|
||||||
speculative_config=speculative_config,
|
speculative_config=speculative_config,
|
||||||
device_config=DeviceConfig(device=current_platform.device_type),
|
device_config=DeviceConfig(device=current_platform.device_type),
|
||||||
|
|||||||
@ -4,7 +4,6 @@ import numpy as np
|
|||||||
|
|
||||||
from vllm.config import (
|
from vllm.config import (
|
||||||
ModelConfig,
|
ModelConfig,
|
||||||
RendererConfig,
|
|
||||||
SpeculativeConfig,
|
SpeculativeConfig,
|
||||||
VllmConfig,
|
VllmConfig,
|
||||||
)
|
)
|
||||||
@ -70,7 +69,6 @@ def test_ngram_proposer():
|
|||||||
return NgramProposer(
|
return NgramProposer(
|
||||||
vllm_config=VllmConfig(
|
vllm_config=VllmConfig(
|
||||||
model_config=model_config,
|
model_config=model_config,
|
||||||
renderer_config=RendererConfig(model_config=model_config),
|
|
||||||
speculative_config=SpeculativeConfig(
|
speculative_config=SpeculativeConfig(
|
||||||
prompt_lookup_min=min_n,
|
prompt_lookup_min=min_n,
|
||||||
prompt_lookup_max=max_n,
|
prompt_lookup_max=max_n,
|
||||||
|
|||||||
@ -6,7 +6,7 @@ from concurrent.futures import Future
|
|||||||
import pytest
|
import pytest
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
from vllm.config import RendererConfig, StructuredOutputsConfig, VllmConfig
|
from vllm.config import StructuredOutputsConfig, VllmConfig
|
||||||
from vllm.config.model import ModelConfig
|
from vllm.config.model import ModelConfig
|
||||||
from vllm.config.parallel import ParallelConfig
|
from vllm.config.parallel import ParallelConfig
|
||||||
from vllm.config.speculative import SpeculativeConfig
|
from vllm.config.speculative import SpeculativeConfig
|
||||||
@ -72,11 +72,8 @@ def test_backend_guidance_rollback_terminated():
|
|||||||
def test_grammar_bitmask_with_specdec():
|
def test_grammar_bitmask_with_specdec():
|
||||||
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
|
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
|
||||||
prompt = tokenizer.encode('{"a": "b"}')
|
prompt = tokenizer.encode('{"a": "b"}')
|
||||||
|
|
||||||
model_config = ModelConfig(tokenizer=TOKENIZER)
|
|
||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
model_config=model_config,
|
model_config=ModelConfig(tokenizer=TOKENIZER),
|
||||||
renderer_config=RendererConfig(model_config=model_config, tokenizer=TOKENIZER),
|
|
||||||
structured_outputs_config=StructuredOutputsConfig(backend="guidance"),
|
structured_outputs_config=StructuredOutputsConfig(backend="guidance"),
|
||||||
speculative_config=SpeculativeConfig(model="[ngram]", num_speculative_tokens=3),
|
speculative_config=SpeculativeConfig(model="[ngram]", num_speculative_tokens=3),
|
||||||
)
|
)
|
||||||
@ -140,11 +137,8 @@ def test_grammar_init_async_and_sync(async_grammar):
|
|||||||
|
|
||||||
# Use "external_launcher" for sync mode, None for async mode
|
# Use "external_launcher" for sync mode, None for async mode
|
||||||
executor_backend = None if async_grammar else "external_launcher"
|
executor_backend = None if async_grammar else "external_launcher"
|
||||||
|
|
||||||
model_config = ModelConfig(tokenizer=TOKENIZER)
|
|
||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
model_config=model_config,
|
model_config=ModelConfig(tokenizer=TOKENIZER),
|
||||||
renderer_config=RendererConfig(model_config=model_config, tokenizer=TOKENIZER),
|
|
||||||
structured_outputs_config=StructuredOutputsConfig(backend="guidance"),
|
structured_outputs_config=StructuredOutputsConfig(backend="guidance"),
|
||||||
parallel_config=ParallelConfig(distributed_executor_backend=executor_backend),
|
parallel_config=ParallelConfig(distributed_executor_backend=executor_backend),
|
||||||
)
|
)
|
||||||
|
|||||||
@ -7,7 +7,7 @@ from unittest.mock import Mock
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from vllm.config import ModelConfig, RendererConfig, SchedulerConfig, VllmConfig
|
from vllm.config import ModelConfig, SchedulerConfig, VllmConfig
|
||||||
from vllm.reasoning import ReasoningParser
|
from vllm.reasoning import ReasoningParser
|
||||||
from vllm.v1.request import Request
|
from vllm.v1.request import Request
|
||||||
from vllm.v1.structured_output import StructuredOutputManager
|
from vllm.v1.structured_output import StructuredOutputManager
|
||||||
@ -17,26 +17,19 @@ class TestReasoningStructuredOutput:
|
|||||||
"""Test reasoning-aware structured output functionality."""
|
"""Test reasoning-aware structured output functionality."""
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def mock_renderer_config(self):
|
def mock_model_config(self):
|
||||||
"""Create a mock RendererConfig."""
|
"""Create a mock ModelConfig."""
|
||||||
renderer_config = Mock(spec=RendererConfig)
|
config = Mock(spec=ModelConfig)
|
||||||
renderer_config.skip_tokenizer_init = (
|
config.skip_tokenizer_init = True # Skip tokenizer init to avoid network calls
|
||||||
True # Skip tokenizer init to avoid network calls
|
config.get_vocab_size = Mock(return_value=50000)
|
||||||
)
|
|
||||||
|
|
||||||
model_config = Mock(spec=ModelConfig)
|
|
||||||
model_config.get_vocab_size = Mock(return_value=50000)
|
|
||||||
model_config.trust_remote_code = False
|
|
||||||
# Add missing runner_type attribute that tokenizer initialization expects
|
# Add missing runner_type attribute that tokenizer initialization expects
|
||||||
model_config.runner_type = "generate"
|
config.runner_type = "generate"
|
||||||
renderer_config.model_config = model_config
|
|
||||||
|
|
||||||
# Add other attributes that tokenizer initialization might need
|
# Add other attributes that tokenizer initialization might need
|
||||||
renderer_config.tokenizer = "test-tokenizer"
|
config.tokenizer = "test-tokenizer"
|
||||||
renderer_config.tokenizer_mode = "auto"
|
config.tokenizer_mode = "auto"
|
||||||
renderer_config.tokenizer_revision = None
|
config.trust_remote_code = False
|
||||||
|
config.tokenizer_revision = None
|
||||||
return renderer_config
|
return config
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def mock_scheduler_config(self):
|
def mock_scheduler_config(self):
|
||||||
@ -46,10 +39,10 @@ class TestReasoningStructuredOutput:
|
|||||||
return config
|
return config
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def mock_vllm_config(self, mock_renderer_config, mock_scheduler_config):
|
def mock_vllm_config(self, mock_model_config, mock_scheduler_config):
|
||||||
"""Create a mock VllmConfig."""
|
"""Create a mock VllmConfig."""
|
||||||
config = Mock(spec=VllmConfig)
|
config = Mock(spec=VllmConfig)
|
||||||
config.renderer_config = mock_renderer_config
|
config.model_config = mock_model_config
|
||||||
config.scheduler_config = mock_scheduler_config
|
config.scheduler_config = mock_scheduler_config
|
||||||
config.structured_outputs_config = Mock()
|
config.structured_outputs_config = Mock()
|
||||||
config.structured_outputs_config.reasoning_parser = None
|
config.structured_outputs_config.reasoning_parser = None
|
||||||
|
|||||||
@ -7,7 +7,6 @@ from vllm.attention.layer import Attention
|
|||||||
from vllm.config import (
|
from vllm.config import (
|
||||||
CacheConfig,
|
CacheConfig,
|
||||||
ModelConfig,
|
ModelConfig,
|
||||||
RendererConfig,
|
|
||||||
SchedulerConfig,
|
SchedulerConfig,
|
||||||
VllmConfig,
|
VllmConfig,
|
||||||
set_current_vllm_config,
|
set_current_vllm_config,
|
||||||
@ -46,7 +45,6 @@ def get_vllm_config():
|
|||||||
)
|
)
|
||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
model_config=model_config,
|
model_config=model_config,
|
||||||
renderer_config=RendererConfig(model_config=model_config),
|
|
||||||
cache_config=cache_config,
|
cache_config=cache_config,
|
||||||
scheduler_config=scheduler_config,
|
scheduler_config=scheduler_config,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -13,7 +13,6 @@ from vllm.config import (
|
|||||||
CacheConfig,
|
CacheConfig,
|
||||||
ModelConfig,
|
ModelConfig,
|
||||||
ParallelConfig,
|
ParallelConfig,
|
||||||
RendererConfig,
|
|
||||||
SchedulerConfig,
|
SchedulerConfig,
|
||||||
VllmConfig,
|
VllmConfig,
|
||||||
set_current_vllm_config,
|
set_current_vllm_config,
|
||||||
@ -102,7 +101,6 @@ def get_vllm_config():
|
|||||||
parallel_config = ParallelConfig()
|
parallel_config = ParallelConfig()
|
||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
model_config=model_config,
|
model_config=model_config,
|
||||||
renderer_config=RendererConfig(model_config=model_config),
|
|
||||||
cache_config=cache_config,
|
cache_config=cache_config,
|
||||||
scheduler_config=scheduler_config,
|
scheduler_config=scheduler_config,
|
||||||
parallel_config=parallel_config,
|
parallel_config=parallel_config,
|
||||||
@ -813,7 +811,6 @@ def test_hybrid_attention_mamba_tensor_shapes():
|
|||||||
attention_config = AttentionConfig(backend=AttentionBackendEnum.FLASHINFER)
|
attention_config = AttentionConfig(backend=AttentionBackendEnum.FLASHINFER)
|
||||||
vllm_config = VllmConfig(
|
vllm_config = VllmConfig(
|
||||||
model_config=model_config,
|
model_config=model_config,
|
||||||
renderer_config=RendererConfig(model_config=model_config),
|
|
||||||
cache_config=cache_config,
|
cache_config=cache_config,
|
||||||
scheduler_config=scheduler_config,
|
scheduler_config=scheduler_config,
|
||||||
parallel_config=parallel_config,
|
parallel_config=parallel_config,
|
||||||
|
|||||||
@ -24,7 +24,6 @@ from vllm.config.multimodal import MultiModalConfig
|
|||||||
from vllm.config.observability import ObservabilityConfig
|
from vllm.config.observability import ObservabilityConfig
|
||||||
from vllm.config.parallel import EPLBConfig, ParallelConfig
|
from vllm.config.parallel import EPLBConfig, ParallelConfig
|
||||||
from vllm.config.pooler import PoolerConfig
|
from vllm.config.pooler import PoolerConfig
|
||||||
from vllm.config.renderer import RendererConfig
|
|
||||||
from vllm.config.scheduler import SchedulerConfig
|
from vllm.config.scheduler import SchedulerConfig
|
||||||
from vllm.config.speculative import SpeculativeConfig
|
from vllm.config.speculative import SpeculativeConfig
|
||||||
from vllm.config.speech_to_text import SpeechToTextConfig
|
from vllm.config.speech_to_text import SpeechToTextConfig
|
||||||
@ -82,8 +81,6 @@ __all__ = [
|
|||||||
"ParallelConfig",
|
"ParallelConfig",
|
||||||
# From vllm.config.pooler
|
# From vllm.config.pooler
|
||||||
"PoolerConfig",
|
"PoolerConfig",
|
||||||
# From vllm.config.renderer
|
|
||||||
"RendererConfig",
|
|
||||||
# From vllm.config.scheduler
|
# From vllm.config.scheduler
|
||||||
"SchedulerConfig",
|
"SchedulerConfig",
|
||||||
# From vllm.config.speculative
|
# From vllm.config.speculative
|
||||||
|
|||||||
@ -36,6 +36,7 @@ from vllm.transformers_utils.config import (
|
|||||||
uses_xdrope_dim,
|
uses_xdrope_dim,
|
||||||
)
|
)
|
||||||
from vllm.transformers_utils.gguf_utils import (
|
from vllm.transformers_utils.gguf_utils import (
|
||||||
|
is_gguf,
|
||||||
is_remote_gguf,
|
is_remote_gguf,
|
||||||
maybe_patch_hf_config_from_gguf,
|
maybe_patch_hf_config_from_gguf,
|
||||||
split_remote_gguf,
|
split_remote_gguf,
|
||||||
@ -82,6 +83,7 @@ TaskOption = Literal[
|
|||||||
"transcription",
|
"transcription",
|
||||||
"draft",
|
"draft",
|
||||||
]
|
]
|
||||||
|
TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"]
|
||||||
ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
|
ModelDType = Literal["auto", "half", "float16", "bfloat16", "float", "float32"]
|
||||||
LogprobsMode = Literal[
|
LogprobsMode = Literal[
|
||||||
"raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"
|
"raw_logits", "raw_logprobs", "processed_logits", "processed_logprobs"
|
||||||
@ -129,6 +131,18 @@ class ModelConfig:
|
|||||||
|
|
||||||
Note that the model may support other tasks using the same model runner.
|
Note that the model may support other tasks using the same model runner.
|
||||||
"""
|
"""
|
||||||
|
tokenizer: SkipValidation[str] = None # type: ignore
|
||||||
|
"""Name or path of the Hugging Face tokenizer to use. If unspecified, model
|
||||||
|
name or path will be used."""
|
||||||
|
tokenizer_mode: TokenizerMode | str = "auto"
|
||||||
|
"""Tokenizer mode:\n
|
||||||
|
- "auto" will use the tokenizer from `mistral_common` for Mistral models
|
||||||
|
if available, otherwise it will use the "hf" tokenizer.\n
|
||||||
|
- "hf" will use the fast tokenizer if available.\n
|
||||||
|
- "slow" will always use the slow tokenizer.\n
|
||||||
|
- "mistral" will always use the tokenizer from `mistral_common`.\n
|
||||||
|
- "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
|
||||||
|
- Other custom values can be supported via plugins."""
|
||||||
trust_remote_code: bool = False
|
trust_remote_code: bool = False
|
||||||
"""Trust remote code (e.g., from HuggingFace) when downloading the model
|
"""Trust remote code (e.g., from HuggingFace) when downloading the model
|
||||||
and tokenizer."""
|
and tokenizer."""
|
||||||
@ -154,6 +168,13 @@ class ModelConfig:
|
|||||||
hf_config_path: str | None = None
|
hf_config_path: str | None = None
|
||||||
"""Name or path of the Hugging Face config to use. If unspecified, model
|
"""Name or path of the Hugging Face config to use. If unspecified, model
|
||||||
name or path will be used."""
|
name or path will be used."""
|
||||||
|
allowed_local_media_path: str = ""
|
||||||
|
"""Allowing API requests to read local images or videos from directories
|
||||||
|
specified by the server file system. This is a security risk. Should only
|
||||||
|
be enabled in trusted environments."""
|
||||||
|
allowed_media_domains: list[str] | None = None
|
||||||
|
"""If set, only media URLs that belong to this domain can be used for
|
||||||
|
multi-modal inputs. """
|
||||||
revision: str | None = None
|
revision: str | None = None
|
||||||
"""The specific model version to use. It can be a branch name, a tag name,
|
"""The specific model version to use. It can be a branch name, a tag name,
|
||||||
or a commit id. If unspecified, will use the default version."""
|
or a commit id. If unspecified, will use the default version."""
|
||||||
@ -161,6 +182,10 @@ class ModelConfig:
|
|||||||
"""The specific revision to use for the model code on the Hugging Face Hub.
|
"""The specific revision to use for the model code on the Hugging Face Hub.
|
||||||
It can be a branch name, a tag name, or a commit id. If unspecified, will
|
It can be a branch name, a tag name, or a commit id. If unspecified, will
|
||||||
use the default version."""
|
use the default version."""
|
||||||
|
tokenizer_revision: str | None = None
|
||||||
|
"""The specific revision to use for the tokenizer on the Hugging Face Hub.
|
||||||
|
It can be a branch name, a tag name, or a commit id. If unspecified, will
|
||||||
|
use the default version."""
|
||||||
max_model_len: SkipValidation[int] = None # type: ignore
|
max_model_len: SkipValidation[int] = None # type: ignore
|
||||||
"""Model context length (prompt and output). If unspecified, will be
|
"""Model context length (prompt and output). If unspecified, will be
|
||||||
automatically derived from the model config.
|
automatically derived from the model config.
|
||||||
@ -205,6 +230,10 @@ class ModelConfig:
|
|||||||
preventing potential numerical issues. Note that even if this is set to
|
preventing potential numerical issues. Note that even if this is set to
|
||||||
False, cascade attention will be only used when the heuristic tells that
|
False, cascade attention will be only used when the heuristic tells that
|
||||||
it's beneficial."""
|
it's beneficial."""
|
||||||
|
skip_tokenizer_init: bool = False
|
||||||
|
"""Skip initialization of tokenizer and detokenizer. Expects valid
|
||||||
|
`prompt_token_ids` and `None` for prompt from the input. The generated
|
||||||
|
output will contain token ids."""
|
||||||
enable_prompt_embeds: bool = False
|
enable_prompt_embeds: bool = False
|
||||||
"""If `True`, enables passing text embeddings as inputs via the
|
"""If `True`, enables passing text embeddings as inputs via the
|
||||||
`prompt_embeds` key.
|
`prompt_embeds` key.
|
||||||
@ -265,6 +294,8 @@ class ModelConfig:
|
|||||||
logits_processors: list[str | type[LogitsProcessor]] | None = None
|
logits_processors: list[str | type[LogitsProcessor]] | None = None
|
||||||
"""One or more logits processors' fully-qualified class names or class
|
"""One or more logits processors' fully-qualified class names or class
|
||||||
definitions"""
|
definitions"""
|
||||||
|
io_processor_plugin: str | None = None
|
||||||
|
"""IOProcessor plugin name to load at model startup"""
|
||||||
|
|
||||||
# Pooler config
|
# Pooler config
|
||||||
pooler_config: PoolerConfig | None = None
|
pooler_config: PoolerConfig | None = None
|
||||||
@ -277,6 +308,7 @@ class ModelConfig:
|
|||||||
from the architecture of `self.model`."""
|
from the architecture of `self.model`."""
|
||||||
limit_mm_per_prompt: InitVar[dict[str, int | dict[str, int]] | None] = None
|
limit_mm_per_prompt: InitVar[dict[str, int | dict[str, int]] | None] = None
|
||||||
enable_mm_embeds: InitVar[bool | None] = None
|
enable_mm_embeds: InitVar[bool | None] = None
|
||||||
|
media_io_kwargs: InitVar[dict[str, dict[str, Any]] | None] = None
|
||||||
mm_processor_kwargs: InitVar[dict[str, Any] | None] = None
|
mm_processor_kwargs: InitVar[dict[str, Any] | None] = None
|
||||||
mm_processor_cache_gb: InitVar[float | None] = None
|
mm_processor_cache_gb: InitVar[float | None] = None
|
||||||
mm_processor_cache_type: InitVar[MMCacheType | None] = None
|
mm_processor_cache_type: InitVar[MMCacheType | None] = None
|
||||||
@ -303,12 +335,18 @@ class ModelConfig:
|
|||||||
"runner",
|
"runner",
|
||||||
"convert",
|
"convert",
|
||||||
"task",
|
"task",
|
||||||
|
"tokenizer",
|
||||||
|
"tokenizer_mode",
|
||||||
"seed",
|
"seed",
|
||||||
"hf_config_path",
|
"hf_config_path",
|
||||||
|
"allowed_local_media_path",
|
||||||
|
"allowed_media_domains",
|
||||||
|
"tokenizer_revision",
|
||||||
"spec_target_max_model_len",
|
"spec_target_max_model_len",
|
||||||
"enforce_eager",
|
"enforce_eager",
|
||||||
"logprobs_mode",
|
"logprobs_mode",
|
||||||
"disable_cascade_attn",
|
"disable_cascade_attn",
|
||||||
|
"skip_tokenizer_init",
|
||||||
"served_model_name",
|
"served_model_name",
|
||||||
"config_format",
|
"config_format",
|
||||||
"hf_token",
|
"hf_token",
|
||||||
@ -316,9 +354,11 @@ class ModelConfig:
|
|||||||
"logits_processor_pattern",
|
"logits_processor_pattern",
|
||||||
"override_attention_dtype",
|
"override_attention_dtype",
|
||||||
"logits_processors",
|
"logits_processors",
|
||||||
|
"io_processor_plugin",
|
||||||
"pooler_config",
|
"pooler_config",
|
||||||
"multimodal_config",
|
"multimodal_config",
|
||||||
"limit_mm_per_prompt",
|
"limit_mm_per_prompt",
|
||||||
|
"media_io_kwargs",
|
||||||
"mm_processor_kwargs",
|
"mm_processor_kwargs",
|
||||||
"mm_processor_cache_gb",
|
"mm_processor_cache_gb",
|
||||||
"mm_processor_cache_type",
|
"mm_processor_cache_type",
|
||||||
@ -383,6 +423,7 @@ class ModelConfig:
|
|||||||
# Multimodal config init vars
|
# Multimodal config init vars
|
||||||
limit_mm_per_prompt: dict[str, int | dict[str, int]] | None,
|
limit_mm_per_prompt: dict[str, int | dict[str, int]] | None,
|
||||||
enable_mm_embeds: bool | None,
|
enable_mm_embeds: bool | None,
|
||||||
|
media_io_kwargs: dict[str, dict[str, Any]] | None,
|
||||||
mm_processor_kwargs: dict[str, Any] | None,
|
mm_processor_kwargs: dict[str, Any] | None,
|
||||||
mm_processor_cache_gb: float | None,
|
mm_processor_cache_gb: float | None,
|
||||||
mm_processor_cache_type: MMCacheType | None,
|
mm_processor_cache_type: MMCacheType | None,
|
||||||
@ -397,8 +438,13 @@ class ModelConfig:
|
|||||||
self.served_model_name = get_served_model_name(
|
self.served_model_name = get_served_model_name(
|
||||||
self.model, self.served_model_name
|
self.model, self.served_model_name
|
||||||
)
|
)
|
||||||
self.original_model = self.model
|
self.model = maybe_model_redirect(self.model)
|
||||||
self.model = maybe_model_redirect(self.original_model)
|
# The tokenizer is consistent with the model by default.
|
||||||
|
if self.tokenizer is None:
|
||||||
|
self.tokenizer = self.model
|
||||||
|
if self.tokenizer_revision is None:
|
||||||
|
self.tokenizer_revision = self.revision
|
||||||
|
self.tokenizer = maybe_model_redirect(self.tokenizer)
|
||||||
|
|
||||||
if isinstance(self.hf_config_path, str):
|
if isinstance(self.hf_config_path, str):
|
||||||
self.hf_config_path = maybe_model_redirect(self.hf_config_path)
|
self.hf_config_path = maybe_model_redirect(self.hf_config_path)
|
||||||
@ -419,7 +465,7 @@ class ModelConfig:
|
|||||||
hf_overrides_kw[key] = value
|
hf_overrides_kw[key] = value
|
||||||
hf_overrides_fn = None
|
hf_overrides_fn = None
|
||||||
|
|
||||||
self.maybe_pull_model_for_runai(self.model)
|
self.maybe_pull_model_tokenizer_for_runai(self.model, self.tokenizer)
|
||||||
|
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
|
||||||
@ -602,8 +648,7 @@ class ModelConfig:
|
|||||||
)
|
)
|
||||||
|
|
||||||
self.original_max_model_len = self.max_model_len
|
self.original_max_model_len = self.max_model_len
|
||||||
self.recalculate_max_model_len(self.original_max_model_len)
|
self.max_model_len = self.get_and_verify_max_len(self.max_model_len)
|
||||||
|
|
||||||
# Init multimodal config if needed
|
# Init multimodal config if needed
|
||||||
if self._model_info.supports_multimodal:
|
if self._model_info.supports_multimodal:
|
||||||
if (
|
if (
|
||||||
@ -619,6 +664,7 @@ class ModelConfig:
|
|||||||
mm_config_kwargs = dict(
|
mm_config_kwargs = dict(
|
||||||
limit_per_prompt=limit_mm_per_prompt,
|
limit_per_prompt=limit_mm_per_prompt,
|
||||||
enable_mm_embeds=enable_mm_embeds,
|
enable_mm_embeds=enable_mm_embeds,
|
||||||
|
media_io_kwargs=media_io_kwargs,
|
||||||
mm_processor_kwargs=mm_processor_kwargs,
|
mm_processor_kwargs=mm_processor_kwargs,
|
||||||
mm_processor_cache_gb=mm_processor_cache_gb,
|
mm_processor_cache_gb=mm_processor_cache_gb,
|
||||||
mm_processor_cache_type=mm_processor_cache_type,
|
mm_processor_cache_type=mm_processor_cache_type,
|
||||||
@ -636,8 +682,16 @@ class ModelConfig:
|
|||||||
|
|
||||||
self.multimodal_config = MultiModalConfig(**mm_config_kwargs)
|
self.multimodal_config = MultiModalConfig(**mm_config_kwargs)
|
||||||
|
|
||||||
|
# Multimodal GGUF models must use original repo for mm processing
|
||||||
|
if is_gguf(self.tokenizer) and self.is_multimodal_model:
|
||||||
|
raise ValueError(
|
||||||
|
"Loading a multimodal GGUF model needs to use original "
|
||||||
|
"tokenizer. Please specify the unquantized hf model's "
|
||||||
|
"repo name or path using the --tokenizer argument."
|
||||||
|
)
|
||||||
|
|
||||||
if self.disable_sliding_window:
|
if self.disable_sliding_window:
|
||||||
# Set after recalculate_max_model_len to ensure that max_model_len
|
# Set after get_and_verify_max_len to ensure that max_model_len
|
||||||
# can be correctly capped to sliding window size
|
# can be correctly capped to sliding window size
|
||||||
self.hf_text_config.sliding_window = None
|
self.hf_text_config.sliding_window = None
|
||||||
|
|
||||||
@ -661,9 +715,10 @@ class ModelConfig:
|
|||||||
|
|
||||||
@model_validator(mode="after")
|
@model_validator(mode="after")
|
||||||
def validate_model_config_after(self: "ModelConfig") -> "ModelConfig":
|
def validate_model_config_after(self: "ModelConfig") -> "ModelConfig":
|
||||||
|
if not isinstance(self.tokenizer, str):
|
||||||
|
raise ValueError("tokenizer must be a string after __post_init__.")
|
||||||
if not isinstance(self.max_model_len, int):
|
if not isinstance(self.max_model_len, int):
|
||||||
raise ValueError("max_model_len must be an integer after __post_init__.")
|
raise ValueError("max_model_len must be an integer after __post_init__.")
|
||||||
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def _get_transformers_backend_cls(self) -> str:
|
def _get_transformers_backend_cls(self) -> str:
|
||||||
@ -712,17 +767,49 @@ class ModelConfig:
|
|||||||
"""The architecture vllm actually used."""
|
"""The architecture vllm actually used."""
|
||||||
return self._architecture
|
return self._architecture
|
||||||
|
|
||||||
def maybe_pull_model_for_runai(self, model: str) -> None:
|
def maybe_pull_model_tokenizer_for_runai(self, model: str, tokenizer: str) -> None:
|
||||||
"""Pull model from Object Storage to temporary directory when needed."""
|
"""Pull model/tokenizer from Object Storage to temporary
|
||||||
if not is_runai_obj_uri(model):
|
directory when needed.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model: Model name or path
|
||||||
|
tokenizer: Tokenizer name or path
|
||||||
|
"""
|
||||||
|
|
||||||
|
if not (is_runai_obj_uri(model) or is_runai_obj_uri(tokenizer)):
|
||||||
return
|
return
|
||||||
|
|
||||||
object_storage_model = ObjectStorageModel(url=model)
|
if is_runai_obj_uri(model):
|
||||||
object_storage_model.pull_files(
|
object_storage_model = ObjectStorageModel(url=model)
|
||||||
model, allow_pattern=["*.model", "*.py", "*.json"]
|
object_storage_model.pull_files(
|
||||||
)
|
model, allow_pattern=["*.model", "*.py", "*.json"]
|
||||||
self.model_weights = model
|
)
|
||||||
self.model = object_storage_model.dir
|
self.model_weights = model
|
||||||
|
self.model = object_storage_model.dir
|
||||||
|
|
||||||
|
# If tokenizer is same as model, download to same directory
|
||||||
|
if model == tokenizer:
|
||||||
|
object_storage_model.pull_files(
|
||||||
|
model,
|
||||||
|
ignore_pattern=[
|
||||||
|
"*.pt",
|
||||||
|
"*.safetensors",
|
||||||
|
"*.bin",
|
||||||
|
"*.tensors",
|
||||||
|
"*.pth",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
self.tokenizer = object_storage_model.dir
|
||||||
|
return
|
||||||
|
|
||||||
|
# Only download tokenizer if needed and not already handled
|
||||||
|
if is_runai_obj_uri(tokenizer):
|
||||||
|
object_storage_tokenizer = ObjectStorageModel(url=tokenizer)
|
||||||
|
object_storage_tokenizer.pull_files(
|
||||||
|
model,
|
||||||
|
ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors", "*.pth"],
|
||||||
|
)
|
||||||
|
self.tokenizer = object_storage_tokenizer.dir
|
||||||
|
|
||||||
def _get_encoder_config(self):
|
def _get_encoder_config(self):
|
||||||
model = self.model
|
model = self.model
|
||||||
@ -1625,38 +1712,30 @@ class ModelConfig:
|
|||||||
return dense_modules[-1]["out_features"]
|
return dense_modules[-1]["out_features"]
|
||||||
return self.get_hidden_size()
|
return self.get_hidden_size()
|
||||||
|
|
||||||
def recalculate_max_model_len(
|
def get_and_verify_max_len(self, max_model_len: int):
|
||||||
self,
|
|
||||||
original_max_model_len: int | None,
|
|
||||||
*,
|
|
||||||
tokenizer: str | None = None,
|
|
||||||
tokenizer_revision: str | None = None,
|
|
||||||
) -> None:
|
|
||||||
# Consider max_model_len in tokenizer_config only when
|
# Consider max_model_len in tokenizer_config only when
|
||||||
# pooling models use absolute position_embedding.
|
# pooling models use absolute position_embedding.
|
||||||
# NOTE: For simplicity we assume `args.model == args.tokenizer`
|
|
||||||
# since this is
|
|
||||||
tokenizer_config = None
|
tokenizer_config = None
|
||||||
if (
|
if (
|
||||||
self.runner_type == "pooling"
|
self.runner_type == "pooling"
|
||||||
and getattr(self.hf_config, "position_embedding_type", "") == "absolute"
|
and getattr(self.hf_config, "position_embedding_type", "") == "absolute"
|
||||||
):
|
):
|
||||||
tokenizer_config = try_get_tokenizer_config(
|
tokenizer_config = try_get_tokenizer_config(
|
||||||
tokenizer or self.model,
|
self.tokenizer,
|
||||||
trust_remote_code=self.trust_remote_code,
|
trust_remote_code=self.trust_remote_code,
|
||||||
revision=tokenizer_revision or self.revision,
|
revision=self.tokenizer_revision,
|
||||||
)
|
)
|
||||||
|
max_model_len = _get_and_verify_max_len(
|
||||||
self.max_model_len = _get_and_verify_max_len(
|
|
||||||
hf_config=self.hf_text_config,
|
hf_config=self.hf_text_config,
|
||||||
tokenizer_config=tokenizer_config,
|
tokenizer_config=tokenizer_config,
|
||||||
max_model_len=original_max_model_len,
|
max_model_len=max_model_len,
|
||||||
disable_sliding_window=self.disable_sliding_window,
|
disable_sliding_window=self.disable_sliding_window,
|
||||||
sliding_window=self.get_sliding_window(),
|
sliding_window=self.get_sliding_window(),
|
||||||
spec_target_max_model_len=self.spec_target_max_model_len,
|
spec_target_max_model_len=self.spec_target_max_model_len,
|
||||||
encoder_config=self.encoder_config,
|
encoder_config=self.encoder_config,
|
||||||
)
|
)
|
||||||
logger.info("Using max model len %s", self.max_model_len)
|
logger.info("Using max model len %s", max_model_len)
|
||||||
|
return max_model_len
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def attn_type(self) -> AttnTypeStr:
|
def attn_type(self) -> AttnTypeStr:
|
||||||
|
|||||||
@ -79,6 +79,10 @@ class MultiModalConfig:
|
|||||||
|
|
||||||
WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed.
|
WARNING: The vLLM engine may crash if incorrect shape of embeddings is passed.
|
||||||
Only enable this flag for trusted users!"""
|
Only enable this flag for trusted users!"""
|
||||||
|
media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict)
|
||||||
|
"""Additional args passed to process media inputs, keyed by modalities.
|
||||||
|
For example, to set num_frames for video, set
|
||||||
|
`--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
|
||||||
mm_processor_kwargs: dict[str, object] | None = None
|
mm_processor_kwargs: dict[str, object] | None = None
|
||||||
"""Arguments to be forwarded to the model's processor for multi-modal data,
|
"""Arguments to be forwarded to the model's processor for multi-modal data,
|
||||||
e.g., image processor. Overrides for the multi-modal processor obtained
|
e.g., image processor. Overrides for the multi-modal processor obtained
|
||||||
|
|||||||
@ -1,109 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
from typing import Any, Literal
|
|
||||||
|
|
||||||
from pydantic import Field, SkipValidation
|
|
||||||
from pydantic.dataclasses import dataclass
|
|
||||||
|
|
||||||
from vllm.config.model import ModelConfig
|
|
||||||
from vllm.config.utils import config
|
|
||||||
from vllm.transformers_utils.gguf_utils import is_gguf
|
|
||||||
from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri
|
|
||||||
from vllm.transformers_utils.utils import maybe_model_redirect
|
|
||||||
|
|
||||||
TokenizerMode = Literal["auto", "hf", "slow", "mistral", "deepseek_v32"]
|
|
||||||
|
|
||||||
|
|
||||||
@config
|
|
||||||
@dataclass
|
|
||||||
class RendererConfig:
|
|
||||||
"""Configuration for the renderer."""
|
|
||||||
|
|
||||||
# NOTE: In reality, this is a required argument.
|
|
||||||
# We provide a dummy default value here to generate the CLI args.
|
|
||||||
model_config: SkipValidation[ModelConfig] = None # type: ignore
|
|
||||||
"""Provides model context to the renderer."""
|
|
||||||
|
|
||||||
tokenizer: str = ""
|
|
||||||
"""Name or path of the Hugging Face tokenizer to use. If unspecified, model
|
|
||||||
name or path will be used."""
|
|
||||||
tokenizer_mode: TokenizerMode | str = "auto"
|
|
||||||
"""Tokenizer mode:\n
|
|
||||||
- "auto" will use the tokenizer from `mistral_common` for Mistral models
|
|
||||||
if available, otherwise it will use the "hf" tokenizer.\n
|
|
||||||
- "hf" will use the fast tokenizer if available.\n
|
|
||||||
- "slow" will always use the slow tokenizer.\n
|
|
||||||
- "mistral" will always use the tokenizer from `mistral_common`.\n
|
|
||||||
- "deepseek_v32" will always use the tokenizer from `deepseek_v32`.\n
|
|
||||||
- Other custom values can be supported via plugins."""
|
|
||||||
tokenizer_revision: str | None = None
|
|
||||||
"""The specific revision to use for the tokenizer on the Hugging Face Hub.
|
|
||||||
It can be a branch name, a tag name, or a commit id. If unspecified, will
|
|
||||||
use the default version."""
|
|
||||||
skip_tokenizer_init: bool = False
|
|
||||||
"""Skip initialization of tokenizer and detokenizer. Expects valid
|
|
||||||
`prompt_token_ids` and `None` for prompt from the input. The generated
|
|
||||||
output will contain token ids."""
|
|
||||||
|
|
||||||
io_processor_plugin: str | None = None
|
|
||||||
"""IOProcessor plugin name to load at model startup."""
|
|
||||||
|
|
||||||
media_io_kwargs: dict[str, dict[str, Any]] = Field(default_factory=dict)
|
|
||||||
"""Additional args passed to process media inputs, keyed by modalities.
|
|
||||||
For example, to set num_frames for video, set
|
|
||||||
`--media-io-kwargs '{"video": {"num_frames": 40} }'`"""
|
|
||||||
allowed_local_media_path: str = ""
|
|
||||||
"""Allowing API requests to read local images or videos from directories
|
|
||||||
specified by the server file system. This is a security risk. Should only
|
|
||||||
be enabled in trusted environments."""
|
|
||||||
allowed_media_domains: list[str] | None = None
|
|
||||||
"""If set, only media URLs that belong to this domain can be used for
|
|
||||||
multi-modal inputs. """
|
|
||||||
|
|
||||||
@property
|
|
||||||
def trust_remote_code(self) -> bool:
|
|
||||||
return self.model_config.trust_remote_code
|
|
||||||
|
|
||||||
def __post_init__(self) -> None:
|
|
||||||
model_config = self.model_config
|
|
||||||
|
|
||||||
# The tokenizer is consistent with the model by default.
|
|
||||||
if not self.tokenizer:
|
|
||||||
self.tokenizer = (
|
|
||||||
ModelConfig.model
|
|
||||||
if model_config is None
|
|
||||||
else model_config.original_model
|
|
||||||
)
|
|
||||||
if not self.tokenizer_revision:
|
|
||||||
self.tokenizer_revision = (
|
|
||||||
ModelConfig.revision if model_config is None else model_config.revision
|
|
||||||
)
|
|
||||||
|
|
||||||
self.original_tokenizer = self.tokenizer
|
|
||||||
self.tokenizer = maybe_model_redirect(self.original_tokenizer)
|
|
||||||
self.maybe_pull_tokenizer_for_runai(self.tokenizer)
|
|
||||||
|
|
||||||
# Multimodal GGUF models must use original repo for mm processing
|
|
||||||
is_multimodal_model = (
|
|
||||||
ModelConfig.is_multimodal_model
|
|
||||||
if model_config is None
|
|
||||||
else model_config.is_multimodal_model
|
|
||||||
)
|
|
||||||
if is_gguf(self.tokenizer) and is_multimodal_model:
|
|
||||||
raise ValueError(
|
|
||||||
"Loading a multimodal GGUF model needs to use original "
|
|
||||||
"tokenizer. Please specify the unquantized hf model's "
|
|
||||||
"repo name or path using the --tokenizer argument."
|
|
||||||
)
|
|
||||||
|
|
||||||
def maybe_pull_tokenizer_for_runai(self, tokenizer: str) -> None:
|
|
||||||
"""Pull tokenizer from Object Storage to temporary directory when needed."""
|
|
||||||
if not is_runai_obj_uri(tokenizer):
|
|
||||||
return
|
|
||||||
|
|
||||||
object_storage_tokenizer = ObjectStorageModel(url=tokenizer)
|
|
||||||
object_storage_tokenizer.pull_files(
|
|
||||||
tokenizer,
|
|
||||||
ignore_pattern=["*.pt", "*.safetensors", "*.bin", "*.tensors", "*.pth"],
|
|
||||||
)
|
|
||||||
self.tokenizer = object_storage_tokenizer.dir
|
|
||||||
@ -322,11 +322,16 @@ class SpeculativeConfig:
|
|||||||
self.draft_model_config = ModelConfig(
|
self.draft_model_config = ModelConfig(
|
||||||
model=self.model,
|
model=self.model,
|
||||||
runner="draft",
|
runner="draft",
|
||||||
|
tokenizer=self.target_model_config.tokenizer,
|
||||||
|
tokenizer_mode=self.target_model_config.tokenizer_mode,
|
||||||
trust_remote_code=self.target_model_config.trust_remote_code,
|
trust_remote_code=self.target_model_config.trust_remote_code,
|
||||||
|
allowed_local_media_path=self.target_model_config.allowed_local_media_path,
|
||||||
|
allowed_media_domains=self.target_model_config.allowed_media_domains,
|
||||||
dtype=self.target_model_config.dtype,
|
dtype=self.target_model_config.dtype,
|
||||||
seed=self.target_model_config.seed,
|
seed=self.target_model_config.seed,
|
||||||
revision=self.revision,
|
revision=self.revision,
|
||||||
code_revision=self.code_revision,
|
code_revision=self.code_revision,
|
||||||
|
tokenizer_revision=self.target_model_config.tokenizer_revision,
|
||||||
spec_target_max_model_len=self.target_model_config.max_model_len,
|
spec_target_max_model_len=self.target_model_config.max_model_len,
|
||||||
quantization=self.quantization,
|
quantization=self.quantization,
|
||||||
enforce_eager=self.target_model_config.enforce_eager,
|
enforce_eager=self.target_model_config.enforce_eager,
|
||||||
|
|||||||
@ -39,7 +39,6 @@ from .lora import LoRAConfig
|
|||||||
from .model import ModelConfig
|
from .model import ModelConfig
|
||||||
from .observability import ObservabilityConfig
|
from .observability import ObservabilityConfig
|
||||||
from .parallel import ParallelConfig
|
from .parallel import ParallelConfig
|
||||||
from .renderer import RendererConfig
|
|
||||||
from .scheduler import SchedulerConfig
|
from .scheduler import SchedulerConfig
|
||||||
from .speculative import SpeculativeConfig
|
from .speculative import SpeculativeConfig
|
||||||
from .structured_outputs import StructuredOutputsConfig
|
from .structured_outputs import StructuredOutputsConfig
|
||||||
@ -182,8 +181,6 @@ class VllmConfig:
|
|||||||
# try to download a model
|
# try to download a model
|
||||||
model_config: ModelConfig = Field(default=None)
|
model_config: ModelConfig = Field(default=None)
|
||||||
"""Model configuration."""
|
"""Model configuration."""
|
||||||
renderer_config: RendererConfig = Field(default_factory=RendererConfig)
|
|
||||||
"""Renderer configuration."""
|
|
||||||
cache_config: CacheConfig = Field(default_factory=CacheConfig)
|
cache_config: CacheConfig = Field(default_factory=CacheConfig)
|
||||||
"""Cache configuration."""
|
"""Cache configuration."""
|
||||||
parallel_config: ParallelConfig = Field(default_factory=ParallelConfig)
|
parallel_config: ParallelConfig = Field(default_factory=ParallelConfig)
|
||||||
@ -744,7 +741,7 @@ class VllmConfig:
|
|||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
|
|
||||||
self.scheduler_config.max_num_encoder_input_tokens = (
|
self.scheduler_config.max_num_encoder_input_tokens = (
|
||||||
MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.renderer_config)
|
MULTIMODAL_REGISTRY.get_encdec_max_encoder_len(self.model_config)
|
||||||
)
|
)
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"Encoder-decoder model detected: setting "
|
"Encoder-decoder model detected: setting "
|
||||||
@ -1189,13 +1186,11 @@ class VllmConfig:
|
|||||||
computed_compile_ranges_split_points
|
computed_compile_ranges_split_points
|
||||||
)
|
)
|
||||||
|
|
||||||
def recalculate_max_model_len(self, original_max_model_len: int | None) -> None:
|
def recalculate_max_model_len(self, max_model_len: int):
|
||||||
# Can only be called during try_verify_and_update_config
|
# Can only be called in try_verify_and_update_config
|
||||||
self.model_config.recalculate_max_model_len(
|
model_config = self.model_config
|
||||||
original_max_model_len,
|
max_model_len = model_config.get_and_verify_max_len(max_model_len)
|
||||||
tokenizer=self.renderer_config.tokenizer,
|
self.model_config.max_model_len = max_model_len
|
||||||
tokenizer_revision=self.renderer_config.tokenizer_revision,
|
|
||||||
)
|
|
||||||
|
|
||||||
def try_verify_and_update_config(self):
|
def try_verify_and_update_config(self):
|
||||||
if self.model_config is None:
|
if self.model_config is None:
|
||||||
@ -1269,11 +1264,11 @@ class VllmConfig:
|
|||||||
return (
|
return (
|
||||||
f"model={self.model_config.model!r}, "
|
f"model={self.model_config.model!r}, "
|
||||||
f"speculative_config={self.speculative_config!r}, "
|
f"speculative_config={self.speculative_config!r}, "
|
||||||
f"tokenizer={self.renderer_config.tokenizer!r}, "
|
f"tokenizer={self.model_config.tokenizer!r}, "
|
||||||
f"skip_tokenizer_init={self.renderer_config.skip_tokenizer_init}, "
|
f"skip_tokenizer_init={self.model_config.skip_tokenizer_init}, "
|
||||||
f"tokenizer_mode={self.renderer_config.tokenizer_mode}, "
|
f"tokenizer_mode={self.model_config.tokenizer_mode}, "
|
||||||
f"revision={self.model_config.revision}, "
|
f"revision={self.model_config.revision}, "
|
||||||
f"tokenizer_revision={self.renderer_config.tokenizer_revision}, "
|
f"tokenizer_revision={self.model_config.tokenizer_revision}, "
|
||||||
f"trust_remote_code={self.model_config.trust_remote_code}, "
|
f"trust_remote_code={self.model_config.trust_remote_code}, "
|
||||||
f"dtype={self.model_config.dtype}, "
|
f"dtype={self.model_config.dtype}, "
|
||||||
f"max_seq_len={self.model_config.max_model_len}, "
|
f"max_seq_len={self.model_config.max_model_len}, "
|
||||||
|
|||||||
@ -71,11 +71,11 @@ from vllm.config.model import (
|
|||||||
ModelDType,
|
ModelDType,
|
||||||
RunnerOption,
|
RunnerOption,
|
||||||
TaskOption,
|
TaskOption,
|
||||||
|
TokenizerMode,
|
||||||
)
|
)
|
||||||
from vllm.config.multimodal import MMCacheType, MMEncoderTPMode
|
from vllm.config.multimodal import MMCacheType, MMEncoderTPMode
|
||||||
from vllm.config.observability import DetailedTraceModules
|
from vllm.config.observability import DetailedTraceModules
|
||||||
from vllm.config.parallel import DistributedExecutorBackend, ExpertPlacementStrategy
|
from vllm.config.parallel import DistributedExecutorBackend, ExpertPlacementStrategy
|
||||||
from vllm.config.renderer import RendererConfig, TokenizerMode
|
|
||||||
from vllm.config.scheduler import SchedulerPolicy
|
from vllm.config.scheduler import SchedulerPolicy
|
||||||
from vllm.config.utils import get_field
|
from vllm.config.utils import get_field
|
||||||
from vllm.config.vllm import OptimizationLevel
|
from vllm.config.vllm import OptimizationLevel
|
||||||
@ -355,12 +355,17 @@ class EngineArgs:
|
|||||||
|
|
||||||
model: str = ModelConfig.model
|
model: str = ModelConfig.model
|
||||||
served_model_name: str | list[str] | None = ModelConfig.served_model_name
|
served_model_name: str | list[str] | None = ModelConfig.served_model_name
|
||||||
|
tokenizer: str | None = ModelConfig.tokenizer
|
||||||
hf_config_path: str | None = ModelConfig.hf_config_path
|
hf_config_path: str | None = ModelConfig.hf_config_path
|
||||||
runner: RunnerOption = ModelConfig.runner
|
runner: RunnerOption = ModelConfig.runner
|
||||||
convert: ConvertOption = ModelConfig.convert
|
convert: ConvertOption = ModelConfig.convert
|
||||||
task: TaskOption | None = ModelConfig.task
|
task: TaskOption | None = ModelConfig.task
|
||||||
|
skip_tokenizer_init: bool = ModelConfig.skip_tokenizer_init
|
||||||
enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
|
enable_prompt_embeds: bool = ModelConfig.enable_prompt_embeds
|
||||||
|
tokenizer_mode: TokenizerMode | str = ModelConfig.tokenizer_mode
|
||||||
trust_remote_code: bool = ModelConfig.trust_remote_code
|
trust_remote_code: bool = ModelConfig.trust_remote_code
|
||||||
|
allowed_local_media_path: str = ModelConfig.allowed_local_media_path
|
||||||
|
allowed_media_domains: list[str] | None = ModelConfig.allowed_media_domains
|
||||||
download_dir: str | None = LoadConfig.download_dir
|
download_dir: str | None = LoadConfig.download_dir
|
||||||
safetensors_load_strategy: str = LoadConfig.safetensors_load_strategy
|
safetensors_load_strategy: str = LoadConfig.safetensors_load_strategy
|
||||||
load_format: str | LoadFormats = LoadConfig.load_format
|
load_format: str | LoadFormats = LoadConfig.load_format
|
||||||
@ -444,6 +449,7 @@ class EngineArgs:
|
|||||||
code_revision: str | None = ModelConfig.code_revision
|
code_revision: str | None = ModelConfig.code_revision
|
||||||
hf_token: bool | str | None = ModelConfig.hf_token
|
hf_token: bool | str | None = ModelConfig.hf_token
|
||||||
hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides")
|
hf_overrides: HfOverrides = get_field(ModelConfig, "hf_overrides")
|
||||||
|
tokenizer_revision: str | None = ModelConfig.tokenizer_revision
|
||||||
quantization: QuantizationMethods | None = ModelConfig.quantization
|
quantization: QuantizationMethods | None = ModelConfig.quantization
|
||||||
enforce_eager: bool = ModelConfig.enforce_eager
|
enforce_eager: bool = ModelConfig.enforce_eager
|
||||||
disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
|
disable_custom_all_reduce: bool = ParallelConfig.disable_custom_all_reduce
|
||||||
@ -452,6 +458,9 @@ class EngineArgs:
|
|||||||
)
|
)
|
||||||
enable_mm_embeds: bool = MultiModalConfig.enable_mm_embeds
|
enable_mm_embeds: bool = MultiModalConfig.enable_mm_embeds
|
||||||
interleave_mm_strings: bool = MultiModalConfig.interleave_mm_strings
|
interleave_mm_strings: bool = MultiModalConfig.interleave_mm_strings
|
||||||
|
media_io_kwargs: dict[str, dict[str, Any]] = get_field(
|
||||||
|
MultiModalConfig, "media_io_kwargs"
|
||||||
|
)
|
||||||
mm_processor_kwargs: dict[str, Any] | None = MultiModalConfig.mm_processor_kwargs
|
mm_processor_kwargs: dict[str, Any] | None = MultiModalConfig.mm_processor_kwargs
|
||||||
disable_mm_preprocessor_cache: bool = False # DEPRECATED
|
disable_mm_preprocessor_cache: bool = False # DEPRECATED
|
||||||
mm_processor_cache_gb: float = MultiModalConfig.mm_processor_cache_gb
|
mm_processor_cache_gb: float = MultiModalConfig.mm_processor_cache_gb
|
||||||
@ -465,19 +474,9 @@ class EngineArgs:
|
|||||||
mm_encoder_attn_backend: AttentionBackendEnum | str | None = (
|
mm_encoder_attn_backend: AttentionBackendEnum | str | None = (
|
||||||
MultiModalConfig.mm_encoder_attn_backend
|
MultiModalConfig.mm_encoder_attn_backend
|
||||||
)
|
)
|
||||||
|
io_processor_plugin: str | None = None
|
||||||
skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling
|
skip_mm_profiling: bool = MultiModalConfig.skip_mm_profiling
|
||||||
video_pruning_rate: float = MultiModalConfig.video_pruning_rate
|
video_pruning_rate: float = MultiModalConfig.video_pruning_rate
|
||||||
# Renderer fields
|
|
||||||
tokenizer: str | None = None
|
|
||||||
tokenizer_mode: TokenizerMode | str = RendererConfig.tokenizer_mode
|
|
||||||
tokenizer_revision: str | None = RendererConfig.tokenizer_revision
|
|
||||||
skip_tokenizer_init: bool = RendererConfig.skip_tokenizer_init
|
|
||||||
io_processor_plugin: str | None = None
|
|
||||||
media_io_kwargs: dict[str, dict[str, Any]] = get_field(
|
|
||||||
RendererConfig, "media_io_kwargs"
|
|
||||||
)
|
|
||||||
allowed_local_media_path: str = RendererConfig.allowed_local_media_path
|
|
||||||
allowed_media_domains: list[str] | None = RendererConfig.allowed_media_domains
|
|
||||||
# LoRA fields
|
# LoRA fields
|
||||||
enable_lora: bool = False
|
enable_lora: bool = False
|
||||||
max_loras: int = LoRAConfig.max_loras
|
max_loras: int = LoRAConfig.max_loras
|
||||||
@ -628,14 +627,25 @@ class EngineArgs:
|
|||||||
model_group.add_argument("--runner", **model_kwargs["runner"])
|
model_group.add_argument("--runner", **model_kwargs["runner"])
|
||||||
model_group.add_argument("--convert", **model_kwargs["convert"])
|
model_group.add_argument("--convert", **model_kwargs["convert"])
|
||||||
model_group.add_argument("--task", **model_kwargs["task"], deprecated=True)
|
model_group.add_argument("--task", **model_kwargs["task"], deprecated=True)
|
||||||
|
model_group.add_argument("--tokenizer", **model_kwargs["tokenizer"])
|
||||||
|
model_group.add_argument("--tokenizer-mode", **model_kwargs["tokenizer_mode"])
|
||||||
model_group.add_argument(
|
model_group.add_argument(
|
||||||
"--trust-remote-code", **model_kwargs["trust_remote_code"]
|
"--trust-remote-code", **model_kwargs["trust_remote_code"]
|
||||||
)
|
)
|
||||||
model_group.add_argument("--dtype", **model_kwargs["dtype"])
|
model_group.add_argument("--dtype", **model_kwargs["dtype"])
|
||||||
model_group.add_argument("--seed", **model_kwargs["seed"])
|
model_group.add_argument("--seed", **model_kwargs["seed"])
|
||||||
model_group.add_argument("--hf-config-path", **model_kwargs["hf_config_path"])
|
model_group.add_argument("--hf-config-path", **model_kwargs["hf_config_path"])
|
||||||
|
model_group.add_argument(
|
||||||
|
"--allowed-local-media-path", **model_kwargs["allowed_local_media_path"]
|
||||||
|
)
|
||||||
|
model_group.add_argument(
|
||||||
|
"--allowed-media-domains", **model_kwargs["allowed_media_domains"]
|
||||||
|
)
|
||||||
model_group.add_argument("--revision", **model_kwargs["revision"])
|
model_group.add_argument("--revision", **model_kwargs["revision"])
|
||||||
model_group.add_argument("--code-revision", **model_kwargs["code_revision"])
|
model_group.add_argument("--code-revision", **model_kwargs["code_revision"])
|
||||||
|
model_group.add_argument(
|
||||||
|
"--tokenizer-revision", **model_kwargs["tokenizer_revision"]
|
||||||
|
)
|
||||||
model_group.add_argument("--max-model-len", **model_kwargs["max_model_len"])
|
model_group.add_argument("--max-model-len", **model_kwargs["max_model_len"])
|
||||||
model_group.add_argument("--quantization", "-q", **model_kwargs["quantization"])
|
model_group.add_argument("--quantization", "-q", **model_kwargs["quantization"])
|
||||||
model_group.add_argument("--enforce-eager", **model_kwargs["enforce_eager"])
|
model_group.add_argument("--enforce-eager", **model_kwargs["enforce_eager"])
|
||||||
@ -647,6 +657,9 @@ class EngineArgs:
|
|||||||
model_group.add_argument(
|
model_group.add_argument(
|
||||||
"--disable-cascade-attn", **model_kwargs["disable_cascade_attn"]
|
"--disable-cascade-attn", **model_kwargs["disable_cascade_attn"]
|
||||||
)
|
)
|
||||||
|
model_group.add_argument(
|
||||||
|
"--skip-tokenizer-init", **model_kwargs["skip_tokenizer_init"]
|
||||||
|
)
|
||||||
model_group.add_argument(
|
model_group.add_argument(
|
||||||
"--enable-prompt-embeds", **model_kwargs["enable_prompt_embeds"]
|
"--enable-prompt-embeds", **model_kwargs["enable_prompt_embeds"]
|
||||||
)
|
)
|
||||||
@ -685,34 +698,8 @@ class EngineArgs:
|
|||||||
model_group.add_argument(
|
model_group.add_argument(
|
||||||
"--logits-processors", **model_kwargs["logits_processors"]
|
"--logits-processors", **model_kwargs["logits_processors"]
|
||||||
)
|
)
|
||||||
|
model_group.add_argument(
|
||||||
# Renderer arguments
|
"--io-processor-plugin", **model_kwargs["io_processor_plugin"]
|
||||||
renderer_kwargs = get_kwargs(RendererConfig)
|
|
||||||
renderer_group = parser.add_argument_group(
|
|
||||||
title="RendererConfig",
|
|
||||||
description=RendererConfig.__doc__,
|
|
||||||
)
|
|
||||||
renderer_group.add_argument("--tokenizer", **renderer_kwargs["tokenizer"])
|
|
||||||
renderer_group.add_argument(
|
|
||||||
"--tokenizer-mode", **renderer_kwargs["tokenizer_mode"]
|
|
||||||
)
|
|
||||||
renderer_group.add_argument(
|
|
||||||
"--tokenizer-revision", **renderer_kwargs["tokenizer_revision"]
|
|
||||||
)
|
|
||||||
renderer_group.add_argument(
|
|
||||||
"--skip-tokenizer-init", **renderer_kwargs["skip_tokenizer_init"]
|
|
||||||
)
|
|
||||||
renderer_group.add_argument(
|
|
||||||
"--media-io-kwargs", **renderer_kwargs["media_io_kwargs"]
|
|
||||||
)
|
|
||||||
renderer_group.add_argument(
|
|
||||||
"--allowed-local-media-path", **renderer_kwargs["allowed_local_media_path"]
|
|
||||||
)
|
|
||||||
renderer_group.add_argument(
|
|
||||||
"--allowed-media-domains", **renderer_kwargs["allowed_media_domains"]
|
|
||||||
)
|
|
||||||
renderer_group.add_argument(
|
|
||||||
"--io-processor-plugin", **renderer_kwargs["io_processor_plugin"]
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Model loading arguments
|
# Model loading arguments
|
||||||
@ -962,6 +949,9 @@ class EngineArgs:
|
|||||||
multimodal_group.add_argument(
|
multimodal_group.add_argument(
|
||||||
"--enable-mm-embeds", **multimodal_kwargs["enable_mm_embeds"]
|
"--enable-mm-embeds", **multimodal_kwargs["enable_mm_embeds"]
|
||||||
)
|
)
|
||||||
|
multimodal_group.add_argument(
|
||||||
|
"--media-io-kwargs", **multimodal_kwargs["media_io_kwargs"]
|
||||||
|
)
|
||||||
multimodal_group.add_argument(
|
multimodal_group.add_argument(
|
||||||
"--mm-processor-kwargs", **multimodal_kwargs["mm_processor_kwargs"]
|
"--mm-processor-kwargs", **multimodal_kwargs["mm_processor_kwargs"]
|
||||||
)
|
)
|
||||||
@ -1265,13 +1255,18 @@ class EngineArgs:
|
|||||||
runner=self.runner,
|
runner=self.runner,
|
||||||
convert=self.convert,
|
convert=self.convert,
|
||||||
task=self.task,
|
task=self.task,
|
||||||
|
tokenizer=self.tokenizer,
|
||||||
|
tokenizer_mode=self.tokenizer_mode,
|
||||||
trust_remote_code=self.trust_remote_code,
|
trust_remote_code=self.trust_remote_code,
|
||||||
|
allowed_local_media_path=self.allowed_local_media_path,
|
||||||
|
allowed_media_domains=self.allowed_media_domains,
|
||||||
dtype=self.dtype,
|
dtype=self.dtype,
|
||||||
seed=self.seed,
|
seed=self.seed,
|
||||||
revision=self.revision,
|
revision=self.revision,
|
||||||
code_revision=self.code_revision,
|
code_revision=self.code_revision,
|
||||||
hf_token=self.hf_token,
|
hf_token=self.hf_token,
|
||||||
hf_overrides=self.hf_overrides,
|
hf_overrides=self.hf_overrides,
|
||||||
|
tokenizer_revision=self.tokenizer_revision,
|
||||||
max_model_len=self.max_model_len,
|
max_model_len=self.max_model_len,
|
||||||
quantization=self.quantization,
|
quantization=self.quantization,
|
||||||
enforce_eager=self.enforce_eager,
|
enforce_eager=self.enforce_eager,
|
||||||
@ -1279,11 +1274,13 @@ class EngineArgs:
|
|||||||
logprobs_mode=self.logprobs_mode,
|
logprobs_mode=self.logprobs_mode,
|
||||||
disable_sliding_window=self.disable_sliding_window,
|
disable_sliding_window=self.disable_sliding_window,
|
||||||
disable_cascade_attn=self.disable_cascade_attn,
|
disable_cascade_attn=self.disable_cascade_attn,
|
||||||
|
skip_tokenizer_init=self.skip_tokenizer_init,
|
||||||
enable_prompt_embeds=self.enable_prompt_embeds,
|
enable_prompt_embeds=self.enable_prompt_embeds,
|
||||||
served_model_name=self.served_model_name,
|
served_model_name=self.served_model_name,
|
||||||
limit_mm_per_prompt=self.limit_mm_per_prompt,
|
limit_mm_per_prompt=self.limit_mm_per_prompt,
|
||||||
enable_mm_embeds=self.enable_mm_embeds,
|
enable_mm_embeds=self.enable_mm_embeds,
|
||||||
interleave_mm_strings=self.interleave_mm_strings,
|
interleave_mm_strings=self.interleave_mm_strings,
|
||||||
|
media_io_kwargs=self.media_io_kwargs,
|
||||||
skip_mm_profiling=self.skip_mm_profiling,
|
skip_mm_profiling=self.skip_mm_profiling,
|
||||||
config_format=self.config_format,
|
config_format=self.config_format,
|
||||||
mm_processor_kwargs=self.mm_processor_kwargs,
|
mm_processor_kwargs=self.mm_processor_kwargs,
|
||||||
@ -1301,6 +1298,7 @@ class EngineArgs:
|
|||||||
override_attention_dtype=self.override_attention_dtype,
|
override_attention_dtype=self.override_attention_dtype,
|
||||||
logits_processors=self.logits_processors,
|
logits_processors=self.logits_processors,
|
||||||
video_pruning_rate=self.video_pruning_rate,
|
video_pruning_rate=self.video_pruning_rate,
|
||||||
|
io_processor_plugin=self.io_processor_plugin,
|
||||||
)
|
)
|
||||||
|
|
||||||
def validate_tensorizer_args(self):
|
def validate_tensorizer_args(self):
|
||||||
@ -1396,25 +1394,9 @@ class EngineArgs:
|
|||||||
)
|
)
|
||||||
|
|
||||||
model_config = self.create_model_config()
|
model_config = self.create_model_config()
|
||||||
renderer_config = RendererConfig(
|
|
||||||
model_config=model_config,
|
|
||||||
tokenizer=self.tokenizer or "",
|
|
||||||
tokenizer_mode=self.tokenizer_mode,
|
|
||||||
tokenizer_revision=self.tokenizer_revision,
|
|
||||||
skip_tokenizer_init=self.skip_tokenizer_init,
|
|
||||||
io_processor_plugin=self.io_processor_plugin,
|
|
||||||
media_io_kwargs=self.media_io_kwargs,
|
|
||||||
allowed_local_media_path=self.allowed_local_media_path,
|
|
||||||
allowed_media_domains=self.allowed_media_domains,
|
|
||||||
)
|
|
||||||
|
|
||||||
model_config.recalculate_max_model_len(
|
|
||||||
model_config.original_max_model_len,
|
|
||||||
tokenizer=renderer_config.tokenizer,
|
|
||||||
tokenizer_revision=renderer_config.tokenizer_revision,
|
|
||||||
)
|
|
||||||
|
|
||||||
self.model = model_config.model
|
self.model = model_config.model
|
||||||
|
self.tokenizer = model_config.tokenizer
|
||||||
|
|
||||||
self._check_feature_supported(model_config)
|
self._check_feature_supported(model_config)
|
||||||
self._set_default_chunked_prefill_and_prefix_caching_args(model_config)
|
self._set_default_chunked_prefill_and_prefix_caching_args(model_config)
|
||||||
self._set_default_max_num_seqs_and_batched_tokens_args(
|
self._set_default_max_num_seqs_and_batched_tokens_args(
|
||||||
@ -1786,7 +1768,6 @@ class EngineArgs:
|
|||||||
)
|
)
|
||||||
config = VllmConfig(
|
config = VllmConfig(
|
||||||
model_config=model_config,
|
model_config=model_config,
|
||||||
renderer_config=renderer_config,
|
|
||||||
cache_config=cache_config,
|
cache_config=cache_config,
|
||||||
parallel_config=parallel_config,
|
parallel_config=parallel_config,
|
||||||
scheduler_config=scheduler_config,
|
scheduler_config=scheduler_config,
|
||||||
|
|||||||
@ -5,7 +5,7 @@ from abc import ABC, abstractmethod
|
|||||||
from collections.abc import AsyncGenerator, Iterable, Mapping
|
from collections.abc import AsyncGenerator, Iterable, Mapping
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from vllm.config import ModelConfig, RendererConfig, VllmConfig
|
from vllm.config import ModelConfig, VllmConfig
|
||||||
from vllm.inputs.data import PromptType
|
from vllm.inputs.data import PromptType
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.outputs import PoolingRequestOutput, RequestOutput
|
from vllm.outputs import PoolingRequestOutput, RequestOutput
|
||||||
@ -22,7 +22,6 @@ class EngineClient(ABC):
|
|||||||
"""Protocol class for Clients to Engine"""
|
"""Protocol class for Clients to Engine"""
|
||||||
|
|
||||||
vllm_config: VllmConfig
|
vllm_config: VllmConfig
|
||||||
renderer_config: RendererConfig
|
|
||||||
model_config: ModelConfig
|
model_config: ModelConfig
|
||||||
input_processor: InputProcessor
|
input_processor: InputProcessor
|
||||||
io_processor: IOProcessor | None
|
io_processor: IOProcessor | None
|
||||||
|
|||||||
@ -44,7 +44,7 @@ from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast, Processor
|
|||||||
from typing_extensions import Required, TypedDict
|
from typing_extensions import Required, TypedDict
|
||||||
|
|
||||||
from vllm import envs
|
from vllm import envs
|
||||||
from vllm.config import ModelConfig, RendererConfig
|
from vllm.config import ModelConfig
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor.models import SupportsMultiModal
|
from vllm.model_executor.models import SupportsMultiModal
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict
|
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict, MultiModalUUIDDict
|
||||||
@ -452,10 +452,9 @@ This is needed because `lru_cache` does not cache when an exception happens.
|
|||||||
|
|
||||||
def _try_get_processor_chat_template(
|
def _try_get_processor_chat_template(
|
||||||
tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
|
tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
|
||||||
*,
|
model_config: ModelConfig,
|
||||||
trust_remote_code: bool,
|
|
||||||
) -> str | None:
|
) -> str | None:
|
||||||
cache_key = (tokenizer.name_or_path, trust_remote_code)
|
cache_key = (tokenizer.name_or_path, model_config.trust_remote_code)
|
||||||
if cache_key in _PROCESSOR_CHAT_TEMPLATES:
|
if cache_key in _PROCESSOR_CHAT_TEMPLATES:
|
||||||
return _PROCESSOR_CHAT_TEMPLATES[cache_key]
|
return _PROCESSOR_CHAT_TEMPLATES[cache_key]
|
||||||
|
|
||||||
@ -467,7 +466,7 @@ def _try_get_processor_chat_template(
|
|||||||
PreTrainedTokenizerFast,
|
PreTrainedTokenizerFast,
|
||||||
ProcessorMixin,
|
ProcessorMixin,
|
||||||
),
|
),
|
||||||
trust_remote_code=trust_remote_code,
|
trust_remote_code=model_config.trust_remote_code,
|
||||||
)
|
)
|
||||||
if (
|
if (
|
||||||
isinstance(processor, ProcessorMixin)
|
isinstance(processor, ProcessorMixin)
|
||||||
@ -500,10 +499,7 @@ def resolve_hf_chat_template(
|
|||||||
|
|
||||||
# 2nd priority: AutoProcessor chat template, unless tool calling is enabled
|
# 2nd priority: AutoProcessor chat template, unless tool calling is enabled
|
||||||
if tools is None:
|
if tools is None:
|
||||||
chat_template = _try_get_processor_chat_template(
|
chat_template = _try_get_processor_chat_template(tokenizer, model_config)
|
||||||
tokenizer,
|
|
||||||
trust_remote_code=model_config.trust_remote_code,
|
|
||||||
)
|
|
||||||
if chat_template is not None:
|
if chat_template is not None:
|
||||||
return chat_template
|
return chat_template
|
||||||
|
|
||||||
@ -517,10 +513,10 @@ def resolve_hf_chat_template(
|
|||||||
exc_info=True,
|
exc_info=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
# 4th priority: Predefined fallbacks]
|
# 4th priority: Predefined fallbacks
|
||||||
path = get_chat_template_fallback_path(
|
path = get_chat_template_fallback_path(
|
||||||
model_type=model_config.hf_config.model_type,
|
model_type=model_config.hf_config.model_type,
|
||||||
tokenizer_name_or_path=tokenizer.name_or_path,
|
tokenizer_name_or_path=model_config.tokenizer,
|
||||||
)
|
)
|
||||||
if path is not None:
|
if path is not None:
|
||||||
logger.info_once(
|
logger.info_once(
|
||||||
@ -542,14 +538,14 @@ def _resolve_chat_template_content_format(
|
|||||||
tools: list[dict[str, Any]] | None,
|
tools: list[dict[str, Any]] | None,
|
||||||
tokenizer: TokenizerLike | None,
|
tokenizer: TokenizerLike | None,
|
||||||
*,
|
*,
|
||||||
renderer_config: RendererConfig,
|
model_config: ModelConfig,
|
||||||
) -> _ChatTemplateContentFormat:
|
) -> _ChatTemplateContentFormat:
|
||||||
if isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
|
if isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
|
||||||
hf_chat_template = resolve_hf_chat_template(
|
hf_chat_template = resolve_hf_chat_template(
|
||||||
tokenizer,
|
tokenizer,
|
||||||
chat_template=chat_template,
|
chat_template=chat_template,
|
||||||
tools=tools,
|
tools=tools,
|
||||||
model_config=renderer_config.model_config,
|
model_config=model_config,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
hf_chat_template = None
|
hf_chat_template = None
|
||||||
@ -599,7 +595,7 @@ def resolve_chat_template_content_format(
|
|||||||
given_format: ChatTemplateContentFormatOption,
|
given_format: ChatTemplateContentFormatOption,
|
||||||
tokenizer: TokenizerLike | None,
|
tokenizer: TokenizerLike | None,
|
||||||
*,
|
*,
|
||||||
renderer_config: RendererConfig,
|
model_config: ModelConfig,
|
||||||
) -> _ChatTemplateContentFormat:
|
) -> _ChatTemplateContentFormat:
|
||||||
if given_format != "auto":
|
if given_format != "auto":
|
||||||
return given_format
|
return given_format
|
||||||
@ -608,7 +604,7 @@ def resolve_chat_template_content_format(
|
|||||||
chat_template,
|
chat_template,
|
||||||
tools,
|
tools,
|
||||||
tokenizer,
|
tokenizer,
|
||||||
renderer_config=renderer_config,
|
model_config=model_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
_log_chat_template_content_format(
|
_log_chat_template_content_format(
|
||||||
@ -631,32 +627,32 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
|
|||||||
maximum per prompt.
|
maximum per prompt.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, renderer_config: RendererConfig):
|
def __init__(self, model_config: ModelConfig):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
self._renderer_config = renderer_config
|
self._model_config = model_config
|
||||||
|
|
||||||
self._items_by_modality = defaultdict[str, list[_T | None]](list)
|
self._items_by_modality = defaultdict[str, list[_T | None]](list)
|
||||||
self._uuids_by_modality = defaultdict[str, list[str | None]](list)
|
self._uuids_by_modality = defaultdict[str, list[str | None]](list)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def renderer_config(self) -> RendererConfig:
|
def model_config(self) -> ModelConfig:
|
||||||
return self._renderer_config
|
return self._model_config
|
||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def model_cls(self) -> type[SupportsMultiModal]:
|
def model_cls(self) -> type[SupportsMultiModal]:
|
||||||
from vllm.model_executor.model_loader import get_model_cls
|
from vllm.model_executor.model_loader import get_model_cls
|
||||||
|
|
||||||
model_cls = get_model_cls(self.renderer_config.model_config)
|
model_cls = get_model_cls(self.model_config)
|
||||||
return cast(type[SupportsMultiModal], model_cls)
|
return cast(type[SupportsMultiModal], model_cls)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def allowed_local_media_path(self):
|
def allowed_local_media_path(self):
|
||||||
return self._renderer_config.allowed_local_media_path
|
return self._model_config.allowed_local_media_path
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def allowed_media_domains(self):
|
def allowed_media_domains(self):
|
||||||
return self._renderer_config.allowed_media_domains
|
return self._model_config.allowed_media_domains
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def mm_registry(self):
|
def mm_registry(self):
|
||||||
@ -664,7 +660,7 @@ class BaseMultiModalItemTracker(ABC, Generic[_T]):
|
|||||||
|
|
||||||
@cached_property
|
@cached_property
|
||||||
def mm_processor(self):
|
def mm_processor(self):
|
||||||
return self.mm_registry.create_processor(self.renderer_config)
|
return self.mm_registry.create_processor(self.model_config)
|
||||||
|
|
||||||
def add(
|
def add(
|
||||||
self,
|
self,
|
||||||
@ -855,20 +851,19 @@ class MultiModalContentParser(BaseMultiModalContentParser):
|
|||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
self._tracker = tracker
|
self._tracker = tracker
|
||||||
|
multimodal_config = self._tracker.model_config.multimodal_config
|
||||||
|
media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None)
|
||||||
|
|
||||||
self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load(
|
self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load(
|
||||||
envs.VLLM_MEDIA_CONNECTOR,
|
envs.VLLM_MEDIA_CONNECTOR,
|
||||||
media_io_kwargs=self.renderer_config.media_io_kwargs,
|
media_io_kwargs=media_io_kwargs,
|
||||||
allowed_local_media_path=tracker.allowed_local_media_path,
|
allowed_local_media_path=tracker.allowed_local_media_path,
|
||||||
allowed_media_domains=tracker.allowed_media_domains,
|
allowed_media_domains=tracker.allowed_media_domains,
|
||||||
)
|
)
|
||||||
|
|
||||||
@property
|
|
||||||
def renderer_config(self) -> RendererConfig:
|
|
||||||
return self._tracker.renderer_config
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def model_config(self) -> ModelConfig:
|
def model_config(self) -> ModelConfig:
|
||||||
return self.renderer_config.model_config
|
return self._tracker.model_config
|
||||||
|
|
||||||
def parse_image(self, image_url: str | None, uuid: str | None = None) -> None:
|
def parse_image(self, image_url: str | None, uuid: str | None = None) -> None:
|
||||||
image = self._connector.fetch_image(image_url) if image_url else None
|
image = self._connector.fetch_image(image_url) if image_url else None
|
||||||
@ -968,20 +963,18 @@ class AsyncMultiModalContentParser(BaseMultiModalContentParser):
|
|||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
self._tracker = tracker
|
self._tracker = tracker
|
||||||
|
multimodal_config = self._tracker.model_config.multimodal_config
|
||||||
|
media_io_kwargs = getattr(multimodal_config, "media_io_kwargs", None)
|
||||||
self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load(
|
self._connector: MediaConnector = MEDIA_CONNECTOR_REGISTRY.load(
|
||||||
envs.VLLM_MEDIA_CONNECTOR,
|
envs.VLLM_MEDIA_CONNECTOR,
|
||||||
media_io_kwargs=self.renderer_config.media_io_kwargs,
|
media_io_kwargs=media_io_kwargs,
|
||||||
allowed_local_media_path=tracker.allowed_local_media_path,
|
allowed_local_media_path=tracker.allowed_local_media_path,
|
||||||
allowed_media_domains=tracker.allowed_media_domains,
|
allowed_media_domains=tracker.allowed_media_domains,
|
||||||
)
|
)
|
||||||
|
|
||||||
@property
|
|
||||||
def renderer_config(self) -> RendererConfig:
|
|
||||||
return self._tracker.renderer_config
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def model_config(self) -> ModelConfig:
|
def model_config(self) -> ModelConfig:
|
||||||
return self.renderer_config.model_config
|
return self._tracker.model_config
|
||||||
|
|
||||||
def parse_image(self, image_url: str | None, uuid: str | None = None) -> None:
|
def parse_image(self, image_url: str | None, uuid: str | None = None) -> None:
|
||||||
image_coro = self._connector.fetch_image_async(image_url) if image_url else None
|
image_coro = self._connector.fetch_image_async(image_url) if image_url else None
|
||||||
@ -1611,17 +1604,15 @@ def _postprocess_messages(messages: list[ConversationMessage]) -> None:
|
|||||||
|
|
||||||
def parse_chat_messages(
|
def parse_chat_messages(
|
||||||
messages: list[ChatCompletionMessageParam],
|
messages: list[ChatCompletionMessageParam],
|
||||||
renderer_config: RendererConfig,
|
model_config: ModelConfig,
|
||||||
content_format: _ChatTemplateContentFormat,
|
content_format: _ChatTemplateContentFormat,
|
||||||
) -> tuple[
|
) -> tuple[
|
||||||
list[ConversationMessage],
|
list[ConversationMessage],
|
||||||
MultiModalDataDict | None,
|
MultiModalDataDict | None,
|
||||||
MultiModalUUIDDict | None,
|
MultiModalUUIDDict | None,
|
||||||
]:
|
]:
|
||||||
model_config = renderer_config.model_config
|
|
||||||
|
|
||||||
conversation: list[ConversationMessage] = []
|
conversation: list[ConversationMessage] = []
|
||||||
mm_tracker = MultiModalItemTracker(renderer_config)
|
mm_tracker = MultiModalItemTracker(model_config)
|
||||||
|
|
||||||
for msg in messages:
|
for msg in messages:
|
||||||
sub_messages = _parse_chat_message_content(
|
sub_messages = _parse_chat_message_content(
|
||||||
@ -1644,17 +1635,15 @@ def parse_chat_messages(
|
|||||||
|
|
||||||
def parse_chat_messages_futures(
|
def parse_chat_messages_futures(
|
||||||
messages: list[ChatCompletionMessageParam],
|
messages: list[ChatCompletionMessageParam],
|
||||||
renderer_config: RendererConfig,
|
model_config: ModelConfig,
|
||||||
content_format: _ChatTemplateContentFormat,
|
content_format: _ChatTemplateContentFormat,
|
||||||
) -> tuple[
|
) -> tuple[
|
||||||
list[ConversationMessage],
|
list[ConversationMessage],
|
||||||
Awaitable[MultiModalDataDict | None],
|
Awaitable[MultiModalDataDict | None],
|
||||||
MultiModalUUIDDict | None,
|
MultiModalUUIDDict | None,
|
||||||
]:
|
]:
|
||||||
model_config = renderer_config.model_config
|
|
||||||
|
|
||||||
conversation: list[ConversationMessage] = []
|
conversation: list[ConversationMessage] = []
|
||||||
mm_tracker = AsyncMultiModalItemTracker(renderer_config)
|
mm_tracker = AsyncMultiModalItemTracker(model_config)
|
||||||
|
|
||||||
for msg in messages:
|
for msg in messages:
|
||||||
sub_messages = _parse_chat_message_content(
|
sub_messages = _parse_chat_message_content(
|
||||||
@ -1759,14 +1748,14 @@ def apply_hf_chat_template(
|
|||||||
chat_template: str | None,
|
chat_template: str | None,
|
||||||
tools: list[dict[str, Any]] | None,
|
tools: list[dict[str, Any]] | None,
|
||||||
*,
|
*,
|
||||||
renderer_config: RendererConfig,
|
model_config: ModelConfig,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> str:
|
) -> str:
|
||||||
hf_chat_template = resolve_hf_chat_template(
|
hf_chat_template = resolve_hf_chat_template(
|
||||||
tokenizer,
|
tokenizer,
|
||||||
chat_template=chat_template,
|
chat_template=chat_template,
|
||||||
tools=tools,
|
tools=tools,
|
||||||
model_config=renderer_config.model_config,
|
model_config=model_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
if hf_chat_template is None:
|
if hf_chat_template is None:
|
||||||
|
|||||||
@ -29,8 +29,8 @@ from vllm.config.model import (
|
|||||||
HfOverrides,
|
HfOverrides,
|
||||||
ModelDType,
|
ModelDType,
|
||||||
RunnerOption,
|
RunnerOption,
|
||||||
|
TokenizerMode,
|
||||||
)
|
)
|
||||||
from vllm.config.renderer import TokenizerMode
|
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.entrypoints.chat_utils import (
|
from vllm.entrypoints.chat_utils import (
|
||||||
ChatCompletionMessageParam,
|
ChatCompletionMessageParam,
|
||||||
@ -343,7 +343,6 @@ class LLM:
|
|||||||
logger.info("Supported tasks: %s", supported_tasks)
|
logger.info("Supported tasks: %s", supported_tasks)
|
||||||
self.supported_tasks = supported_tasks
|
self.supported_tasks = supported_tasks
|
||||||
|
|
||||||
self.renderer_config = self.llm_engine.renderer_config
|
|
||||||
self.model_config = self.llm_engine.model_config
|
self.model_config = self.llm_engine.model_config
|
||||||
self.input_processor = self.llm_engine.input_processor
|
self.input_processor = self.llm_engine.input_processor
|
||||||
self.io_processor = self.llm_engine.io_processor
|
self.io_processor = self.llm_engine.io_processor
|
||||||
@ -809,13 +808,13 @@ class LLM:
|
|||||||
list_of_messages = [cast(list[ChatCompletionMessageParam], messages)]
|
list_of_messages = [cast(list[ChatCompletionMessageParam], messages)]
|
||||||
|
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
renderer_config = self.renderer_config
|
model_config = self.model_config
|
||||||
resolved_content_format = resolve_chat_template_content_format(
|
resolved_content_format = resolve_chat_template_content_format(
|
||||||
chat_template,
|
chat_template,
|
||||||
tools,
|
tools,
|
||||||
chat_template_content_format,
|
chat_template_content_format,
|
||||||
tokenizer,
|
tokenizer,
|
||||||
renderer_config=renderer_config,
|
model_config=model_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
_chat_template_kwargs: dict[str, Any] = dict(
|
_chat_template_kwargs: dict[str, Any] = dict(
|
||||||
@ -834,7 +833,7 @@ class LLM:
|
|||||||
# the chat message parsing for it.
|
# the chat message parsing for it.
|
||||||
conversation, mm_data, mm_uuids = parse_chat_messages(
|
conversation, mm_data, mm_uuids = parse_chat_messages(
|
||||||
msgs,
|
msgs,
|
||||||
renderer_config,
|
model_config,
|
||||||
content_format=resolved_content_format,
|
content_format=resolved_content_format,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -848,7 +847,7 @@ class LLM:
|
|||||||
prompt_str = apply_hf_chat_template(
|
prompt_str = apply_hf_chat_template(
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
conversation=conversation,
|
conversation=conversation,
|
||||||
renderer_config=renderer_config,
|
model_config=model_config,
|
||||||
**_chat_template_kwargs,
|
**_chat_template_kwargs,
|
||||||
)
|
)
|
||||||
# Special tokens are already included in chat templates so
|
# Special tokens are already included in chat templates so
|
||||||
@ -1291,7 +1290,6 @@ class LLM:
|
|||||||
lora_request: list[LoRARequest] | LoRARequest | None = None,
|
lora_request: list[LoRARequest] | LoRARequest | None = None,
|
||||||
tokenization_kwargs: dict[str, Any] | None = None,
|
tokenization_kwargs: dict[str, Any] | None = None,
|
||||||
) -> list[ScoringRequestOutput]:
|
) -> list[ScoringRequestOutput]:
|
||||||
renderer_config = self.renderer_config
|
|
||||||
model_config = self.model_config
|
model_config = self.model_config
|
||||||
|
|
||||||
if isinstance(tokenizer, MistralTokenizer):
|
if isinstance(tokenizer, MistralTokenizer):
|
||||||
@ -1319,7 +1317,7 @@ class LLM:
|
|||||||
|
|
||||||
for q, d in input_pairs:
|
for q, d in input_pairs:
|
||||||
_, engine_prompt = get_score_prompt(
|
_, engine_prompt = get_score_prompt(
|
||||||
renderer_config=renderer_config,
|
model_config=model_config,
|
||||||
data_1=q,
|
data_1=q,
|
||||||
data_2=d,
|
data_2=d,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
|
|||||||
@ -1099,7 +1099,7 @@ async def init_app_state(
|
|||||||
logger.info("Supported tasks: %s", supported_tasks)
|
logger.info("Supported tasks: %s", supported_tasks)
|
||||||
|
|
||||||
resolved_chat_template = await process_chat_template(
|
resolved_chat_template = await process_chat_template(
|
||||||
args.chat_template, engine_client, vllm_config.renderer_config
|
args.chat_template, engine_client, vllm_config.model_config
|
||||||
)
|
)
|
||||||
|
|
||||||
if args.tool_server == "demo":
|
if args.tool_server == "demo":
|
||||||
|
|||||||
@ -122,7 +122,7 @@ class OpenAIServingCompletion(OpenAIServing):
|
|||||||
try:
|
try:
|
||||||
lora_request = self._maybe_get_adapters(request)
|
lora_request = self._maybe_get_adapters(request)
|
||||||
|
|
||||||
if self.renderer_config.skip_tokenizer_init:
|
if self.model_config.skip_tokenizer_init:
|
||||||
tokenizer = None
|
tokenizer = None
|
||||||
else:
|
else:
|
||||||
tokenizer = await self.engine_client.get_tokenizer()
|
tokenizer = await self.engine_client.get_tokenizer()
|
||||||
|
|||||||
@ -291,7 +291,6 @@ class OpenAIServing:
|
|||||||
|
|
||||||
self.input_processor = self.models.input_processor
|
self.input_processor = self.models.input_processor
|
||||||
self.io_processor = self.models.io_processor
|
self.io_processor = self.models.io_processor
|
||||||
self.renderer_config = self.models.renderer_config
|
|
||||||
self.model_config = self.models.model_config
|
self.model_config = self.models.model_config
|
||||||
self.max_model_len = self.model_config.max_model_len
|
self.max_model_len = self.model_config.max_model_len
|
||||||
|
|
||||||
@ -1101,18 +1100,18 @@ class OpenAIServing:
|
|||||||
Sequence[RequestPrompt],
|
Sequence[RequestPrompt],
|
||||||
list[EngineTokensPrompt],
|
list[EngineTokensPrompt],
|
||||||
]:
|
]:
|
||||||
renderer_config = self.renderer_config
|
model_config = self.model_config
|
||||||
|
|
||||||
resolved_content_format = resolve_chat_template_content_format(
|
resolved_content_format = resolve_chat_template_content_format(
|
||||||
chat_template,
|
chat_template,
|
||||||
tool_dicts,
|
tool_dicts,
|
||||||
chat_template_content_format,
|
chat_template_content_format,
|
||||||
tokenizer,
|
tokenizer,
|
||||||
renderer_config=renderer_config,
|
model_config=model_config,
|
||||||
)
|
)
|
||||||
conversation, mm_data_future, mm_uuids = parse_chat_messages_futures(
|
conversation, mm_data_future, mm_uuids = parse_chat_messages_futures(
|
||||||
messages,
|
messages,
|
||||||
renderer_config,
|
model_config,
|
||||||
content_format=resolved_content_format,
|
content_format=resolved_content_format,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1139,14 +1138,14 @@ class OpenAIServing:
|
|||||||
request_prompt = tokenizer.apply_chat_template(
|
request_prompt = tokenizer.apply_chat_template(
|
||||||
conversation=conversation,
|
conversation=conversation,
|
||||||
messages=messages,
|
messages=messages,
|
||||||
model_config=renderer_config.model_config,
|
model_config=model_config,
|
||||||
**_chat_template_kwargs,
|
**_chat_template_kwargs,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
request_prompt = apply_hf_chat_template(
|
request_prompt = apply_hf_chat_template(
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
conversation=conversation,
|
conversation=conversation,
|
||||||
renderer_config=renderer_config,
|
model_config=model_config,
|
||||||
**_chat_template_kwargs,
|
**_chat_template_kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -71,7 +71,6 @@ class OpenAIServingModels:
|
|||||||
|
|
||||||
self.input_processor = self.engine_client.input_processor
|
self.input_processor = self.engine_client.input_processor
|
||||||
self.io_processor = self.engine_client.io_processor
|
self.io_processor = self.engine_client.io_processor
|
||||||
self.renderer_config = self.engine_client.renderer_config
|
|
||||||
self.model_config = self.engine_client.model_config
|
self.model_config = self.engine_client.model_config
|
||||||
self.max_model_len = self.model_config.max_model_len
|
self.max_model_len = self.model_config.max_model_len
|
||||||
|
|
||||||
|
|||||||
@ -91,7 +91,7 @@ class OpenAISpeechToText(OpenAIServing):
|
|||||||
self.task_type = task_type
|
self.task_type = task_type
|
||||||
|
|
||||||
self.asr_config = self.model_cls.get_speech_to_text_config(
|
self.asr_config = self.model_cls.get_speech_to_text_config(
|
||||||
self.renderer_config, task_type
|
self.model_config, task_type
|
||||||
)
|
)
|
||||||
|
|
||||||
self.enable_force_include_usage = enable_force_include_usage
|
self.enable_force_include_usage = enable_force_include_usage
|
||||||
@ -101,8 +101,8 @@ class OpenAISpeechToText(OpenAIServing):
|
|||||||
self.tokenizer = cast(
|
self.tokenizer = cast(
|
||||||
PreTrainedTokenizerBase,
|
PreTrainedTokenizerBase,
|
||||||
get_tokenizer(
|
get_tokenizer(
|
||||||
tokenizer_name=self.renderer_config.tokenizer,
|
tokenizer_name=self.model_config.tokenizer,
|
||||||
tokenizer_mode=self.renderer_config.tokenizer_mode,
|
tokenizer_mode=self.model_config.tokenizer_mode,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -154,7 +154,7 @@ class OpenAISpeechToText(OpenAIServing):
|
|||||||
prompt = self.model_cls.get_generation_prompt(
|
prompt = self.model_cls.get_generation_prompt(
|
||||||
audio=chunk,
|
audio=chunk,
|
||||||
stt_config=self.asr_config,
|
stt_config=self.asr_config,
|
||||||
renderer_config=self.renderer_config,
|
model_config=self.model_config,
|
||||||
language=language,
|
language=language,
|
||||||
task_type=self.task_type,
|
task_type=self.task_type,
|
||||||
request_prompt=request.prompt,
|
request_prompt=request.prompt,
|
||||||
@ -428,7 +428,7 @@ class OpenAISpeechToText(OpenAIServing):
|
|||||||
if res.prompt_token_ids is not None:
|
if res.prompt_token_ids is not None:
|
||||||
num_prompt_tokens = len(res.prompt_token_ids)
|
num_prompt_tokens = len(res.prompt_token_ids)
|
||||||
if audio_tokens := self.model_cls.get_num_audio_tokens(
|
if audio_tokens := self.model_cls.get_num_audio_tokens(
|
||||||
audio_duration_s, self.asr_config, self.renderer_config
|
audio_duration_s, self.asr_config, self.model_config
|
||||||
):
|
):
|
||||||
num_prompt_tokens += audio_tokens
|
num_prompt_tokens += audio_tokens
|
||||||
|
|
||||||
|
|||||||
@ -94,7 +94,7 @@ class OpenAIServingPooling(OpenAIServing):
|
|||||||
try:
|
try:
|
||||||
lora_request = self._maybe_get_adapters(request)
|
lora_request = self._maybe_get_adapters(request)
|
||||||
|
|
||||||
if self.renderer_config.skip_tokenizer_init:
|
if self.model_config.skip_tokenizer_init:
|
||||||
tokenizer = None
|
tokenizer = None
|
||||||
else:
|
else:
|
||||||
tokenizer = await self.engine_client.get_tokenizer()
|
tokenizer = await self.engine_client.get_tokenizer()
|
||||||
|
|||||||
@ -160,8 +160,10 @@ class ServingScores(OpenAIServing):
|
|||||||
data_1: str | ScoreContentPartParam,
|
data_1: str | ScoreContentPartParam,
|
||||||
data_2: str | ScoreContentPartParam,
|
data_2: str | ScoreContentPartParam,
|
||||||
) -> tuple[str, TokensPrompt]:
|
) -> tuple[str, TokensPrompt]:
|
||||||
|
model_config = self.model_config
|
||||||
|
|
||||||
full_prompt, engine_prompt = get_score_prompt(
|
full_prompt, engine_prompt = get_score_prompt(
|
||||||
renderer_config=self.renderer_config,
|
model_config=model_config,
|
||||||
data_1=data_1,
|
data_1=data_1,
|
||||||
data_2=data_2,
|
data_2=data_2,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
|
|||||||
@ -5,7 +5,7 @@ from typing import Any, TypeAlias, cast
|
|||||||
from torch.nn import CosineSimilarity
|
from torch.nn import CosineSimilarity
|
||||||
from typing_extensions import Required, TypedDict
|
from typing_extensions import Required, TypedDict
|
||||||
|
|
||||||
from vllm.config import ModelConfig, RendererConfig
|
from vllm.config import ModelConfig
|
||||||
from vllm.entrypoints.chat_utils import (
|
from vllm.entrypoints.chat_utils import (
|
||||||
BaseMultiModalItemTracker,
|
BaseMultiModalItemTracker,
|
||||||
ChatCompletionContentPartImageEmbedsParam,
|
ChatCompletionContentPartImageEmbedsParam,
|
||||||
@ -88,9 +88,9 @@ def _validate_score_input_lens(
|
|||||||
def parse_score_data(
|
def parse_score_data(
|
||||||
data_1: str | ScoreContentPartParam,
|
data_1: str | ScoreContentPartParam,
|
||||||
data_2: str | ScoreContentPartParam,
|
data_2: str | ScoreContentPartParam,
|
||||||
renderer_config: RendererConfig,
|
model_config: ModelConfig,
|
||||||
) -> tuple[str, str, MultiModalDataDict | None]:
|
) -> tuple[str, str, MultiModalDataDict | None]:
|
||||||
mm_tracker = MultiModalItemTracker(renderer_config)
|
mm_tracker = MultiModalItemTracker(model_config)
|
||||||
|
|
||||||
content_1 = _parse_score_content(data_1, mm_tracker)
|
content_1 = _parse_score_content(data_1, mm_tracker)
|
||||||
content_2 = _parse_score_content(data_2, mm_tracker)
|
content_2 = _parse_score_content(data_2, mm_tracker)
|
||||||
@ -176,7 +176,7 @@ def post_process_tokens(
|
|||||||
|
|
||||||
|
|
||||||
def get_score_prompt(
|
def get_score_prompt(
|
||||||
renderer_config: RendererConfig,
|
model_config: ModelConfig,
|
||||||
tokenizer: TokenizerLike,
|
tokenizer: TokenizerLike,
|
||||||
tokenization_kwargs: dict[str, Any],
|
tokenization_kwargs: dict[str, Any],
|
||||||
data_1: str | ScoreContentPartParam,
|
data_1: str | ScoreContentPartParam,
|
||||||
@ -185,14 +185,11 @@ def get_score_prompt(
|
|||||||
prompt_1, prompt_2, mm_data = parse_score_data(
|
prompt_1, prompt_2, mm_data = parse_score_data(
|
||||||
data_1,
|
data_1,
|
||||||
data_2,
|
data_2,
|
||||||
renderer_config,
|
model_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
from vllm.model_executor.model_loader import get_model_cls
|
from vllm.model_executor.model_loader import get_model_cls
|
||||||
|
|
||||||
model_config = renderer_config.model_config
|
|
||||||
model = get_model_cls(model_config)
|
model = get_model_cls(model_config)
|
||||||
|
|
||||||
if supports_score_template(model):
|
if supports_score_template(model):
|
||||||
full_prompt = apply_score_template(model_config, prompt_1, prompt_2)
|
full_prompt = apply_score_template(model_config, prompt_1, prompt_2)
|
||||||
prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs)
|
prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs)
|
||||||
|
|||||||
@ -13,7 +13,7 @@ from fastapi import Request
|
|||||||
from fastapi.responses import JSONResponse, StreamingResponse
|
from fastapi.responses import JSONResponse, StreamingResponse
|
||||||
from starlette.background import BackgroundTask, BackgroundTasks
|
from starlette.background import BackgroundTask, BackgroundTasks
|
||||||
|
|
||||||
from vllm.config import RendererConfig
|
from vllm.config import ModelConfig
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.engine.protocol import EngineClient
|
from vllm.engine.protocol import EngineClient
|
||||||
from vllm.entrypoints.chat_utils import (
|
from vllm.entrypoints.chat_utils import (
|
||||||
@ -288,7 +288,7 @@ def process_lora_modules(
|
|||||||
async def process_chat_template(
|
async def process_chat_template(
|
||||||
args_chat_template: Path | str | None,
|
args_chat_template: Path | str | None,
|
||||||
engine_client: EngineClient,
|
engine_client: EngineClient,
|
||||||
renderer_config: RendererConfig,
|
model_config: ModelConfig,
|
||||||
) -> str | None:
|
) -> str | None:
|
||||||
resolved_chat_template = load_chat_template(args_chat_template)
|
resolved_chat_template = load_chat_template(args_chat_template)
|
||||||
if resolved_chat_template is not None:
|
if resolved_chat_template is not None:
|
||||||
@ -305,7 +305,7 @@ async def process_chat_template(
|
|||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
chat_template=None,
|
chat_template=None,
|
||||||
tools=None,
|
tools=None,
|
||||||
model_config=renderer_config.model_config,
|
model_config=model_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
if hf_chat_template != resolved_chat_template:
|
if hf_chat_template != resolved_chat_template:
|
||||||
@ -314,6 +314,6 @@ async def process_chat_template(
|
|||||||
"It is different from official chat template '%s'. "
|
"It is different from official chat template '%s'. "
|
||||||
"This discrepancy may lead to performance degradation.",
|
"This discrepancy may lead to performance degradation.",
|
||||||
resolved_chat_template,
|
resolved_chat_template,
|
||||||
renderer_config.model_config.model,
|
model_config.model,
|
||||||
)
|
)
|
||||||
return resolved_chat_template
|
return resolved_chat_template
|
||||||
|
|||||||
@ -6,7 +6,7 @@ from typing import Any, cast
|
|||||||
|
|
||||||
from typing_extensions import assert_never
|
from typing_extensions import assert_never
|
||||||
|
|
||||||
from vllm.config import RendererConfig
|
from vllm.config import ModelConfig
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
|
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
|
||||||
from vllm.multimodal.cache import BaseMultiModalProcessorCache
|
from vllm.multimodal.cache import BaseMultiModalProcessorCache
|
||||||
@ -45,15 +45,14 @@ logger = init_logger(__name__)
|
|||||||
class InputPreprocessor:
|
class InputPreprocessor:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
renderer_config: RendererConfig,
|
model_config: ModelConfig,
|
||||||
tokenizer: TokenizerLike | None,
|
tokenizer: TokenizerLike | None,
|
||||||
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
|
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
|
||||||
mm_processor_cache: BaseMultiModalProcessorCache | None = None,
|
mm_processor_cache: BaseMultiModalProcessorCache | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.renderer_config = renderer_config
|
self.model_config = model_config
|
||||||
self.model_config = renderer_config.model_config
|
|
||||||
self.tokenizer = tokenizer
|
self.tokenizer = tokenizer
|
||||||
self.mm_registry = mm_registry
|
self.mm_registry = mm_registry
|
||||||
self.mm_processor_cache = mm_processor_cache
|
self.mm_processor_cache = mm_processor_cache
|
||||||
@ -232,7 +231,7 @@ class InputPreprocessor:
|
|||||||
def _get_mm_processor(self) -> BaseMultiModalProcessor:
|
def _get_mm_processor(self) -> BaseMultiModalProcessor:
|
||||||
if not hasattr(self, "_mm_processor"):
|
if not hasattr(self, "_mm_processor"):
|
||||||
self._mm_processor = self.mm_registry.create_processor(
|
self._mm_processor = self.mm_registry.create_processor(
|
||||||
self.renderer_config,
|
self.model_config,
|
||||||
tokenizer=self.tokenizer,
|
tokenizer=self.tokenizer,
|
||||||
cache=self.mm_processor_cache,
|
cache=self.mm_processor_cache,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -415,7 +415,7 @@ def load_weights_using_from_2_way_softmax(
|
|||||||
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
||||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||||
|
|
||||||
renderer_config = model.vllm_config.renderer_config
|
model_config = model.vllm_config.model_config
|
||||||
quant_config = model.vllm_config.quant_config
|
quant_config = model.vllm_config.quant_config
|
||||||
text_config = model.config.get_text_config()
|
text_config = model.config.get_text_config()
|
||||||
|
|
||||||
@ -447,10 +447,10 @@ def load_weights_using_from_2_way_softmax(
|
|||||||
from vllm.tokenizers import get_tokenizer
|
from vllm.tokenizers import get_tokenizer
|
||||||
|
|
||||||
tokenizer = get_tokenizer(
|
tokenizer = get_tokenizer(
|
||||||
renderer_config.tokenizer,
|
model_config.tokenizer,
|
||||||
revision=renderer_config.tokenizer_revision,
|
revision=model_config.tokenizer_revision,
|
||||||
tokenizer_mode=renderer_config.tokenizer_mode,
|
tokenizer_mode=model_config.tokenizer_mode,
|
||||||
trust_remote_code=renderer_config.trust_remote_code,
|
trust_remote_code=model_config.trust_remote_code,
|
||||||
)
|
)
|
||||||
|
|
||||||
false_id = tokenizer.convert_tokens_to_ids(tokens[0])
|
false_id = tokenizer.convert_tokens_to_ids(tokens[0])
|
||||||
@ -473,7 +473,7 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
|
|||||||
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
|
||||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||||
|
|
||||||
renderer_config = model.vllm_config.renderer_config
|
model_config = model.vllm_config.model_config
|
||||||
quant_config = model.vllm_config.quant_config
|
quant_config = model.vllm_config.quant_config
|
||||||
text_config = model.config.get_text_config()
|
text_config = model.config.get_text_config()
|
||||||
|
|
||||||
@ -501,10 +501,10 @@ def load_weights_no_post_processing(model, weights: Iterable[tuple[str, torch.Te
|
|||||||
from vllm.tokenizers import get_tokenizer
|
from vllm.tokenizers import get_tokenizer
|
||||||
|
|
||||||
tokenizer = get_tokenizer(
|
tokenizer = get_tokenizer(
|
||||||
renderer_config.tokenizer,
|
model_config.tokenizer,
|
||||||
revision=renderer_config.tokenizer_revision,
|
revision=model_config.tokenizer_revision,
|
||||||
tokenizer_mode=renderer_config.tokenizer_mode,
|
tokenizer_mode=model_config.tokenizer_mode,
|
||||||
trust_remote_code=renderer_config.trust_remote_code,
|
trust_remote_code=model_config.trust_remote_code,
|
||||||
)
|
)
|
||||||
|
|
||||||
token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens]
|
token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens]
|
||||||
|
|||||||
@ -377,8 +377,8 @@ class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
|||||||
self.projector_config = config.projector_config
|
self.projector_config = config.projector_config
|
||||||
self.text_config = config.text_config
|
self.text_config = config.text_config
|
||||||
|
|
||||||
renderer_config = vllm_config.renderer_config
|
model_config = vllm_config.model_config
|
||||||
tokenizer = cached_tokenizer_from_config(renderer_config)
|
tokenizer = cached_tokenizer_from_config(model_config)
|
||||||
self.image_token_id = tokenizer.vocab[_IMAGE_TOKEN]
|
self.image_token_id = tokenizer.vocab[_IMAGE_TOKEN]
|
||||||
|
|
||||||
self.sam_model = build_sam_vit_b()
|
self.sam_model = build_sam_vit_b()
|
||||||
|
|||||||
@ -370,8 +370,8 @@ class DeepseekVLV2ForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
|
|||||||
self.projector_config = config.projector_config
|
self.projector_config = config.projector_config
|
||||||
self.text_config = config.text_config
|
self.text_config = config.text_config
|
||||||
|
|
||||||
renderer_config = vllm_config.renderer_config
|
model_config = vllm_config.model_config
|
||||||
tokenizer = cached_tokenizer_from_config(renderer_config)
|
tokenizer = cached_tokenizer_from_config(model_config)
|
||||||
self.image_token_id: int = tokenizer.vocab[_IMAGE_TOKEN]
|
self.image_token_id: int = tokenizer.vocab[_IMAGE_TOKEN]
|
||||||
|
|
||||||
self.vision = self._init_vision_module(
|
self.vision = self._init_vision_module(
|
||||||
|
|||||||
@ -18,7 +18,7 @@ from transformers.models.gemma3n import (
|
|||||||
)
|
)
|
||||||
from transformers.models.siglip import SiglipImageProcessorFast
|
from transformers.models.siglip import SiglipImageProcessorFast
|
||||||
|
|
||||||
from vllm.config import RendererConfig, SpeechToTextConfig, VllmConfig
|
from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
|
||||||
from vllm.config.multimodal import BaseDummyOptions
|
from vllm.config.multimodal import BaseDummyOptions
|
||||||
from vllm.inputs.data import PromptType
|
from vllm.inputs.data import PromptType
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
@ -760,7 +760,7 @@ class Gemma3nForConditionalGeneration(
|
|||||||
cls,
|
cls,
|
||||||
audio: np.ndarray,
|
audio: np.ndarray,
|
||||||
stt_config: SpeechToTextConfig,
|
stt_config: SpeechToTextConfig,
|
||||||
renderer_config: RendererConfig,
|
model_config: ModelConfig,
|
||||||
language: Optional[str],
|
language: Optional[str],
|
||||||
task_type: Literal["transcribe", "translate"],
|
task_type: Literal["transcribe", "translate"],
|
||||||
request_prompt: str,
|
request_prompt: str,
|
||||||
@ -798,9 +798,7 @@ class Gemma3nForConditionalGeneration(
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_speech_to_text_config(
|
def get_speech_to_text_config(
|
||||||
cls,
|
cls, model_config: ModelConfig, task_type: str
|
||||||
renderer_config: RendererConfig,
|
|
||||||
task_type: str,
|
|
||||||
) -> SpeechToTextConfig:
|
) -> SpeechToTextConfig:
|
||||||
return SpeechToTextConfig(
|
return SpeechToTextConfig(
|
||||||
# Let's set this to 30 as suggested in the docs for now, although
|
# Let's set this to 30 as suggested in the docs for now, although
|
||||||
|
|||||||
@ -34,7 +34,7 @@ import torch.nn.functional as F
|
|||||||
from torch import nn
|
from torch import nn
|
||||||
from transformers import BatchFeature, PretrainedConfig
|
from transformers import BatchFeature, PretrainedConfig
|
||||||
|
|
||||||
from vllm.config import CacheConfig, RendererConfig, SpeechToTextConfig, VllmConfig
|
from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
|
||||||
from vllm.config.multimodal import BaseDummyOptions
|
from vllm.config.multimodal import BaseDummyOptions
|
||||||
from vllm.inputs.data import PromptType
|
from vllm.inputs.data import PromptType
|
||||||
from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
|
from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear
|
||||||
@ -840,7 +840,7 @@ class GraniteSpeechForConditionalGeneration(
|
|||||||
def get_generation_prompt(
|
def get_generation_prompt(
|
||||||
cls,
|
cls,
|
||||||
audio: np.ndarray,
|
audio: np.ndarray,
|
||||||
renderer_config: RendererConfig,
|
model_config: ModelConfig,
|
||||||
stt_config: SpeechToTextConfig,
|
stt_config: SpeechToTextConfig,
|
||||||
language: str | None,
|
language: str | None,
|
||||||
task_type: Literal["transcribe", "translate"],
|
task_type: Literal["transcribe", "translate"],
|
||||||
@ -861,7 +861,7 @@ class GraniteSpeechForConditionalGeneration(
|
|||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported task type {task_type}")
|
raise ValueError(f"Unsupported task type {task_type}")
|
||||||
|
|
||||||
tokenizer = cached_tokenizer_from_config(renderer_config)
|
tokenizer = cached_tokenizer_from_config(model_config)
|
||||||
chat = [dict(role="user", content=user_prompt)]
|
chat = [dict(role="user", content=user_prompt)]
|
||||||
prompt = tokenizer.apply_chat_template(
|
prompt = tokenizer.apply_chat_template(
|
||||||
chat,
|
chat,
|
||||||
@ -882,10 +882,10 @@ class GraniteSpeechForConditionalGeneration(
|
|||||||
cls,
|
cls,
|
||||||
audio_duration_s: float,
|
audio_duration_s: float,
|
||||||
stt_config: SpeechToTextConfig,
|
stt_config: SpeechToTextConfig,
|
||||||
renderer_config: RendererConfig,
|
model_config: ModelConfig,
|
||||||
) -> int | None:
|
) -> int | None:
|
||||||
"""Get the number of audio tokens for an audio duration in sec."""
|
"""Get the number of audio tokens for an audio duration in sec."""
|
||||||
processor = cached_processor_from_config(renderer_config)
|
processor = cached_processor_from_config(model_config)
|
||||||
hop_length = processor.audio_processor.melspec_kwargs["hop_length"]
|
hop_length = processor.audio_processor.melspec_kwargs["hop_length"]
|
||||||
proj_win_size = processor.audio_processor.projector_window_size
|
proj_win_size = processor.audio_processor.projector_window_size
|
||||||
ds_rate = processor.audio_processor.projector_downsample_rate
|
ds_rate = processor.audio_processor.projector_downsample_rate
|
||||||
@ -903,9 +903,7 @@ class GraniteSpeechForConditionalGeneration(
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_speech_to_text_config(
|
def get_speech_to_text_config(
|
||||||
cls,
|
cls, model_config: ModelConfig, task_type: str
|
||||||
renderer_config: RendererConfig,
|
|
||||||
task_type: str,
|
|
||||||
) -> SpeechToTextConfig:
|
) -> SpeechToTextConfig:
|
||||||
"""Get the stt config for this model."""
|
"""Get the stt config for this model."""
|
||||||
# Default settings are reasonable for this model and we don't currently
|
# Default settings are reasonable for this model and we don't currently
|
||||||
|
|||||||
@ -6,7 +6,7 @@ import numpy as np
|
|||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
|
||||||
from vllm.config import RendererConfig, VllmConfig
|
from vllm.config import ModelConfig, VllmConfig
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.model_executor.layers.pooler import (
|
from vllm.model_executor.layers.pooler import (
|
||||||
DispatchPooler,
|
DispatchPooler,
|
||||||
@ -29,12 +29,12 @@ logger = init_logger(__name__)
|
|||||||
class GritLMMeanPool(nn.Module):
|
class GritLMMeanPool(nn.Module):
|
||||||
"""As `MeanPool`, but only includes non-instruction tokens."""
|
"""As `MeanPool`, but only includes non-instruction tokens."""
|
||||||
|
|
||||||
def __init__(self, renderer_config: RendererConfig):
|
def __init__(self, model_config: ModelConfig):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.renderer_config = renderer_config
|
self.model_config = model_config
|
||||||
|
|
||||||
tokenizer = cached_tokenizer_from_config(self.renderer_config)
|
tokenizer = cached_tokenizer_from_config(self.model_config)
|
||||||
|
|
||||||
# Collect the tokens needed for pattern matching.
|
# Collect the tokens needed for pattern matching.
|
||||||
# "▁<" is different from "_<". The former uses "▁" to indicate that
|
# "▁<" is different from "_<". The former uses "▁" to indicate that
|
||||||
@ -174,10 +174,10 @@ class GritLMMeanPool(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class GritLMPooler(Pooler):
|
class GritLMPooler(Pooler):
|
||||||
def __init__(self, renderer_config: RendererConfig):
|
def __init__(self, model_config: ModelConfig):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
self.pooling = GritLMMeanPool(renderer_config)
|
self.pooling = GritLMMeanPool(model_config)
|
||||||
self.head = PoolerHead(PoolerNormalize())
|
self.head = PoolerHead(PoolerNormalize())
|
||||||
|
|
||||||
def get_supported_tasks(self) -> Set[PoolingTask]:
|
def get_supported_tasks(self) -> Set[PoolingTask]:
|
||||||
@ -238,6 +238,6 @@ class GritLM(LlamaForCausalLM):
|
|||||||
self.pooler = DispatchPooler(
|
self.pooler = DispatchPooler(
|
||||||
{
|
{
|
||||||
"token_embed": Pooler.for_token_embed(pooler_config),
|
"token_embed": Pooler.for_token_embed(pooler_config),
|
||||||
"embed": GritLMPooler(vllm_config.renderer_config),
|
"embed": GritLMPooler(vllm_config.model_config),
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|||||||
@ -19,7 +19,7 @@ from torch import Tensor
|
|||||||
from transformers.models.whisper.tokenization_whisper import LANGUAGES
|
from transformers.models.whisper.tokenization_whisper import LANGUAGES
|
||||||
from typing_extensions import Self, TypeIs
|
from typing_extensions import Self, TypeIs
|
||||||
|
|
||||||
from vllm.config import RendererConfig, SpeechToTextConfig
|
from vllm.config import ModelConfig, SpeechToTextConfig
|
||||||
from vllm.inputs import TokensPrompt
|
from vllm.inputs import TokensPrompt
|
||||||
from vllm.inputs.data import PromptType
|
from vllm.inputs.data import PromptType
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
@ -887,7 +887,7 @@ class SupportsTranscription(Protocol):
|
|||||||
cls,
|
cls,
|
||||||
audio: np.ndarray,
|
audio: np.ndarray,
|
||||||
stt_config: SpeechToTextConfig,
|
stt_config: SpeechToTextConfig,
|
||||||
renderer_config: RendererConfig,
|
model_config: ModelConfig,
|
||||||
language: str | None,
|
language: str | None,
|
||||||
task_type: Literal["transcribe", "translate"],
|
task_type: Literal["transcribe", "translate"],
|
||||||
request_prompt: str,
|
request_prompt: str,
|
||||||
@ -930,9 +930,7 @@ class SupportsTranscription(Protocol):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_speech_to_text_config(
|
def get_speech_to_text_config(
|
||||||
cls,
|
cls, model_config: ModelConfig, task_type: Literal["transcribe", "translate"]
|
||||||
renderer_config: RendererConfig,
|
|
||||||
task_type: Literal["transcribe", "translate"],
|
|
||||||
) -> SpeechToTextConfig:
|
) -> SpeechToTextConfig:
|
||||||
"""Get the speech to text config for the ASR model."""
|
"""Get the speech to text config for the ASR model."""
|
||||||
...
|
...
|
||||||
@ -942,7 +940,7 @@ class SupportsTranscription(Protocol):
|
|||||||
cls,
|
cls,
|
||||||
audio_duration_s: float,
|
audio_duration_s: float,
|
||||||
stt_config: SpeechToTextConfig,
|
stt_config: SpeechToTextConfig,
|
||||||
renderer_config: RendererConfig,
|
model_config: ModelConfig,
|
||||||
) -> int | None:
|
) -> int | None:
|
||||||
"""
|
"""
|
||||||
Map from audio duration to number of audio tokens produced by the ASR
|
Map from audio duration to number of audio tokens produced by the ASR
|
||||||
|
|||||||
@ -182,7 +182,7 @@ class InternS1ProcessingInfo(BaseProcessingInfo):
|
|||||||
def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
|
def get_hf_processor(self, **kwargs: object) -> InternVLProcessor:
|
||||||
hf_processor = self.ctx.get_hf_processor(InternVLProcessor, **kwargs)
|
hf_processor = self.ctx.get_hf_processor(InternVLProcessor, **kwargs)
|
||||||
hf_processor.video_processor = cached_video_processor_from_config(
|
hf_processor.video_processor = cached_video_processor_from_config(
|
||||||
self.ctx.renderer_config,
|
self.ctx.model_config,
|
||||||
processor_cls=InternVLVideoProcessor,
|
processor_cls=InternVLVideoProcessor,
|
||||||
size=hf_processor.image_processor.size,
|
size=hf_processor.image_processor.size,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
|
|||||||
@ -1169,17 +1169,16 @@ class NemotronH_Nano_VL_V2(
|
|||||||
self.mlp1 = self.mlp1.to(self.language_model.config.dtype)
|
self.mlp1 = self.mlp1.to(self.language_model.config.dtype)
|
||||||
|
|
||||||
self.config = config
|
self.config = config
|
||||||
|
self.model_config = vllm_config.model_config
|
||||||
|
|
||||||
# Pre-tokenize special tokens for video processing
|
# Pre-tokenize special tokens for video processing
|
||||||
# to avoid repeated tokenization
|
# to avoid repeated tokenization
|
||||||
self._tokenizer = cached_tokenizer_from_config(vllm_config.renderer_config)
|
tokenizer = cached_tokenizer_from_config(vllm_config.model_config)
|
||||||
self._img_start_token_ids = self._tokenizer.encode(
|
self._img_start_token_ids = tokenizer.encode(
|
||||||
IMG_START, add_special_tokens=False
|
IMG_START, add_special_tokens=False
|
||||||
)
|
)
|
||||||
self._img_end_token_ids = self._tokenizer.encode(
|
self._img_end_token_ids = tokenizer.encode(IMG_END, add_special_tokens=False)
|
||||||
IMG_END, add_special_tokens=False
|
self._img_context_token_ids = tokenizer.encode(
|
||||||
)
|
|
||||||
self._img_context_token_ids = self._tokenizer.encode(
|
|
||||||
IMG_CONTEXT, add_special_tokens=False
|
IMG_CONTEXT, add_special_tokens=False
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1365,7 +1364,7 @@ class NemotronH_Nano_VL_V2(
|
|||||||
input_embeds for the LLM.
|
input_embeds for the LLM.
|
||||||
"""
|
"""
|
||||||
device = video_embeddings.device
|
device = video_embeddings.device
|
||||||
tokenizer = self._tokenizer
|
tokenizer = cached_tokenizer_from_config(self.model_config)
|
||||||
|
|
||||||
# Generate video replacement token IDs using get_video_repl
|
# Generate video replacement token IDs using get_video_repl
|
||||||
# This tokenizes each frame separator independently, then uses pre-tokenized
|
# This tokenizes each frame separator independently, then uses pre-tokenized
|
||||||
|
|||||||
@ -347,7 +347,7 @@ class NemotronVLProcessingInfo(BaseInternVLProcessingInfo):
|
|||||||
|
|
||||||
def get_image_processor(self, **kwargs: object):
|
def get_image_processor(self, **kwargs: object):
|
||||||
return cached_image_processor_from_config(
|
return cached_image_processor_from_config(
|
||||||
self.ctx.renderer_config,
|
self.ctx.model_config,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -193,7 +193,7 @@ class PixtralProcessorAdapter:
|
|||||||
|
|
||||||
class PixtralProcessingInfo(BaseProcessingInfo):
|
class PixtralProcessingInfo(BaseProcessingInfo):
|
||||||
def get_tokenizer(self) -> MistralTokenizer:
|
def get_tokenizer(self) -> MistralTokenizer:
|
||||||
tokenizer = cached_tokenizer_from_config(self.ctx.renderer_config)
|
tokenizer = cached_tokenizer_from_config(self.ctx.model_config)
|
||||||
if not isinstance(tokenizer, MistralTokenizer):
|
if not isinstance(tokenizer, MistralTokenizer):
|
||||||
raise ValueError("This model requires `--tokenizer-mode mistral`")
|
raise ValueError("This model requires `--tokenizer-mode mistral`")
|
||||||
|
|
||||||
|
|||||||
@ -20,7 +20,7 @@ from mistral_common.tokens.tokenizers.audio import Audio, AudioEncoder
|
|||||||
from transformers import BatchFeature, TensorType, WhisperConfig
|
from transformers import BatchFeature, TensorType, WhisperConfig
|
||||||
from transformers.tokenization_utils_base import TextInput
|
from transformers.tokenization_utils_base import TextInput
|
||||||
|
|
||||||
from vllm.config import RendererConfig, SpeechToTextConfig, VllmConfig
|
from vllm.config import ModelConfig, SpeechToTextConfig, VllmConfig
|
||||||
from vllm.config.multimodal import BaseDummyOptions
|
from vllm.config.multimodal import BaseDummyOptions
|
||||||
from vllm.inputs.data import PromptType
|
from vllm.inputs.data import PromptType
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
@ -176,7 +176,7 @@ class VoxtralProcessorAdapter:
|
|||||||
|
|
||||||
class VoxtralProcessingInfo(BaseProcessingInfo):
|
class VoxtralProcessingInfo(BaseProcessingInfo):
|
||||||
def get_tokenizer(self) -> MistralTokenizer:
|
def get_tokenizer(self) -> MistralTokenizer:
|
||||||
tokenizer = cached_tokenizer_from_config(self.ctx.renderer_config)
|
tokenizer = cached_tokenizer_from_config(self.ctx.model_config)
|
||||||
if not isinstance(tokenizer, MistralTokenizer):
|
if not isinstance(tokenizer, MistralTokenizer):
|
||||||
raise ValueError("This model requires `--tokenizer-mode mistral`")
|
raise ValueError("This model requires `--tokenizer-mode mistral`")
|
||||||
|
|
||||||
@ -339,7 +339,7 @@ class VoxtralForConditionalGeneration(
|
|||||||
|
|
||||||
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.tokenizer = cached_tokenizer_from_config(vllm_config.renderer_config)
|
self.tokenizer = cached_tokenizer_from_config(vllm_config.model_config)
|
||||||
|
|
||||||
# update quant config to so that ignored module and target module names
|
# update quant config to so that ignored module and target module names
|
||||||
# match the vLLM model names
|
# match the vLLM model names
|
||||||
@ -450,11 +450,9 @@ class VoxtralForConditionalGeneration(
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_speech_to_text_config(
|
def get_speech_to_text_config(
|
||||||
cls,
|
cls, model_config: ModelConfig, task_type: str
|
||||||
renderer_config: RendererConfig,
|
|
||||||
task_type: str,
|
|
||||||
) -> SpeechToTextConfig:
|
) -> SpeechToTextConfig:
|
||||||
tokenizer = cached_tokenizer_from_config(renderer_config)
|
tokenizer = cached_tokenizer_from_config(model_config)
|
||||||
audio_config = tokenizer.instruct.audio_encoder.audio_config
|
audio_config = tokenizer.instruct.audio_encoder.audio_config
|
||||||
max_audio_clip_s = audio_config.chunk_length_s
|
max_audio_clip_s = audio_config.chunk_length_s
|
||||||
sample_rate = audio_config.sampling_rate
|
sample_rate = audio_config.sampling_rate
|
||||||
@ -470,17 +468,17 @@ class VoxtralForConditionalGeneration(
|
|||||||
def get_generation_prompt(
|
def get_generation_prompt(
|
||||||
cls,
|
cls,
|
||||||
audio: np.ndarray,
|
audio: np.ndarray,
|
||||||
renderer_config: RendererConfig, # not needed here
|
model_config: ModelConfig,
|
||||||
stt_config: SpeechToTextConfig,
|
stt_config: SpeechToTextConfig,
|
||||||
language: str | None,
|
language: str | None,
|
||||||
task_type: Literal["transcribe", "translate"],
|
task_type: Literal["transcribe", "translate"],
|
||||||
request_prompt: str,
|
request_prompt: str,
|
||||||
to_language: str | None,
|
to_language: str | None,
|
||||||
) -> PromptType:
|
) -> PromptType:
|
||||||
tokenizer = cached_tokenizer_from_config(renderer_config)
|
tokenizer = cached_tokenizer_from_config(model_config)
|
||||||
audio = Audio(audio, int(stt_config.sample_rate), format="wav") # lossless
|
audio = Audio(audio, int(stt_config.sample_rate), format="wav") # lossless
|
||||||
req = TranscriptionRequest(
|
req = TranscriptionRequest(
|
||||||
model=renderer_config.model_config.model,
|
model=model_config.model,
|
||||||
audio=RawAudio.from_audio(audio),
|
audio=RawAudio.from_audio(audio),
|
||||||
language=language,
|
language=language,
|
||||||
)
|
)
|
||||||
@ -496,14 +494,14 @@ class VoxtralForConditionalGeneration(
|
|||||||
cls,
|
cls,
|
||||||
audio_duration_s: float,
|
audio_duration_s: float,
|
||||||
stt_config: SpeechToTextConfig,
|
stt_config: SpeechToTextConfig,
|
||||||
renderer_config: RendererConfig,
|
model_config: ModelConfig,
|
||||||
) -> int | None:
|
) -> int | None:
|
||||||
"""
|
"""
|
||||||
Map from audio duration to number of audio tokens produced by the ASR
|
Map from audio duration to number of audio tokens produced by the ASR
|
||||||
model, without running a forward pass.
|
model, without running a forward pass.
|
||||||
This is used for estimating the amount of processing for this audio.
|
This is used for estimating the amount of processing for this audio.
|
||||||
"""
|
"""
|
||||||
tokenizer = cached_tokenizer_from_config(renderer_config)
|
tokenizer = cached_tokenizer_from_config(model_config)
|
||||||
adapter = VoxtralProcessorAdapter(tokenizer)
|
adapter = VoxtralProcessorAdapter(tokenizer)
|
||||||
return adapter.get_num_audio_tokens(
|
return adapter.get_num_audio_tokens(
|
||||||
int(audio_duration_s * stt_config.sample_rate)
|
int(audio_duration_s * stt_config.sample_rate)
|
||||||
|
|||||||
@ -19,7 +19,7 @@ from transformers.models.whisper.modeling_whisper import sinusoids
|
|||||||
from vllm.attention.backends.abstract import AttentionType
|
from vllm.attention.backends.abstract import AttentionType
|
||||||
from vllm.attention.layer import Attention, MultiHeadAttention
|
from vllm.attention.layer import Attention, MultiHeadAttention
|
||||||
from vllm.attention.layers.cross_attention import CrossAttention
|
from vllm.attention.layers.cross_attention import CrossAttention
|
||||||
from vllm.config import CacheConfig, RendererConfig, SpeechToTextConfig, VllmConfig
|
from vllm.config import CacheConfig, ModelConfig, SpeechToTextConfig, VllmConfig
|
||||||
from vllm.config.multimodal import BaseDummyOptions
|
from vllm.config.multimodal import BaseDummyOptions
|
||||||
from vllm.distributed import get_tensor_model_parallel_world_size
|
from vllm.distributed import get_tensor_model_parallel_world_size
|
||||||
from vllm.inputs.data import PromptType
|
from vllm.inputs.data import PromptType
|
||||||
@ -811,7 +811,7 @@ class WhisperForConditionalGeneration(
|
|||||||
def get_generation_prompt(
|
def get_generation_prompt(
|
||||||
cls,
|
cls,
|
||||||
audio: np.ndarray,
|
audio: np.ndarray,
|
||||||
renderer_config: RendererConfig, # not needed here
|
model_config: ModelConfig, # not needed here
|
||||||
stt_config: SpeechToTextConfig,
|
stt_config: SpeechToTextConfig,
|
||||||
language: str | None,
|
language: str | None,
|
||||||
task_type: Literal["transcribe", "translate"],
|
task_type: Literal["transcribe", "translate"],
|
||||||
@ -847,11 +847,9 @@ class WhisperForConditionalGeneration(
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_speech_to_text_config(
|
def get_speech_to_text_config(
|
||||||
cls,
|
cls, model_config: ModelConfig, task_type: str
|
||||||
renderer_config: RendererConfig,
|
|
||||||
task_type: str,
|
|
||||||
) -> SpeechToTextConfig:
|
) -> SpeechToTextConfig:
|
||||||
processor = cached_processor_from_config(renderer_config)
|
processor = cached_processor_from_config(model_config)
|
||||||
|
|
||||||
return SpeechToTextConfig(
|
return SpeechToTextConfig(
|
||||||
max_audio_clip_s=processor.feature_extractor.chunk_length,
|
max_audio_clip_s=processor.feature_extractor.chunk_length,
|
||||||
@ -863,9 +861,9 @@ class WhisperForConditionalGeneration(
|
|||||||
cls,
|
cls,
|
||||||
audio_duration_s: float,
|
audio_duration_s: float,
|
||||||
stt_config: SpeechToTextConfig,
|
stt_config: SpeechToTextConfig,
|
||||||
renderer_config: RendererConfig,
|
model_config: ModelConfig,
|
||||||
) -> int | None:
|
) -> int | None:
|
||||||
processor = cached_processor_from_config(renderer_config)
|
processor = cached_processor_from_config(model_config)
|
||||||
hop_length = processor.feature_extractor.hop_length
|
hop_length = processor.feature_extractor.hop_length
|
||||||
assert hop_length is not None
|
assert hop_length is not None
|
||||||
# NOTE(NickLucche) user can't pass encoder
|
# NOTE(NickLucche) user can't pass encoder
|
||||||
|
|||||||
@ -31,7 +31,7 @@ from .inputs import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from vllm.config import ModelConfig, RendererConfig, VllmConfig
|
from vllm.config import ModelConfig, VllmConfig
|
||||||
|
|
||||||
from .processing import ResolvedPromptUpdate
|
from .processing import ResolvedPromptUpdate
|
||||||
from .registry import MultiModalRegistry
|
from .registry import MultiModalRegistry
|
||||||
@ -561,13 +561,13 @@ class ShmObjectStoreSenderCache(BaseMultiModalProcessorCache):
|
|||||||
|
|
||||||
|
|
||||||
def _enable_processor_cache(
|
def _enable_processor_cache(
|
||||||
renderer_config: "RendererConfig",
|
model_config: "ModelConfig",
|
||||||
mm_registry: "MultiModalRegistry",
|
mm_registry: "MultiModalRegistry",
|
||||||
) -> bool:
|
) -> bool:
|
||||||
if not mm_registry.supports_multimodal_inputs(renderer_config):
|
if not mm_registry.supports_multimodal_inputs(model_config):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
mm_config = renderer_config.model_config.get_multimodal_config()
|
mm_config = model_config.get_multimodal_config()
|
||||||
return mm_config.mm_processor_cache_gb > 0
|
return mm_config.mm_processor_cache_gb > 0
|
||||||
|
|
||||||
|
|
||||||
@ -599,7 +599,7 @@ def processor_cache_from_config(
|
|||||||
"""Return a `BaseMultiModalProcessorCache`, if enabled."""
|
"""Return a `BaseMultiModalProcessorCache`, if enabled."""
|
||||||
model_config = vllm_config.model_config
|
model_config = vllm_config.model_config
|
||||||
|
|
||||||
if not _enable_processor_cache(vllm_config.renderer_config, mm_registry):
|
if not _enable_processor_cache(model_config, mm_registry):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if not _enable_ipc_cache(vllm_config):
|
if not _enable_ipc_cache(vllm_config):
|
||||||
@ -611,14 +611,14 @@ def processor_cache_from_config(
|
|||||||
|
|
||||||
|
|
||||||
def processor_only_cache_from_config(
|
def processor_only_cache_from_config(
|
||||||
renderer_config: "RendererConfig",
|
model_config: "ModelConfig",
|
||||||
mm_registry: "MultiModalRegistry",
|
mm_registry: "MultiModalRegistry",
|
||||||
):
|
):
|
||||||
"""Return a `MultiModalProcessorOnlyCache`, if enabled."""
|
"""Return a `MultiModalProcessorOnlyCache`, if enabled."""
|
||||||
if not _enable_processor_cache(renderer_config, mm_registry):
|
if not _enable_processor_cache(model_config, mm_registry):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return MultiModalProcessorOnlyCache(renderer_config.model_config)
|
return MultiModalProcessorOnlyCache(model_config)
|
||||||
|
|
||||||
|
|
||||||
class BaseMultiModalReceiverCache(
|
class BaseMultiModalReceiverCache(
|
||||||
@ -787,7 +787,7 @@ def engine_receiver_cache_from_config(
|
|||||||
"""
|
"""
|
||||||
model_config = vllm_config.model_config
|
model_config = vllm_config.model_config
|
||||||
|
|
||||||
if not _enable_processor_cache(vllm_config.renderer_config, mm_registry):
|
if not _enable_processor_cache(model_config, mm_registry):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if not _enable_ipc_cache(vllm_config):
|
if not _enable_ipc_cache(vllm_config):
|
||||||
@ -809,7 +809,9 @@ def worker_receiver_cache_from_config(
|
|||||||
Return a `BaseMultiModalReceiverCache` only when IPC caching is enabled and
|
Return a `BaseMultiModalReceiverCache` only when IPC caching is enabled and
|
||||||
mm_processor_cache_type=="shm".
|
mm_processor_cache_type=="shm".
|
||||||
"""
|
"""
|
||||||
if not _enable_processor_cache(vllm_config.renderer_config, mm_registry):
|
model_config = vllm_config.model_config
|
||||||
|
|
||||||
|
if not _enable_processor_cache(model_config, mm_registry):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
if not _enable_ipc_cache(vllm_config):
|
if not _enable_ipc_cache(vllm_config):
|
||||||
|
|||||||
@ -23,7 +23,7 @@ import torch
|
|||||||
from typing_extensions import TypeVar, assert_never
|
from typing_extensions import TypeVar, assert_never
|
||||||
|
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
|
from vllm.tokenizers import TokenizerLike
|
||||||
from vllm.transformers_utils.processor import cached_processor_from_config
|
from vllm.transformers_utils.processor import cached_processor_from_config
|
||||||
from vllm.utils.collection_utils import flatten_2d_lists, full_groupby
|
from vllm.utils.collection_utils import flatten_2d_lists, full_groupby
|
||||||
from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
|
from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
|
||||||
@ -53,7 +53,7 @@ if TYPE_CHECKING:
|
|||||||
from transformers.feature_extraction_utils import BatchFeature
|
from transformers.feature_extraction_utils import BatchFeature
|
||||||
from transformers.processing_utils import ProcessorMixin
|
from transformers.processing_utils import ProcessorMixin
|
||||||
|
|
||||||
from vllm.config import ModelConfig, RendererConfig
|
from vllm.config import ModelConfig
|
||||||
|
|
||||||
from .cache import BaseMultiModalProcessorCache
|
from .cache import BaseMultiModalProcessorCache
|
||||||
from .profiling import BaseDummyInputsBuilder
|
from .profiling import BaseDummyInputsBuilder
|
||||||
@ -63,7 +63,6 @@ else:
|
|||||||
ProcessorMixin = object
|
ProcessorMixin = object
|
||||||
|
|
||||||
ModelConfig = object
|
ModelConfig = object
|
||||||
RendererConfig = object
|
|
||||||
|
|
||||||
BaseMultiModalProcessorCache = object
|
BaseMultiModalProcessorCache = object
|
||||||
|
|
||||||
@ -946,29 +945,12 @@ class InputProcessingContext:
|
|||||||
modify the inputs.
|
modify the inputs.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
renderer_config: RendererConfig
|
model_config: ModelConfig
|
||||||
"""The configuration of the renderer."""
|
"""The configuration of the model."""
|
||||||
|
|
||||||
tokenizer: TokenizerLike | None
|
tokenizer: TokenizerLike | None
|
||||||
"""The tokenizer used to tokenize the inputs."""
|
"""The tokenizer used to tokenize the inputs."""
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_config(
|
|
||||||
cls,
|
|
||||||
renderer_config: RendererConfig,
|
|
||||||
*,
|
|
||||||
tokenizer: TokenizerLike | None = None,
|
|
||||||
):
|
|
||||||
if tokenizer is None and not renderer_config.skip_tokenizer_init:
|
|
||||||
tokenizer = cached_tokenizer_from_config(renderer_config)
|
|
||||||
|
|
||||||
return cls(renderer_config, tokenizer)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def model_config(self) -> ModelConfig:
|
|
||||||
"""The configuration of the model."""
|
|
||||||
return self.renderer_config.model_config
|
|
||||||
|
|
||||||
def get_tokenizer(self) -> TokenizerLike:
|
def get_tokenizer(self) -> TokenizerLike:
|
||||||
if self.tokenizer is None:
|
if self.tokenizer is None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
@ -1065,7 +1047,7 @@ class InputProcessingContext:
|
|||||||
typ = ProcessorMixin
|
typ = ProcessorMixin
|
||||||
|
|
||||||
return cached_processor_from_config(
|
return cached_processor_from_config(
|
||||||
self.renderer_config,
|
self.model_config,
|
||||||
processor_cls=typ,
|
processor_cls=typ,
|
||||||
tokenizer=self.tokenizer,
|
tokenizer=self.tokenizer,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
|
|||||||
@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, Generic, Protocol, TypeVar, cast
|
|||||||
|
|
||||||
from vllm.config.multimodal import BaseDummyOptions
|
from vllm.config.multimodal import BaseDummyOptions
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.tokenizers import TokenizerLike
|
from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config
|
||||||
|
|
||||||
from .cache import BaseMultiModalProcessorCache
|
from .cache import BaseMultiModalProcessorCache
|
||||||
from .processing import (
|
from .processing import (
|
||||||
@ -22,7 +22,7 @@ from .profiling import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from vllm.config import ModelConfig, RendererConfig
|
from vllm.config import ModelConfig
|
||||||
from vllm.model_executor.models.interfaces import SupportsMultiModal
|
from vllm.model_executor.models.interfaces import SupportsMultiModal
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
@ -114,18 +114,17 @@ class MultiModalRegistry:
|
|||||||
|
|
||||||
return mm_options if len(mm_options) > 0 else None
|
return mm_options if len(mm_options) > 0 else None
|
||||||
|
|
||||||
def supports_multimodal_inputs(self, renderer_config: "RendererConfig") -> bool:
|
def supports_multimodal_inputs(self, model_config: "ModelConfig") -> bool:
|
||||||
"""
|
"""
|
||||||
Checks if the model supports multimodal inputs.
|
Checks if the model supports multimodal inputs.
|
||||||
Returns True if the model is multimodal with any non-zero supported
|
Returns True if the model is multimodal with any non-zero supported
|
||||||
modalities, otherwise returns False, effectively running in
|
modalities, otherwise returns False, effectively running in
|
||||||
text-only mode.
|
text-only mode.
|
||||||
"""
|
"""
|
||||||
model_config = renderer_config.model_config
|
|
||||||
if not model_config.is_multimodal_model:
|
if not model_config.is_multimodal_model:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
info = self._create_processing_info(renderer_config, tokenizer=None)
|
info = self._create_processing_info(model_config, tokenizer=None)
|
||||||
supported_modalities = info.get_supported_mm_limits()
|
supported_modalities = info.get_supported_mm_limits()
|
||||||
|
|
||||||
mm_config = model_config.get_multimodal_config()
|
mm_config = model_config.get_multimodal_config()
|
||||||
@ -145,7 +144,7 @@ class MultiModalRegistry:
|
|||||||
|
|
||||||
def get_max_tokens_per_item_by_modality(
|
def get_max_tokens_per_item_by_modality(
|
||||||
self,
|
self,
|
||||||
renderer_config: "RendererConfig",
|
model_config: "ModelConfig",
|
||||||
*,
|
*,
|
||||||
cache: BaseMultiModalProcessorCache | None = None,
|
cache: BaseMultiModalProcessorCache | None = None,
|
||||||
profiler_limits: Mapping[str, int] | None = None,
|
profiler_limits: Mapping[str, int] | None = None,
|
||||||
@ -154,11 +153,10 @@ class MultiModalRegistry:
|
|||||||
Get the maximum number of tokens per data item from each modality based
|
Get the maximum number of tokens per data item from each modality based
|
||||||
on underlying model configuration.
|
on underlying model configuration.
|
||||||
"""
|
"""
|
||||||
model_config = renderer_config.model_config
|
|
||||||
if not model_config.is_multimodal_model:
|
if not model_config.is_multimodal_model:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
processor = self.create_processor(renderer_config, cache=cache)
|
processor = self.create_processor(model_config, cache=cache)
|
||||||
profiler: MultiModalProfiler = MultiModalProfiler(processor)
|
profiler: MultiModalProfiler = MultiModalProfiler(processor)
|
||||||
|
|
||||||
seq_len = model_config.max_model_len
|
seq_len = model_config.max_model_len
|
||||||
@ -173,7 +171,7 @@ class MultiModalRegistry:
|
|||||||
|
|
||||||
def get_mm_limits_per_prompt(
|
def get_mm_limits_per_prompt(
|
||||||
self,
|
self,
|
||||||
renderer_config: "RendererConfig",
|
model_config: "ModelConfig",
|
||||||
*,
|
*,
|
||||||
cache: BaseMultiModalProcessorCache | None = None,
|
cache: BaseMultiModalProcessorCache | None = None,
|
||||||
) -> Mapping[str, int]:
|
) -> Mapping[str, int]:
|
||||||
@ -181,11 +179,10 @@ class MultiModalRegistry:
|
|||||||
Get the maximum number of multi-modal input instances for each modality
|
Get the maximum number of multi-modal input instances for each modality
|
||||||
that are allowed per prompt for a model class.
|
that are allowed per prompt for a model class.
|
||||||
"""
|
"""
|
||||||
model_config = renderer_config.model_config
|
|
||||||
if not model_config.is_multimodal_model:
|
if not model_config.is_multimodal_model:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
processor = self.create_processor(renderer_config, cache=cache)
|
processor = self.create_processor(model_config, cache=cache)
|
||||||
profiler: MultiModalProfiler = MultiModalProfiler(processor)
|
profiler: MultiModalProfiler = MultiModalProfiler(processor)
|
||||||
return profiler.get_mm_limits()
|
return profiler.get_mm_limits()
|
||||||
|
|
||||||
@ -231,21 +228,30 @@ class MultiModalRegistry:
|
|||||||
assert hasattr(model_cls, "_processor_factory")
|
assert hasattr(model_cls, "_processor_factory")
|
||||||
return cast("SupportsMultiModal", model_cls)
|
return cast("SupportsMultiModal", model_cls)
|
||||||
|
|
||||||
|
def _create_processing_ctx(
|
||||||
|
self,
|
||||||
|
model_config: "ModelConfig",
|
||||||
|
tokenizer: TokenizerLike | None = None,
|
||||||
|
) -> InputProcessingContext:
|
||||||
|
if tokenizer is None and not model_config.skip_tokenizer_init:
|
||||||
|
tokenizer = cached_tokenizer_from_config(model_config)
|
||||||
|
|
||||||
|
return InputProcessingContext(model_config, tokenizer)
|
||||||
|
|
||||||
def _create_processing_info(
|
def _create_processing_info(
|
||||||
self,
|
self,
|
||||||
renderer_config: "RendererConfig",
|
model_config: "ModelConfig",
|
||||||
*,
|
*,
|
||||||
tokenizer: TokenizerLike | None = None,
|
tokenizer: TokenizerLike | None = None,
|
||||||
) -> BaseProcessingInfo:
|
) -> BaseProcessingInfo:
|
||||||
model_cls = self._get_model_cls(renderer_config.model_config)
|
model_cls = self._get_model_cls(model_config)
|
||||||
factories = model_cls._processor_factory
|
factories = model_cls._processor_factory
|
||||||
|
ctx = self._create_processing_ctx(model_config, tokenizer)
|
||||||
ctx = InputProcessingContext.from_config(renderer_config, tokenizer=tokenizer)
|
|
||||||
return factories.info(ctx)
|
return factories.info(ctx)
|
||||||
|
|
||||||
def create_processor(
|
def create_processor(
|
||||||
self,
|
self,
|
||||||
renderer_config: "RendererConfig",
|
model_config: "ModelConfig",
|
||||||
*,
|
*,
|
||||||
tokenizer: TokenizerLike | None = None,
|
tokenizer: TokenizerLike | None = None,
|
||||||
cache: BaseMultiModalProcessorCache | None = None,
|
cache: BaseMultiModalProcessorCache | None = None,
|
||||||
@ -253,19 +259,19 @@ class MultiModalRegistry:
|
|||||||
"""
|
"""
|
||||||
Create a multi-modal processor for a specific model and tokenizer.
|
Create a multi-modal processor for a specific model and tokenizer.
|
||||||
"""
|
"""
|
||||||
model_config = renderer_config.model_config
|
|
||||||
if not model_config.is_multimodal_model:
|
if not model_config.is_multimodal_model:
|
||||||
raise ValueError(f"{model_config.model} is not a multimodal model")
|
raise ValueError(f"{model_config.model} is not a multimodal model")
|
||||||
|
|
||||||
model_cls = self._get_model_cls(model_config)
|
model_cls = self._get_model_cls(model_config)
|
||||||
factories = model_cls._processor_factory
|
factories = model_cls._processor_factory
|
||||||
|
|
||||||
ctx = InputProcessingContext.from_config(renderer_config, tokenizer=tokenizer)
|
ctx = self._create_processing_ctx(model_config, tokenizer)
|
||||||
|
|
||||||
return factories.build_processor(ctx, cache=cache)
|
return factories.build_processor(ctx, cache=cache)
|
||||||
|
|
||||||
def get_decoder_dummy_data(
|
def get_decoder_dummy_data(
|
||||||
self,
|
self,
|
||||||
renderer_config: "RendererConfig",
|
model_config: "ModelConfig",
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
mm_counts: Mapping[str, int] | None = None,
|
mm_counts: Mapping[str, int] | None = None,
|
||||||
*,
|
*,
|
||||||
@ -274,15 +280,15 @@ class MultiModalRegistry:
|
|||||||
"""
|
"""
|
||||||
Create dummy data for profiling the memory usage of a model.
|
Create dummy data for profiling the memory usage of a model.
|
||||||
|
|
||||||
The model is identified by `renderer_config`.
|
The model is identified by `model_config`.
|
||||||
"""
|
"""
|
||||||
processor = self.create_processor(renderer_config, cache=cache)
|
processor = self.create_processor(model_config, cache=cache)
|
||||||
profiler: MultiModalProfiler = MultiModalProfiler(processor)
|
profiler: MultiModalProfiler = MultiModalProfiler(processor)
|
||||||
|
|
||||||
# Extract configurable options from multimodal config.
|
# Extract configurable options from multimodal config.
|
||||||
# Only include modalities that use advanced option types so legacy
|
# Only include modalities that use advanced option types so legacy
|
||||||
# count-only behavior remains unchanged.
|
# count-only behavior remains unchanged.
|
||||||
mm_options = self._extract_mm_options(renderer_config.model_config)
|
mm_options = self._extract_mm_options(model_config)
|
||||||
|
|
||||||
dummy_data = profiler.get_decoder_dummy_data(seq_len, mm_counts, mm_options)
|
dummy_data = profiler.get_decoder_dummy_data(seq_len, mm_counts, mm_options)
|
||||||
|
|
||||||
@ -298,7 +304,7 @@ class MultiModalRegistry:
|
|||||||
|
|
||||||
def get_encoder_dummy_data(
|
def get_encoder_dummy_data(
|
||||||
self,
|
self,
|
||||||
renderer_config: "RendererConfig",
|
model_config: "ModelConfig",
|
||||||
seq_len: int,
|
seq_len: int,
|
||||||
mm_counts: Mapping[str, int] | None = None,
|
mm_counts: Mapping[str, int] | None = None,
|
||||||
*,
|
*,
|
||||||
@ -307,15 +313,15 @@ class MultiModalRegistry:
|
|||||||
"""
|
"""
|
||||||
Create dummy data for profiling the memory usage of a model.
|
Create dummy data for profiling the memory usage of a model.
|
||||||
|
|
||||||
The model is identified by `renderer_config`.
|
The model is identified by `model_config`.
|
||||||
"""
|
"""
|
||||||
processor = self.create_processor(renderer_config, cache=cache)
|
processor = self.create_processor(model_config, cache=cache)
|
||||||
profiler: MultiModalProfiler = MultiModalProfiler(processor)
|
profiler: MultiModalProfiler = MultiModalProfiler(processor)
|
||||||
|
|
||||||
# Extract configurable options from multimodal config.
|
# Extract configurable options from multimodal config.
|
||||||
# Only include modalities that use advanced option types so legacy
|
# Only include modalities that use advanced option types so legacy
|
||||||
# count-only behavior remains unchanged.
|
# count-only behavior remains unchanged.
|
||||||
mm_options = self._extract_mm_options(renderer_config.model_config)
|
mm_options = self._extract_mm_options(model_config)
|
||||||
|
|
||||||
dummy_data = profiler.get_encoder_dummy_data(seq_len, mm_counts, mm_options)
|
dummy_data = profiler.get_encoder_dummy_data(seq_len, mm_counts, mm_options)
|
||||||
|
|
||||||
@ -330,15 +336,13 @@ class MultiModalRegistry:
|
|||||||
|
|
||||||
return dummy_data
|
return dummy_data
|
||||||
|
|
||||||
def get_encdec_max_encoder_len(self, renderer_config: "RendererConfig") -> int:
|
def get_encdec_max_encoder_len(self, model_config: "ModelConfig") -> int:
|
||||||
"""
|
"""
|
||||||
Get the maximum length of the encoder input for encoder-decoder models.
|
Get the maximum length of the encoder input for encoder-decoder models.
|
||||||
"""
|
"""
|
||||||
model_config = renderer_config.model_config
|
|
||||||
if not model_config.is_encoder_decoder:
|
if not model_config.is_encoder_decoder:
|
||||||
return 0
|
return 0
|
||||||
|
max_tokens = self.get_max_tokens_per_item_by_modality(model_config)
|
||||||
max_tokens = self.get_max_tokens_per_item_by_modality(renderer_config)
|
|
||||||
if not max_tokens:
|
if not max_tokens:
|
||||||
# TODO - this function assumes encoder-decoder models are
|
# TODO - this function assumes encoder-decoder models are
|
||||||
# multimodal. This will need to change when adding support for more
|
# multimodal. This will need to change when adding support for more
|
||||||
|
|||||||
@ -24,7 +24,7 @@ from vllm.utils.import_utils import resolve_obj_by_qualname
|
|||||||
from .protocol import TokenizerLike
|
from .protocol import TokenizerLike
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from vllm.config import RendererConfig
|
from vllm.config import ModelConfig
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
@ -205,18 +205,18 @@ def get_tokenizer(
|
|||||||
cached_get_tokenizer = lru_cache(get_tokenizer)
|
cached_get_tokenizer = lru_cache(get_tokenizer)
|
||||||
|
|
||||||
|
|
||||||
def cached_tokenizer_from_config(renderer_config: "RendererConfig", **kwargs):
|
def cached_tokenizer_from_config(model_config: "ModelConfig", **kwargs):
|
||||||
return cached_get_tokenizer(
|
return cached_get_tokenizer(
|
||||||
renderer_config.tokenizer,
|
model_config.tokenizer,
|
||||||
tokenizer_mode=renderer_config.tokenizer_mode,
|
tokenizer_mode=model_config.tokenizer_mode,
|
||||||
revision=renderer_config.tokenizer_revision,
|
revision=model_config.tokenizer_revision,
|
||||||
trust_remote_code=renderer_config.trust_remote_code,
|
trust_remote_code=model_config.trust_remote_code,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def init_tokenizer_from_config(renderer_config: "RendererConfig"):
|
def init_tokenizer_from_config(model_config: "ModelConfig"):
|
||||||
runner_type = renderer_config.model_config.runner_type
|
runner_type = model_config.runner_type
|
||||||
if runner_type == "generate" or runner_type == "draft":
|
if runner_type == "generate" or runner_type == "draft":
|
||||||
truncation_side = "left"
|
truncation_side = "left"
|
||||||
elif runner_type == "pooling":
|
elif runner_type == "pooling":
|
||||||
@ -225,9 +225,9 @@ def init_tokenizer_from_config(renderer_config: "RendererConfig"):
|
|||||||
assert_never(runner_type)
|
assert_never(runner_type)
|
||||||
|
|
||||||
return get_tokenizer(
|
return get_tokenizer(
|
||||||
renderer_config.tokenizer,
|
model_config.tokenizer,
|
||||||
tokenizer_mode=renderer_config.tokenizer_mode,
|
tokenizer_mode=model_config.tokenizer_mode,
|
||||||
trust_remote_code=renderer_config.trust_remote_code,
|
trust_remote_code=model_config.trust_remote_code,
|
||||||
revision=renderer_config.tokenizer_revision,
|
revision=model_config.tokenizer_revision,
|
||||||
truncation_side=truncation_side,
|
truncation_side=truncation_side,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -23,7 +23,7 @@ from vllm.transformers_utils.utils import convert_model_repo_to_path
|
|||||||
from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
|
from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from vllm.config import ModelConfig, RendererConfig
|
from vllm.config import ModelConfig
|
||||||
|
|
||||||
_P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
|
_P = TypeVar("_P", bound=ProcessorMixin, default=ProcessorMixin)
|
||||||
_V = TypeVar("_V", bound=BaseVideoProcessor, default=BaseVideoProcessor)
|
_V = TypeVar("_V", bound=BaseVideoProcessor, default=BaseVideoProcessor)
|
||||||
@ -233,18 +233,17 @@ def cached_get_processor_without_dynamic_kwargs(
|
|||||||
|
|
||||||
|
|
||||||
def cached_processor_from_config(
|
def cached_processor_from_config(
|
||||||
renderer_config: "RendererConfig",
|
model_config: "ModelConfig",
|
||||||
processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin,
|
processor_cls: type[_P] | tuple[type[_P], ...] = ProcessorMixin,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> _P:
|
) -> _P:
|
||||||
model_config = renderer_config.model_config
|
|
||||||
if is_gguf(model_config.model):
|
if is_gguf(model_config.model):
|
||||||
assert not is_gguf(renderer_config.tokenizer), (
|
assert not is_gguf(model_config.tokenizer), (
|
||||||
"For multimodal GGUF models, the original tokenizer "
|
"For multimodal GGUF models, the original tokenizer "
|
||||||
"should be used to correctly load processor."
|
"should be used to correctly load processor."
|
||||||
)
|
)
|
||||||
model = renderer_config.tokenizer
|
model = model_config.tokenizer
|
||||||
revision = renderer_config.tokenizer_revision
|
revision = model_config.tokenizer_revision
|
||||||
else:
|
else:
|
||||||
model = model_config.model
|
model = model_config.model
|
||||||
revision = model_config.revision
|
revision = model_config.revision
|
||||||
@ -298,11 +297,9 @@ cached_get_feature_extractor = lru_cache(get_feature_extractor)
|
|||||||
|
|
||||||
|
|
||||||
def cached_feature_extractor_from_config(
|
def cached_feature_extractor_from_config(
|
||||||
renderer_config: "RendererConfig",
|
model_config: "ModelConfig",
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
):
|
):
|
||||||
model_config = renderer_config.model_config
|
|
||||||
|
|
||||||
return cached_get_feature_extractor(
|
return cached_get_feature_extractor(
|
||||||
model_config.model,
|
model_config.model,
|
||||||
revision=model_config.revision,
|
revision=model_config.revision,
|
||||||
@ -351,17 +348,16 @@ cached_get_image_processor = lru_cache(get_image_processor)
|
|||||||
|
|
||||||
|
|
||||||
def cached_image_processor_from_config(
|
def cached_image_processor_from_config(
|
||||||
renderer_config: "RendererConfig",
|
model_config: "ModelConfig",
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
):
|
):
|
||||||
model_config = renderer_config.model_config
|
|
||||||
if is_gguf(model_config.model):
|
if is_gguf(model_config.model):
|
||||||
assert not is_gguf(renderer_config.tokenizer), (
|
assert not is_gguf(model_config.tokenizer), (
|
||||||
"For multimodal GGUF models, the original tokenizer "
|
"For multimodal GGUF models, the original tokenizer "
|
||||||
"should be used to correctly load image processor."
|
"should be used to correctly load image processor."
|
||||||
)
|
)
|
||||||
model = renderer_config.tokenizer
|
model = model_config.tokenizer
|
||||||
revision = renderer_config.tokenizer_revision
|
revision = model_config.tokenizer_revision
|
||||||
else:
|
else:
|
||||||
model = model_config.model
|
model = model_config.model
|
||||||
revision = model_config.revision
|
revision = model_config.revision
|
||||||
@ -415,12 +411,10 @@ cached_get_video_processor = lru_cache(get_video_processor)
|
|||||||
|
|
||||||
|
|
||||||
def cached_video_processor_from_config(
|
def cached_video_processor_from_config(
|
||||||
renderer_config: "RendererConfig",
|
model_config: "ModelConfig",
|
||||||
processor_cls: type[_V] | None = None,
|
processor_cls: type[_V] | None = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
):
|
):
|
||||||
model_config = renderer_config.model_config
|
|
||||||
|
|
||||||
return cached_get_video_processor(
|
return cached_get_video_processor(
|
||||||
model_config.model,
|
model_config.model,
|
||||||
revision=model_config.revision,
|
revision=model_config.revision,
|
||||||
|
|||||||
@ -10,7 +10,7 @@ from vllm.multimodal import MultiModalRegistry
|
|||||||
from vllm.v1.request import Request
|
from vllm.v1.request import Request
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from vllm.config import RendererConfig, SchedulerConfig
|
from vllm.config import ModelConfig, SchedulerConfig
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
@ -250,7 +250,7 @@ class EncoderCacheManager:
|
|||||||
|
|
||||||
|
|
||||||
def compute_encoder_budget(
|
def compute_encoder_budget(
|
||||||
renderer_config: "RendererConfig",
|
model_config: "ModelConfig",
|
||||||
scheduler_config: "SchedulerConfig",
|
scheduler_config: "SchedulerConfig",
|
||||||
mm_registry: MultiModalRegistry,
|
mm_registry: MultiModalRegistry,
|
||||||
) -> tuple[int, int]:
|
) -> tuple[int, int]:
|
||||||
@ -263,9 +263,9 @@ def compute_encoder_budget(
|
|||||||
- Space budget for encoder cache size, measured in number of tokens
|
- Space budget for encoder cache size, measured in number of tokens
|
||||||
from the input sequence.
|
from the input sequence.
|
||||||
"""
|
"""
|
||||||
if mm_registry.supports_multimodal_inputs(renderer_config):
|
if mm_registry.supports_multimodal_inputs(model_config):
|
||||||
max_tokens_by_modality = mm_registry.get_max_tokens_per_item_by_modality(
|
max_tokens_by_modality = mm_registry.get_max_tokens_per_item_by_modality(
|
||||||
renderer_config
|
model_config
|
||||||
)
|
)
|
||||||
|
|
||||||
return compute_mm_encoder_budget(
|
return compute_mm_encoder_budget(
|
||||||
|
|||||||
@ -164,7 +164,7 @@ class Scheduler(SchedulerInterface):
|
|||||||
# This can be changed when we make encoder cache for embedding caching
|
# This can be changed when we make encoder cache for embedding caching
|
||||||
# across requests.
|
# across requests.
|
||||||
encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
|
encoder_compute_budget, encoder_cache_size = compute_encoder_budget(
|
||||||
renderer_config=vllm_config.renderer_config,
|
model_config=vllm_config.model_config,
|
||||||
scheduler_config=vllm_config.scheduler_config,
|
scheduler_config=vllm_config.scheduler_config,
|
||||||
mm_registry=mm_registry,
|
mm_registry=mm_registry,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -91,7 +91,6 @@ class AsyncLLM(EngineClient):
|
|||||||
# Ensure we can serialize custom transformer configs
|
# Ensure we can serialize custom transformer configs
|
||||||
maybe_register_config_serialize_by_value()
|
maybe_register_config_serialize_by_value()
|
||||||
|
|
||||||
self.renderer_config = vllm_config.renderer_config
|
|
||||||
self.model_config = vllm_config.model_config
|
self.model_config = vllm_config.model_config
|
||||||
self.vllm_config = vllm_config
|
self.vllm_config = vllm_config
|
||||||
self.observability_config = vllm_config.observability_config
|
self.observability_config = vllm_config.observability_config
|
||||||
@ -109,15 +108,15 @@ class AsyncLLM(EngineClient):
|
|||||||
"enabling logging without default stat loggers."
|
"enabling logging without default stat loggers."
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.renderer_config.skip_tokenizer_init:
|
if self.model_config.skip_tokenizer_init:
|
||||||
tokenizer = None
|
tokenizer = None
|
||||||
else:
|
else:
|
||||||
tokenizer = init_tokenizer_from_config(self.renderer_config)
|
tokenizer = init_tokenizer_from_config(self.model_config)
|
||||||
|
|
||||||
self.input_processor = InputProcessor(self.vllm_config, tokenizer)
|
self.input_processor = InputProcessor(self.vllm_config, tokenizer)
|
||||||
self.io_processor = get_io_processor(
|
self.io_processor = get_io_processor(
|
||||||
self.vllm_config,
|
self.vllm_config,
|
||||||
self.renderer_config.io_processor_plugin,
|
self.model_config.io_processor_plugin,
|
||||||
)
|
)
|
||||||
|
|
||||||
# OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
|
# OutputProcessor (converts EngineCoreOutputs --> RequestOutput).
|
||||||
|
|||||||
@ -43,7 +43,6 @@ class InputProcessor:
|
|||||||
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
|
mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.vllm_config = vllm_config
|
self.vllm_config = vllm_config
|
||||||
self.renderer_config = vllm_config.renderer_config
|
|
||||||
self.model_config = vllm_config.model_config
|
self.model_config = vllm_config.model_config
|
||||||
self.cache_config = vllm_config.cache_config
|
self.cache_config = vllm_config.cache_config
|
||||||
self.lora_config = vllm_config.lora_config
|
self.lora_config = vllm_config.lora_config
|
||||||
@ -55,7 +54,7 @@ class InputProcessor:
|
|||||||
self.mm_processor_cache = processor_cache_from_config(vllm_config, mm_registry)
|
self.mm_processor_cache = processor_cache_from_config(vllm_config, mm_registry)
|
||||||
|
|
||||||
self.input_preprocessor = InputPreprocessor(
|
self.input_preprocessor = InputPreprocessor(
|
||||||
self.renderer_config,
|
self.model_config,
|
||||||
tokenizer,
|
tokenizer,
|
||||||
mm_registry,
|
mm_registry,
|
||||||
mm_processor_cache=self.mm_processor_cache,
|
mm_processor_cache=self.mm_processor_cache,
|
||||||
@ -253,7 +252,7 @@ class InputProcessor:
|
|||||||
if not params.structured_outputs or not self.structured_outputs_config:
|
if not params.structured_outputs or not self.structured_outputs_config:
|
||||||
return
|
return
|
||||||
|
|
||||||
if self.renderer_config.skip_tokenizer_init and params.structured_outputs:
|
if self.model_config.skip_tokenizer_init and params.structured_outputs:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'" # noqa: E501
|
"Structured outputs requires a tokenizer so it can't be used with 'skip_tokenizer_init'" # noqa: E501
|
||||||
)
|
)
|
||||||
@ -583,7 +582,7 @@ class InputProcessor:
|
|||||||
if prompt_type == "encoder" and model_config.is_multimodal_model:
|
if prompt_type == "encoder" and model_config.is_multimodal_model:
|
||||||
mm_registry = self.input_preprocessor.mm_registry
|
mm_registry = self.input_preprocessor.mm_registry
|
||||||
mm_processor = mm_registry.create_processor(
|
mm_processor = mm_registry.create_processor(
|
||||||
self.renderer_config,
|
model_config,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
)
|
)
|
||||||
assert isinstance(mm_processor, EncDecMultiModalProcessor)
|
assert isinstance(mm_processor, EncDecMultiModalProcessor)
|
||||||
|
|||||||
@ -60,7 +60,6 @@ class LLMEngine:
|
|||||||
) -> None:
|
) -> None:
|
||||||
self.vllm_config = vllm_config
|
self.vllm_config = vllm_config
|
||||||
self.observability_config = vllm_config.observability_config
|
self.observability_config = vllm_config.observability_config
|
||||||
self.renderer_config = vllm_config.renderer_config
|
|
||||||
self.model_config = vllm_config.model_config
|
self.model_config = vllm_config.model_config
|
||||||
self.cache_config = vllm_config.cache_config
|
self.cache_config = vllm_config.cache_config
|
||||||
|
|
||||||
@ -84,15 +83,15 @@ class LLMEngine:
|
|||||||
self.dp_group = None
|
self.dp_group = None
|
||||||
self.should_execute_dummy_batch = False
|
self.should_execute_dummy_batch = False
|
||||||
|
|
||||||
if self.renderer_config.skip_tokenizer_init:
|
if self.model_config.skip_tokenizer_init:
|
||||||
tokenizer = None
|
tokenizer = None
|
||||||
else:
|
else:
|
||||||
tokenizer = init_tokenizer_from_config(self.renderer_config)
|
tokenizer = init_tokenizer_from_config(self.model_config)
|
||||||
|
|
||||||
self.input_processor = InputProcessor(self.vllm_config, tokenizer)
|
self.input_processor = InputProcessor(self.vllm_config, tokenizer)
|
||||||
self.io_processor = get_io_processor(
|
self.io_processor = get_io_processor(
|
||||||
self.vllm_config,
|
self.vllm_config,
|
||||||
self.renderer_config.io_processor_plugin,
|
self.model_config.io_processor_plugin,
|
||||||
)
|
)
|
||||||
|
|
||||||
# OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
|
# OutputProcessor (convert EngineCoreOutputs --> RequestOutput).
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user