diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index 0115863f5626..a84999cfbf4f 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -83,8 +83,8 @@ def _test_processing_correctness( } tokenizer_encode_kwargs = {} - if model_config.hf_config.model_type in ("mllama", "whisper"): - # For some encoder-decoder models, tokenizer will always add bos_token + if model_config.hf_config.model_type in ("mllama", "whisper", "ultravox"): + # For some multimodal models, tokenizer will always add bos_token # at the beginning of prompt by default, causing hf_processor outputs # incorrect token ids. So we need use `add_special_tokens=False` here # to leave bos_token to be added by the processor. @@ -172,7 +172,7 @@ def _test_processing_correctness( "Qwen/Qwen2-VL-2B-Instruct", "Qwen/Qwen2.5-VL-3B-Instruct", "Qwen/Qwen2-Audio-7B-Instruct", - "fixie-ai/ultravox-v0_5-llama-3_2-1b", + "fixie-ai/ultravox-v0_4", "openai/whisper-large-v3", ]) @pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0]) diff --git a/tests/models/registry.py b/tests/models/registry.py index 566a4418feb1..b47eaef30bf2 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -284,7 +284,7 @@ _MULTIMODAL_EXAMPLE_MODELS = { "Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"), # noqa: E501 "Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct", # noqa: E501 min_transformers_version="4.49"), # noqa: E501 - "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b", + "UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_4", trust_remote_code=True), # [Encoder-decoder] "MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501 diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 1dbba3c50b19..b8d4aef252e5 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -146,7 +146,8 @@ class UltravoxMultiModalProcessor( ) -> BatchFeature: # Text-only input not supported in composite processor if not mm_data or not mm_data.get("audios", []): - prompt_ids = self.info.get_tokenizer().encode(prompt) + prompt_ids = self.info.get_tokenizer().encode( + prompt, add_special_tokens=False) prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids) return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt") @@ -185,16 +186,6 @@ class UltravoxMultiModalProcessor( ) return BatchFeature(combined_outputs) - def _apply_hf_processor_tokens_only( - self, - prompt_tokens: list[int], - ) -> list[int]: - # HF processor omits bos_token_id by setting add_special_tokens=False - tokenizer = self.info.get_tokenizer() - assert prompt_tokens[0] == tokenizer.bos_token_id - - return prompt_tokens[1:] - def _get_mm_fields_config( self, hf_inputs: BatchFeature,