mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-10 09:06:03 +08:00
[Misc] Fix input processing for Ultravox (#13871)
This commit is contained in:
parent
5157338ed9
commit
7ca1da020f
@ -83,8 +83,8 @@ def _test_processing_correctness(
|
|||||||
}
|
}
|
||||||
|
|
||||||
tokenizer_encode_kwargs = {}
|
tokenizer_encode_kwargs = {}
|
||||||
if model_config.hf_config.model_type in ("mllama", "whisper"):
|
if model_config.hf_config.model_type in ("mllama", "whisper", "ultravox"):
|
||||||
# For some encoder-decoder models, tokenizer will always add bos_token
|
# For some multimodal models, tokenizer will always add bos_token
|
||||||
# at the beginning of prompt by default, causing hf_processor outputs
|
# at the beginning of prompt by default, causing hf_processor outputs
|
||||||
# incorrect token ids. So we need use `add_special_tokens=False` here
|
# incorrect token ids. So we need use `add_special_tokens=False` here
|
||||||
# to leave bos_token to be added by the processor.
|
# to leave bos_token to be added by the processor.
|
||||||
@ -172,7 +172,7 @@ def _test_processing_correctness(
|
|||||||
"Qwen/Qwen2-VL-2B-Instruct",
|
"Qwen/Qwen2-VL-2B-Instruct",
|
||||||
"Qwen/Qwen2.5-VL-3B-Instruct",
|
"Qwen/Qwen2.5-VL-3B-Instruct",
|
||||||
"Qwen/Qwen2-Audio-7B-Instruct",
|
"Qwen/Qwen2-Audio-7B-Instruct",
|
||||||
"fixie-ai/ultravox-v0_5-llama-3_2-1b",
|
"fixie-ai/ultravox-v0_4",
|
||||||
"openai/whisper-large-v3",
|
"openai/whisper-large-v3",
|
||||||
])
|
])
|
||||||
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
|
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
|
||||||
|
|||||||
@ -284,7 +284,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
|
|||||||
"Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"), # noqa: E501
|
"Qwen2VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2-VL-2B-Instruct"), # noqa: E501
|
||||||
"Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct", # noqa: E501
|
"Qwen2_5_VLForConditionalGeneration": _HfExamplesInfo("Qwen/Qwen2.5-VL-3B-Instruct", # noqa: E501
|
||||||
min_transformers_version="4.49"), # noqa: E501
|
min_transformers_version="4.49"), # noqa: E501
|
||||||
"UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_5-llama-3_2-1b",
|
"UltravoxModel": _HfExamplesInfo("fixie-ai/ultravox-v0_4",
|
||||||
trust_remote_code=True),
|
trust_remote_code=True),
|
||||||
# [Encoder-decoder]
|
# [Encoder-decoder]
|
||||||
"MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501
|
"MllamaForConditionalGeneration": _HfExamplesInfo("meta-llama/Llama-3.2-11B-Vision-Instruct"), # noqa: E501
|
||||||
|
|||||||
@ -146,7 +146,8 @@ class UltravoxMultiModalProcessor(
|
|||||||
) -> BatchFeature:
|
) -> BatchFeature:
|
||||||
# Text-only input not supported in composite processor
|
# Text-only input not supported in composite processor
|
||||||
if not mm_data or not mm_data.get("audios", []):
|
if not mm_data or not mm_data.get("audios", []):
|
||||||
prompt_ids = self.info.get_tokenizer().encode(prompt)
|
prompt_ids = self.info.get_tokenizer().encode(
|
||||||
|
prompt, add_special_tokens=False)
|
||||||
prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
|
prompt_ids = self._apply_hf_processor_tokens_only(prompt_ids)
|
||||||
return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
|
return BatchFeature(dict(input_ids=[prompt_ids]), tensor_type="pt")
|
||||||
|
|
||||||
@ -185,16 +186,6 @@ class UltravoxMultiModalProcessor(
|
|||||||
)
|
)
|
||||||
return BatchFeature(combined_outputs)
|
return BatchFeature(combined_outputs)
|
||||||
|
|
||||||
def _apply_hf_processor_tokens_only(
|
|
||||||
self,
|
|
||||||
prompt_tokens: list[int],
|
|
||||||
) -> list[int]:
|
|
||||||
# HF processor omits bos_token_id by setting add_special_tokens=False
|
|
||||||
tokenizer = self.info.get_tokenizer()
|
|
||||||
assert prompt_tokens[0] == tokenizer.bos_token_id
|
|
||||||
|
|
||||||
return prompt_tokens[1:]
|
|
||||||
|
|
||||||
def _get_mm_fields_config(
|
def _get_mm_fields_config(
|
||||||
self,
|
self,
|
||||||
hf_inputs: BatchFeature,
|
hf_inputs: BatchFeature,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user