[Misc] Clean up Qwen2.5-Omni code (#17301)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung 2025-04-28 21:20:45 +08:00 committed by GitHub
parent 889ebb2638
commit 8b464d9660
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 75 additions and 94 deletions

View File

@ -51,11 +51,9 @@ from vllm.model_executor.models.qwen2_audio import (
from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalDataParser from vllm.model_executor.models.qwen2_vl import Qwen2VLMultiModalDataParser
from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.hasher import MultiModalHasher
from vllm.multimodal.inputs import (ImageItem, ModalityData, from vllm.multimodal.inputs import (ImageItem, ModalityData,
MultiModalDataDict, MultiModalFieldConfig, MultiModalDataDict, MultiModalFieldConfig,
MultiModalInputs, MultiModalKwargs, MultiModalKwargs, NestedTensors)
NestedTensors)
from vllm.multimodal.parse import (AudioProcessorItems, DictEmbeddingItems, from vllm.multimodal.parse import (AudioProcessorItems, DictEmbeddingItems,
ModalityDataItems, MultiModalDataItems, ModalityDataItems, MultiModalDataItems,
MultiModalDataParser) MultiModalDataParser)
@ -279,46 +277,17 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
) -> Mapping[str, MultiModalFieldConfig]: ) -> Mapping[str, MultiModalFieldConfig]:
return _qwen2_5_omni_thinker_field_config(hf_inputs) return _qwen2_5_omni_thinker_field_config(hf_inputs)
def apply( def _maybe_apply_prompt_updates(
self, self,
prompt: Union[str, list[int]], mm_items: MultiModalDataItems,
mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
return_mm_hashes: bool = False, prompt_ids: list[int],
) -> MultiModalInputs: mm_kwargs: MultiModalKwargs,
is_update_applied: bool,
) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
""" """
Qwen2.5-Omni reimplements this function to handle `use_audio_in_video`. Qwen2.5-Omni reimplements this function to handle `use_audio_in_video`.
""" """
mm_items = self._to_mm_items(mm_data)
# Create MM hashes to be returned (only used in V1)
# TODO: Use these hash keys for caching operations in apply_hf_processor
# instead of rehashing.
if return_mm_hashes:
model_id = self.info.model_id
mm_hashes = {
modality: [
MultiModalHasher.hash_kwargs(model_id=model_id,
**{modality: item},
**hf_processor_mm_kwargs)
for item in items
]
for modality, items in mm_items.items()
}
else:
mm_hashes = None
(
prompt_ids,
mm_kwargs,
is_update_applied,
) = self._cached_apply_hf_processor(
prompt,
mm_items,
hf_processor_mm_kwargs,
)
unbound_prompt_updates = self._get_prompt_updates( unbound_prompt_updates = self._get_prompt_updates(
mm_items, mm_items,
hf_processor_mm_kwargs, hf_processor_mm_kwargs,
@ -364,22 +333,10 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
tokenizer = self.info.get_tokenizer() tokenizer = self.info.get_tokenizer()
prompt = decode_tokens(tokenizer, prompt_ids) prompt = decode_tokens(tokenizer, prompt_ids)
mm_placeholder_ranges = {
modality: [item.to_range() for item in placeholders]
for modality, placeholders in mm_placeholders.items()
}
if use_audio_in_video: if use_audio_in_video:
mm_kwargs["use_audio_in_video"] = True mm_kwargs["use_audio_in_video"] = True
return MultiModalInputs( return prompt_ids, prompt, mm_placeholders
type="multimodal",
prompt=prompt,
prompt_token_ids=prompt_ids,
mm_kwargs=mm_kwargs,
mm_hashes=mm_hashes,
mm_placeholders=mm_placeholder_ranges,
)
def _get_prompt_updates( def _get_prompt_updates(
self, self,

View File

@ -1569,56 +1569,35 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
"model (usually arising from an inconsistency between " "model (usually arising from an inconsistency between "
"`_call_hf_processor` and `_get_prompt_updates`).") "`_call_hf_processor` and `_get_prompt_updates`).")
def apply( def _hash_mm_items(
self, self,
prompt: Union[str, list[int]], mm_items: MultiModalDataItems,
mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
return_mm_hashes: bool = False, ) -> dict[str, list[str]]:
) -> MultiModalInputs: """Create MM hashes to be returned (only used in V1)."""
"""
Process multi-modal inputs to be used in vLLM.
The main steps are:
1. Apply HF Processor on prompt text and multi-modal data together,
outputting token IDs and processed tensors.
2. Find and update sequences in the token IDs with placeholder tokens.
The number of placeholder tokens equals the feature size of the
multi-modal data outputted by the multi-modal encoder.
3. Extract information about the placeholder tokens from the
processed token IDs.
"""
mm_items = self._to_mm_items(mm_data)
# Create MM hashes to be returned (only used in V1)
# TODO: Use these hash keys for caching operations in apply_hf_processor # TODO: Use these hash keys for caching operations in apply_hf_processor
# instead of rehashing. # instead of rehashing.
model_id = self.info.model_id
if return_mm_hashes: return {
model_id = self.info.model_id modality: [
mm_hashes = { MultiModalHasher.hash_kwargs(model_id=model_id,
modality: [ **{modality: item},
MultiModalHasher.hash_kwargs(model_id=model_id, **hf_processor_mm_kwargs)
**{modality: item}, for item in items
**hf_processor_mm_kwargs) ]
for item in items for modality, items in mm_items.items()
] }
for modality, items in mm_items.items()
}
else:
mm_hashes = None
(
prompt_ids,
mm_kwargs,
is_update_applied,
) = self._cached_apply_hf_processor(
prompt,
mm_items,
hf_processor_mm_kwargs,
)
def _maybe_apply_prompt_updates(
self,
mm_items: MultiModalDataItems,
hf_processor_mm_kwargs: Mapping[str, object],
prompt_ids: list[int],
mm_kwargs: MultiModalKwargs,
is_update_applied: bool,
) -> tuple[list[int], str, Mapping[str, list[PlaceholderFeaturesInfo]]]:
unbound_prompt_updates = self._get_prompt_updates( unbound_prompt_updates = self._get_prompt_updates(
mm_items, mm_items,
hf_processor_mm_kwargs, hf_processor_mm_kwargs,
@ -1652,6 +1631,51 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
) )
self._validate_mm_placeholders(mm_placeholders, mm_item_counts) self._validate_mm_placeholders(mm_placeholders, mm_item_counts)
return prompt_ids, prompt, mm_placeholders
def apply(
self,
prompt: Union[str, list[int]],
mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object],
return_mm_hashes: bool = False,
) -> MultiModalInputs:
"""
Process multi-modal inputs to be used in vLLM.
The main steps are:
1. Apply HF Processor on prompt text and multi-modal data together,
outputting token IDs and processed tensors.
2. Find and update sequences in the token IDs with placeholder tokens.
The number of placeholder tokens equals the feature size of the
multi-modal data outputted by the multi-modal encoder.
3. Extract information about the placeholder tokens from the
processed token IDs.
"""
mm_items = self._to_mm_items(mm_data)
mm_hashes = (self._hash_mm_items(mm_items, hf_processor_mm_kwargs)
if return_mm_hashes else None)
(
prompt_ids,
mm_kwargs,
is_update_applied,
) = self._cached_apply_hf_processor(
prompt,
mm_items,
hf_processor_mm_kwargs,
)
prompt_ids, prompt, mm_placeholders = self._maybe_apply_prompt_updates(
mm_items=mm_items,
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
prompt_ids=prompt_ids,
mm_kwargs=mm_kwargs,
is_update_applied=is_update_applied,
)
mm_placeholder_ranges = { mm_placeholder_ranges = {
modality: [item.to_range() for item in placeholders] modality: [item.to_range() for item in placeholders]
for modality, placeholders in mm_placeholders.items() for modality, placeholders in mm_placeholders.items()