mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-04-24 15:17:04 +08:00
[Core] [Bugfix] [Multimodal] Fix multimodal profiling and generation for SFT/PTQed models (#20058)
Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
This commit is contained in:
parent
551ef1631a
commit
d8cf819a9a
@ -538,11 +538,13 @@ return a schema of the tensors outputted by the HF processor that are related to
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
processed_outputs = super()._call_hf_processor(
|
||||
prompt=prompt,
|
||||
mm_data=mm_data,
|
||||
mm_kwargs=mm_kwargs,
|
||||
tok_kwargs=tok_kwargs,
|
||||
)
|
||||
|
||||
image_patches = processed_outputs.get("image_patches")
|
||||
@ -566,6 +568,11 @@ return a schema of the tensors outputted by the HF processor that are related to
|
||||
Our [actual code](gh-file:vllm/model_executor/models/fuyu.py) has special handling
|
||||
for text-only inputs to prevent unnecessary warnings from HF processor.
|
||||
|
||||
!!! note
|
||||
The `_call_hf_processor` method specifies both `mm_kwargs` and `tok_kwargs` for
|
||||
processing. `mm_kwargs` is used to both initialize and call the huggingface
|
||||
processor, whereas `tok_kwargs` is only used to call the huggingface processor.
|
||||
|
||||
This lets us override [_get_mm_fields_config][vllm.multimodal.processing.BaseMultiModalProcessor._get_mm_fields_config] as follows:
|
||||
|
||||
```python
|
||||
|
||||
@ -1086,6 +1086,7 @@ def test_hf_processor_kwargs(model_id, call_kwargs, expected_kwargs):
|
||||
prompt="",
|
||||
mm_data={},
|
||||
mm_kwargs=call_kwargs,
|
||||
tok_kwargs={},
|
||||
)
|
||||
|
||||
assert out_kwargs == expected_kwargs
|
||||
|
||||
@ -481,6 +481,13 @@ class LLM:
|
||||
# Use default sampling params.
|
||||
sampling_params = self.get_default_sampling_params()
|
||||
|
||||
tokenization_kwargs: dict[str, Any] = {}
|
||||
truncate_prompt_tokens = None
|
||||
if isinstance(sampling_params, SamplingParams):
|
||||
truncate_prompt_tokens = sampling_params.truncate_prompt_tokens
|
||||
_validate_truncation_size(self.llm_engine.model_config.max_model_len,
|
||||
truncate_prompt_tokens, tokenization_kwargs)
|
||||
|
||||
self._validate_and_add_requests(
|
||||
prompts=parsed_prompts,
|
||||
params=sampling_params,
|
||||
@ -488,6 +495,7 @@ class LLM:
|
||||
lora_request=lora_request,
|
||||
prompt_adapter_request=prompt_adapter_request,
|
||||
guided_options=guided_options_request,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
priority=priority,
|
||||
)
|
||||
|
||||
|
||||
@ -171,6 +171,10 @@ def _validate_truncation_size(
|
||||
tokenization_kwargs["truncation"] = True
|
||||
tokenization_kwargs["max_length"] = truncate_prompt_tokens
|
||||
|
||||
else:
|
||||
if tokenization_kwargs is not None:
|
||||
tokenization_kwargs["truncation"] = False
|
||||
|
||||
return truncate_prompt_tokens
|
||||
|
||||
|
||||
|
||||
@ -265,7 +265,8 @@ class InputPreprocessor:
|
||||
prompt: Union[str, list[int]],
|
||||
mm_data: MultiModalDataDict,
|
||||
mm_processor_kwargs: Optional[Mapping[str, object]],
|
||||
lora_request: Optional[LoRARequest],
|
||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> MultiModalInputs:
|
||||
"""
|
||||
@ -280,15 +281,19 @@ class InputPreprocessor:
|
||||
if mm_processor_kwargs is None:
|
||||
mm_processor_kwargs = {}
|
||||
|
||||
return mm_processor.apply(prompt, mm_data, mm_processor_kwargs,
|
||||
return_mm_hashes)
|
||||
return mm_processor.apply(prompt,
|
||||
mm_data,
|
||||
hf_processor_mm_kwargs=mm_processor_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
return_mm_hashes=return_mm_hashes)
|
||||
|
||||
async def _process_multimodal_async(
|
||||
self,
|
||||
prompt: Union[str, list[int]],
|
||||
mm_data: MultiModalDataDict,
|
||||
mm_processor_kwargs: Optional[Mapping[str, object]],
|
||||
lora_request: Optional[LoRARequest],
|
||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> MultiModalInputs:
|
||||
"""
|
||||
@ -302,8 +307,11 @@ class InputPreprocessor:
|
||||
if mm_processor_kwargs is None:
|
||||
mm_processor_kwargs = {}
|
||||
|
||||
return mm_processor.apply(prompt, mm_data, mm_processor_kwargs,
|
||||
return_mm_hashes)
|
||||
return mm_processor.apply(prompt,
|
||||
mm_data,
|
||||
hf_processor_mm_kwargs=mm_processor_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
return_mm_hashes=return_mm_hashes)
|
||||
|
||||
def _process_embeds(
|
||||
self,
|
||||
@ -338,6 +346,7 @@ class InputPreprocessor:
|
||||
def _process_tokens(
|
||||
self,
|
||||
parsed_content: TokensPrompt,
|
||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> Union[TokenInputs, MultiModalInputs]:
|
||||
@ -350,6 +359,7 @@ class InputPreprocessor:
|
||||
prompt_token_ids,
|
||||
multi_modal_data,
|
||||
parsed_content.get("mm_processor_kwargs"),
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
lora_request=lora_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
@ -367,6 +377,7 @@ class InputPreprocessor:
|
||||
async def _process_tokens_async(
|
||||
self,
|
||||
parsed_content: TokensPrompt,
|
||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> Union[TokenInputs, MultiModalInputs]:
|
||||
@ -379,6 +390,7 @@ class InputPreprocessor:
|
||||
prompt_token_ids,
|
||||
multi_modal_data,
|
||||
parsed_content.get("mm_processor_kwargs"),
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
lora_request=lora_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
@ -408,6 +420,7 @@ class InputPreprocessor:
|
||||
prompt_text,
|
||||
multi_modal_data,
|
||||
parsed_content.get("mm_processor_kwargs"),
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
lora_request=lora_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
@ -442,6 +455,7 @@ class InputPreprocessor:
|
||||
prompt_text,
|
||||
multi_modal_data,
|
||||
parsed_content.get("mm_processor_kwargs"),
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
lora_request=lora_request,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
@ -860,7 +874,8 @@ class InputPreprocessor:
|
||||
"returned until they are supported on vLLM V1.")
|
||||
# Encoder-decoder model requires special mapping of
|
||||
# input prompts to encoder & decoder
|
||||
return self._process_encoder_decoder_prompt(prompt)
|
||||
return self._process_encoder_decoder_prompt(
|
||||
prompt, tokenization_kwargs)
|
||||
|
||||
if is_explicit_encoder_decoder_prompt(prompt):
|
||||
raise ValueError("Cannot pass encoder-decoder prompt "
|
||||
|
||||
@ -185,11 +185,13 @@ class AyaVisionMultiModalProcessor(
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
processed_outputs = super()._call_hf_processor(
|
||||
prompt,
|
||||
mm_data,
|
||||
mm_kwargs,
|
||||
tok_kwargs,
|
||||
)
|
||||
hf_processor = self.info.get_hf_processor(**mm_kwargs)
|
||||
image_processor = hf_processor.image_processor
|
||||
|
||||
@ -454,6 +454,7 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
if not mm_data:
|
||||
# HF processor always adds placeholders even when there's no image
|
||||
@ -465,6 +466,7 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor[Blip2ProcessingInfo]):
|
||||
prompt=prompt,
|
||||
mm_data=mm_data,
|
||||
mm_kwargs=mm_kwargs,
|
||||
tok_kwargs=tok_kwargs,
|
||||
)
|
||||
|
||||
def _get_mm_fields_config(
|
||||
|
||||
@ -107,6 +107,7 @@ class ChameleonMultiModalProcessor(
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
if not mm_data:
|
||||
prompt_ids = self.info.get_tokenizer().encode(prompt)
|
||||
@ -117,6 +118,7 @@ class ChameleonMultiModalProcessor(
|
||||
prompt=prompt,
|
||||
mm_data=mm_data,
|
||||
mm_kwargs=mm_kwargs,
|
||||
tok_kwargs=tok_kwargs,
|
||||
)
|
||||
|
||||
def _apply_hf_processor_tokens_only(
|
||||
|
||||
@ -204,12 +204,13 @@ class DeepseekVL2MultiModalProcessor(
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
if mm_data:
|
||||
processed_outputs = self.info.ctx.call_hf_processor(
|
||||
self.info.get_hf_processor(**mm_kwargs),
|
||||
dict(prompt=prompt, **mm_data),
|
||||
mm_kwargs,
|
||||
dict(**mm_kwargs, **tok_kwargs),
|
||||
)
|
||||
pixel_values = processed_outputs["pixel_values"]
|
||||
# split pixel values into patches corresponding to each image
|
||||
@ -278,6 +279,7 @@ class DeepseekVL2MultiModalProcessor(
|
||||
prompt: Union[str, list[int]],
|
||||
mm_data_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
*,
|
||||
return_mm_hashes: bool,
|
||||
) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
|
||||
@ -290,6 +292,7 @@ class DeepseekVL2MultiModalProcessor(
|
||||
prompt=prompt,
|
||||
mm_data_items=mm_data_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
|
||||
@ -297,6 +300,7 @@ class DeepseekVL2MultiModalProcessor(
|
||||
prompt=prompt,
|
||||
mm_data_items=mm_data_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
|
||||
|
||||
@ -794,6 +794,7 @@ class Florence2MultiModalProcessor(
|
||||
prompt_text: str,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
) -> bool:
|
||||
return False
|
||||
|
||||
@ -828,10 +829,11 @@ class Florence2MultiModalProcessor(
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
if mm_data:
|
||||
processed_outputs = super()._call_hf_processor(
|
||||
prompt, mm_data, mm_kwargs)
|
||||
prompt, mm_data, mm_kwargs, tok_kwargs)
|
||||
else:
|
||||
hf_processor = self.info.get_hf_processor()
|
||||
tokenizer = hf_processor.tokenizer
|
||||
|
||||
@ -153,6 +153,7 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
if not mm_data:
|
||||
# Avoid warning from HF logger for text-only input
|
||||
@ -164,6 +165,7 @@ class FuyuMultiModalProcessor(BaseMultiModalProcessor[FuyuProcessingInfo]):
|
||||
prompt=prompt,
|
||||
mm_data=mm_data,
|
||||
mm_kwargs=mm_kwargs,
|
||||
tok_kwargs=tok_kwargs,
|
||||
)
|
||||
|
||||
image_patches = processed_outputs.get("image_patches")
|
||||
|
||||
@ -259,11 +259,13 @@ class Gemma3MultiModalProcessor(BaseMultiModalProcessor[Gemma3ProcessingInfo]):
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
processed_outputs = super()._call_hf_processor(
|
||||
prompt,
|
||||
mm_data,
|
||||
mm_kwargs,
|
||||
tok_kwargs,
|
||||
)
|
||||
|
||||
# HF processor pops the `num_crops` kwarg, which is needed by vLLM
|
||||
|
||||
@ -481,6 +481,7 @@ class GLM4VMultiModalProcessor(BaseMultiModalProcessor[GLM4VProcessingInfo]):
|
||||
prompt_text: str,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
@ -141,6 +141,7 @@ class GraniteSpeechMultiModalProcessor(
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
mm_data = dict(mm_data)
|
||||
audios = mm_data.pop("audios", [])
|
||||
@ -153,6 +154,7 @@ class GraniteSpeechMultiModalProcessor(
|
||||
prompt=prompt,
|
||||
mm_data=mm_data,
|
||||
mm_kwargs=mm_kwargs,
|
||||
tok_kwargs=tok_kwargs,
|
||||
)
|
||||
|
||||
if "audio" in mm_data:
|
||||
|
||||
@ -490,6 +490,7 @@ class H2OVLMultiModalProcessor(
|
||||
prompt: Union[str, list[int]],
|
||||
mm_data_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
*,
|
||||
return_mm_hashes: bool,
|
||||
) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
|
||||
@ -502,6 +503,7 @@ class H2OVLMultiModalProcessor(
|
||||
prompt=prompt,
|
||||
mm_data_items=mm_data_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
|
||||
@ -509,6 +511,7 @@ class H2OVLMultiModalProcessor(
|
||||
prompt=prompt,
|
||||
mm_data_items=mm_data_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
|
||||
|
||||
@ -326,6 +326,7 @@ class Idefics3MultiModalProcessor(
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
# Text-only input not supported in composite processor
|
||||
if not (images := mm_data.get("images", [])):
|
||||
@ -337,6 +338,7 @@ class Idefics3MultiModalProcessor(
|
||||
prompt,
|
||||
mm_data,
|
||||
mm_kwargs,
|
||||
tok_kwargs,
|
||||
)
|
||||
|
||||
parsed_images = (self._get_data_parser().parse_mm_data({
|
||||
|
||||
@ -758,11 +758,13 @@ class BaseInternVLMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> Mapping[str, NestedTensors]:
|
||||
processed_outputs = super()._call_hf_processor(
|
||||
prompt=prompt,
|
||||
mm_data=mm_data,
|
||||
mm_kwargs=mm_kwargs,
|
||||
tok_kwargs=tok_kwargs,
|
||||
)
|
||||
|
||||
hf_processor = self.info.get_hf_processor(**mm_kwargs)
|
||||
@ -941,9 +943,10 @@ class InternVLMultiModalProcessor(
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> Mapping[str, NestedTensors]:
|
||||
processed_outputs = super()._call_hf_processor(prompt, mm_data,
|
||||
mm_kwargs)
|
||||
mm_kwargs, tok_kwargs)
|
||||
|
||||
hf_processor = self.info.get_hf_processor(**mm_kwargs)
|
||||
if self.info.supports_video and (
|
||||
|
||||
@ -296,11 +296,13 @@ class PixtralHFMultiModalProcessor(
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
processed_outputs = super()._call_hf_processor(
|
||||
prompt=prompt,
|
||||
mm_data=mm_data,
|
||||
mm_kwargs=mm_kwargs,
|
||||
tok_kwargs=tok_kwargs,
|
||||
)
|
||||
|
||||
pixel_values = processed_outputs.get("pixel_values")
|
||||
@ -797,6 +799,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
|
||||
prompt: Union[str, list[int]],
|
||||
mm_data: MultiModalDataDict,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Optional[Mapping[str, object]] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> MultiModalInputs:
|
||||
hf_config = self.info.get_hf_config()
|
||||
@ -809,7 +812,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
|
||||
)
|
||||
|
||||
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
|
||||
return_mm_hashes)
|
||||
tokenization_kwargs, return_mm_hashes)
|
||||
|
||||
mm_items = self._to_mm_items(mm_data)
|
||||
mm_item_counts = mm_items.get_all_counts()
|
||||
|
||||
@ -286,6 +286,7 @@ class LlavaOnevisionMultiModalProcessor(
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
mm_data = dict(mm_data)
|
||||
videos = mm_data.pop("videos", [])
|
||||
@ -296,6 +297,7 @@ class LlavaOnevisionMultiModalProcessor(
|
||||
prompt=prompt,
|
||||
mm_data=mm_data,
|
||||
mm_kwargs=mm_kwargs,
|
||||
tok_kwargs=tok_kwargs,
|
||||
)
|
||||
|
||||
# LLaVA-OneVision processor doesn't support multiple videos
|
||||
@ -310,6 +312,7 @@ class LlavaOnevisionMultiModalProcessor(
|
||||
prompt=prompt,
|
||||
mm_data={},
|
||||
mm_kwargs=mm_kwargs,
|
||||
tok_kwargs=tok_kwargs,
|
||||
)
|
||||
|
||||
images = mm_data.pop("images", [])
|
||||
@ -319,6 +322,7 @@ class LlavaOnevisionMultiModalProcessor(
|
||||
prompt=image_token * len(images),
|
||||
mm_data={"images": images},
|
||||
mm_kwargs=mm_kwargs,
|
||||
tok_kwargs=tok_kwargs,
|
||||
)
|
||||
image_outputs = {
|
||||
k: v
|
||||
@ -334,6 +338,7 @@ class LlavaOnevisionMultiModalProcessor(
|
||||
prompt=video_token,
|
||||
mm_data={"videos": video},
|
||||
mm_kwargs=mm_kwargs,
|
||||
tok_kwargs=tok_kwargs,
|
||||
)
|
||||
|
||||
pixel_values_videos.append(item_outputs["pixel_values_videos"][0])
|
||||
@ -352,11 +357,13 @@ class LlavaOnevisionMultiModalProcessor(
|
||||
prompt_text: str,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
) -> bool:
|
||||
base_result = super()._hf_processor_applies_updates(
|
||||
prompt_text=prompt_text,
|
||||
mm_items=mm_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
)
|
||||
|
||||
return base_result and mm_items.get_count("video", strict=False) == 0
|
||||
|
||||
@ -260,6 +260,7 @@ class MiniCPMOMultiModalProcessor(
|
||||
self,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> Mapping[str, NestedTensors]:
|
||||
if (audios := mm_data.get("audios")) is None:
|
||||
return {}
|
||||
@ -276,9 +277,9 @@ class MiniCPMOMultiModalProcessor(
|
||||
prompts=[self.info.audio_pattern] * len(parsed_audios),
|
||||
mm_data={"audios": [[audio] for audio in parsed_audios]},
|
||||
mm_kwargs={
|
||||
**mm_kwargs,
|
||||
"chunk_input": True,
|
||||
**mm_kwargs, "chunk_input": True
|
||||
},
|
||||
tok_kwargs=tok_kwargs,
|
||||
out_keys={"audio_features", "audio_feature_lens"},
|
||||
)
|
||||
|
||||
@ -302,10 +303,11 @@ class MiniCPMOMultiModalProcessor(
|
||||
self,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> Mapping[str, NestedTensors]:
|
||||
return {
|
||||
**super().process_mm_inputs(mm_data, mm_kwargs),
|
||||
**self.process_audios(mm_data, mm_kwargs),
|
||||
**super().process_mm_inputs(mm_data, mm_kwargs, tok_kwargs),
|
||||
**self.process_audios(mm_data, mm_kwargs, tok_kwargs),
|
||||
}
|
||||
|
||||
def _get_prompt_updates(
|
||||
|
||||
@ -534,6 +534,7 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
||||
self,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> Mapping[str, NestedTensors]:
|
||||
if (images := mm_data.get("images")) is None:
|
||||
return {}
|
||||
@ -550,6 +551,7 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
||||
prompts=[self.info.image_pattern] * len(parsed_images),
|
||||
mm_data={"images": [[image] for image in parsed_images]},
|
||||
mm_kwargs=mm_kwargs,
|
||||
tok_kwargs=tok_kwargs,
|
||||
out_keys={"pixel_values", "image_sizes", "tgt_sizes"},
|
||||
)
|
||||
|
||||
@ -563,6 +565,7 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
||||
self,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> Mapping[str, NestedTensors]:
|
||||
if (videos := mm_data.get("videos")) is None:
|
||||
return {}
|
||||
@ -586,6 +589,7 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
||||
"max_slice_nums":
|
||||
self.info.get_video_max_slice_num(),
|
||||
},
|
||||
tok_kwargs=tok_kwargs,
|
||||
out_keys={"pixel_values", "image_sizes", "tgt_sizes"},
|
||||
)
|
||||
|
||||
@ -601,10 +605,11 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
||||
self,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> Mapping[str, NestedTensors]:
|
||||
return {
|
||||
**self.process_images(mm_data, mm_kwargs),
|
||||
**self.process_videos(mm_data, mm_kwargs),
|
||||
**self.process_images(mm_data, mm_kwargs, tok_kwargs),
|
||||
**self.process_videos(mm_data, mm_kwargs, tok_kwargs),
|
||||
}
|
||||
|
||||
def _base_call_hf_processor(
|
||||
@ -612,6 +617,7 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
||||
prompts: list[str],
|
||||
mm_data: Mapping[str, Sequence[object]],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
*,
|
||||
out_keys: set[str],
|
||||
) -> dict[str, NestedTensors]:
|
||||
@ -621,6 +627,7 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
||||
prompt=prompts, # type: ignore
|
||||
mm_data=mm_data,
|
||||
mm_kwargs=mm_kwargs,
|
||||
tok_kwargs=tok_kwargs,
|
||||
)
|
||||
else:
|
||||
inputs = defaultdict[str, list[torch.Tensor]](list)
|
||||
@ -633,6 +640,7 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
||||
for k, v in mm_data.items()
|
||||
},
|
||||
mm_kwargs=mm_kwargs,
|
||||
tok_kwargs=tok_kwargs,
|
||||
)
|
||||
|
||||
for k, v in inputs_one.items():
|
||||
@ -646,11 +654,12 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
|
||||
input_ids = torch.tensor([tokenizer.encode(prompt)])
|
||||
mm_inputs = self.process_mm_inputs(mm_data, mm_kwargs)
|
||||
input_ids = torch.tensor([tokenizer.encode(prompt, **tok_kwargs)])
|
||||
mm_inputs = self.process_mm_inputs(mm_data, mm_kwargs, tok_kwargs)
|
||||
|
||||
return BatchFeature({
|
||||
"input_ids": input_ids,
|
||||
@ -662,6 +671,7 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
||||
prompt_text: str,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
@ -113,11 +113,13 @@ class MiniMaxVL01MultiModalProcessor(
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
processed_outputs = super()._call_hf_processor(
|
||||
prompt=prompt,
|
||||
mm_data=mm_data,
|
||||
mm_kwargs=mm_kwargs,
|
||||
tok_kwargs=tok_kwargs,
|
||||
)
|
||||
|
||||
pixel_values = processed_outputs.get("pixel_values")
|
||||
|
||||
@ -228,11 +228,13 @@ class Mistral3MultiModalProcessor(
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
processed_outputs = super()._call_hf_processor(
|
||||
prompt=prompt,
|
||||
mm_data=mm_data,
|
||||
mm_kwargs=mm_kwargs,
|
||||
tok_kwargs=tok_kwargs,
|
||||
)
|
||||
|
||||
pixel_values = processed_outputs.get("pixel_values")
|
||||
|
||||
@ -166,10 +166,11 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
|
||||
prompt: Union[str, list[int]],
|
||||
mm_data: MultiModalDataDict,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Optional[Mapping[str, object]] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> MultiModalEncDecInputs:
|
||||
mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
|
||||
return_mm_hashes)
|
||||
tokenization_kwargs, return_mm_hashes)
|
||||
|
||||
image_token_id = self.info.get_hf_config().image_token_index
|
||||
# Check that the number of image tokens in the decoder prompt matches
|
||||
@ -239,6 +240,7 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
if mm_data:
|
||||
@ -247,7 +249,7 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
|
||||
for img in mm_data["images"]
|
||||
]
|
||||
processed_outputs = super()._call_hf_processor(
|
||||
prompt, mm_data, mm_kwargs)
|
||||
prompt, mm_data, mm_kwargs, tok_kwargs)
|
||||
processed_outputs["num_tiles"] = torch.tensor(num_tiles)
|
||||
for k in ('pixel_values', 'aspect_ratio_ids', "aspect_ratio_mask"):
|
||||
processed_outputs[k] = processed_outputs[k].squeeze(0)
|
||||
|
||||
@ -574,6 +574,7 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
|
||||
@ -583,6 +584,7 @@ class Mllama4MultiModalProcessor(BaseMultiModalProcessor[Mllama4ProcessingInfo]
|
||||
prompt=prompt,
|
||||
mm_data=mm_data,
|
||||
mm_kwargs=mm_kwargs,
|
||||
tok_kwargs=tok_kwargs,
|
||||
)
|
||||
|
||||
processor = self.info.get_hf_processor(**mm_kwargs)
|
||||
|
||||
@ -335,6 +335,7 @@ class OvisMultiModalProcessor(BaseMultiModalProcessor[OvisProcessingInfo]):
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
if not mm_data:
|
||||
# Avoid warning from HF logger for text-only input
|
||||
@ -346,6 +347,7 @@ class OvisMultiModalProcessor(BaseMultiModalProcessor[OvisProcessingInfo]):
|
||||
prompt=prompt,
|
||||
mm_data=mm_data,
|
||||
mm_kwargs=mm_kwargs,
|
||||
tok_kwargs=tok_kwargs,
|
||||
)
|
||||
|
||||
hf_processor = self.info.get_hf_processor()
|
||||
|
||||
@ -121,6 +121,7 @@ class PaliGemmaMultiModalProcessor(
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
if not mm_data:
|
||||
@ -131,6 +132,7 @@ class PaliGemmaMultiModalProcessor(
|
||||
prompt=prompt,
|
||||
mm_data=mm_data,
|
||||
mm_kwargs=mm_kwargs,
|
||||
tok_kwargs=tok_kwargs,
|
||||
)
|
||||
|
||||
def _get_mm_fields_config(
|
||||
@ -191,10 +193,11 @@ class PaliGemmaMultiModalProcessor(
|
||||
prompt: Union[str, list[int]],
|
||||
mm_data: MultiModalDataDict,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Optional[Mapping[str, object]] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> MultiModalInputs:
|
||||
mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
|
||||
return_mm_hashes)
|
||||
tokenization_kwargs, return_mm_hashes)
|
||||
prompt_token_ids = mm_inputs["prompt_token_ids"]
|
||||
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
|
||||
@ -376,11 +376,13 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
processed_outputs = super()._call_hf_processor(
|
||||
prompt=prompt,
|
||||
mm_data=mm_data,
|
||||
mm_kwargs=mm_kwargs,
|
||||
tok_kwargs=tok_kwargs,
|
||||
)
|
||||
|
||||
input_ids = processed_outputs["input_ids"]
|
||||
|
||||
@ -762,6 +762,7 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
if not mm_data:
|
||||
prompt_ids = self.info.get_tokenizer().encode(prompt)
|
||||
@ -773,7 +774,7 @@ class Phi4MMMultiModalProcessor(BaseMultiModalProcessor[Phi4MMProcessingInfo]):
|
||||
mm_data['audios'] = [(data, sr) for data in audio_data]
|
||||
|
||||
processed_outputs = super()._call_hf_processor(prompt, mm_data,
|
||||
mm_kwargs)
|
||||
mm_kwargs, tok_kwargs)
|
||||
|
||||
num_img_tokens = [
|
||||
self.info.get_num_image_tokens(image_width=img_size[0],
|
||||
|
||||
@ -237,6 +237,7 @@ class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]):
|
||||
dummy_text = self.get_dummy_text(mm_counts)
|
||||
dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts)
|
||||
dummy_images = dummy_mm_data.get("image", [])
|
||||
tokenization_kwargs = {"truncation": False}
|
||||
|
||||
request = ChatCompletionRequest(messages=[
|
||||
UserMessage(content=[
|
||||
@ -247,7 +248,9 @@ class PixtralDummyInputsBuilder(BaseDummyInputsBuilder[PixtralProcessingInfo]):
|
||||
res = tokenizer.mistral.encode_chat_completion(request)
|
||||
dummy_tokens = res.tokens
|
||||
|
||||
return ProcessorInputs(prompt=dummy_tokens, mm_data=dummy_mm_data)
|
||||
return ProcessorInputs(prompt=dummy_tokens,
|
||||
mm_data=dummy_mm_data,
|
||||
tokenization_kwargs=tokenization_kwargs)
|
||||
|
||||
|
||||
class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
|
||||
@ -297,6 +300,7 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
|
||||
prompt: Union[str, list[int]],
|
||||
mm_data_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
*,
|
||||
return_mm_hashes: bool,
|
||||
) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
|
||||
@ -309,6 +313,7 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
|
||||
prompt=prompt,
|
||||
mm_data_items=mm_data_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
|
||||
|
||||
@ -92,6 +92,7 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
|
||||
prompt: Union[str, list[int]],
|
||||
mm_data: MultiModalDataDict,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Optional[Mapping[str, object]] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> MultiModalInputs:
|
||||
mm_kwargs = {}
|
||||
|
||||
@ -244,6 +244,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
mm_data = dict(mm_data)
|
||||
audios = mm_data.pop("audios", [])
|
||||
@ -258,6 +259,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
|
||||
prompt=prompt,
|
||||
mm_data=mm_data,
|
||||
mm_kwargs=mm_kwargs,
|
||||
tok_kwargs=tok_kwargs,
|
||||
)
|
||||
|
||||
input_features = hf_inputs.pop('input_features', None)
|
||||
@ -453,6 +455,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
|
||||
prompt: Union[str, list[int]],
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
*,
|
||||
enable_hf_prompt_update: bool,
|
||||
) -> tuple[list[int], MultiModalKwargs, bool]:
|
||||
@ -465,6 +468,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
|
||||
prompt_text=prompt,
|
||||
mm_items=mm_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
)
|
||||
tokenizer = self.info.get_tokenizer()
|
||||
prompt_ids = encode_tokens(tokenizer, prompt)
|
||||
@ -474,6 +478,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
|
||||
mm_kwargs = self._apply_hf_processor_mm_only(
|
||||
mm_items=mm_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
)
|
||||
|
||||
return prompt_ids, mm_kwargs, False
|
||||
@ -482,6 +487,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
) -> MultiModalKwargs:
|
||||
"""
|
||||
Qwen2.5-Omni reimplements this function to handle `use_audio_in_video`.
|
||||
@ -498,6 +504,7 @@ class Qwen2_5OmniThinkerMultiModalProcessor(
|
||||
prompt_text=self.dummy_inputs.get_dummy_text(mm_counts),
|
||||
mm_items=mm_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
)
|
||||
|
||||
return mm_kwargs
|
||||
|
||||
@ -150,6 +150,7 @@ class Qwen2AudioMultiModalProcessor(
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, Any],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
# NOTE - we rename audios -> audio in mm data because transformers has
|
||||
# deprecated audios for the qwen2audio processor and will remove
|
||||
@ -174,6 +175,7 @@ class Qwen2AudioMultiModalProcessor(
|
||||
prompt=prompt,
|
||||
mm_data=mm_data,
|
||||
mm_kwargs=mm_kwargs,
|
||||
tok_kwargs=tok_kwargs,
|
||||
)
|
||||
|
||||
def _get_mm_fields_config(
|
||||
|
||||
@ -1027,11 +1027,13 @@ class Qwen2VLMultiModalProcessor(BaseMultiModalProcessor[Qwen2VLProcessingInfo]
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
mm_kwargs = self.info._get_image_processor_kwargs(**mm_kwargs)
|
||||
return self.info.ctx.call_hf_processor(
|
||||
self.info.get_hf_processor(**mm_kwargs),
|
||||
dict(text=prompt, **mm_data),
|
||||
self.info._get_image_processor_kwargs(**mm_kwargs),
|
||||
dict(**mm_kwargs, **tok_kwargs),
|
||||
)
|
||||
|
||||
def _get_prompt_updates(
|
||||
|
||||
@ -580,6 +580,7 @@ class QwenVLMultiModalProcessor(BaseMultiModalProcessor[QwenVLProcessingInfo]):
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
# Drops anything between <img>/</img> tags; encoding with the tokenizer
|
||||
# will automatically add the image pads for the context.
|
||||
@ -600,6 +601,7 @@ class QwenVLMultiModalProcessor(BaseMultiModalProcessor[QwenVLProcessingInfo]):
|
||||
prompt=prompt,
|
||||
mm_data=mm_data,
|
||||
mm_kwargs=mm_kwargs,
|
||||
tok_kwargs=tok_kwargs,
|
||||
)
|
||||
|
||||
def _hf_processor_applies_updates(
|
||||
@ -607,6 +609,7 @@ class QwenVLMultiModalProcessor(BaseMultiModalProcessor[QwenVLProcessingInfo]):
|
||||
prompt_text: str,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
@ -534,11 +534,13 @@ class SkyworkR1VMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> Mapping[str, NestedTensors]:
|
||||
processed_outputs = super()._call_hf_processor(
|
||||
prompt=prompt,
|
||||
mm_data=mm_data,
|
||||
mm_kwargs=mm_kwargs,
|
||||
tok_kwargs=tok_kwargs,
|
||||
)
|
||||
|
||||
hf_processor = self.info.get_hf_processor(**mm_kwargs)
|
||||
|
||||
@ -144,6 +144,7 @@ class UltravoxMultiModalProcessor(
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
# Text-only input not supported in composite processor
|
||||
if not mm_data.get("audios", []):
|
||||
@ -165,10 +166,15 @@ class UltravoxMultiModalProcessor(
|
||||
|
||||
item_processor_data = dict(**mm_data, audios=audios)
|
||||
|
||||
# some tokenizer kwargs are incompatible with UltravoxProcessor
|
||||
tok_kwargs.pop("padding", None)
|
||||
tok_kwargs.pop("truncation", None)
|
||||
|
||||
output = super()._call_hf_processor(
|
||||
prompt=prompt,
|
||||
mm_data=item_processor_data,
|
||||
mm_kwargs=mm_kwargs,
|
||||
tok_kwargs=tok_kwargs,
|
||||
)
|
||||
output['audio_features'] = output.pop('audio_values')
|
||||
|
||||
|
||||
@ -700,9 +700,10 @@ class WhisperMultiModalProcessor(
|
||||
prompt: str,
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> BatchFeature:
|
||||
if mm_data:
|
||||
feature_extractor = self.info.get_feature_extractor(**mm_kwargs)
|
||||
feature_extractor = self.info.get_feature_extractor()
|
||||
mm_data = dict(audio=mm_data.pop("audios"))
|
||||
mm_kwargs = dict(
|
||||
**mm_kwargs,
|
||||
@ -712,6 +713,7 @@ class WhisperMultiModalProcessor(
|
||||
prompt=prompt,
|
||||
mm_data=mm_data,
|
||||
mm_kwargs=mm_kwargs,
|
||||
tok_kwargs=tok_kwargs,
|
||||
)
|
||||
if "labels" in processed_outputs:
|
||||
processed_outputs["input_ids"] = processed_outputs.pop("labels")
|
||||
|
||||
@ -1267,6 +1267,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
# This refers to the data to be passed to HF processor.
|
||||
mm_data: Mapping[str, object],
|
||||
mm_kwargs: Mapping[str, object],
|
||||
tok_kwargs: Mapping[str, object],
|
||||
) -> "BatchFeature":
|
||||
"""
|
||||
Call the HF processor on the prompt text and
|
||||
@ -1275,7 +1276,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
return self.info.ctx.call_hf_processor(
|
||||
self.info.get_hf_processor(**mm_kwargs),
|
||||
dict(text=prompt, **mm_data),
|
||||
mm_kwargs,
|
||||
dict(**mm_kwargs, **tok_kwargs),
|
||||
)
|
||||
|
||||
def _hf_processor_applies_updates(
|
||||
@ -1283,6 +1284,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
prompt_text: str,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
) -> bool:
|
||||
"""
|
||||
Return whether the HF processor applies prompt updates.
|
||||
@ -1300,6 +1302,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
prompt_text: str,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
) -> tuple[list[int], MultiModalKwargs, bool]:
|
||||
"""
|
||||
Apply the HF processor on the prompt text and multi-modal data
|
||||
@ -1313,6 +1316,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
prompt=prompt_text,
|
||||
mm_data=processor_data,
|
||||
mm_kwargs=hf_processor_mm_kwargs,
|
||||
tok_kwargs=tokenization_kwargs,
|
||||
)
|
||||
processed_data.update(passthrough_data)
|
||||
|
||||
@ -1327,11 +1331,14 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
prompt_text=prompt_text,
|
||||
mm_items=mm_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
)
|
||||
|
||||
return prompt_ids, mm_kwargs, is_update_applied
|
||||
|
||||
def _apply_hf_processor_text_only(self, prompt_text: str) -> list[int]:
|
||||
def _apply_hf_processor_text_only(
|
||||
self, prompt_text: str,
|
||||
tokenization_kwargs: Mapping[str, object]) -> list[int]:
|
||||
"""
|
||||
Apply the HF processor on the prompt text only.
|
||||
|
||||
@ -1343,6 +1350,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
prompt_text=prompt_text,
|
||||
mm_items=MultiModalDataItems({}),
|
||||
hf_processor_mm_kwargs={},
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
)
|
||||
|
||||
return prompt_ids
|
||||
@ -1368,6 +1376,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
) -> MultiModalKwargs:
|
||||
"""
|
||||
Apply the HF processor on the multi-modal data only.
|
||||
@ -1383,6 +1392,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
prompt_text=self.dummy_inputs.get_dummy_text(mm_counts),
|
||||
mm_items=mm_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
)
|
||||
|
||||
return mm_kwargs
|
||||
@ -1392,6 +1402,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
prompt: Union[str, list[int]],
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
*,
|
||||
enable_hf_prompt_update: bool,
|
||||
) -> tuple[list[int], MultiModalKwargs, bool]:
|
||||
@ -1412,15 +1423,18 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
prompt_text=prompt,
|
||||
mm_items=mm_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
)
|
||||
|
||||
prompt_ids = self._apply_hf_processor_text_only(prompt)
|
||||
prompt_ids = self._apply_hf_processor_text_only(
|
||||
prompt, tokenization_kwargs)
|
||||
else:
|
||||
prompt_ids = self._apply_hf_processor_tokens_only(prompt)
|
||||
|
||||
mm_kwargs = self._apply_hf_processor_mm_only(
|
||||
mm_items=mm_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
)
|
||||
|
||||
return prompt_ids, mm_kwargs, False
|
||||
@ -1430,14 +1444,17 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
cache: ProcessingCache,
|
||||
mm_data_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
) -> tuple[dict[str, list[ProcessingCacheOptionalItem]], dict[
|
||||
str, list[object]]]:
|
||||
model_id = self.info.model_id
|
||||
|
||||
mm_cache_items = {
|
||||
modality: [
|
||||
cache.get_item(model_id, modality, item,
|
||||
hf_processor_mm_kwargs) for item in items
|
||||
cache.get_item(
|
||||
model_id, modality, item,
|
||||
dict(**hf_processor_mm_kwargs, **tokenization_kwargs))
|
||||
for item in items
|
||||
]
|
||||
for modality, items in mm_data_items.items()
|
||||
}
|
||||
@ -1457,10 +1474,9 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
return mm_cache_items, mm_missing_data
|
||||
|
||||
def _hash_mm_items(
|
||||
self,
|
||||
mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
) -> MultiModalHashes:
|
||||
self, mm_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Mapping[str, object]) -> MultiModalHashes:
|
||||
"""Create MM hashes to be returned (only used in V1)."""
|
||||
model_id = self.info.model_id
|
||||
|
||||
@ -1468,7 +1484,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
modality: [
|
||||
MultiModalHasher.hash_kwargs(model_id=model_id,
|
||||
**{modality: item},
|
||||
**hf_processor_mm_kwargs)
|
||||
**hf_processor_mm_kwargs,
|
||||
**tokenization_kwargs)
|
||||
for item in items
|
||||
]
|
||||
for modality, items in mm_items.items()
|
||||
@ -1513,6 +1530,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
prompt: Union[str, list[int]],
|
||||
mm_data_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
*,
|
||||
return_mm_hashes: bool,
|
||||
) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
|
||||
@ -1524,10 +1542,12 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
prompt=prompt,
|
||||
mm_items=mm_data_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
enable_hf_prompt_update=True,
|
||||
)
|
||||
|
||||
mm_hashes = (self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs)
|
||||
mm_hashes = (self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
|
||||
tokenization_kwargs)
|
||||
if return_mm_hashes else None)
|
||||
|
||||
return prompt_ids, mm_kwargs, mm_hashes, is_update_applied
|
||||
@ -1537,6 +1557,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
prompt: Union[str, list[int]],
|
||||
mm_data_items: MultiModalDataItems,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Mapping[str, object],
|
||||
*,
|
||||
return_mm_hashes: bool,
|
||||
) -> tuple[list[int], MultiModalKwargs, Optional[MultiModalHashes], bool]:
|
||||
@ -1552,6 +1573,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
prompt=prompt,
|
||||
mm_data_items=mm_data_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
|
||||
@ -1562,6 +1584,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
cache=cache,
|
||||
mm_data_items=mm_data_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
)
|
||||
|
||||
# NOTE: `prompt` does not correspond to `mm_missing_data_items`,
|
||||
@ -1575,6 +1598,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
prompt=prompt,
|
||||
mm_items=self._to_mm_items(mm_missing_data),
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
enable_hf_prompt_update=False,
|
||||
)
|
||||
|
||||
@ -1783,6 +1807,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
prompt: Union[str, list[int]],
|
||||
mm_data: MultiModalDataDict,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Optional[Mapping[str, object]] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> MultiModalInputs:
|
||||
"""
|
||||
@ -1800,6 +1825,9 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
"""
|
||||
mm_items = self._to_mm_items(mm_data)
|
||||
|
||||
if tokenization_kwargs is None:
|
||||
tokenization_kwargs = {}
|
||||
|
||||
(
|
||||
prompt_ids,
|
||||
mm_kwargs,
|
||||
@ -1809,9 +1837,11 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
||||
prompt,
|
||||
mm_items,
|
||||
hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=tokenization_kwargs,
|
||||
return_mm_hashes=return_mm_hashes,
|
||||
)
|
||||
|
||||
# NOTE: tokenization_kwargs are not required to init processor
|
||||
prompt_ids, prompt, mm_placeholders = self._maybe_apply_prompt_updates(
|
||||
mm_items=mm_items,
|
||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||
@ -1892,6 +1922,7 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
||||
prompt: Union[str, list[int]],
|
||||
mm_data: MultiModalDataDict,
|
||||
hf_processor_mm_kwargs: Mapping[str, object],
|
||||
tokenization_kwargs: Optional[Mapping[str, object]] = None,
|
||||
return_mm_hashes: bool = False,
|
||||
) -> MultiModalEncDecInputs:
|
||||
"""
|
||||
@ -1906,6 +1937,7 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
||||
encoder_prompt,
|
||||
mm_data,
|
||||
hf_processor_mm_kwargs,
|
||||
tokenization_kwargs,
|
||||
return_mm_hashes,
|
||||
)
|
||||
|
||||
|
||||
@ -30,6 +30,7 @@ class ProcessorInputs:
|
||||
prompt: Union[str, list[int]]
|
||||
mm_data: MultiModalDataDict
|
||||
hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict)
|
||||
tokenization_kwargs: Mapping[str, object] = field(default_factory=dict)
|
||||
|
||||
|
||||
class DummyEncoderData(NamedTuple):
|
||||
@ -90,8 +91,11 @@ class BaseDummyInputsBuilder(ABC, Generic[_I]):
|
||||
"""
|
||||
dummy_text = self.get_dummy_text(mm_counts)
|
||||
dummy_mm_data = self.get_dummy_mm_data(seq_len, mm_counts)
|
||||
tokenization_kwargs = {"truncation": False}
|
||||
|
||||
return ProcessorInputs(prompt=dummy_text, mm_data=dummy_mm_data)
|
||||
return ProcessorInputs(prompt=dummy_text,
|
||||
mm_data=dummy_mm_data,
|
||||
tokenization_kwargs=tokenization_kwargs)
|
||||
|
||||
def _get_dummy_audios(
|
||||
self,
|
||||
@ -170,6 +174,7 @@ class MultiModalProfiler(Generic[_I]):
|
||||
prompt=processor_inputs.prompt,
|
||||
mm_data=processor_inputs.mm_data,
|
||||
hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=processor_inputs.tokenization_kwargs,
|
||||
)
|
||||
|
||||
def _get_mm_num_tokens(
|
||||
|
||||
@ -1729,6 +1729,7 @@ def supports_kw(
|
||||
last_param = params[next(reversed(params))] # type: ignore
|
||||
return (last_param.kind == inspect.Parameter.VAR_KEYWORD
|
||||
and last_param.name != kw_name)
|
||||
|
||||
return False
|
||||
|
||||
|
||||
@ -1771,6 +1772,7 @@ def resolve_mm_processor_kwargs(
|
||||
# Merge the final processor kwargs, prioritizing inference
|
||||
# time values over the initialization time values.
|
||||
mm_processor_kwargs = {**init_mm_kwargs, **runtime_mm_kwargs}
|
||||
|
||||
return mm_processor_kwargs
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user