mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2026-06-04 03:29:09 +08:00
[Multimodal] Always enable hashing mm data (#23308)
Signed-off-by: Roger Wang <hey@rogerw.io> Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
parent
f8daddcc4c
commit
79f05e4436
@ -1685,15 +1685,6 @@ class ModelConfig:
|
|||||||
def is_multimodal_model(self) -> bool:
|
def is_multimodal_model(self) -> bool:
|
||||||
return self.multimodal_config is not None
|
return self.multimodal_config is not None
|
||||||
|
|
||||||
@property
|
|
||||||
def processor_return_mm_hashes(self) -> bool:
|
|
||||||
"""Whether the multi-modal processor should output hashes."""
|
|
||||||
mm_config = self.multimodal_config
|
|
||||||
if mm_config is None:
|
|
||||||
return False
|
|
||||||
|
|
||||||
return mm_config.mm_processor_cache_gb > 0
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def enable_mm_processor_cache(self) -> bool:
|
def enable_mm_processor_cache(self) -> bool:
|
||||||
"""Whether the multi-modal processor cache should be enabled."""
|
"""Whether the multi-modal processor cache should be enabled."""
|
||||||
|
|||||||
@ -254,7 +254,6 @@ class InputPreprocessor:
|
|||||||
mm_processor_kwargs: Optional[Mapping[str, object]],
|
mm_processor_kwargs: Optional[Mapping[str, object]],
|
||||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||||
lora_request: Optional[LoRARequest] = None,
|
lora_request: Optional[LoRARequest] = None,
|
||||||
return_mm_hashes: bool = False,
|
|
||||||
) -> MultiModalInputs:
|
) -> MultiModalInputs:
|
||||||
"""
|
"""
|
||||||
Apply the model's multi-modal processor to a multi-modal prompt,
|
Apply the model's multi-modal processor to a multi-modal prompt,
|
||||||
@ -271,8 +270,7 @@ class InputPreprocessor:
|
|||||||
return mm_processor.apply(prompt,
|
return mm_processor.apply(prompt,
|
||||||
mm_data,
|
mm_data,
|
||||||
hf_processor_mm_kwargs=mm_processor_kwargs,
|
hf_processor_mm_kwargs=mm_processor_kwargs,
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs)
|
||||||
return_mm_hashes=return_mm_hashes)
|
|
||||||
|
|
||||||
async def _process_multimodal_async(
|
async def _process_multimodal_async(
|
||||||
self,
|
self,
|
||||||
@ -281,7 +279,6 @@ class InputPreprocessor:
|
|||||||
mm_processor_kwargs: Optional[Mapping[str, object]],
|
mm_processor_kwargs: Optional[Mapping[str, object]],
|
||||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||||
lora_request: Optional[LoRARequest] = None,
|
lora_request: Optional[LoRARequest] = None,
|
||||||
return_mm_hashes: bool = False,
|
|
||||||
) -> MultiModalInputs:
|
) -> MultiModalInputs:
|
||||||
"""
|
"""
|
||||||
Async version of
|
Async version of
|
||||||
@ -297,8 +294,7 @@ class InputPreprocessor:
|
|||||||
return mm_processor.apply(prompt,
|
return mm_processor.apply(prompt,
|
||||||
mm_data,
|
mm_data,
|
||||||
hf_processor_mm_kwargs=mm_processor_kwargs,
|
hf_processor_mm_kwargs=mm_processor_kwargs,
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs)
|
||||||
return_mm_hashes=return_mm_hashes)
|
|
||||||
|
|
||||||
def _process_embeds(
|
def _process_embeds(
|
||||||
self,
|
self,
|
||||||
@ -335,7 +331,6 @@ class InputPreprocessor:
|
|||||||
parsed_content: TokensPrompt,
|
parsed_content: TokensPrompt,
|
||||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||||
lora_request: Optional[LoRARequest] = None,
|
lora_request: Optional[LoRARequest] = None,
|
||||||
return_mm_hashes: bool = False,
|
|
||||||
) -> Union[TokenInputs, MultiModalInputs]:
|
) -> Union[TokenInputs, MultiModalInputs]:
|
||||||
prompt_token_ids = parsed_content["prompt_token_ids"]
|
prompt_token_ids = parsed_content["prompt_token_ids"]
|
||||||
token_type_ids = parsed_content.get("token_type_ids")
|
token_type_ids = parsed_content.get("token_type_ids")
|
||||||
@ -348,7 +343,6 @@ class InputPreprocessor:
|
|||||||
parsed_content.get("mm_processor_kwargs"),
|
parsed_content.get("mm_processor_kwargs"),
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
lora_request=lora_request,
|
lora_request=lora_request,
|
||||||
return_mm_hashes=return_mm_hashes,
|
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
inputs = token_inputs(
|
inputs = token_inputs(
|
||||||
@ -366,7 +360,6 @@ class InputPreprocessor:
|
|||||||
parsed_content: TokensPrompt,
|
parsed_content: TokensPrompt,
|
||||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||||
lora_request: Optional[LoRARequest] = None,
|
lora_request: Optional[LoRARequest] = None,
|
||||||
return_mm_hashes: bool = False,
|
|
||||||
) -> Union[TokenInputs, MultiModalInputs]:
|
) -> Union[TokenInputs, MultiModalInputs]:
|
||||||
prompt_token_ids = parsed_content["prompt_token_ids"]
|
prompt_token_ids = parsed_content["prompt_token_ids"]
|
||||||
token_type_ids = parsed_content.get("token_type_ids")
|
token_type_ids = parsed_content.get("token_type_ids")
|
||||||
@ -379,7 +372,6 @@ class InputPreprocessor:
|
|||||||
parsed_content.get("mm_processor_kwargs"),
|
parsed_content.get("mm_processor_kwargs"),
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
lora_request=lora_request,
|
lora_request=lora_request,
|
||||||
return_mm_hashes=return_mm_hashes,
|
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
inputs = token_inputs(
|
inputs = token_inputs(
|
||||||
@ -397,7 +389,6 @@ class InputPreprocessor:
|
|||||||
parsed_content: TextPrompt,
|
parsed_content: TextPrompt,
|
||||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||||
lora_request: Optional[LoRARequest] = None,
|
lora_request: Optional[LoRARequest] = None,
|
||||||
return_mm_hashes: bool = False,
|
|
||||||
) -> Union[TokenInputs, MultiModalInputs]:
|
) -> Union[TokenInputs, MultiModalInputs]:
|
||||||
prompt_text = parsed_content["prompt"]
|
prompt_text = parsed_content["prompt"]
|
||||||
|
|
||||||
@ -409,7 +400,6 @@ class InputPreprocessor:
|
|||||||
parsed_content.get("mm_processor_kwargs"),
|
parsed_content.get("mm_processor_kwargs"),
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
lora_request=lora_request,
|
lora_request=lora_request,
|
||||||
return_mm_hashes=return_mm_hashes,
|
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
prompt_token_ids = self._tokenize_prompt(
|
prompt_token_ids = self._tokenize_prompt(
|
||||||
@ -432,7 +422,6 @@ class InputPreprocessor:
|
|||||||
parsed_content: TextPrompt,
|
parsed_content: TextPrompt,
|
||||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||||
lora_request: Optional[LoRARequest] = None,
|
lora_request: Optional[LoRARequest] = None,
|
||||||
return_mm_hashes: bool = False,
|
|
||||||
) -> Union[TokenInputs, MultiModalInputs]:
|
) -> Union[TokenInputs, MultiModalInputs]:
|
||||||
prompt_text = parsed_content["prompt"]
|
prompt_text = parsed_content["prompt"]
|
||||||
|
|
||||||
@ -444,7 +433,6 @@ class InputPreprocessor:
|
|||||||
parsed_content.get("mm_processor_kwargs"),
|
parsed_content.get("mm_processor_kwargs"),
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
lora_request=lora_request,
|
lora_request=lora_request,
|
||||||
return_mm_hashes=return_mm_hashes,
|
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
prompt_token_ids = await self._tokenize_prompt_async(
|
prompt_token_ids = await self._tokenize_prompt_async(
|
||||||
@ -467,7 +455,6 @@ class InputPreprocessor:
|
|||||||
prompt: SingletonPrompt,
|
prompt: SingletonPrompt,
|
||||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||||
lora_request: Optional[LoRARequest] = None,
|
lora_request: Optional[LoRARequest] = None,
|
||||||
return_mm_hashes: bool = False,
|
|
||||||
) -> SingletonInputs:
|
) -> SingletonInputs:
|
||||||
"""
|
"""
|
||||||
Extract the singleton inputs from a prompt.
|
Extract the singleton inputs from a prompt.
|
||||||
@ -476,7 +463,6 @@ class InputPreprocessor:
|
|||||||
|
|
||||||
* prompt: single encoder or decoder input prompt
|
* prompt: single encoder or decoder input prompt
|
||||||
* lora_request: this is only valid for decoder prompts
|
* lora_request: this is only valid for decoder prompts
|
||||||
* return_mm_hashes: whether to return multimodal hashes
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|
||||||
@ -490,21 +476,18 @@ class InputPreprocessor:
|
|||||||
return self._process_tokens(
|
return self._process_tokens(
|
||||||
parsed["content"],
|
parsed["content"],
|
||||||
lora_request=lora_request,
|
lora_request=lora_request,
|
||||||
return_mm_hashes=return_mm_hashes,
|
|
||||||
)
|
)
|
||||||
if parsed["type"] == "text":
|
if parsed["type"] == "text":
|
||||||
return self._process_text(
|
return self._process_text(
|
||||||
parsed["content"],
|
parsed["content"],
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
lora_request=lora_request,
|
lora_request=lora_request,
|
||||||
return_mm_hashes=return_mm_hashes,
|
|
||||||
)
|
)
|
||||||
if parsed["type"] == "str":
|
if parsed["type"] == "str":
|
||||||
return self._process_text(
|
return self._process_text(
|
||||||
TextPrompt(prompt=parsed["content"]),
|
TextPrompt(prompt=parsed["content"]),
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
lora_request=lora_request,
|
lora_request=lora_request,
|
||||||
return_mm_hashes=return_mm_hashes,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
assert_never(parsed)
|
assert_never(parsed)
|
||||||
@ -514,7 +497,6 @@ class InputPreprocessor:
|
|||||||
prompt: SingletonPrompt,
|
prompt: SingletonPrompt,
|
||||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||||
lora_request: Optional[LoRARequest] = None,
|
lora_request: Optional[LoRARequest] = None,
|
||||||
return_mm_hashes: bool = False,
|
|
||||||
) -> SingletonInputs:
|
) -> SingletonInputs:
|
||||||
"""
|
"""
|
||||||
Async version of
|
Async version of
|
||||||
@ -528,21 +510,18 @@ class InputPreprocessor:
|
|||||||
return await self._process_tokens_async(
|
return await self._process_tokens_async(
|
||||||
parsed["content"],
|
parsed["content"],
|
||||||
lora_request=lora_request,
|
lora_request=lora_request,
|
||||||
return_mm_hashes=return_mm_hashes,
|
|
||||||
)
|
)
|
||||||
if parsed["type"] == "text":
|
if parsed["type"] == "text":
|
||||||
return await self._process_text_async(
|
return await self._process_text_async(
|
||||||
parsed["content"],
|
parsed["content"],
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
lora_request=lora_request,
|
lora_request=lora_request,
|
||||||
return_mm_hashes=return_mm_hashes,
|
|
||||||
)
|
)
|
||||||
if parsed["type"] == "str":
|
if parsed["type"] == "str":
|
||||||
return await self._process_text_async(
|
return await self._process_text_async(
|
||||||
TextPrompt(prompt=parsed["content"]),
|
TextPrompt(prompt=parsed["content"]),
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
lora_request=lora_request,
|
lora_request=lora_request,
|
||||||
return_mm_hashes=return_mm_hashes,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
assert_never(parsed)
|
assert_never(parsed)
|
||||||
@ -785,7 +764,6 @@ class InputPreprocessor:
|
|||||||
prompt: SingletonPrompt,
|
prompt: SingletonPrompt,
|
||||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||||
lora_request: Optional[LoRARequest] = None,
|
lora_request: Optional[LoRARequest] = None,
|
||||||
return_mm_hashes: bool = False,
|
|
||||||
) -> DecoderOnlyInputs:
|
) -> DecoderOnlyInputs:
|
||||||
"""
|
"""
|
||||||
For decoder-only models:
|
For decoder-only models:
|
||||||
@ -796,7 +774,6 @@ class InputPreprocessor:
|
|||||||
|
|
||||||
* prompt: input prompt
|
* prompt: input prompt
|
||||||
* lora_request
|
* lora_request
|
||||||
* return_mm_hashes
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|
||||||
@ -807,7 +784,6 @@ class InputPreprocessor:
|
|||||||
prompt,
|
prompt,
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
lora_request=lora_request,
|
lora_request=lora_request,
|
||||||
return_mm_hashes=return_mm_hashes,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
return self._build_decoder_only_llm_inputs(prompt_comps)
|
return self._build_decoder_only_llm_inputs(prompt_comps)
|
||||||
@ -817,7 +793,6 @@ class InputPreprocessor:
|
|||||||
prompt: SingletonPrompt,
|
prompt: SingletonPrompt,
|
||||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||||
lora_request: Optional[LoRARequest] = None,
|
lora_request: Optional[LoRARequest] = None,
|
||||||
return_mm_hashes: bool = False,
|
|
||||||
) -> DecoderOnlyInputs:
|
) -> DecoderOnlyInputs:
|
||||||
"""
|
"""
|
||||||
Async version of
|
Async version of
|
||||||
@ -827,7 +802,6 @@ class InputPreprocessor:
|
|||||||
prompt,
|
prompt,
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
lora_request=lora_request,
|
lora_request=lora_request,
|
||||||
return_mm_hashes=return_mm_hashes,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
return self._build_decoder_only_llm_inputs(prompt_comps)
|
return self._build_decoder_only_llm_inputs(prompt_comps)
|
||||||
@ -837,17 +811,15 @@ class InputPreprocessor:
|
|||||||
prompt: PromptType,
|
prompt: PromptType,
|
||||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||||
lora_request: Optional[LoRARequest] = None,
|
lora_request: Optional[LoRARequest] = None,
|
||||||
return_mm_hashes: bool = False,
|
|
||||||
) -> ProcessorInputs:
|
) -> ProcessorInputs:
|
||||||
"""Preprocess the input prompt."""
|
"""Preprocess the input prompt."""
|
||||||
if self.model_config.is_encoder_decoder:
|
if self.model_config.is_encoder_decoder:
|
||||||
assert not return_mm_hashes, (
|
|
||||||
"Multimodal hashes for encoder-decoder models should not be ",
|
|
||||||
"returned until they are supported on vLLM V1.")
|
|
||||||
# Encoder-decoder model requires special mapping of
|
# Encoder-decoder model requires special mapping of
|
||||||
# input prompts to encoder & decoder
|
# input prompts to encoder & decoder.
|
||||||
return self._process_encoder_decoder_prompt(
|
return self._process_encoder_decoder_prompt(
|
||||||
prompt, tokenization_kwargs)
|
prompt,
|
||||||
|
tokenization_kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
if is_explicit_encoder_decoder_prompt(prompt):
|
if is_explicit_encoder_decoder_prompt(prompt):
|
||||||
raise ValueError("Cannot pass encoder-decoder prompt "
|
raise ValueError("Cannot pass encoder-decoder prompt "
|
||||||
@ -858,7 +830,6 @@ class InputPreprocessor:
|
|||||||
prompt,
|
prompt,
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
lora_request=lora_request,
|
lora_request=lora_request,
|
||||||
return_mm_hashes=return_mm_hashes,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
async def preprocess_async(
|
async def preprocess_async(
|
||||||
@ -866,19 +837,18 @@ class InputPreprocessor:
|
|||||||
prompt: PromptType,
|
prompt: PromptType,
|
||||||
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
tokenization_kwargs: Optional[dict[str, Any]] = None,
|
||||||
lora_request: Optional[LoRARequest] = None,
|
lora_request: Optional[LoRARequest] = None,
|
||||||
return_mm_hashes: bool = False,
|
|
||||||
) -> ProcessorInputs:
|
) -> ProcessorInputs:
|
||||||
"""
|
"""
|
||||||
Async version of
|
Async version of
|
||||||
[`preprocess`][vllm.inputs.preprocess.InputPreprocessor.preprocess].
|
[`preprocess`][vllm.inputs.preprocess.InputPreprocessor.preprocess].
|
||||||
"""
|
"""
|
||||||
if self.model_config.is_encoder_decoder:
|
if self.model_config.is_encoder_decoder:
|
||||||
assert not return_mm_hashes, (
|
|
||||||
"Multimodal hashes for encoder-decoder models should not be ",
|
|
||||||
"returned until they are supported on vLLM V1.")
|
|
||||||
# Encoder-decoder model requires special mapping of
|
# Encoder-decoder model requires special mapping of
|
||||||
# input prompts to encoder & decoder
|
# input prompts to encoder & decoder.
|
||||||
return await self._process_encoder_decoder_prompt_async(prompt)
|
return await self._process_encoder_decoder_prompt_async(
|
||||||
|
prompt,
|
||||||
|
tokenization_kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
if is_explicit_encoder_decoder_prompt(prompt):
|
if is_explicit_encoder_decoder_prompt(prompt):
|
||||||
raise ValueError("Cannot pass encoder-decoder prompt "
|
raise ValueError("Cannot pass encoder-decoder prompt "
|
||||||
@ -889,5 +859,4 @@ class InputPreprocessor:
|
|||||||
prompt,
|
prompt,
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
lora_request=lora_request,
|
lora_request=lora_request,
|
||||||
return_mm_hashes=return_mm_hashes,
|
|
||||||
)
|
)
|
||||||
|
|||||||
@ -290,8 +290,6 @@ class DeepseekVL2MultiModalProcessor(
|
|||||||
mm_data_items: MultiModalDataItems,
|
mm_data_items: MultiModalDataItems,
|
||||||
hf_processor_mm_kwargs: Mapping[str, object],
|
hf_processor_mm_kwargs: Mapping[str, object],
|
||||||
tokenization_kwargs: Mapping[str, object],
|
tokenization_kwargs: Mapping[str, object],
|
||||||
*,
|
|
||||||
return_mm_hashes: bool,
|
|
||||||
) -> tuple[list[int], MultiModalProcessingInfo, bool]:
|
) -> tuple[list[int], MultiModalProcessingInfo, bool]:
|
||||||
# The processor logic is different for len(images) <= 2 vs > 2
|
# The processor logic is different for len(images) <= 2 vs > 2
|
||||||
# Since the processing cache assumes that the processor output is
|
# Since the processing cache assumes that the processor output is
|
||||||
@ -303,7 +301,6 @@ class DeepseekVL2MultiModalProcessor(
|
|||||||
mm_data_items=mm_data_items,
|
mm_data_items=mm_data_items,
|
||||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
return_mm_hashes=return_mm_hashes,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
return super()._cached_apply_hf_processor(
|
return super()._cached_apply_hf_processor(
|
||||||
@ -311,7 +308,6 @@ class DeepseekVL2MultiModalProcessor(
|
|||||||
mm_data_items=mm_data_items,
|
mm_data_items=mm_data_items,
|
||||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
return_mm_hashes=return_mm_hashes,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -479,8 +479,6 @@ class H2OVLMultiModalProcessor(
|
|||||||
mm_data_items: MultiModalDataItems,
|
mm_data_items: MultiModalDataItems,
|
||||||
hf_processor_mm_kwargs: Mapping[str, object],
|
hf_processor_mm_kwargs: Mapping[str, object],
|
||||||
tokenization_kwargs: Mapping[str, object],
|
tokenization_kwargs: Mapping[str, object],
|
||||||
*,
|
|
||||||
return_mm_hashes: bool,
|
|
||||||
) -> tuple[list[int], MultiModalProcessingInfo, bool]:
|
) -> tuple[list[int], MultiModalProcessingInfo, bool]:
|
||||||
# The processor logic is different for len(images) <= 1 vs > 1
|
# The processor logic is different for len(images) <= 1 vs > 1
|
||||||
# Since the processing cache assumes that the processor output is
|
# Since the processing cache assumes that the processor output is
|
||||||
@ -492,7 +490,6 @@ class H2OVLMultiModalProcessor(
|
|||||||
mm_data_items=mm_data_items,
|
mm_data_items=mm_data_items,
|
||||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
return_mm_hashes=return_mm_hashes,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
return super()._cached_apply_hf_processor(
|
return super()._cached_apply_hf_processor(
|
||||||
@ -500,7 +497,6 @@ class H2OVLMultiModalProcessor(
|
|||||||
mm_data_items=mm_data_items,
|
mm_data_items=mm_data_items,
|
||||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
return_mm_hashes=return_mm_hashes,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -795,7 +795,6 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
|
|||||||
mm_data: MultiModalDataDict,
|
mm_data: MultiModalDataDict,
|
||||||
hf_processor_mm_kwargs: Mapping[str, object],
|
hf_processor_mm_kwargs: Mapping[str, object],
|
||||||
tokenization_kwargs: Optional[Mapping[str, object]] = None,
|
tokenization_kwargs: Optional[Mapping[str, object]] = None,
|
||||||
return_mm_hashes: bool = False,
|
|
||||||
) -> MultiModalInputs:
|
) -> MultiModalInputs:
|
||||||
hf_config = self.info.get_hf_config()
|
hf_config = self.info.get_hf_config()
|
||||||
image_token_id = hf_config.image_token_index
|
image_token_id = hf_config.image_token_index
|
||||||
@ -807,7 +806,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
|
|||||||
)
|
)
|
||||||
|
|
||||||
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
|
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
|
||||||
tokenization_kwargs, return_mm_hashes)
|
tokenization_kwargs)
|
||||||
|
|
||||||
mm_items = self._to_mm_items(mm_data)
|
mm_items = self._to_mm_items(mm_data)
|
||||||
mm_item_counts = mm_items.get_all_counts()
|
mm_item_counts = mm_items.get_all_counts()
|
||||||
|
|||||||
@ -168,10 +168,9 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
|
|||||||
mm_data: MultiModalDataDict,
|
mm_data: MultiModalDataDict,
|
||||||
hf_processor_mm_kwargs: Mapping[str, object],
|
hf_processor_mm_kwargs: Mapping[str, object],
|
||||||
tokenization_kwargs: Optional[Mapping[str, object]] = None,
|
tokenization_kwargs: Optional[Mapping[str, object]] = None,
|
||||||
return_mm_hashes: bool = False,
|
|
||||||
) -> MultiModalEncDecInputs:
|
) -> MultiModalEncDecInputs:
|
||||||
mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
|
mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
|
||||||
tokenization_kwargs, return_mm_hashes)
|
tokenization_kwargs)
|
||||||
|
|
||||||
image_token_id = self.info.get_hf_config().image_token_index
|
image_token_id = self.info.get_hf_config().image_token_index
|
||||||
# Check that the number of image tokens in the decoder prompt matches
|
# Check that the number of image tokens in the decoder prompt matches
|
||||||
|
|||||||
@ -194,10 +194,9 @@ class PaliGemmaMultiModalProcessor(
|
|||||||
mm_data: MultiModalDataDict,
|
mm_data: MultiModalDataDict,
|
||||||
hf_processor_mm_kwargs: Mapping[str, object],
|
hf_processor_mm_kwargs: Mapping[str, object],
|
||||||
tokenization_kwargs: Optional[Mapping[str, object]] = None,
|
tokenization_kwargs: Optional[Mapping[str, object]] = None,
|
||||||
return_mm_hashes: bool = False,
|
|
||||||
) -> MultiModalInputs:
|
) -> MultiModalInputs:
|
||||||
mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
|
mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
|
||||||
tokenization_kwargs, return_mm_hashes)
|
tokenization_kwargs)
|
||||||
prompt_token_ids = mm_inputs["prompt_token_ids"]
|
prompt_token_ids = mm_inputs["prompt_token_ids"]
|
||||||
|
|
||||||
tokenizer = self.info.get_tokenizer()
|
tokenizer = self.info.get_tokenizer()
|
||||||
|
|||||||
@ -308,15 +308,12 @@ class PixtralMultiModalProcessor(BaseMultiModalProcessor[PixtralProcessingInfo]
|
|||||||
mm_data_items: MultiModalDataItems,
|
mm_data_items: MultiModalDataItems,
|
||||||
hf_processor_mm_kwargs: Mapping[str, object],
|
hf_processor_mm_kwargs: Mapping[str, object],
|
||||||
tokenization_kwargs: Mapping[str, object],
|
tokenization_kwargs: Mapping[str, object],
|
||||||
*,
|
|
||||||
return_mm_hashes: bool,
|
|
||||||
) -> tuple[list[int], MultiModalProcessingInfo, bool]:
|
) -> tuple[list[int], MultiModalProcessingInfo, bool]:
|
||||||
prompt_ids, mm_info, _ = super()._cached_apply_hf_processor(
|
prompt_ids, mm_info, _ = super()._cached_apply_hf_processor(
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
mm_data_items=mm_data_items,
|
mm_data_items=mm_data_items,
|
||||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
return_mm_hashes=return_mm_hashes,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# NOTE: The tokens are already inserted by the chat template
|
# NOTE: The tokens are already inserted by the chat template
|
||||||
|
|||||||
@ -18,7 +18,7 @@
|
|||||||
"""Inference-only IBM/NASA Prithvi Geospatial model."""
|
"""Inference-only IBM/NASA Prithvi Geospatial model."""
|
||||||
|
|
||||||
from collections.abc import Iterable, Mapping, Sequence
|
from collections.abc import Iterable, Mapping, Sequence
|
||||||
from typing import Optional, Union
|
from typing import Any, Optional, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
@ -32,18 +32,56 @@ from vllm.model_executor.models.interfaces import (
|
|||||||
default_pooling_type)
|
default_pooling_type)
|
||||||
from vllm.model_executor.models.utils import AutoWeightsLoader
|
from vllm.model_executor.models.utils import AutoWeightsLoader
|
||||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||||
from vllm.multimodal.inputs import (MultiModalDataDict, MultiModalFieldConfig,
|
from vllm.multimodal.inputs import (ImageItem, ModalityData,
|
||||||
MultiModalFieldElem, MultiModalInputs,
|
MultiModalDataDict, MultiModalFieldConfig,
|
||||||
MultiModalKwargsItem,
|
MultiModalInputs, MultiModalKwargsItems,
|
||||||
MultiModalKwargsItems,
|
PlaceholderRange)
|
||||||
MultiModalSharedField, PlaceholderRange)
|
from vllm.multimodal.parse import (DictEmbeddingItems, ModalityDataItems,
|
||||||
from vllm.multimodal.parse import MultiModalDataItems
|
MultiModalDataItems, MultiModalDataParser)
|
||||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||||
BaseProcessingInfo, PromptUpdate)
|
BaseProcessingInfo, PromptUpdate)
|
||||||
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
from vllm.multimodal.profiling import BaseDummyInputsBuilder
|
||||||
from vllm.sequence import IntermediateTensors
|
from vllm.sequence import IntermediateTensors
|
||||||
|
|
||||||
|
|
||||||
|
def _prithvi_field_config(hf_inputs: Mapping[str, torch.Tensor]):
|
||||||
|
# This model receives in input a multi-dimensional tensor representing
|
||||||
|
# a single image patch and therefore it is not to be split
|
||||||
|
# into multiple elements, but rather to be considered a single one.
|
||||||
|
# Hence, the decision of using a MultiModalSharedField.
|
||||||
|
# The expected shape is (num_channels, width, height).
|
||||||
|
|
||||||
|
# This model however allows the user to also submit multiple image
|
||||||
|
# patches as a batch, adding a further dimension to the above shape.
|
||||||
|
# At this stage we only support submitting one patch per request and
|
||||||
|
# batching is achieved via vLLM batching.
|
||||||
|
# TODO (christian-pinto): enable support for multi patch requests
|
||||||
|
# in tandem with vLLM batching.
|
||||||
|
return dict(
|
||||||
|
pixel_values=MultiModalFieldConfig.shared(batch_size=1,
|
||||||
|
modality="image"),
|
||||||
|
location_coords=MultiModalFieldConfig.shared(batch_size=1,
|
||||||
|
modality="image"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class PrithviGeoSpatialMAEMultiModalDataParser(MultiModalDataParser):
|
||||||
|
|
||||||
|
def _parse_image_data(
|
||||||
|
self,
|
||||||
|
data: Union[dict[str, torch.Tensor], ModalityData[ImageItem]],
|
||||||
|
) -> Optional[ModalityDataItems[Any, Any]]:
|
||||||
|
if isinstance(data, dict):
|
||||||
|
return DictEmbeddingItems(
|
||||||
|
data,
|
||||||
|
modality="image",
|
||||||
|
required_fields={"pixel_values", "location_coords"},
|
||||||
|
fields_factory=_prithvi_field_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
return super()._parse_image_data(data)
|
||||||
|
|
||||||
|
|
||||||
class PrithviGeoSpatialMAEProcessingInfo(BaseProcessingInfo):
|
class PrithviGeoSpatialMAEProcessingInfo(BaseProcessingInfo):
|
||||||
|
|
||||||
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
|
||||||
@ -64,26 +102,26 @@ class PrithviGeoSpatialMAEInputBuilder(
|
|||||||
# This model input is fixed and is in the form of a torch Tensor.
|
# This model input is fixed and is in the form of a torch Tensor.
|
||||||
# The size of pixel_values might change in the cases where we resize
|
# The size of pixel_values might change in the cases where we resize
|
||||||
# the input but never exceeds the dimensions below.
|
# the input but never exceeds the dimensions below.
|
||||||
return {
|
image_data = {
|
||||||
"pixel_values": torch.full((6, 512, 512), 1.0,
|
"pixel_values": torch.full((6, 512, 512), 1.0,
|
||||||
dtype=torch.float16),
|
dtype=torch.float16),
|
||||||
"location_coords": torch.full((1, 2), 1.0, dtype=torch.float16),
|
"location_coords": torch.full((1, 2), 1.0, dtype=torch.float16),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return {"image": image_data}
|
||||||
|
|
||||||
|
|
||||||
class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
|
class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
|
||||||
|
|
||||||
|
def _get_data_parser(self) -> MultiModalDataParser:
|
||||||
|
return PrithviGeoSpatialMAEMultiModalDataParser()
|
||||||
|
|
||||||
def _get_mm_fields_config(
|
def _get_mm_fields_config(
|
||||||
self,
|
self,
|
||||||
hf_inputs: BatchFeature,
|
hf_inputs: BatchFeature,
|
||||||
hf_processor_mm_kwargs: Mapping[str, object],
|
hf_processor_mm_kwargs: Mapping[str, object],
|
||||||
) -> Mapping[str, MultiModalFieldConfig]:
|
) -> Mapping[str, MultiModalFieldConfig]:
|
||||||
return dict(
|
return _prithvi_field_config(hf_inputs)
|
||||||
pixel_values=MultiModalFieldConfig.shared(batch_size=1,
|
|
||||||
modality="image"),
|
|
||||||
location_coords=MultiModalFieldConfig.shared(batch_size=1,
|
|
||||||
modality="image"),
|
|
||||||
)
|
|
||||||
|
|
||||||
def _get_prompt_updates(
|
def _get_prompt_updates(
|
||||||
self,
|
self,
|
||||||
@ -99,46 +137,32 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
|
|||||||
mm_data: MultiModalDataDict,
|
mm_data: MultiModalDataDict,
|
||||||
hf_processor_mm_kwargs: Mapping[str, object],
|
hf_processor_mm_kwargs: Mapping[str, object],
|
||||||
tokenization_kwargs: Optional[Mapping[str, object]] = None,
|
tokenization_kwargs: Optional[Mapping[str, object]] = None,
|
||||||
return_mm_hashes: bool = False,
|
|
||||||
) -> MultiModalInputs:
|
) -> MultiModalInputs:
|
||||||
mm_kwargs = {}
|
if "image" in mm_data:
|
||||||
|
image_data = mm_data["image"]
|
||||||
|
else:
|
||||||
|
image_data = mm_data
|
||||||
|
mm_data = {"image": mm_data}
|
||||||
|
|
||||||
for k, v in mm_data.items():
|
mm_items = self._to_mm_items(mm_data)
|
||||||
if isinstance(v, dict) and k == "image":
|
mm_hashes = self._hash_mm_items(mm_items, hf_processor_mm_kwargs,
|
||||||
mm_kwargs.update(v)
|
tokenization_kwargs or {})
|
||||||
else:
|
|
||||||
mm_kwargs[k] = v
|
|
||||||
mm_placeholders = {"image": [PlaceholderRange(offset=0, length=0)]}
|
mm_placeholders = {"image": [PlaceholderRange(offset=0, length=0)]}
|
||||||
|
|
||||||
# This model receives in input a multi-dimensional tensor representing
|
mm_processed_data = BatchFeature(image_data)
|
||||||
# a single image patch and therefore it is not to be split
|
|
||||||
# into multiple elements, but rather to be considered a single one.
|
|
||||||
# Hence, the decision of using a MultiModalSharedField.
|
|
||||||
# The expected shape is (num_channels, width, height).
|
|
||||||
|
|
||||||
# This model however allows the user to also submit multiple image
|
mm_kwargs = MultiModalKwargsItems.from_hf_inputs(
|
||||||
# patches as a batch, adding a further dimension to the above shape.
|
mm_processed_data,
|
||||||
# At this stage we only support submitting one patch per request and
|
self._get_mm_fields_config(mm_processed_data,
|
||||||
# batching is achieved via vLLM batching.
|
hf_processor_mm_kwargs),
|
||||||
# TODO (christian-pinto): enable support for multi patch requests
|
)
|
||||||
# in tandem with vLLM batching.
|
|
||||||
multimodal_kwargs_items = [
|
|
||||||
MultiModalKwargsItem.from_elems([
|
|
||||||
MultiModalFieldElem(
|
|
||||||
modality="image",
|
|
||||||
key=key,
|
|
||||||
data=data,
|
|
||||||
field=MultiModalSharedField(1),
|
|
||||||
) for key, data in mm_kwargs.items()
|
|
||||||
])
|
|
||||||
]
|
|
||||||
|
|
||||||
return MultiModalInputs(
|
return MultiModalInputs(
|
||||||
type="multimodal",
|
type="multimodal",
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
prompt_token_ids=[1],
|
prompt_token_ids=[1],
|
||||||
mm_kwargs=MultiModalKwargsItems.from_seq(multimodal_kwargs_items),
|
mm_kwargs=mm_kwargs,
|
||||||
mm_hashes=None,
|
mm_hashes=mm_hashes,
|
||||||
mm_placeholders=mm_placeholders,
|
mm_placeholders=mm_placeholders,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@ -310,7 +310,6 @@ class MultiModalProcessor(BaseMultiModalProcessor[MultiModalProcessingInfo]):
|
|||||||
mm_data: MultiModalDataDict,
|
mm_data: MultiModalDataDict,
|
||||||
hf_processor_mm_kwargs: Mapping[str, object],
|
hf_processor_mm_kwargs: Mapping[str, object],
|
||||||
tokenization_kwargs: Optional[Mapping[str, object]] = None,
|
tokenization_kwargs: Optional[Mapping[str, object]] = None,
|
||||||
return_mm_hashes: bool = False,
|
|
||||||
) -> MultiModalInputs:
|
) -> MultiModalInputs:
|
||||||
"""
|
"""
|
||||||
Process multi-modal inputs to be used in vLLM.
|
Process multi-modal inputs to be used in vLLM.
|
||||||
|
|||||||
@ -288,15 +288,12 @@ class VoxtralMultiModalProcessor(BaseMultiModalProcessor[VoxtralProcessingInfo]
|
|||||||
mm_data_items: MultiModalDataItems,
|
mm_data_items: MultiModalDataItems,
|
||||||
hf_processor_mm_kwargs: Mapping[str, object],
|
hf_processor_mm_kwargs: Mapping[str, object],
|
||||||
tokenization_kwargs: Mapping[str, object],
|
tokenization_kwargs: Mapping[str, object],
|
||||||
*,
|
|
||||||
return_mm_hashes: bool,
|
|
||||||
) -> tuple[list[int], MultiModalProcessingInfo, bool]:
|
) -> tuple[list[int], MultiModalProcessingInfo, bool]:
|
||||||
prompt_ids, mm_info, _ = super()._cached_apply_hf_processor(
|
prompt_ids, mm_info, _ = super()._cached_apply_hf_processor(
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
mm_data_items=mm_data_items,
|
mm_data_items=mm_data_items,
|
||||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
return_mm_hashes=return_mm_hashes,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# NOTE: The tokens are already inserted by the chat template
|
# NOTE: The tokens are already inserted by the chat template
|
||||||
|
|||||||
@ -43,7 +43,7 @@ class MultiModalHasher:
|
|||||||
return cls.item_to_bytes(
|
return cls.item_to_bytes(
|
||||||
"image", np.asarray(convert_image_mode(obj, "RGBA")))
|
"image", np.asarray(convert_image_mode(obj, "RGBA")))
|
||||||
if isinstance(obj, torch.Tensor):
|
if isinstance(obj, torch.Tensor):
|
||||||
return cls.item_to_bytes("tensor", obj.numpy())
|
return cls.item_to_bytes("tensor", obj.cpu().numpy())
|
||||||
if isinstance(obj, np.ndarray):
|
if isinstance(obj, np.ndarray):
|
||||||
# If the array is non-contiguous, we need to copy it first
|
# If the array is non-contiguous, we need to copy it first
|
||||||
arr_data = obj.data if obj.flags.c_contiguous else obj.tobytes()
|
arr_data = obj.data if obj.flags.c_contiguous else obj.tobytes()
|
||||||
|
|||||||
@ -901,7 +901,7 @@ class MultiModalInputs(TypedDict):
|
|||||||
mm_kwargs: MultiModalKwargsItems
|
mm_kwargs: MultiModalKwargsItems
|
||||||
"""Keyword arguments to be directly passed to the model after batching."""
|
"""Keyword arguments to be directly passed to the model after batching."""
|
||||||
|
|
||||||
mm_hashes: Optional["MultiModalHashDict"]
|
mm_hashes: "MultiModalHashDict"
|
||||||
"""The hashes of the multi-modal data."""
|
"""The hashes of the multi-modal data."""
|
||||||
|
|
||||||
mm_placeholders: "MultiModalPlaceholderDict"
|
mm_placeholders: "MultiModalPlaceholderDict"
|
||||||
|
|||||||
@ -998,7 +998,7 @@ A collection of prompt updates with a similar structure as
|
|||||||
|
|
||||||
class MultiModalProcessingInfo(NamedTuple):
|
class MultiModalProcessingInfo(NamedTuple):
|
||||||
kwargs: MultiModalKwargsItems
|
kwargs: MultiModalKwargsItems
|
||||||
hashes: Optional[MultiModalHashes]
|
hashes: MultiModalHashes
|
||||||
prompt_updates: MultiModalPromptUpdates
|
prompt_updates: MultiModalPromptUpdates
|
||||||
|
|
||||||
|
|
||||||
@ -1399,8 +1399,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
mm_data_items: MultiModalDataItems,
|
mm_data_items: MultiModalDataItems,
|
||||||
hf_processor_mm_kwargs: Mapping[str, object],
|
hf_processor_mm_kwargs: Mapping[str, object],
|
||||||
tokenization_kwargs: Mapping[str, object],
|
tokenization_kwargs: Mapping[str, object],
|
||||||
*,
|
|
||||||
return_mm_hashes: bool,
|
|
||||||
) -> tuple[list[int], MultiModalProcessingInfo, bool]:
|
) -> tuple[list[int], MultiModalProcessingInfo, bool]:
|
||||||
(
|
(
|
||||||
prompt_ids,
|
prompt_ids,
|
||||||
@ -1420,9 +1418,8 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
hf_processor_mm_kwargs),
|
hf_processor_mm_kwargs),
|
||||||
)
|
)
|
||||||
|
|
||||||
mm_hashes = (self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
|
mm_hashes = self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
|
||||||
tokenization_kwargs)
|
tokenization_kwargs)
|
||||||
if return_mm_hashes else None)
|
|
||||||
|
|
||||||
unbound_prompt_updates = self._get_prompt_updates(
|
unbound_prompt_updates = self._get_prompt_updates(
|
||||||
mm_data_items,
|
mm_data_items,
|
||||||
@ -1446,8 +1443,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
mm_data_items: MultiModalDataItems,
|
mm_data_items: MultiModalDataItems,
|
||||||
hf_processor_mm_kwargs: Mapping[str, object],
|
hf_processor_mm_kwargs: Mapping[str, object],
|
||||||
tokenization_kwargs: Mapping[str, object],
|
tokenization_kwargs: Mapping[str, object],
|
||||||
*,
|
|
||||||
return_mm_hashes: bool,
|
|
||||||
) -> tuple[list[int], MultiModalProcessingInfo, bool]:
|
) -> tuple[list[int], MultiModalProcessingInfo, bool]:
|
||||||
"""
|
"""
|
||||||
Apply the HF processor on the full prompt text,
|
Apply the HF processor on the full prompt text,
|
||||||
@ -1462,7 +1457,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
mm_data_items=mm_data_items,
|
mm_data_items=mm_data_items,
|
||||||
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
hf_processor_mm_kwargs=hf_processor_mm_kwargs,
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
return_mm_hashes=return_mm_hashes,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
mm_hashes = self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
|
mm_hashes = self._hash_mm_items(mm_data_items, hf_processor_mm_kwargs,
|
||||||
@ -1476,8 +1470,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
mm_hashes=mm_hashes,
|
mm_hashes=mm_hashes,
|
||||||
)
|
)
|
||||||
|
|
||||||
mm_hashes_to_return = mm_hashes if return_mm_hashes else None
|
|
||||||
|
|
||||||
# NOTE: `prompt` does not correspond to `mm_missing_data_items`,
|
# NOTE: `prompt` does not correspond to `mm_missing_data_items`,
|
||||||
# so we can't apply prompt updates until the new multimodal
|
# so we can't apply prompt updates until the new multimodal
|
||||||
# items are combined with the cached multimodal items
|
# items are combined with the cached multimodal items
|
||||||
@ -1515,7 +1507,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
|
|
||||||
mm_info = MultiModalProcessingInfo(
|
mm_info = MultiModalProcessingInfo(
|
||||||
kwargs=mm_kwargs,
|
kwargs=mm_kwargs,
|
||||||
hashes=mm_hashes_to_return,
|
hashes=mm_hashes,
|
||||||
prompt_updates=mm_prompt_updates,
|
prompt_updates=mm_prompt_updates,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -1697,7 +1689,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
mm_data: MultiModalDataDict,
|
mm_data: MultiModalDataDict,
|
||||||
hf_processor_mm_kwargs: Mapping[str, object],
|
hf_processor_mm_kwargs: Mapping[str, object],
|
||||||
tokenization_kwargs: Optional[Mapping[str, object]] = None,
|
tokenization_kwargs: Optional[Mapping[str, object]] = None,
|
||||||
return_mm_hashes: bool = False,
|
|
||||||
) -> MultiModalInputs:
|
) -> MultiModalInputs:
|
||||||
"""
|
"""
|
||||||
Process multi-modal inputs to be used in vLLM.
|
Process multi-modal inputs to be used in vLLM.
|
||||||
@ -1726,7 +1717,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
mm_items,
|
mm_items,
|
||||||
hf_processor_mm_kwargs,
|
hf_processor_mm_kwargs,
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
return_mm_hashes=return_mm_hashes,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# NOTE: tokenization_kwargs are not required to init processor
|
# NOTE: tokenization_kwargs are not required to init processor
|
||||||
@ -1811,7 +1801,6 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
|||||||
mm_data: MultiModalDataDict,
|
mm_data: MultiModalDataDict,
|
||||||
hf_processor_mm_kwargs: Mapping[str, object],
|
hf_processor_mm_kwargs: Mapping[str, object],
|
||||||
tokenization_kwargs: Optional[Mapping[str, object]] = None,
|
tokenization_kwargs: Optional[Mapping[str, object]] = None,
|
||||||
return_mm_hashes: bool = False,
|
|
||||||
) -> MultiModalEncDecInputs:
|
) -> MultiModalEncDecInputs:
|
||||||
"""
|
"""
|
||||||
Process multi-modal inputs to be used in vLLM.
|
Process multi-modal inputs to be used in vLLM.
|
||||||
@ -1826,7 +1815,6 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
|||||||
mm_data,
|
mm_data,
|
||||||
hf_processor_mm_kwargs,
|
hf_processor_mm_kwargs,
|
||||||
tokenization_kwargs,
|
tokenization_kwargs,
|
||||||
return_mm_hashes,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
return self._get_enc_dec_inputs(
|
return self._get_enc_dec_inputs(
|
||||||
|
|||||||
@ -17,7 +17,6 @@ from vllm.multimodal.utils import argsort_mm_positions
|
|||||||
from vllm.pooling_params import PoolingParams
|
from vllm.pooling_params import PoolingParams
|
||||||
from vllm.sampling_params import SamplingParams
|
from vllm.sampling_params import SamplingParams
|
||||||
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
|
from vllm.transformers_utils.tokenizer_group import TokenizerGroup
|
||||||
from vllm.utils import is_list_of
|
|
||||||
from vllm.v1.engine import EngineCoreRequest
|
from vllm.v1.engine import EngineCoreRequest
|
||||||
from vllm.v1.engine.mm_input_cache import MultiModalInputCacheClient
|
from vllm.v1.engine.mm_input_cache import MultiModalInputCacheClient
|
||||||
from vllm.v1.structured_output.backend_guidance import (
|
from vllm.v1.structured_output.backend_guidance import (
|
||||||
@ -253,13 +252,10 @@ class Processor:
|
|||||||
# 1. Tokenize text prompt, with LoRA request if one exists.
|
# 1. Tokenize text prompt, with LoRA request if one exists.
|
||||||
# 2. For multimodal models with a merged preprocessor, preprocess
|
# 2. For multimodal models with a merged preprocessor, preprocess
|
||||||
# multimodal data and expand prompt token ids accordingly.
|
# multimodal data and expand prompt token ids accordingly.
|
||||||
return_mm_hashes = (self.model_config.processor_return_mm_hashes
|
|
||||||
or bool(self.cache_config.enable_prefix_caching))
|
|
||||||
processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
|
processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess(
|
||||||
prompt,
|
prompt,
|
||||||
tokenization_kwargs=tokenization_kwargs,
|
tokenization_kwargs=tokenization_kwargs,
|
||||||
lora_request=lora_request,
|
lora_request=lora_request,
|
||||||
return_mm_hashes=return_mm_hashes,
|
|
||||||
)
|
)
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
current_platform.validate_request(
|
current_platform.validate_request(
|
||||||
@ -302,7 +298,7 @@ class Processor:
|
|||||||
if decoder_inputs["type"] == "multimodal":
|
if decoder_inputs["type"] == "multimodal":
|
||||||
decoder_mm_inputs = decoder_inputs["mm_kwargs"]
|
decoder_mm_inputs = decoder_inputs["mm_kwargs"]
|
||||||
decoder_mm_positions = decoder_inputs["mm_placeholders"]
|
decoder_mm_positions = decoder_inputs["mm_placeholders"]
|
||||||
decoder_mm_hashes = decoder_inputs.get("mm_hashes")
|
decoder_mm_hashes = decoder_inputs["mm_hashes"]
|
||||||
|
|
||||||
# Merge and flatten multimodal placeholders, hashes and inputs
|
# Merge and flatten multimodal placeholders, hashes and inputs
|
||||||
# from dictionaries to lists, and sort them by each item's position
|
# from dictionaries to lists, and sort them by each item's position
|
||||||
@ -317,19 +313,15 @@ class Processor:
|
|||||||
decoder_mm_positions[modality][idx]
|
decoder_mm_positions[modality][idx]
|
||||||
for modality, idx in sorted_mm_idxs
|
for modality, idx in sorted_mm_idxs
|
||||||
]
|
]
|
||||||
sorted_mm_hashes = None if decoder_mm_hashes is None else [
|
sorted_mm_hashes = [
|
||||||
decoder_mm_hashes[modality][idx]
|
decoder_mm_hashes[modality][idx]
|
||||||
for modality, idx in sorted_mm_idxs
|
for modality, idx in sorted_mm_idxs
|
||||||
]
|
]
|
||||||
|
|
||||||
if sorted_mm_hashes is not None:
|
sorted_mm_inputs = self.mm_input_cache_client.get_and_update(
|
||||||
sorted_mm_inputs = self.mm_input_cache_client.get_and_update(
|
orig_sorted_mm_inputs,
|
||||||
orig_sorted_mm_inputs,
|
sorted_mm_hashes,
|
||||||
sorted_mm_hashes,
|
)
|
||||||
)
|
|
||||||
else:
|
|
||||||
assert is_list_of(orig_sorted_mm_inputs, MultiModalKwargsItem)
|
|
||||||
sorted_mm_inputs = orig_sorted_mm_inputs
|
|
||||||
|
|
||||||
return decoder_inputs.get("prompt"), EngineCoreRequest(
|
return decoder_inputs.get("prompt"), EngineCoreRequest(
|
||||||
request_id=request_id,
|
request_id=request_id,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user