mirror of
https://git.datalinker.icu/vllm-project/vllm.git
synced 2025-12-21 06:05:01 +08:00
[Misc][V1] Avoid using envs.VLLM_USE_V1 in mm processing (#14256)
Signed-off-by: Roger Wang <ywang@roblox.com>
This commit is contained in:
parent
32985bed7c
commit
ec79b67c77
@ -254,6 +254,7 @@ class InputPreprocessor:
|
|||||||
mm_data: MultiModalDataDict,
|
mm_data: MultiModalDataDict,
|
||||||
mm_processor_kwargs: Optional[Mapping[str, object]],
|
mm_processor_kwargs: Optional[Mapping[str, object]],
|
||||||
lora_request: Optional[LoRARequest],
|
lora_request: Optional[LoRARequest],
|
||||||
|
return_mm_hashes: bool = False,
|
||||||
) -> MultiModalInputs:
|
) -> MultiModalInputs:
|
||||||
"""
|
"""
|
||||||
Apply the model's multi-modal processor to a multi-modal prompt,
|
Apply the model's multi-modal processor to a multi-modal prompt,
|
||||||
@ -274,7 +275,8 @@ class InputPreprocessor:
|
|||||||
if mm_processor_kwargs is None:
|
if mm_processor_kwargs is None:
|
||||||
mm_processor_kwargs = {}
|
mm_processor_kwargs = {}
|
||||||
|
|
||||||
return mm_processor.apply(prompt, mm_data, mm_processor_kwargs)
|
return mm_processor.apply(prompt, mm_data, mm_processor_kwargs,
|
||||||
|
return_mm_hashes)
|
||||||
|
|
||||||
async def _process_multimodal_async(
|
async def _process_multimodal_async(
|
||||||
self,
|
self,
|
||||||
@ -282,6 +284,7 @@ class InputPreprocessor:
|
|||||||
mm_data: MultiModalDataDict,
|
mm_data: MultiModalDataDict,
|
||||||
mm_processor_kwargs: Optional[Mapping[str, object]],
|
mm_processor_kwargs: Optional[Mapping[str, object]],
|
||||||
lora_request: Optional[LoRARequest],
|
lora_request: Optional[LoRARequest],
|
||||||
|
return_mm_hashes: bool = False,
|
||||||
) -> MultiModalInputs:
|
) -> MultiModalInputs:
|
||||||
"""Async version of :meth:`_process_multimodal`."""
|
"""Async version of :meth:`_process_multimodal`."""
|
||||||
# At the moment on model (PrithviGeoSpatialMAE) requires to be
|
# At the moment on model (PrithviGeoSpatialMAE) requires to be
|
||||||
@ -299,13 +302,15 @@ class InputPreprocessor:
|
|||||||
if mm_processor_kwargs is None:
|
if mm_processor_kwargs is None:
|
||||||
mm_processor_kwargs = {}
|
mm_processor_kwargs = {}
|
||||||
|
|
||||||
return mm_processor.apply(prompt, mm_data, mm_processor_kwargs)
|
return mm_processor.apply(prompt, mm_data, mm_processor_kwargs,
|
||||||
|
return_mm_hashes)
|
||||||
|
|
||||||
def _prompt_to_llm_inputs(
|
def _prompt_to_llm_inputs(
|
||||||
self,
|
self,
|
||||||
prompt: SingletonPrompt,
|
prompt: SingletonPrompt,
|
||||||
request_id: str,
|
request_id: str,
|
||||||
lora_request: Optional[LoRARequest] = None,
|
lora_request: Optional[LoRARequest] = None,
|
||||||
|
return_mm_hashes: bool = False,
|
||||||
) -> SingletonInputs:
|
) -> SingletonInputs:
|
||||||
"""
|
"""
|
||||||
Extract the singleton inputs from a prompt.
|
Extract the singleton inputs from a prompt.
|
||||||
@ -315,6 +320,7 @@ class InputPreprocessor:
|
|||||||
* request_id
|
* request_id
|
||||||
* prompt: single encoder or decoder input prompt
|
* prompt: single encoder or decoder input prompt
|
||||||
* lora_request: this is only valid for decoder prompts
|
* lora_request: this is only valid for decoder prompts
|
||||||
|
* return_mm_hashes: whether to return multimodal hashes
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|
||||||
@ -349,6 +355,7 @@ class InputPreprocessor:
|
|||||||
multi_modal_data,
|
multi_modal_data,
|
||||||
mm_processor_kwargs,
|
mm_processor_kwargs,
|
||||||
lora_request=lora_request,
|
lora_request=lora_request,
|
||||||
|
return_mm_hashes=return_mm_hashes,
|
||||||
)
|
)
|
||||||
|
|
||||||
return token_inputs(
|
return token_inputs(
|
||||||
@ -695,6 +702,7 @@ class InputPreprocessor:
|
|||||||
request_id: str,
|
request_id: str,
|
||||||
lora_request: Optional[LoRARequest] = None,
|
lora_request: Optional[LoRARequest] = None,
|
||||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||||
|
return_mm_hashes: bool = False,
|
||||||
) -> DecoderOnlyInputs:
|
) -> DecoderOnlyInputs:
|
||||||
"""
|
"""
|
||||||
For decoder-only models:
|
For decoder-only models:
|
||||||
@ -706,6 +714,7 @@ class InputPreprocessor:
|
|||||||
* request_id
|
* request_id
|
||||||
* lora_request
|
* lora_request
|
||||||
* prompt_adapter_request
|
* prompt_adapter_request
|
||||||
|
* return_mm_hashes
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|
||||||
@ -729,6 +738,7 @@ class InputPreprocessor:
|
|||||||
request_id: str,
|
request_id: str,
|
||||||
lora_request: Optional[LoRARequest] = None,
|
lora_request: Optional[LoRARequest] = None,
|
||||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||||
|
return_mm_hashes: bool = False,
|
||||||
) -> DecoderOnlyInputs:
|
) -> DecoderOnlyInputs:
|
||||||
"""Async version of :meth:`_process_decoder_only_prompt`."""
|
"""Async version of :meth:`_process_decoder_only_prompt`."""
|
||||||
prompt_comps = await self._prompt_to_llm_inputs_async(
|
prompt_comps = await self._prompt_to_llm_inputs_async(
|
||||||
@ -748,9 +758,13 @@ class InputPreprocessor:
|
|||||||
request_id: str,
|
request_id: str,
|
||||||
lora_request: Optional[LoRARequest] = None,
|
lora_request: Optional[LoRARequest] = None,
|
||||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||||
|
return_mm_hashes: bool = False,
|
||||||
) -> ProcessorInputs:
|
) -> ProcessorInputs:
|
||||||
"""Preprocess the input prompt."""
|
"""Preprocess the input prompt."""
|
||||||
if self.model_config.is_encoder_decoder:
|
if self.model_config.is_encoder_decoder:
|
||||||
|
assert not return_mm_hashes, (
|
||||||
|
"Multimodal hashes for encoder-decoder models should not be ",
|
||||||
|
"returned until they are supported on vLLM V1.")
|
||||||
# Encoder-decoder model requires special mapping of
|
# Encoder-decoder model requires special mapping of
|
||||||
# input prompts to encoder & decoder
|
# input prompts to encoder & decoder
|
||||||
return self._process_encoder_decoder_prompt(
|
return self._process_encoder_decoder_prompt(
|
||||||
@ -768,6 +782,7 @@ class InputPreprocessor:
|
|||||||
request_id=request_id,
|
request_id=request_id,
|
||||||
lora_request=lora_request,
|
lora_request=lora_request,
|
||||||
prompt_adapter_request=prompt_adapter_request,
|
prompt_adapter_request=prompt_adapter_request,
|
||||||
|
return_mm_hashes=return_mm_hashes,
|
||||||
)
|
)
|
||||||
|
|
||||||
async def preprocess_async(
|
async def preprocess_async(
|
||||||
@ -776,9 +791,13 @@ class InputPreprocessor:
|
|||||||
request_id: str,
|
request_id: str,
|
||||||
lora_request: Optional[LoRARequest] = None,
|
lora_request: Optional[LoRARequest] = None,
|
||||||
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
prompt_adapter_request: Optional[PromptAdapterRequest] = None,
|
||||||
|
return_mm_hashes: bool = False,
|
||||||
) -> ProcessorInputs:
|
) -> ProcessorInputs:
|
||||||
"""Async version of :meth:`preprocess`."""
|
"""Async version of :meth:`preprocess`."""
|
||||||
if self.model_config.is_encoder_decoder:
|
if self.model_config.is_encoder_decoder:
|
||||||
|
assert not return_mm_hashes, (
|
||||||
|
"Multimodal hashes for encoder-decoder models should not be ",
|
||||||
|
"returned until they are supported on vLLM V1.")
|
||||||
# Encoder-decoder model requires special mapping of
|
# Encoder-decoder model requires special mapping of
|
||||||
# input prompts to encoder & decoder
|
# input prompts to encoder & decoder
|
||||||
return await self._process_encoder_decoder_prompt_async(
|
return await self._process_encoder_decoder_prompt_async(
|
||||||
@ -796,4 +815,5 @@ class InputPreprocessor:
|
|||||||
request_id=request_id,
|
request_id=request_id,
|
||||||
lora_request=lora_request,
|
lora_request=lora_request,
|
||||||
prompt_adapter_request=prompt_adapter_request,
|
prompt_adapter_request=prompt_adapter_request,
|
||||||
|
return_mm_hashes=return_mm_hashes,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -767,6 +767,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
|
|||||||
prompt: Union[str, list[int]],
|
prompt: Union[str, list[int]],
|
||||||
mm_data: MultiModalDataDict,
|
mm_data: MultiModalDataDict,
|
||||||
hf_processor_mm_kwargs: Mapping[str, object],
|
hf_processor_mm_kwargs: Mapping[str, object],
|
||||||
|
return_mm_hashes: bool = False,
|
||||||
) -> MultiModalInputs:
|
) -> MultiModalInputs:
|
||||||
hf_config = self.info.get_hf_config()
|
hf_config = self.info.get_hf_config()
|
||||||
image_token_id = hf_config.image_token_index
|
image_token_id = hf_config.image_token_index
|
||||||
@ -777,7 +778,8 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
|
|||||||
image_height=-1,
|
image_height=-1,
|
||||||
)
|
)
|
||||||
|
|
||||||
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
|
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
|
||||||
|
return_mm_hashes)
|
||||||
|
|
||||||
mm_items = self._to_mm_items(mm_data)
|
mm_items = self._to_mm_items(mm_data)
|
||||||
mm_item_counts = mm_items.get_all_counts()
|
mm_item_counts = mm_items.get_all_counts()
|
||||||
|
|||||||
@ -780,6 +780,7 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
|||||||
prompt: Union[str, List[int]],
|
prompt: Union[str, List[int]],
|
||||||
mm_data: MultiModalDataDict,
|
mm_data: MultiModalDataDict,
|
||||||
hf_processor_mm_kwargs: Mapping[str, object],
|
hf_processor_mm_kwargs: Mapping[str, object],
|
||||||
|
return_mm_hashes: bool = False,
|
||||||
) -> MultiModalInputs:
|
) -> MultiModalInputs:
|
||||||
supported_mm_modalities = self.info.get_supported_mm_modalities()
|
supported_mm_modalities = self.info.get_supported_mm_modalities()
|
||||||
if isinstance(prompt, list):
|
if isinstance(prompt, list):
|
||||||
@ -791,7 +792,8 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
|||||||
[index for index, m in enumerate(matches) if m == modality])
|
[index for index, m in enumerate(matches) if m == modality])
|
||||||
for modality in supported_mm_modalities
|
for modality in supported_mm_modalities
|
||||||
}
|
}
|
||||||
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
|
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
|
||||||
|
return_mm_hashes)
|
||||||
# Exclude <image_id>x</image_id> from placeholders
|
# Exclude <image_id>x</image_id> from placeholders
|
||||||
if "image" in result["mm_placeholders"] and \
|
if "image" in result["mm_placeholders"] and \
|
||||||
self.info.get_model_version() == (2, 6):
|
self.info.get_model_version() == (2, 6):
|
||||||
|
|||||||
@ -175,8 +175,10 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
|
|||||||
prompt: Union[str, list[int]],
|
prompt: Union[str, list[int]],
|
||||||
mm_data: MultiModalDataDict,
|
mm_data: MultiModalDataDict,
|
||||||
hf_processor_mm_kwargs: Mapping[str, object],
|
hf_processor_mm_kwargs: Mapping[str, object],
|
||||||
|
return_mm_hashes: bool = False,
|
||||||
) -> MultiModalEncDecInputs:
|
) -> MultiModalEncDecInputs:
|
||||||
mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs)
|
mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
|
||||||
|
return_mm_hashes)
|
||||||
|
|
||||||
# Check that the number of image tokens in the decoder prompt matches
|
# Check that the number of image tokens in the decoder prompt matches
|
||||||
# the number of images provided in mm_data
|
# the number of images provided in mm_data
|
||||||
|
|||||||
@ -93,6 +93,7 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
|
|||||||
prompt: Union[str, list[int]],
|
prompt: Union[str, list[int]],
|
||||||
mm_data: MultiModalDataDict,
|
mm_data: MultiModalDataDict,
|
||||||
hf_processor_mm_kwargs: Mapping[str, object],
|
hf_processor_mm_kwargs: Mapping[str, object],
|
||||||
|
return_mm_hashes: bool = False,
|
||||||
) -> MultiModalInputs:
|
) -> MultiModalInputs:
|
||||||
mm_kwargs = {}
|
mm_kwargs = {}
|
||||||
|
|
||||||
|
|||||||
@ -14,7 +14,6 @@ from typing import (TYPE_CHECKING, Generic, NamedTuple, Optional, Protocol,
|
|||||||
from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
|
from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
|
||||||
from typing_extensions import assert_never
|
from typing_extensions import assert_never
|
||||||
|
|
||||||
import vllm.envs as envs
|
|
||||||
from vllm.inputs import InputProcessingContext
|
from vllm.inputs import InputProcessingContext
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens,
|
from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens,
|
||||||
@ -1435,6 +1434,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
prompt: Union[str, list[int]],
|
prompt: Union[str, list[int]],
|
||||||
mm_data: MultiModalDataDict,
|
mm_data: MultiModalDataDict,
|
||||||
hf_processor_mm_kwargs: Mapping[str, object],
|
hf_processor_mm_kwargs: Mapping[str, object],
|
||||||
|
return_mm_hashes: bool = False,
|
||||||
) -> MultiModalInputs:
|
) -> MultiModalInputs:
|
||||||
"""
|
"""
|
||||||
Process multi-modal inputs to be used in vLLM.
|
Process multi-modal inputs to be used in vLLM.
|
||||||
@ -1451,11 +1451,11 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
|
|||||||
"""
|
"""
|
||||||
mm_items = self._to_mm_items(mm_data)
|
mm_items = self._to_mm_items(mm_data)
|
||||||
|
|
||||||
# Create MM hashes (only used in V1)
|
# Create MM hashes to be returned (only used in V1)
|
||||||
# TODO: Use these hash keys for caching operations in apply_hf_processor
|
# TODO: Use these hash keys for caching operations in apply_hf_processor
|
||||||
# instead of rehashing.
|
# instead of rehashing.
|
||||||
|
|
||||||
if envs.VLLM_USE_V1:
|
if return_mm_hashes:
|
||||||
model_id = self.info.model_id
|
model_id = self.info.model_id
|
||||||
mm_hashes = {
|
mm_hashes = {
|
||||||
modality: [
|
modality: [
|
||||||
@ -1554,6 +1554,7 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
|||||||
prompt: Union[str, list[int]],
|
prompt: Union[str, list[int]],
|
||||||
mm_data: MultiModalDataDict,
|
mm_data: MultiModalDataDict,
|
||||||
hf_processor_mm_kwargs: Mapping[str, object],
|
hf_processor_mm_kwargs: Mapping[str, object],
|
||||||
|
return_mm_hashes: bool = False,
|
||||||
) -> MultiModalEncDecInputs:
|
) -> MultiModalEncDecInputs:
|
||||||
"""
|
"""
|
||||||
Process multi-modal inputs to be used in vLLM.
|
Process multi-modal inputs to be used in vLLM.
|
||||||
@ -1567,6 +1568,7 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
|
|||||||
encoder_prompt,
|
encoder_prompt,
|
||||||
mm_data,
|
mm_data,
|
||||||
hf_processor_mm_kwargs,
|
hf_processor_mm_kwargs,
|
||||||
|
return_mm_hashes,
|
||||||
)
|
)
|
||||||
|
|
||||||
tokenizer = self.info.get_tokenizer()
|
tokenizer = self.info.get_tokenizer()
|
||||||
|
|||||||
@ -131,6 +131,7 @@ class Processor:
|
|||||||
request_id=request_id,
|
request_id=request_id,
|
||||||
lora_request=lora_request,
|
lora_request=lora_request,
|
||||||
prompt_adapter_request=prompt_adapter_request,
|
prompt_adapter_request=prompt_adapter_request,
|
||||||
|
return_mm_hashes=self.use_hash,
|
||||||
)
|
)
|
||||||
eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
|
eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user