[Misc][V1] Avoid using envs.VLLM_USE_V1 in mm processing (#14256)

Signed-off-by: Roger Wang <ywang@roblox.com>
This commit is contained in:
Roger Wang 2025-03-04 23:37:16 -08:00 committed by GitHub
parent 32985bed7c
commit ec79b67c77
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 38 additions and 8 deletions

View File

@ -254,6 +254,7 @@ class InputPreprocessor:
mm_data: MultiModalDataDict, mm_data: MultiModalDataDict,
mm_processor_kwargs: Optional[Mapping[str, object]], mm_processor_kwargs: Optional[Mapping[str, object]],
lora_request: Optional[LoRARequest], lora_request: Optional[LoRARequest],
return_mm_hashes: bool = False,
) -> MultiModalInputs: ) -> MultiModalInputs:
""" """
Apply the model's multi-modal processor to a multi-modal prompt, Apply the model's multi-modal processor to a multi-modal prompt,
@ -274,7 +275,8 @@ class InputPreprocessor:
if mm_processor_kwargs is None: if mm_processor_kwargs is None:
mm_processor_kwargs = {} mm_processor_kwargs = {}
return mm_processor.apply(prompt, mm_data, mm_processor_kwargs) return mm_processor.apply(prompt, mm_data, mm_processor_kwargs,
return_mm_hashes)
async def _process_multimodal_async( async def _process_multimodal_async(
self, self,
@ -282,6 +284,7 @@ class InputPreprocessor:
mm_data: MultiModalDataDict, mm_data: MultiModalDataDict,
mm_processor_kwargs: Optional[Mapping[str, object]], mm_processor_kwargs: Optional[Mapping[str, object]],
lora_request: Optional[LoRARequest], lora_request: Optional[LoRARequest],
return_mm_hashes: bool = False,
) -> MultiModalInputs: ) -> MultiModalInputs:
"""Async version of :meth:`_process_multimodal`.""" """Async version of :meth:`_process_multimodal`."""
# At the moment on model (PrithviGeoSpatialMAE) requires to be # At the moment on model (PrithviGeoSpatialMAE) requires to be
@ -299,13 +302,15 @@ class InputPreprocessor:
if mm_processor_kwargs is None: if mm_processor_kwargs is None:
mm_processor_kwargs = {} mm_processor_kwargs = {}
return mm_processor.apply(prompt, mm_data, mm_processor_kwargs) return mm_processor.apply(prompt, mm_data, mm_processor_kwargs,
return_mm_hashes)
def _prompt_to_llm_inputs( def _prompt_to_llm_inputs(
self, self,
prompt: SingletonPrompt, prompt: SingletonPrompt,
request_id: str, request_id: str,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
return_mm_hashes: bool = False,
) -> SingletonInputs: ) -> SingletonInputs:
""" """
Extract the singleton inputs from a prompt. Extract the singleton inputs from a prompt.
@ -315,6 +320,7 @@ class InputPreprocessor:
* request_id * request_id
* prompt: single encoder or decoder input prompt * prompt: single encoder or decoder input prompt
* lora_request: this is only valid for decoder prompts * lora_request: this is only valid for decoder prompts
* return_mm_hashes: whether to return multimodal hashes
Returns: Returns:
@ -349,6 +355,7 @@ class InputPreprocessor:
multi_modal_data, multi_modal_data,
mm_processor_kwargs, mm_processor_kwargs,
lora_request=lora_request, lora_request=lora_request,
return_mm_hashes=return_mm_hashes,
) )
return token_inputs( return token_inputs(
@ -695,6 +702,7 @@ class InputPreprocessor:
request_id: str, request_id: str,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None,
return_mm_hashes: bool = False,
) -> DecoderOnlyInputs: ) -> DecoderOnlyInputs:
""" """
For decoder-only models: For decoder-only models:
@ -706,6 +714,7 @@ class InputPreprocessor:
* request_id * request_id
* lora_request * lora_request
* prompt_adapter_request * prompt_adapter_request
* return_mm_hashes
Returns: Returns:
@ -729,6 +738,7 @@ class InputPreprocessor:
request_id: str, request_id: str,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None,
return_mm_hashes: bool = False,
) -> DecoderOnlyInputs: ) -> DecoderOnlyInputs:
"""Async version of :meth:`_process_decoder_only_prompt`.""" """Async version of :meth:`_process_decoder_only_prompt`."""
prompt_comps = await self._prompt_to_llm_inputs_async( prompt_comps = await self._prompt_to_llm_inputs_async(
@ -748,9 +758,13 @@ class InputPreprocessor:
request_id: str, request_id: str,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None,
return_mm_hashes: bool = False,
) -> ProcessorInputs: ) -> ProcessorInputs:
"""Preprocess the input prompt.""" """Preprocess the input prompt."""
if self.model_config.is_encoder_decoder: if self.model_config.is_encoder_decoder:
assert not return_mm_hashes, (
"Multimodal hashes for encoder-decoder models should not be ",
"returned until they are supported on vLLM V1.")
# Encoder-decoder model requires special mapping of # Encoder-decoder model requires special mapping of
# input prompts to encoder & decoder # input prompts to encoder & decoder
return self._process_encoder_decoder_prompt( return self._process_encoder_decoder_prompt(
@ -768,6 +782,7 @@ class InputPreprocessor:
request_id=request_id, request_id=request_id,
lora_request=lora_request, lora_request=lora_request,
prompt_adapter_request=prompt_adapter_request, prompt_adapter_request=prompt_adapter_request,
return_mm_hashes=return_mm_hashes,
) )
async def preprocess_async( async def preprocess_async(
@ -776,9 +791,13 @@ class InputPreprocessor:
request_id: str, request_id: str,
lora_request: Optional[LoRARequest] = None, lora_request: Optional[LoRARequest] = None,
prompt_adapter_request: Optional[PromptAdapterRequest] = None, prompt_adapter_request: Optional[PromptAdapterRequest] = None,
return_mm_hashes: bool = False,
) -> ProcessorInputs: ) -> ProcessorInputs:
"""Async version of :meth:`preprocess`.""" """Async version of :meth:`preprocess`."""
if self.model_config.is_encoder_decoder: if self.model_config.is_encoder_decoder:
assert not return_mm_hashes, (
"Multimodal hashes for encoder-decoder models should not be ",
"returned until they are supported on vLLM V1.")
# Encoder-decoder model requires special mapping of # Encoder-decoder model requires special mapping of
# input prompts to encoder & decoder # input prompts to encoder & decoder
return await self._process_encoder_decoder_prompt_async( return await self._process_encoder_decoder_prompt_async(
@ -796,4 +815,5 @@ class InputPreprocessor:
request_id=request_id, request_id=request_id,
lora_request=lora_request, lora_request=lora_request,
prompt_adapter_request=prompt_adapter_request, prompt_adapter_request=prompt_adapter_request,
return_mm_hashes=return_mm_hashes,
) )

View File

@ -767,6 +767,7 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
prompt: Union[str, list[int]], prompt: Union[str, list[int]],
mm_data: MultiModalDataDict, mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
return_mm_hashes: bool = False,
) -> MultiModalInputs: ) -> MultiModalInputs:
hf_config = self.info.get_hf_config() hf_config = self.info.get_hf_config()
image_token_id = hf_config.image_token_index image_token_id = hf_config.image_token_index
@ -777,7 +778,8 @@ class MantisMultiModalProcessor(LlavaMultiModalProcessor):
image_height=-1, image_height=-1,
) )
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs) result = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
return_mm_hashes)
mm_items = self._to_mm_items(mm_data) mm_items = self._to_mm_items(mm_data)
mm_item_counts = mm_items.get_all_counts() mm_item_counts = mm_items.get_all_counts()

View File

@ -780,6 +780,7 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
prompt: Union[str, List[int]], prompt: Union[str, List[int]],
mm_data: MultiModalDataDict, mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
return_mm_hashes: bool = False,
) -> MultiModalInputs: ) -> MultiModalInputs:
supported_mm_modalities = self.info.get_supported_mm_modalities() supported_mm_modalities = self.info.get_supported_mm_modalities()
if isinstance(prompt, list): if isinstance(prompt, list):
@ -791,7 +792,8 @@ class MiniCPMVMultiModalProcessor(BaseMultiModalProcessor[_I]):
[index for index, m in enumerate(matches) if m == modality]) [index for index, m in enumerate(matches) if m == modality])
for modality in supported_mm_modalities for modality in supported_mm_modalities
} }
result = super().apply(prompt, mm_data, hf_processor_mm_kwargs) result = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
return_mm_hashes)
# Exclude <image_id>x</image_id> from placeholders # Exclude <image_id>x</image_id> from placeholders
if "image" in result["mm_placeholders"] and \ if "image" in result["mm_placeholders"] and \
self.info.get_model_version() == (2, 6): self.info.get_model_version() == (2, 6):

View File

@ -175,8 +175,10 @@ class MllamaMultiModalProcessor(EncDecMultiModalProcessor[MllamaProcessingInfo]
prompt: Union[str, list[int]], prompt: Union[str, list[int]],
mm_data: MultiModalDataDict, mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
return_mm_hashes: bool = False,
) -> MultiModalEncDecInputs: ) -> MultiModalEncDecInputs:
mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs) mm_inputs = super().apply(prompt, mm_data, hf_processor_mm_kwargs,
return_mm_hashes)
# Check that the number of image tokens in the decoder prompt matches # Check that the number of image tokens in the decoder prompt matches
# the number of images provided in mm_data # the number of images provided in mm_data

View File

@ -93,6 +93,7 @@ class PrithviGeoSpatialMAEMultiModalProcessor(BaseMultiModalProcessor):
prompt: Union[str, list[int]], prompt: Union[str, list[int]],
mm_data: MultiModalDataDict, mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
return_mm_hashes: bool = False,
) -> MultiModalInputs: ) -> MultiModalInputs:
mm_kwargs = {} mm_kwargs = {}

View File

@ -14,7 +14,6 @@ from typing import (TYPE_CHECKING, Generic, NamedTuple, Optional, Protocol,
from transformers import BatchFeature, PretrainedConfig, ProcessorMixin from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
from typing_extensions import assert_never from typing_extensions import assert_never
import vllm.envs as envs
from vllm.inputs import InputProcessingContext from vllm.inputs import InputProcessingContext
from vllm.logger import init_logger from vllm.logger import init_logger
from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens, from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens,
@ -1435,6 +1434,7 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
prompt: Union[str, list[int]], prompt: Union[str, list[int]],
mm_data: MultiModalDataDict, mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
return_mm_hashes: bool = False,
) -> MultiModalInputs: ) -> MultiModalInputs:
""" """
Process multi-modal inputs to be used in vLLM. Process multi-modal inputs to be used in vLLM.
@ -1451,11 +1451,11 @@ class BaseMultiModalProcessor(ABC, Generic[_I]):
""" """
mm_items = self._to_mm_items(mm_data) mm_items = self._to_mm_items(mm_data)
# Create MM hashes (only used in V1) # Create MM hashes to be returned (only used in V1)
# TODO: Use these hash keys for caching operations in apply_hf_processor # TODO: Use these hash keys for caching operations in apply_hf_processor
# instead of rehashing. # instead of rehashing.
if envs.VLLM_USE_V1: if return_mm_hashes:
model_id = self.info.model_id model_id = self.info.model_id
mm_hashes = { mm_hashes = {
modality: [ modality: [
@ -1554,6 +1554,7 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
prompt: Union[str, list[int]], prompt: Union[str, list[int]],
mm_data: MultiModalDataDict, mm_data: MultiModalDataDict,
hf_processor_mm_kwargs: Mapping[str, object], hf_processor_mm_kwargs: Mapping[str, object],
return_mm_hashes: bool = False,
) -> MultiModalEncDecInputs: ) -> MultiModalEncDecInputs:
""" """
Process multi-modal inputs to be used in vLLM. Process multi-modal inputs to be used in vLLM.
@ -1567,6 +1568,7 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]):
encoder_prompt, encoder_prompt,
mm_data, mm_data,
hf_processor_mm_kwargs, hf_processor_mm_kwargs,
return_mm_hashes,
) )
tokenizer = self.info.get_tokenizer() tokenizer = self.info.get_tokenizer()

View File

@ -131,6 +131,7 @@ class Processor:
request_id=request_id, request_id=request_id,
lora_request=lora_request, lora_request=lora_request,
prompt_adapter_request=prompt_adapter_request, prompt_adapter_request=prompt_adapter_request,
return_mm_hashes=self.use_hash,
) )
eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request) eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request)