From 81b5ace12810dce1c61bd67636f38846fb9b9e90 Mon Sep 17 00:00:00 2001 From: bk-201 Date: Sun, 21 Dec 2025 04:14:11 +0000 Subject: [PATCH] revert lora_kwargs change Signed-off-by: bk-201 --- vllm/inputs/preprocess.py | 12 ------------ vllm/multimodal/processing.py | 11 ----------- vllm/v1/engine/input_processor.py | 14 -------------- 3 files changed, 37 deletions(-) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 8707fc310033c..0372b06d0017f 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -246,7 +246,6 @@ class InputPreprocessor: tokenization_kwargs: dict[str, Any] | None = None, *, mm_uuids: MultiModalUUIDDict | None = None, - lora_kwargs: dict[str, Any] | None = None, ) -> MultiModalInputs: """ Apply the model's multi-modal processor to a multi-modal prompt, @@ -263,7 +262,6 @@ class InputPreprocessor: hf_processor_mm_kwargs=mm_processor_kwargs, tokenization_kwargs=tokenization_kwargs, mm_uuids=mm_uuids, - lora_kwargs=lora_kwargs, ) mm_hashes = mm_input["mm_hashes"] @@ -361,7 +359,6 @@ class InputPreprocessor: tokenization_kwargs: dict[str, Any] | None = None, *, mm_uuids: MultiModalUUIDDict | None = None, - lora_kwargs: dict[str, Any] | None = None, ) -> TokenInputs | MultiModalInputs: prompt_text = parsed_content["prompt"] @@ -373,7 +370,6 @@ class InputPreprocessor: parsed_content.get("mm_processor_kwargs") or {}, tokenization_kwargs=tokenization_kwargs, mm_uuids=mm_uuids, - lora_kwargs=lora_kwargs, ) else: prompt_token_ids = self._tokenize_prompt( @@ -393,7 +389,6 @@ class InputPreprocessor: tokenization_kwargs: dict[str, Any] | None = None, *, mm_uuids: MultiModalUUIDDict | None = None, - lora_kwargs: dict[str, Any] | None = None, ) -> SingletonInputs: """ Extract the singleton inputs from a prompt. @@ -420,7 +415,6 @@ class InputPreprocessor: parsed["content"], tokenization_kwargs=tokenization_kwargs, mm_uuids=mm_uuids, - lora_kwargs=lora_kwargs, ) if parsed["type"] == "str": return self._process_text( @@ -632,7 +626,6 @@ class InputPreprocessor: tokenization_kwargs: dict[str, Any] | None = None, *, mm_uuids: MultiModalUUIDDict | None = None, - lora_kwargs: dict[str, Any] | None = None, ) -> DecoderOnlyInputs: """ For decoder-only models: @@ -652,7 +645,6 @@ class InputPreprocessor: prompt, tokenization_kwargs=tokenization_kwargs, mm_uuids=mm_uuids, - lora_kwargs=lora_kwargs, ) return self._build_decoder_only_llm_inputs(prompt_comps) @@ -663,7 +655,6 @@ class InputPreprocessor: tokenization_kwargs: dict[str, Any] | None = None, *, mm_uuids: MultiModalUUIDDict | None = None, - lora_kwargs: dict[str, Any] | None = None, ) -> ProcessorInputs: if self.model_config.is_encoder_decoder: # Encoder-decoder model requires special mapping of @@ -685,7 +676,6 @@ class InputPreprocessor: cast(SingletonPrompt, prompt), tokenization_kwargs=tokenization_kwargs, mm_uuids=mm_uuids, - lora_kwargs=lora_kwargs, ) def preprocess( @@ -694,14 +684,12 @@ class InputPreprocessor: tokenization_kwargs: dict[str, Any] | None = None, *, mm_uuids: MultiModalUUIDDict | None = None, - lora_kwargs: dict[str, Any] | None = None, ) -> ProcessorInputs: """Preprocess the input prompt.""" res = self._preprocess( prompt, tokenization_kwargs, mm_uuids=mm_uuids, - lora_kwargs=lora_kwargs, ) if self.mm_processor_cache and self.mm_cache_stats is not None: diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 39e476156542c..3bbdab3b393c5 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1672,7 +1672,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): tokenization_kwargs: Mapping[str, object], *, mm_uuids: MultiModalUUIDDict | None = None, - lora_kwargs: dict[str, Any] | None = None, ) -> MultiModalHashes: """Create MM hashes to be returned. @@ -1684,7 +1683,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): hashes: MultiModalHashes = {} mm_uuids = mm_uuids or {} - lora_kwargs = lora_kwargs or {} for modality, items in mm_items.items(): if modality in mm_uuids: @@ -1705,7 +1703,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): item_uuid is None or hf_processor_mm_kwargs or tokenization_kwargs - or lora_kwargs ): # NOTE: use provided hash string to hash with kwargs # if available for better performance. @@ -1716,7 +1713,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): **{modality: item}, **hf_processor_mm_kwargs, **tokenization_kwargs, - **lora_kwargs, ) ) else: @@ -1729,7 +1725,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): **{modality: item}, **hf_processor_mm_kwargs, **tokenization_kwargs, - **lora_kwargs, ) for item in items ] @@ -1888,7 +1883,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): tokenization_kwargs: Mapping[str, object], *, mm_uuids: MultiModalUUIDDict | None = None, - lora_kwargs: dict[str, Any] | None = None, ) -> tuple[list[int], MultiModalProcessingInfo, bool]: """ Apply the HF processor on the full prompt text, @@ -1911,7 +1905,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): hf_processor_mm_kwargs, tokenization_kwargs, mm_uuids=mm_uuids, - lora_kwargs=lora_kwargs, ) mm_is_cached, mm_missing_data_items = self._get_cache_missing_items( @@ -2122,7 +2115,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): tokenization_kwargs: Mapping[str, object] | None = None, *, mm_uuids: MultiModalUUIDDict | None = None, - lora_kwargs: dict[str, Any] | None = None, ) -> MultiModalInputs: """ Process multi-modal inputs to be used in vLLM. @@ -2152,7 +2144,6 @@ class BaseMultiModalProcessor(ABC, Generic[_I]): hf_processor_mm_kwargs, tokenization_kwargs=tokenization_kwargs, mm_uuids=mm_uuids, - lora_kwargs=lora_kwargs, ) # NOTE: tokenization_kwargs are not required to init processor @@ -2233,7 +2224,6 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]): tokenization_kwargs: Mapping[str, object] | None = None, *, mm_uuids: MultiModalUUIDDict | None = None, - lora_kwargs: dict[str, Any] | None = None, ) -> MultiModalEncDecInputs: """ Process multi-modal inputs to be used in vLLM. @@ -2249,7 +2239,6 @@ class EncDecMultiModalProcessor(BaseMultiModalProcessor[_I]): hf_processor_mm_kwargs, tokenization_kwargs, mm_uuids=mm_uuids, - lora_kwargs=lora_kwargs, ) return self._get_enc_dec_inputs( diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index 01d0b7f50f45e..29293877cb69d 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -5,8 +5,6 @@ import time from collections.abc import Mapping from typing import Any, Literal, cast -import msgspec - from vllm.config import VllmConfig from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs from vllm.inputs.parse import split_enc_dec_inputs @@ -460,17 +458,6 @@ class InputProcessor: else: mm_uuids = None - # When enable_tower_connector_lora is True, multi-modal embeddings - # vary depending on the LoRA request. Therefore, the mm_hash must be - # generated based on the LoRA request to prevent incorrect cache hits. - lora_config = self.lora_config - lora_kwargs = ( - msgspec.structs.asdict(lora_request) - if lora_request and lora_config and lora_config.enable_tower_connector_lora - else {} - ) - lora_kwargs = {k: v for k, v in lora_kwargs.items() if v is not None} - # Process inputs, which includes: # 1. Tokenize text prompt, with LoRA request if one exists. # 2. For multimodal models with a merged preprocessor, preprocess @@ -479,7 +466,6 @@ class InputProcessor: prompt, tokenization_kwargs=tokenization_kwargs, mm_uuids=mm_uuids, - lora_kwargs=lora_kwargs, ) from vllm.platforms import current_platform