From f114b4e14346c1466531f7928be718735dd45eb7 Mon Sep 17 00:00:00 2001
From: bk-201 <joy25810@foxmail.com>
Date: Tue, 23 Dec 2025 01:47:18 +0000
Subject: [PATCH] disable mm cache when enable_tower_connector_lora

Signed-off-by: bk-201 <joy25810@foxmail.com>
---
 tests/lora/test_qwenvl.py         | 126 ++++++++++++++++++------------
 vllm/engine/arg_utils.py          |  13 +++
 vllm/v1/engine/input_processor.py |  41 +++++-----
 3 files changed, 106 insertions(+), 74 deletions(-)

diff --git a/tests/lora/test_qwenvl.py b/tests/lora/test_qwenvl.py
index 4b3b92447789f..5f84bfee2c1e2 100644
--- a/tests/lora/test_qwenvl.py
+++ b/tests/lora/test_qwenvl.py
@@ -15,10 +15,11 @@ class TestConfig:
     max_num_seqs: int = 2
     max_loras: int = 2
     max_lora_rank: int = 32
-    enable_tower_connector_lora: bool = True
+    enable_tower_connector_lora: bool = False
     max_model_len: int = 8192
     gpu_memory_utilization: float = 0.85
     mm_processor_kwargs: dict[str, int] | None = None
+    mm_processor_cache_gb: float = 4
 
     def __post_init__(self):
         if self.mm_processor_kwargs is None:
@@ -54,6 +55,7 @@ class Qwen2VLTester:
             trust_remote_code=True,
             gpu_memory_utilization=self.config.gpu_memory_utilization,
             mm_processor_kwargs=self.config.mm_processor_kwargs,
+            mm_processor_cache_gb=self.config.mm_processor_cache_gb,
             max_model_len=self.config.max_model_len,
         )
 
@@ -62,6 +64,7 @@ class Qwen2VLTester:
         images: list[ImageAsset],
         expected_outputs: list[str],
         lora_id: int | None = None,
+        lora_name: str | None = None,
         temperature: float = 0,
         max_tokens: int = 5,
     ):
@@ -77,7 +80,9 @@ class Qwen2VLTester:
             for asset in images
         ]
 
-        lora_request = LoRARequest(str(lora_id), lora_id, self.config.lora_path)
+        lora_request = LoRARequest(
+            lora_name if lora_name else str(lora_id), lora_id, self.config.lora_path
+        )
         outputs = self.llm.generate(inputs, sampling_params, lora_request=lora_request)
         generated_texts = [output.outputs[0].text.strip() for output in outputs]
         # Validate outputs
@@ -207,59 +212,15 @@ def test_qwen25vl_lora(qwen25vl_lora_files):
         tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id)
 
 
-def test_qwen2vl_language_lora(qwen2vl_language_lora_files):
-    """
-    Test language-only LoRA adapter.
-    """
-    config = TestConfig(
-        model_path=QWEN2VL_MODEL_PATH, lora_path=qwen2vl_language_lora_files
-    )
-    tester = Qwen2VLTester(config)
-    for lora_id in [1, 2]:
-        tester.run_test(
-            TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS_LANGUAGE, lora_id=lora_id
-        )
-
-
-def test_qwen2vl_vision_lora(qwen2vl_vision_tower_connector_lora_files):
-    """
-    Test vision tower + connector LoRA adapter.
-    """
-    config = TestConfig(
-        model_path=QWEN2VL_MODEL_PATH,
-        lora_path=qwen2vl_vision_tower_connector_lora_files,
-    )
-    tester = Qwen2VLTester(config)
-    for lora_id in [1, 2]:
-        tester.run_test(
-            TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS_VISION, lora_id=lora_id
-        )
-
-
-def test_qwen2vl_vision_no_connector_lora(
-    qwen2vl_vision_tower_lora_files,
-):
-    """
-    Test vision tower only LoRA adapter.
-
-    """
-    config = TestConfig(
-        model_path=QWEN2VL_MODEL_PATH,
-        lora_path=qwen2vl_vision_tower_lora_files,
-    )
-    tester = Qwen2VLTester(config)
-    for lora_id in [1, 2]:
-        tester.run_test(
-            TEST_IMAGES,
-            expected_outputs=EXPECTED_OUTPUTS_VISION_NO_CONNECTOR,
-            lora_id=lora_id,
-        )
-
-
 def test_qwen25vl_vision_lora(qwen25vl_vision_lora_files):
     config = TestConfig(
         model_path=QWEN25VL_MODEL_PATH,
         lora_path=qwen25vl_vision_lora_files,
+        # Currently, tower_connector_lora is incompatible with
+        # the multi-modal processor cache.
+        # TODO: Remove this restriction
+        mm_processor_cache_gb=0,
+        enable_tower_connector_lora=True,
     )
     tester = Qwen2VLTester(config)
     for lora_id in [1, 2]:
@@ -274,6 +235,11 @@ def test_qwen3vl_vision_lora(qwen3vl_vision_lora_files):
     config = TestConfig(
         model_path=QWEN3VL_MODEL_PATH,
         lora_path=qwen3vl_vision_lora_files,
+        # Currently, tower_connector_lora is incompatible with
+        # the multi-modal processor cache.
+        # TODO: Remove this restriction
+        mm_processor_cache_gb=0,
+        enable_tower_connector_lora=True,
     )
     tester = Qwen2VLTester(config)
     for lora_id in [1, 2]:
@@ -282,3 +248,61 @@ def test_qwen3vl_vision_lora(qwen3vl_vision_lora_files):
             expected_outputs=EXPECTED_OUTPUTS_VISION_QWEN3_VL,
             lora_id=lora_id,
         )
+
+
+def test_qwen2vl_multiple_lora_types(
+    qwen2vl_language_lora_files,
+    qwen2vl_vision_tower_connector_lora_files,
+    qwen2vl_vision_tower_lora_files,
+):
+    """
+    Test multiple LoRA adapter types (language, vision tower + connector,
+    vision tower only) using the same LLM instance to verify mm_encoder_cache
+    behavior with different LoRA requests.
+
+    By reusing the same LLM instance across different LoRA requests, we ensure that
+    the multimodal encoder cache correctly manages state transitions between
+    language-only and vision-enabled LoRA adapters.
+    """
+    config = TestConfig(
+        model_path=QWEN2VL_MODEL_PATH,
+        # We'll override the lora_path for each specific test, but need to provide
+        # an initial path for initialization
+        lora_path=qwen2vl_language_lora_files,
+        # Currently, tower_connector_lora is incompatible with
+        # the multi-modal processor cache.
+        # TODO: Remove this restriction
+        mm_processor_cache_gb=0,
+        enable_tower_connector_lora=True,
+    )
+    tester = Qwen2VLTester(config)
+
+    # Test 1: Language-only LoRA adapter
+    tester.config.lora_path = qwen2vl_language_lora_files
+    for lora_id in [1, 2]:
+        tester.run_test(
+            TEST_IMAGES,
+            expected_outputs=EXPECTED_OUTPUTS_LANGUAGE,
+            lora_id=lora_id,
+            lora_name="language_only",
+        )
+
+    # Test 2: Vision tower + connector LoRA adapter
+    tester.config.lora_path = qwen2vl_vision_tower_connector_lora_files
+    for lora_id in [3, 4]:
+        tester.run_test(
+            TEST_IMAGES,
+            expected_outputs=EXPECTED_OUTPUTS_VISION,
+            lora_id=lora_id,
+            lora_name="vision_tower_connector",
+        )
+
+    # Test 3: Vision tower only LoRA adapter (no connector)
+    tester.config.lora_path = qwen2vl_vision_tower_lora_files
+    for lora_id in [5, 6]:
+        tester.run_test(
+            TEST_IMAGES,
+            expected_outputs=EXPECTED_OUTPUTS_VISION_NO_CONNECTOR,
+            lora_id=lora_id,
+            lora_name="vision_tower",
+        )
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index b222b63853c9f..24c4f1d91638e 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1647,6 +1647,19 @@ class EngineArgs:
             else None
         )
 
+        if (
+            lora_config is not None
+            and lora_config.enable_tower_connector_lora
+            and self.mm_processor_cache_gb != 0
+        ):
+            raise ValueError(
+                "Currently, enable_tower_connector_lora is "
+                "incompatible with the multi-modal processor cache. "
+                "When enable_tower_connector_lora is set, "
+                "mm_processor_cache_gb must be 0, got %s",
+                self.mm_processor_cache_gb,
+            )
+
         if (
             lora_config is not None
             and speculative_config is not None
diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py
index 3bccebe612571..8717e7e24d7ae 100644
--- a/vllm/v1/engine/input_processor.py
+++ b/vllm/v1/engine/input_processor.py
@@ -406,6 +406,20 @@ class InputProcessor:
             mm_uuids[modality] = [f"{request_id}-{modality}-{i}" for i in range(n)]
         return mm_uuids
 
+    def _get_mm_identifier(
+        self,
+        mm_hash: str,
+        lora_request: LoRARequest | None,
+    ) -> str:
+        """
+        When enable_tower_connector_lora is True, multi-modal embeddings
+        vary depending on the LoRA request. Therefore, the mm_hash must be
+        generated based on the LoRA request to prevent incorrect cache hits.
+        """
+        if lora_request is None or not self.lora_config.enable_tower_connector_lora:
+            return mm_hash
+        return f"{lora_request.lora_name}:{mm_hash}"
+
     def process_inputs(
         self,
         request_id: str,
@@ -458,28 +472,6 @@ class InputProcessor:
             else:
                 mm_uuids = None
 
-        # When enable_tower_connector_lora is True, multi-modal embeddings
-        # vary depending on the LoRA request. Therefore, the mm_hash must be
-        # generated based on the LoRA request to prevent incorrect cache hits.
-        lora_config = self.lora_config
-        if (
-            mm_uuids
-            and lora_request
-            and lora_config
-            and lora_config.enable_tower_connector_lora
-        ):
-
-            def add_mm_lora_prefix(val):
-                if isinstance(val, list):
-                    return [
-                        f"{lora_request.lora_name}:{v}" if v is not None else None
-                        for v in val
-                    ]
-                else:
-                    return f"{lora_request.lora_name}:{val}"
-
-            mm_uuids = {k: add_mm_lora_prefix(v) for k, v in mm_uuids.items()}
-
         # Process inputs, which includes:
         # 1. Tokenize text prompt, with LoRA request if one exists.
         # 2. For multimodal models with a merged preprocessor, preprocess
@@ -548,7 +540,10 @@ class InputProcessor:
                     MultiModalFeatureSpec(
                         data=decoder_mm_inputs[modality][idx],
                         modality=modality,
-                        identifier=decoder_mm_hashes[modality][idx],
+                        identifier=self._get_mm_identifier(
+                            decoder_mm_hashes[modality][idx],
+                            lora_request,
+                        ),
                         mm_position=decoder_mm_positions[modality][idx],
                     )
                 )