From cb1a6f074a9dba3876c7c72850dbdfef8534f8d8 Mon Sep 17 00:00:00 2001
From: bk-201 <joy25810@foxmail.com>
Date: Mon, 13 Oct 2025 02:14:36 +0000
Subject: [PATCH] update

Signed-off-by: bk-201 <joy25810@foxmail.com>
---
 requirements/test.txt                     |  41 +------
 vllm/lora/layers/__init__.py              |   3 +-
 vllm/lora/layers/base_linear.py           |   7 ++
 vllm/lora/layers/utils.py                 |   1 +
 vllm/lora/models.py                       | 127 ++++++++++++----------
 vllm/lora/worker_manager.py               |   4 +-
 vllm/v1/worker/gpu_model_runner.py        |  31 ++++--
 vllm/v1/worker/lora_model_runner_mixin.py |  36 ++++--
 8 files changed, 132 insertions(+), 118 deletions(-)

diff --git a/requirements/test.txt b/requirements/test.txt
index ba72502ff43d1..01a501badb1fc 100644
--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -17,7 +17,6 @@ aiohttp==3.13.0
     #   aiohttp-cors
     #   datasets
     #   fsspec
-    #   gpt-oss
     #   lm-eval
     #   ray
 aiohttp-cors==0.8.1
@@ -45,9 +44,7 @@ argcomplete==3.5.1
 arrow==1.3.0
     # via isoduration
 async-timeout==5.0.1
-    # via
-    #   aiohttp
-    #   redis
+    # via redis
 attrs==24.2.0
     # via
     #   aiohttp
@@ -108,8 +105,6 @@ chardet==5.2.0
     # via mbstrdecoder
 charset-normalizer==3.4.0
     # via requests
-chz==0.3.0
-    # via gpt-oss
 click==8.1.7
     # via
     #   black
@@ -180,9 +175,7 @@ distlib==0.3.9
 dnspython==2.7.0
     # via email-validator
 docker==7.1.0
-    # via
-    #   gpt-oss
-    #   mlflow
+    # via mlflow
 docopt==0.6.2
     # via num2words
 docstring-parser==0.17.0
@@ -208,9 +201,7 @@ eval-type-backport==0.2.2
 evaluate==0.4.3
     # via lm-eval
 fastapi==0.116.1
-    # via
-    #   gpt-oss
-    #   mlflow-skinny
+    # via mlflow-skinny
 fastparquet==2024.11.0
     # via genai-perf
 fastrlock==0.8.2
@@ -285,8 +276,6 @@ google-resumable-media==2.7.2
     # via google-cloud-storage
 googleapis-common-protos==1.70.0
     # via google-api-core
-gpt-oss==0.0.8
-    # via -r requirements/test.in
 graphene==3.4.3
     # via mlflow
 graphql-core==3.2.6
@@ -314,8 +303,6 @@ hf-xet==1.1.7
     # via huggingface-hub
 hiredis==3.0.0
     # via tensorizer
-html2text==2025.4.15
-    # via gpt-oss
 httpcore==1.0.6
     # via httpx
 httpx==0.27.2
@@ -450,7 +437,6 @@ lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b772215
 lxml==5.3.0
     # via
     #   blobfile
-    #   gpt-oss
     #   sacrebleu
 mako==1.3.10
     # via alembic
@@ -620,8 +606,6 @@ omegaconf==2.3.0
     #   lightning
 open-clip-torch==2.32.0
     # via -r requirements/test.in
-openai-harmony==0.0.4
-    # via gpt-oss
 opencensus==0.11.4
     # via ray
 opencensus-context==0.1.3
@@ -793,12 +777,10 @@ pydantic==2.12.0
     #   albumentations
     #   datamodel-code-generator
     #   fastapi
-    #   gpt-oss
     #   lightly
     #   mistral-common
     #   mlflow-skinny
     #   mteb
-    #   openai-harmony
     #   pydantic-extra-types
     #   ray
 pydantic-core==2.41.1
@@ -929,7 +911,6 @@ requests==2.32.3
     #   evaluate
     #   google-api-core
     #   google-cloud-storage
-    #   gpt-oss
     #   huggingface-hub
     #   lightly
     #   lm-eval
@@ -1072,8 +1053,6 @@ starlette-testclient==0.4.1
     # via schemathesis
 statsmodels==0.14.4
     # via genai-perf
-structlog==25.4.0
-    # via gpt-oss
 sympy==1.13.3
     # via
     #   einx
@@ -1088,15 +1067,12 @@ tcolorpy==0.1.6
     # via pytablewriter
 tenacity==9.1.2
     # via
-    #   gpt-oss
     #   lm-eval
     #   plotly
 tensorboardx==2.6.4
     # via lightning
 tensorizer==2.10.1
     # via -r requirements/test.in
-termcolor==3.1.0
-    # via gpt-oss
 terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
     # via -r requirements/test.in
 threadpoolctl==3.5.0
@@ -1107,7 +1083,6 @@ tifffile==2025.3.30
     #   terratorch
 tiktoken==0.12.0
     # via
-    #   gpt-oss
     #   lm-eval
     #   mistral-common
 timm==1.0.17
@@ -1121,12 +1096,9 @@ tokenizers==0.22.0
     # via
     #   -r requirements/test.in
     #   transformers
-toml==0.10.2
-    # via datamodel-code-generator
 tomli==2.2.1
     # via
-    #   black
-    #   pytest
+    #   coverage
     #   schemathesis
 tomli-w==1.2.0
     # via schemathesis
@@ -1235,7 +1207,6 @@ typing-extensions==4.15.0
     #   aiosignal
     #   albumentations
     #   alembic
-    #   chz
     #   fastapi
     #   graphene
     #   huggingface-hub
@@ -1275,9 +1246,7 @@ urllib3==2.2.3
     #   responses
     #   tritonclient
 uvicorn==0.35.0
-    # via
-    #   gpt-oss
-    #   mlflow-skinny
+    # via mlflow-skinny
 vector-quantize-pytorch==1.21.2
     # via -r requirements/test.in
 virtualenv==20.31.2
diff --git a/vllm/lora/layers/__init__.py b/vllm/lora/layers/__init__.py
index cfbf51922505f..4915ef85f4f73 100644
--- a/vllm/lora/layers/__init__.py
+++ b/vllm/lora/layers/__init__.py
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from vllm.lora.layers.base import BaseLayerWithLoRA, PunicaWrapperBase
+from vllm.lora.layers.base import BaseLayerWithLoRA
 from vllm.lora.layers.column_parallel_linear import (
     ColumnParallelLinearWithLoRA,
     ColumnParallelLinearWithShardedLoRA,
@@ -36,5 +36,4 @@ __all__ = [
     "RowParallelLinearWithShardedLoRA",
     "ReplicatedLinearWithLoRA",
     "LoRAMapping",
-    "PunicaWrapperBase",
 ]
diff --git a/vllm/lora/layers/base_linear.py b/vllm/lora/layers/base_linear.py
index da053f0923aba..e2b7a247f9d36 100644
--- a/vllm/lora/layers/base_linear.py
+++ b/vllm/lora/layers/base_linear.py
@@ -124,6 +124,9 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
     ) -> torch.Tensor:
         output = self.base_layer.quant_method.apply(self.base_layer, x, bias)
 
+        # Store original shape for later reshaping
+        original_shape = output.shape if output.ndim == 3 else None
+
         # In transformers backend, x and output have extra batch dimension like
         # (1, seq_len, hidden_dim), while punica expects (seq_len, hidden_dim),
         # therefore we need to flatten the batch dimensions.
@@ -137,6 +140,10 @@ class BaseLinearLayerWithLoRA(BaseLayerWithLoRA):
         if not current_platform.can_update_inplace():
             output = lora_output
 
+        # Restore original shape if it was flattened
+        if original_shape is not None:
+            output = output.reshape(original_shape)
+
         return output
 
     @property
diff --git a/vllm/lora/layers/utils.py b/vllm/lora/layers/utils.py
index 2da90f180ee74..002dc934636b9 100644
--- a/vllm/lora/layers/utils.py
+++ b/vllm/lora/layers/utils.py
@@ -12,6 +12,7 @@ class LoRAMapping:
     index_mapping: tuple[int, ...]
     prompt_mapping: tuple[int, ...]
     is_prefill: bool = False
+    is_mm_input: bool = False
 
     def __post_init__(self):
         self.index_mapping = tuple(self.index_mapping)
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index b78a7d8332fcd..2b27a67680624 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -12,10 +12,10 @@ from torch import nn
 
 from vllm.config.lora import LoRAConfig, ModelConfig
 from vllm.logger import init_logger
-from vllm.lora.layers import BaseLayerWithLoRA, LoRAMapping, PunicaWrapperBase
+from vllm.lora.layers import BaseLayerWithLoRA, LoRAMapping
 from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
 from vllm.lora.peft_helper import PEFTHelper
-from vllm.lora.punica_wrapper import get_punica_wrapper
+from vllm.lora.punica_wrapper import PunicaWrapperBase, get_punica_wrapper
 from vllm.lora.utils import (
     from_layer,
     from_layer_logits_processor,
@@ -30,8 +30,8 @@ from vllm.model_executor.models import SupportsLoRA, supports_multimodal
 from vllm.model_executor.models.interfaces import is_pooling_model
 from vllm.model_executor.models.module_mapping import MultiModelKeys
 from vllm.model_executor.models.utils import PPMissingLayer, WeightsMapper
-from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.model_executor.utils import get_packed_modules_mapping
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.utils import is_pin_memory_available
 from vllm.utils.cache import LRUCache
 
@@ -378,17 +378,18 @@ class LoRAModelManager:
             supports_multimodal(self.model)
             # In case the model only supports LoRA for
             # text modules (e.g. ChatGLM)
-            and hasattr(self.model, "get_mm_mapping"))
+            and hasattr(self.model, "get_mm_mapping")
+        )
         # For v0 compatibility
         if model_config is not None:
             self.mm_registry = MULTIMODAL_REGISTRY
-            self.info = self.mm_registry.create_processor(
-                model_config, disable_cache=True).info
+            self.info = self.mm_registry.create_processor(model_config).info
             self.supports_mm_lora = self.supports_mm and hasattr(
-                self.info, "get_num_mm_encoder_tokens")
+                self.info, "get_num_mm_encoder_tokens"
+            )
         else:
             self.supports_mm_lora = False
-        if self.supports_mm_lora: # 从init传进来就可以了，不需要model_config了
+        if self.supports_mm_lora:  # 从init传进来就可以了，不需要model_config了
             self.mm_mapping: MultiModelKeys = self.model.get_mm_mapping()
             self.mm_config = model_config.multimodal_config
             # limit_per_prompt: int = max(
@@ -399,10 +400,8 @@ class LoRAModelManager:
             # max_num_batched_tokens = encoder_budget
             # max_batches = max_batches * limit_per_prompt
             self.mm_punica_wrapper_mapping = {
-                name:
-                get_punica_wrapper(
-                    self.info.get_num_mm_encoder_tokens(
-                        max_num_batched_tokens),
+                name: get_punica_wrapper(
+                    self.info.get_num_mm_encoder_tokens(max_num_batched_tokens),
                     max_batches=self.max_num_seqs * limit_per_prompt,
                     device=self.device,
                     max_loras=self.lora_config.max_loras,
@@ -411,16 +410,11 @@ class LoRAModelManager:
             }
             # For language model
             self.mm_punica_wrapper_mapping.update(
-                {
-                    self.mm_mapping.language_model[0]: self.punica_wrapper
-                }
+                {self.mm_mapping.language_model[0]: self.punica_wrapper}
             )
             # TODO Connector is not supported at the moment.
             self.mm_punica_wrapper_mapping.update(
-                {
-                    name: None
-                    for name in self.mm_mapping.connector
-                }
+                {name: None for name in self.mm_mapping.connector}
             )
 
         self.is_pooling_model = is_pooling_model(self.model)
@@ -512,28 +506,27 @@ class LoRAModelManager:
                 self.lora_slots + 1,
                 self.vocab_size,
                 self.lora_config.lora_extra_vocab_size,
-                self.long_lora_context,
             )
         elif mapping.is_mm_input:
             self.mm_punica_wrapper_mapping[
-                self.mm_mapping.tower_model[0]].update_metadata(
-                    mapping,
-                    self.lora_index_to_id,
-                    self.lora_slots + 1,
-                    self.vocab_size,
-                    self.lora_config.lora_extra_vocab_size,
-                    self.long_lora_context,
-                )
+                self.mm_mapping.tower_model[0]
+            ].update_metadata(
+                mapping,
+                self.lora_index_to_id,
+                self.lora_slots + 1,
+                self.vocab_size,
+                self.lora_config.lora_extra_vocab_size,
+            )
         else:
             self.mm_punica_wrapper_mapping[
-                self.mm_mapping.language_model[0]].update_metadata(
-                    mapping,
-                    self.lora_index_to_id,
-                    self.lora_slots + 1,
-                    self.vocab_size,
-                    self.lora_config.lora_extra_vocab_size,
-                    self.long_lora_context,
-                )
+                self.mm_mapping.language_model[0]
+            ].update_metadata(
+                mapping,
+                self.lora_index_to_id,
+                self.lora_slots + 1,
+                self.vocab_size,
+                self.lora_config.lora_extra_vocab_size,
+            )
 
     def remove_all_adapters(self):
         """Remove all LoRAModels from the manager."""
@@ -613,8 +606,7 @@ class LoRAModelManager:
             self._register_packed_modules(module_name)
             # All lora layers share the same punica_wrapper based on reference.
             if self.supports_mm_lora:
-                new_module.set_mapping(
-                    self._get_mm_punica_wrapper(module_name))
+                new_module.set_mapping(self._get_mm_punica_wrapper(module_name))
             else:
                 new_module.set_mapping(self.punica_wrapper)
 
@@ -711,22 +703,23 @@ class LoRAModelManager:
         if self.supports_mm:
             prefix_lst = self.mm_mapping.connector + self.mm_mapping.tower_model
             if self.supports_mm_lora:
-
                 return self._get_mm_punica_wrapper(module_name) is None
             else:
-                return any(
-                    [module_name.startswith(prefix) for prefix in prefix_lst])
+                return any([module_name.startswith(prefix) for prefix in prefix_lst])
         return False
 
-    def _get_mm_punica_wrapper(self, module_name: str) -> PunicaWrapperBase:
+    def _get_mm_punica_wrapper(self, module_name: str) -> Optional[PunicaWrapperBase]:
         """
-        Match the corresponding punica_wrapper based on module_name, 
+        Match the corresponding punica_wrapper based on module_name,
         and return None if lora is not supported for this module.
         """
         if self.supports_mm_lora:
             # Ensure matching by the longest prefix.
-            sorted_prefixes = sorted(self.mm_punica_wrapper_mapping.keys(), 
-                                     key=lambda x: len(x), reverse=True)
+            sorted_prefixes = sorted(
+                self.mm_punica_wrapper_mapping.keys(),
+                key=lambda x: len(x),
+                reverse=True,
+            )
 
             for prefix in sorted_prefixes:
                 if module_name.startswith(prefix):
@@ -834,12 +827,25 @@ class LoRALRUCache(AdapterLRUCache[LoRAModel]):
 class LRUCacheLoRAModelManager(LoRAModelManager):
     """A model manager that manages multiple LoRAs with LRU cache."""
 
-    def __init__(self, model: nn.Module, max_num_seqs: int,
-                 max_num_batched_tokens: int, vocab_size: int,
-                 lora_config: LoRAConfig, model_config: ModelConfig,
-                 device: torch.device):
-        super().__init__(model, max_num_seqs, max_num_batched_tokens,
-                         vocab_size, lora_config, model_config, device)
+    def __init__(
+        self,
+        model: nn.Module,
+        max_num_seqs: int,
+        max_num_batched_tokens: int,
+        vocab_size: int,
+        lora_config: LoRAConfig,
+        model_config: ModelConfig,
+        device: torch.device,
+    ):
+        super().__init__(
+            model,
+            max_num_seqs,
+            max_num_batched_tokens,
+            vocab_size,
+            lora_config,
+            model_config,
+            device,
+        )
         self._registered_adapters: LoRALRUCache = LoRALRUCache(
             self.capacity, self.deactivate_adapter
         )
@@ -906,15 +912,16 @@ class LRUCacheLoRAModelManager(LoRAModelManager):
 
 
 def create_lora_manager(
-        model: nn.Module,
-        max_num_seqs: int,
-        max_num_batched_tokens: int,
-        vocab_size: int,
-        lora_config: LoRAConfig,
-        model_config: ModelConfig,
-        device: torch.device,
-        lora_manager_cls: type[LoRAModelManager] = LoRAModelManager,
-        **kwargs) -> LoRAModelManager:
+    model: nn.Module,
+    max_num_seqs: int,
+    max_num_batched_tokens: int,
+    vocab_size: int,
+    lora_config: LoRAConfig,
+    model_config: ModelConfig,
+    device: torch.device,
+    lora_manager_cls: type[LoRAModelManager] = LoRAModelManager,
+    **kwargs,
+) -> LoRAModelManager:
     """Create a LoRA adapter for a given model."""
     if not isinstance(model, SupportsLoRA):
         raise ValueError(f"Model {type(model)} is not supported for LoRA.")
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index 4a4772037ee72..d717e81792fed 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -6,7 +6,7 @@ from typing import Any, Literal, Optional, Union
 
 import torch
 
-from vllm.config import VllmConfig, ModelConfig
+from vllm.config import ModelConfig, VllmConfig
 from vllm.logger import init_logger
 from vllm.lora.models import (
     LoRAModel,
@@ -71,6 +71,7 @@ class WorkerLoRAManager:
     def create_lora_manager(
         self,
         model: torch.nn.Module,
+        model_config: Optional[ModelConfig] = None,
     ) -> Any:
         lora_manager = create_lora_manager(
             model,
@@ -80,6 +81,7 @@ class WorkerLoRAManager:
             lora_config=self.lora_config,
             device=self.device,
             lora_manager_cls=self._manager_cls,
+            model_config=model_config,
         )
         self._adapter_manager = lora_manager
         return lora_manager.model
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index c270fd9bce23a..c446103aac629 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -512,6 +512,13 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             pin_memory=self.pin_memory,
         )
 
+        # Multimodal LoRA support
+        if self.supports_mm_inputs:
+            self.info = self.mm_registry.create_processor(self.model_config).info
+            self.supports_mm_lora = hasattr(self.info, "get_num_mm_encoder_tokens")
+        else:
+            self.supports_mm_lora = False
+
     def reset_mm_cache(self) -> None:
         if self.mm_budget:
             self.mm_budget.reset_cache()
@@ -571,15 +578,6 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         )
         return model_kwargs
 
-        # Multimodal LoRA support
-        if self.is_multimodal_model:
-            self.info = self.mm_registry.create_processor(
-                self.model_config, disable_cache=True).info
-            self.supports_mm_lora = hasattr(self.info,
-                                            "get_num_mm_encoder_tokens")
-        else:
-            self.supports_mm_lora = False
-
     def _may_reorder_batch(self, scheduler_output: "SchedulerOutput") -> None:
         """
         Update the order of requests in the batch based on the attention
@@ -1751,6 +1749,19 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
         # encoder outputs.
         model = cast(SupportsMultiModal, self.model)
         encoder_outputs = []
+
+        if self.lora_config and self.supports_mm_lora:
+            mm_tokens = [
+                self.info.get_num_mm_encoder_tokens(pos_info.length)
+                for _, pos_info in mm_hashes_pos
+            ]
+            num_scheduled_tokens = np.array(mm_tokens, dtype=np.int32)
+            self.set_active_loras(
+                self.input_batch,
+                num_scheduled_tokens,
+                is_mm_input=True,
+            )
+
         for modality, num_items, mm_kwargs_group in group_mm_kwargs_by_modality(
             mm_kwargs,
             device=self.device,
@@ -2903,7 +2914,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
             )
             if self.lora_config:
                 self.model = self.load_lora_model(
-                    self.model, self.vllm_config, self.device
+                    self.model, self.vllm_config, self.device, self.model_config
                 )
             if hasattr(self, "drafter"):
                 logger.info("Loading drafter model...")
diff --git a/vllm/v1/worker/lora_model_runner_mixin.py b/vllm/v1/worker/lora_model_runner_mixin.py
index 36a2a0124fbfc..48ae88a9850b7 100644
--- a/vllm/v1/worker/lora_model_runner_mixin.py
+++ b/vllm/v1/worker/lora_model_runner_mixin.py
@@ -11,7 +11,7 @@ import numpy as np
 import torch
 import torch.nn as nn
 
-from vllm.config import VllmConfig
+from vllm.config import ModelConfig, VllmConfig
 from vllm.config.lora import LoRAConfig
 from vllm.logger import init_logger
 from vllm.lora.layers import LoRAMapping
@@ -29,7 +29,11 @@ logger = init_logger(__name__)
 # Defined as a mixin for GPUModelRunner
 class LoRAModelRunnerMixin:
     def load_lora_model(
-        self, model: nn.Module, vllm_config: VllmConfig, device: torch.device
+        self,
+        model: nn.Module,
+        vllm_config: VllmConfig,
+        device: torch.device,
+        model_config: ModelConfig = None,
     ) -> nn.Module:
         if not supports_lora(model):
             raise ValueError(f"{model.__class__.__name__} does not support LoRA yet.")
@@ -54,7 +58,7 @@ class LoRAModelRunnerMixin:
         prompt_lora_mapping: tuple[int, ...],
         token_lora_mapping: tuple[int, ...],
         lora_requests: set[LoRARequest],
-        is_mm_input: bool = False
+        is_mm_input: bool = False,
     ) -> None:
         self._ensure_lora_enabled()
 
@@ -63,7 +67,10 @@ class LoRAModelRunnerMixin:
         # On cuda platforms we use the same kernels for prefill and
         # decode and this flag is generally ignored.
         lora_mapping = LoRAMapping(
-            token_lora_mapping, prompt_lora_mapping, is_prefill=True, is_mm_input=is_mm_input
+            token_lora_mapping,
+            prompt_lora_mapping,
+            is_prefill=True,
+            is_mm_input=is_mm_input,
         )
         self.lora_manager.set_active_adapters(lora_requests, lora_mapping)
 
@@ -72,7 +79,10 @@ class LoRAModelRunnerMixin:
             raise RuntimeError("LoRA is not enabled. Use --enable-lora to enable LoRA.")
 
     def set_active_loras(
-        self, input_batch: InputBatch, num_scheduled_tokens: np.ndarray, is_mm_input: bool = False
+        self,
+        input_batch: InputBatch,
+        num_scheduled_tokens: np.ndarray,
+        is_mm_input: bool = False,
     ) -> None:
         prompt_lora_mapping: tuple[int, ...]  # of size input_batch.num_reqs
         token_lora_mapping: tuple[int, ...]  # of size np.sum(num_scheduled_tokens)
@@ -122,7 +132,10 @@ class LoRAModelRunnerMixin:
 
     @contextmanager
     def maybe_select_dummy_loras(
-        self, lora_config: Optional[LoRAConfig], num_scheduled_tokens: np.ndarray, is_mm_input: bool = False
+        self,
+        lora_config: Optional[LoRAConfig],
+        num_scheduled_tokens: np.ndarray,
+        is_mm_input: bool = False,
     ):
         if lora_config is None:
             yield
@@ -151,7 +164,10 @@ class LoRAModelRunnerMixin:
             }
 
             self._set_active_loras(
-                tuple(prompt_lora_mapping), tuple(token_lora_mapping), lora_requests, is_mm_input
+                tuple(prompt_lora_mapping),
+                tuple(token_lora_mapping),
+                lora_requests,
+                is_mm_input,
             )
 
             yield
@@ -162,11 +178,13 @@ class LoRAModelRunnerMixin:
         lora_config: Optional[LoRAConfig],
         num_scheduled_tokens: np.ndarray,
         remove_lora: bool = True,
-        is_mm_input: bool = False
+        is_mm_input: bool = False,
     ):
         with (
             self.maybe_setup_dummy_loras(lora_config, remove_lora),
-            self.maybe_select_dummy_loras(lora_config, num_scheduled_tokens, is_mm_input),
+            self.maybe_select_dummy_loras(
+                lora_config, num_scheduled_tokens, is_mm_input
+            ),
         ):
             yield